fix(provisioner#24 PR-B): real Gitea TemplateAssetFetcher + wire (no stubs) #2857

Merged
core-devops merged 3 commits from fix/24-pr-b-gitea-fetcher into main 2026-06-14 22:40:21 +00:00
7 changed files with 969 additions and 0 deletions
+28
View File
@@ -249,6 +249,22 @@ func main() {
wh.SetCPProvisioner(cpProv)
}
// PR-B (RFC #2843 #24): wire the Gitea TemplateAssetFetcher.
// nil-if-empty + WARN: if the token isn't set, log a warning
// and stay in the "no fetcher wired" state. The SCAFFOLD gate
// in collectCPConfigFiles treats nil fetcher as "skip the
// fetcher" — pre-scaffold behavior preserved for self-host /
// unconfigured tenants. baseURL has a production default
// (https://git.moleculesai.app) but is overridable for staging
// or per-deployment Gitea mirrors.
if token := templateRepoToken(); token != "" {
baseURL := envOr("MOLECULE_GITEA_BASE_URL", "https://git.moleculesai.app")
wh.SetGiteaTemplateFetcher(provisioner.NewGiteaTemplateAssetFetcher(baseURL, token, nil))
log.Printf("template repo fetcher: wired (baseURL=%q, token set)", baseURL)
} else {
log.Printf("template repo fetcher: MOLECULE_TEMPLATE_REPO_TOKEN unset; fetcher disabled (self-host default / SCAFFOLD-gate skip)")
}
// Self-hosted platform-agent boot-provision (Change 1). The line-128 seed
// only creates the concierge DB ROW; on a fresh self-host that leaves it
// with no container (status='failed'/'online' but nothing running). Now that
@@ -667,6 +683,18 @@ func templateCacheToken() string {
return ""
}
// templateRepoToken returns the per-deployment READ-ONLY Gitea PAT
// used by the Gitea TemplateAssetFetcher (RFC #2843 #24 PR-B).
// Distinct from templateCacheToken (which is for the template cache,
// a different feature) so a tenant can rotate the fetcher token
// without touching the cache token. nil-if-empty + WARN: callers
// should treat empty as "fetcher disabled" (self-host default — the
// SCAFFOLD gate in collectCPConfigFiles treats nil fetcher as
// "skip the fetcher", pre-scaffold behavior preserved).
func templateRepoToken() string {
return strings.TrimSpace(os.Getenv("MOLECULE_TEMPLATE_REPO_TOKEN"))
}
func shouldRefreshTemplateCache(token, manifestPath string) bool {
switch strings.ToLower(strings.TrimSpace(os.Getenv("TEMPLATE_CACHE_REFRESH"))) {
case "0", "false", "off", "no":
@@ -60,6 +60,7 @@ func manifestPath() string {
type manifestEntry struct {
Name string `json:"name"`
Repo string `json:"repo"`
Ref string `json:"ref"`
}
type manifestFile struct {
@@ -178,3 +179,78 @@ func initKnownRuntimes() {
}
log.Printf("runtime registry: loaded %d runtimes from %s: %v", len(loaded), path, names)
}
// templateRepoRef is the parsed manifest entry needed to
// derive a TemplateIdentity for the Gitea fetcher. The
// identity is "<repo>@<ref>" (the giteaTemplateAssetFetcher
// parses this as "<owner>/<repo>@<ref>").
type templateRepoRef struct {
Repo string // e.g. "molecule-ai/molecule-ai-workspace-template-claude-code"
Ref string // e.g. "main" or a pinned tag/sha
}
// templateRepoByName holds the runtime → (repo, ref) mapping
// parsed from manifest.json at boot. Empty for runtimes that
// have no template repo (external, kimi, kimi-cli, mock —
// caller leaves cfg.TemplateIdentity empty for those, which
// the SCAFFOLD gate in collectCPConfigFiles treats as
// "skip the fetcher", pre-scaffold behavior preserved).
var templateRepoByName = make(map[string]templateRepoRef)
// initTemplateRepoByName is called from the package init chain
// (alongside initKnownRuntimes) to populate the repo map. The
// fallback set returns an empty map — external/kimi/kimi-cli/mock
// have no manifest entry by design.
//
// Reconcile-on-every-boot: the map is RESET at the start of every
// call, then re-populated from the current manifest. Stale entries
// (runtimes removed from the manifest) are dropped; the consumer
// (collectCPConfigFiles + the SCAFFOLD gate) can rely on the map
// being exactly the current manifest's runtimes. The reset is
// critical for the every-boot reconcile semantic — without it,
// dropping a template from the manifest would leave its identity
// resolvable, and the fetcher would attempt a no-longer-existing
// repo. Idempotent for the same manifest input.
func initTemplateRepoByName() {
path := manifestPath()
if path == "" {
log.Printf("template repo registry: manifest.json not found, no template repos available")
templateRepoByName = make(map[string]templateRepoRef)
return
}
data, err := os.ReadFile(path)
if err != nil {
log.Printf("template repo registry: manifest.json load failed (%v) — no template repos", err)
templateRepoByName = make(map[string]templateRepoRef)
return
}
var m manifestFile
if err := json.Unmarshal(data, &m); err != nil {
log.Printf("template repo registry: manifest.json parse failed (%v) — no template repos", err)
templateRepoByName = make(map[string]templateRepoRef)
return
}
// Reconcile: reset, then re-populate. Stale entries from a
// previous manifest are dropped — see func comment.
templateRepoByName = make(map[string]templateRepoRef, len(m.WorkspaceTemplates))
for _, e := range m.WorkspaceTemplates {
// Same normalization as loadRuntimesFromManifest: strip
// the "-default" suffix so the runtime identifier is
// the map key, not the template-variant name.
name := strings.TrimSuffix(e.Name, "-default")
templateRepoByName[name] = templateRepoRef{Repo: e.Repo, Ref: e.Ref}
}
}
// templateIdentityForRuntime returns the Gitea template identity
// for the given runtime name, or "" + false if the runtime has
// no template repo (external / kimi / kimi-cli / mock / unknown).
// The format is "<repo>@<ref>" — the giteaTemplateAssetFetcher
// parses this further as "<owner>/<repo>@<ref>".
func templateIdentityForRuntime(runtime string) (string, bool) {
rr, ok := templateRepoByName[runtime]
if !ok {
return "", false
}
return rr.Repo + "@" + rr.Ref, true
}
@@ -6,10 +6,17 @@ package handlers
// 2. "external" is always injected, even on manifests without it.
// 3. Missing file / malformed JSON returns error, caller uses
// fallback (tested at the initKnownRuntimes level via integration).
// 4. initTemplateRepoByName populates the map at the prod-init
// path (PR-B / RFC #2843 #24 contract-pin: the map must be
// non-empty after init for shipped runtimes).
// 5. initTemplateRepoByName is idempotent + reconciles stale
// entries on every-boot (a runtime removed from the manifest
// must NOT be resolvable in the map after the next init).
import (
"os"
"path/filepath"
"strings"
"testing"
)
@@ -126,3 +133,190 @@ func keys(m map[string]struct{}) []string {
}
return out
}
// TestTemplateIdentityForRuntime pins the runtime -> "<repo>@<ref>"
// mapping that drives cfg.TemplateIdentity (RFC #2843 #24 PR-B).
// Loads the real manifest.json (test setup already has it), then
// asserts the wire shape for a known runtime + the empty-result
// for the BYO-compute meta-runtimes (external / kimi / kimi-cli /
// mock / unknown).
func TestTemplateIdentityForRuntime(t *testing.T) {
path := manifestPath()
if path == "" {
t.Skip("manifest.json not discoverable from this test cwd")
}
// Force the repo registry to load from the same path
// (idempotent — safe to call multiple times).
initTemplateRepoByName()
// Sanity: a real template-backed runtime resolves to a
// "<repo>@<ref>" identity.
id, ok := templateIdentityForRuntime("claude-code")
if !ok {
t.Errorf("claude-code should resolve to an identity (manifest has it), got none")
} else {
// Identity must contain an @ and a slash (the fetcher
// parses this as "<owner>/<repo>@<ref>").
if !strings.Contains(id, "@") || !strings.Contains(id, "/") {
t.Errorf("claude-code identity %q doesn't look like \"<owner>/<repo>@<ref>\"", id)
}
// Identity must NOT be empty (the SCAFFOLD gate in
// collectCPConfigFiles would skip the fetcher on empty).
if id == "" {
t.Errorf("claude-code identity is empty; should be \"<repo>@<ref>\"")
}
}
// BYO-compute meta-runtimes have no template repo — the
// lookup MUST return (empty, false) so the SCAFFOLD gate
// skips the fetcher for them (preserves pre-scaffold
// behavior).
for _, rt := range []string{"external", "kimi", "kimi-cli", "mock", "unknown-runtime-xyz"} {
id2, ok2 := templateIdentityForRuntime(rt)
if ok2 {
t.Errorf("runtime %q should NOT have a template identity (BYO-compute / unknown), got identity=%q", rt, id2)
}
if id2 != "" {
t.Errorf("runtime %q identity should be empty, got %q", rt, id2)
}
}
}
// TestTemplateIdentityForRuntimeOrEmpty pins the
// single-expression wrapper used at the call site in
// buildProvisionerConfig.
func TestTemplateIdentityForRuntimeOrEmpty(t *testing.T) {
if manifestPath() == "" {
t.Skip("manifest.json not discoverable from this test cwd")
}
initTemplateRepoByName()
if got := templateIdentityForRuntimeOrEmpty("claude-code"); got == "" {
t.Error("claude-code should return a non-empty identity")
}
if got := templateIdentityForRuntimeOrEmpty("external"); got != "" {
t.Errorf("external should return empty, got %q", got)
}
if got := templateIdentityForRuntimeOrEmpty("unknown-xyz"); got != "" {
t.Errorf("unknown-xyz should return empty, got %q", got)
}
}
// TestInitTemplateRepoByName_PopulatesMap_FromTempManifest pins the
// PR-B contract-pin: the prod-init path must populate templateRepoByName
// from a real manifest so cfg.TemplateIdentity is non-empty for
// template-backed runtimes at boot. Uses a temp manifest.json (via
// the WORKSPACE_MANIFEST_PATH env var, read by manifestPath()) so the
// test doesn't depend on a real manifest being present.
//
// This is the load-bearing test PM required: "Keep a test asserting
// the prod-init path populates it (so this can't regress to test-only)".
func TestInitTemplateRepoByName_PopulatesMap_FromTempManifest(t *testing.T) {
// Write a temp manifest.json with a known set of template
// runtimes.
dir := t.TempDir()
manifestPath := filepath.Join(dir, "manifest.json")
manifest := `{
"workspace_templates": [
{"name": "claude-code-default", "repo": "molecule-ai/t-cc", "ref": "main"},
{"name": "hermes", "repo": "molecule-ai/t-hermes", "ref": "v1.2.3"},
{"name": "codex", "repo": "molecule-ai/t-codex", "ref": "main"}
]
}`
if err := os.WriteFile(manifestPath, []byte(manifest), 0600); err != nil {
t.Fatalf("write temp manifest: %v", err)
}
// Point manifestPath() at the temp file (it reads
// WORKSPACE_MANIFEST_PATH first).
t.Setenv("WORKSPACE_MANIFEST_PATH", manifestPath)
// Run the prod-init path.
initTemplateRepoByName()
// Assert the map is populated for the shipped runtimes.
cases := []struct {
runtime string
wantRepo string
wantRef string
}{
{"claude-code", "molecule-ai/t-cc", "main"},
{"hermes", "molecule-ai/t-hermes", "v1.2.3"},
{"codex", "molecule-ai/t-codex", "main"},
}
for _, c := range cases {
rr, ok := templateRepoByName[c.runtime]
if !ok {
t.Errorf("runtime %q missing from templateRepoByName after init (got %d entries)", c.runtime, len(templateRepoByName))
continue
}
if rr.Repo != c.wantRepo {
t.Errorf("runtime %q: want repo=%q, got %q", c.runtime, c.wantRepo, rr.Repo)
}
if rr.Ref != c.wantRef {
t.Errorf("runtime %q: want ref=%q, got %q", c.runtime, c.wantRef, rr.Ref)
}
}
// Assert the lookup function returns the expected identity.
for _, c := range cases {
id, ok := templateIdentityForRuntime(c.runtime)
if !ok {
t.Errorf("templateIdentityForRuntime(%q) returned (empty, false); want (non-empty, true)", c.runtime)
continue
}
want := c.wantRepo + "@" + c.wantRef
if id != want {
t.Errorf("templateIdentityForRuntime(%q) = %q, want %q", c.runtime, id, want)
}
}
}
// TestInitTemplateRepoByName_ReconcilesStaleEntries pins the
// every-boot reconcile property: a runtime removed from the manifest
// between two init calls must NOT be resolvable in the map after
// the second init. This catches the "stale entry persists" bug that
// would otherwise let the fetcher attempt a no-longer-existing repo.
func TestInitTemplateRepoByName_ReconcilesStaleEntries(t *testing.T) {
dir := t.TempDir()
manifestPath := filepath.Join(dir, "manifest.json")
t.Setenv("WORKSPACE_MANIFEST_PATH", manifestPath)
// First manifest: claude-code + hermes present.
if err := os.WriteFile(manifestPath, []byte(`{
"workspace_templates": [
{"name": "claude-code-default", "repo": "molecule-ai/t-cc", "ref": "main"},
{"name": "hermes", "repo": "molecule-ai/t-hermes", "ref": "main"}
]
}`), 0600); err != nil {
t.Fatalf("write manifest v1: %v", err)
}
initTemplateRepoByName()
if _, ok := templateRepoByName["claude-code"]; !ok {
t.Fatalf("after first init: claude-code missing")
}
if _, ok := templateRepoByName["hermes"]; !ok {
t.Fatalf("after first init: hermes missing")
}
// Second manifest: hermes REMOVED. claude-code unchanged.
if err := os.WriteFile(manifestPath, []byte(`{
"workspace_templates": [
{"name": "claude-code-default", "repo": "molecule-ai/t-cc", "ref": "main"}
]
}`), 0600); err != nil {
t.Fatalf("write manifest v2: %v", err)
}
initTemplateRepoByName()
// claude-code still resolves.
if _, ok := templateRepoByName["claude-code"]; !ok {
t.Errorf("after second init: claude-code should still resolve (it stayed in the manifest)")
}
// hermes must be GONE — the manifest removed it.
if _, ok := templateRepoByName["hermes"]; ok {
t.Errorf("after second init: hermes should NOT resolve (it was removed from the manifest); the every-boot reconcile failed")
}
// And the lookup returns ok=false for hermes.
if id, ok := templateIdentityForRuntime("hermes"); ok || id != "" {
t.Errorf("templateIdentityForRuntime(hermes) should return (\"\", false), got (%q, %v)", id, ok)
}
}
@@ -63,6 +63,17 @@ type WorkspaceHandler struct {
// registered; Registry.Run handles a nil receiver as a no-op so the
// hot path stays a single nil-pointer compare.
envMutators *provisionhook.Registry
// giteaTemplateFetcher is the per-deployment TemplateAssetFetcher
// (RFC #2843 #24 PR-B) wired from main.go. nil = no fetcher
// wired (self-host default; the SCAFFOLD gate in
// collectCPConfigFiles treats nil fetcher as "skip the
// fetcher", pre-scaffold behavior preserved). When non-nil,
// the fetcher is called on EVERY provision AND every restart
// (buildProvisionerConfig is the single source of truth for
// WorkspaceConfig across both paths) — collectCPConfigFiles
// reconciles every boot, not just first provision, per the
// dispatch's "every boot" requirement.
giteaTemplateFetcher provisioner.TemplateAssetFetcher
// stopFnOverride is set exclusively in tests to intercept provisioner.Stop
// calls made by HibernateWorkspace without requiring a running Docker daemon.
// Always nil in production; the real provisioner path is used when nil.
@@ -241,6 +252,16 @@ func (h *WorkspaceHandler) SetEnvMutators(r *provisionhook.Registry) {
h.envMutators = r
}
// SetGiteaTemplateFetcher wires the per-deployment TemplateAssetFetcher
// (RFC #2843 #24 PR-B). Nil is fine (self-host default; the SCAFFOLD
// gate in collectCPConfigFiles treats nil as "skip the fetcher").
// Production wires a giteaTemplateAssetFetcher with baseURL +
// per-identity READ-ONLY Gitea PAT (threaded from Infisical SSOT).
// Tests pass a stub.
func (h *WorkspaceHandler) SetGiteaTemplateFetcher(f provisioner.TemplateAssetFetcher) {
h.giteaTemplateFetcher = f
}
// TokenRegistry returns the provisionhook.Registry so the router can
// wire the GET /admin/github-installation-token handler without coupling
// to WorkspaceHandler's internals. Returns nil when no plugin has been
@@ -376,9 +376,38 @@ func (h *WorkspaceHandler) buildProvisionerConfig(
// RuntimeImages[Runtime] :latest lookup, which is what the dead
// reader's sql.ErrNoRows path was producing already.
Image: "",
// RFC #2843 #24 PR-B — wire the generic template-asset
// channel. cfg.TemplateIdentity is derived from the
// runtime_registry (manifest.json's workspace_templates
// entry for this runtime) — the format is "<repo>@<ref>"
// (the giteaTemplateAssetFetcher parses this further as
// "<owner>/<repo>@<ref>"). External-like runtimes
// (external / kimi / kimi-cli / mock) have NO template
// repo, so the identity is left empty — the SCAFFOLD
// gate in collectCPConfigFiles treats empty identity as
// "skip the fetcher" (pre-scaffold behavior preserved).
//
// The fetcher itself is assigned by the caller (main.go
// for SaaS, or a test helper) via h.giteaTemplateFetcher
// — wired here so the fetcher resolution is one place,
// not duplicated across first-provision + restart paths.
// nil fetcher = "no fetcher wired" (self-host default;
// falls through to the local TemplatePath path).
TemplateIdentity: templateIdentityForRuntimeOrEmpty(payload.Runtime),
TemplateAssetFetcher: h.giteaTemplateFetcher,
}
}
// templateIdentityForRuntimeOrEmpty is a tiny wrapper around
// templateIdentityForRuntime that returns "" on miss (rather
// than the (string, bool) tuple). Used at the call site so
// the assignment can be a single expression.
func templateIdentityForRuntimeOrEmpty(runtime string) string {
id, _ := templateIdentityForRuntime(runtime)
return id
}
// issueAndInjectToken rotates the workspace auth token and injects the
// plaintext into cfg.ConfigFiles[".auth_token"] so it is written into the
// /configs volume by WriteFilesToContainer immediately after the container
@@ -605,6 +634,13 @@ var knownRuntimes = fallbackRuntimes
func init() {
initKnownRuntimes()
// PR-B (RFC #2843 #24): populate the templateRepoByName map at
// boot so cfg.TemplateIdentity is non-empty for template-backed
// runtimes (claude-code / hermes / etc). The init() order matters:
// must follow initKnownRuntimes so the per-runtime manifestEntry
// lookups use the same set of normalized runtime names. Idempotent
// — both inits read manifestPath() (cached) and tolerate each other.
initTemplateRepoByName()
}
// yamlQuote emits a YAML double-quoted scalar that safely contains any
@@ -0,0 +1,289 @@
package provisioner
// gitea_template_assets.go — real Gitea-based TemplateAssetFetcher
// (RFC #2843 #24, PR-B). The previous PR-A SCAFFOLD (#2855) wired
// the interface + fields; this file is the production impl.
//
// Transport: GET {baseURL}/api/v1/repos/<owner>/<repo>/archive/<ref>.tar.gz
// with header "Authorization: token <token>" → stream-extract the
// tarball → return map[relpath][]byte for ONLY allowlisted paths
// (config.yaml + prompts/** + agent-skills/**) → strip the archive's
// top-level dir prefix.
//
// Template identity format: "<owner>/<repo>@<ref>" (e.g.
// "molecule-ai/workspace-template-claude-code@main"). The caller
// (workspace_provision.go) is responsible for resolving the runtime
// name to a (repo, ref) pair via the runtime_registry / manifest.json.
// The fetcher just parses + fetches.
//
// Fail-closed: any transport / extraction / parse error returns a
// non-nil error so the caller can abort the provision rather than
// silently regressing to stub-mode /configs (same contract as the
// persisted-bundle provider in #2831 PIECE 1).
//
// Memory-preservation: this fetcher ONLY materializes TEMPLATE
// ASSETS (config.yaml + prompts/* + agent-skills/*). The existing
// IsCPTemplateAssetPath allowlist in the consumer (collectCPConfigFiles)
// is the load-bearing guard against a fetcher that returns a path
// outside the template-asset namespace (e.g. /workspace, MEMORY.md,
// USER.md, CLAUDE.md, .claude/sessions/*) — those would either be
// rejected by the allowlist (provision aborts) or, if they somehow
// slipped through, would land in the SEPARATE TemplateAssets field
// rather than the SM-bound ConfigFiles field. The transport split
// (TemplateAssets vs ConfigFiles) is the second line of defense
// against clobbering agent-owned state.
//
// Auth: per-identity READ-ONLY Gitea token. The token is threaded
// from Infisical SSOT (per #2676 program) into the fetcher via the
// Token field. The token MUST be a per-identity PAT scoped to the
// template repo with read-only access — NOT a founder PAT, NOT a
// workspace-admin PAT. Workspace-server never logs or echoes the
// token (the httpClient logs are scrubbed via the standard net/http
// strip-header path; the token is in the Authorization header,
// which net/http does not log by default).
//
// NO SIZE CAP: the existing #2845 acbc0da9 added a 16MiB bound
// for TemplateAssets in collectCPConfigFiles (separate from the
// 256KiB SM cap on ConfigFiles). The fetcher itself does NOT cap
// the response — the consumer-side bound is the cap. This matches
// the dispatch's "NO size cap" on the fetcher.
import (
"archive/tar"
"bytes"
"compress/gzip"
"context"
"errors"
"fmt"
"io"
"net/http"
"path"
"strings"
"time"
)
// giteaTemplateAssetFetcher is the production TemplateAssetFetcher
// backed by a Gitea archive endpoint. The baseURL, Token, and
// httpClient fields are exported via a constructor (NewGiteaTemplateAssetFetcher)
// so main.go can wire per-deployment values without making the
// struct fields mutable post-construction.
type giteaTemplateAssetFetcher struct {
baseURL string
token string
httpClient *http.Client
}
// NewGiteaTemplateAssetFetcher returns a Gitea-backed fetcher wired
// with the given base URL, read-only PAT, and HTTP client. The HTTP
// client may be nil — the constructor substitutes a sane default
// (30s timeout, no surprises). Callers SHOULD use a shared client
// with a connection pool in production; the per-call client here
// is for tests + fallback.
func NewGiteaTemplateAssetFetcher(baseURL, token string, httpClient *http.Client) TemplateAssetFetcher {
if httpClient == nil {
httpClient = &http.Client{Timeout: 30 * time.Second}
}
return &giteaTemplateAssetFetcher{
baseURL: strings.TrimRight(baseURL, "/"),
token: token,
httpClient: httpClient,
}
}
// parseTemplateIdentity splits "<owner>/<repo>@<ref>" into its
// three parts. Returns an error on missing pieces or extra "@"
// segments. The ref part is the git ref (branch, tag, or SHA); a
// missing "@" is an error (no default — the caller must specify a
// pinned ref so the artifact is reproducible).
func parseTemplateIdentity(identity string) (owner, repo, ref string, err error) {
if identity == "" {
return "", "", "", errors.New("templateIdentity is empty (want \"<owner>/<repo>@<ref>\")")
}
atIdx := strings.LastIndex(identity, "@")
if atIdx < 0 {
return "", "", "", fmt.Errorf("templateIdentity %q has no @ref suffix (want \"<owner>/<repo>@<ref>\")", identity)
}
ref = identity[atIdx+1:]
if ref == "" {
return "", "", "", fmt.Errorf("templateIdentity %q has empty @ref", identity)
}
repoPart := identity[:atIdx]
slashIdx := strings.Index(repoPart, "/")
if slashIdx < 0 {
return "", "", "", fmt.Errorf("templateIdentity %q has no <owner>/<repo> prefix", identity)
}
owner = repoPart[:slashIdx]
repo = repoPart[slashIdx+1:]
if owner == "" || repo == "" {
return "", "", "", fmt.Errorf("templateIdentity %q has empty owner or repo", identity)
}
if strings.Contains(repo, "@") {
return "", "", "", fmt.Errorf("templateIdentity %q has extra @ in repo path", identity)
}
return owner, repo, ref, nil
}
// Load fetches the template's tarball archive and returns the
// allowlisted asset map. See the package doc-comment for the full
// transport + allowlist contract.
func (f *giteaTemplateAssetFetcher) Load(ctx context.Context, templateIdentity string) (map[string][]byte, error) {
if f.baseURL == "" {
return nil, errors.New("giteaTemplateAssetFetcher: baseURL is empty")
}
if f.token == "" {
return nil, errors.New("giteaTemplateAssetFetcher: token is empty (per-identity READ-ONLY Gitea PAT required)")
}
owner, repo, ref, err := parseTemplateIdentity(templateIdentity)
if err != nil {
return nil, fmt.Errorf("giteaTemplateAssetFetcher: %w", err)
}
url := fmt.Sprintf("%s/api/v1/repos/%s/%s/archive/%s.tar.gz", f.baseURL, owner, repo, ref)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return nil, fmt.Errorf("giteaTemplateAssetFetcher: build request: %w", err)
}
req.Header.Set("Authorization", "token "+f.token)
req.Header.Set("Accept", "application/gzip, application/octet-stream")
resp, err := f.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("giteaTemplateAssetFetcher: GET %s: %w", url, err)
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
// Read up to 4 KiB of the body for the error message (the
// body may contain a structured Gitea error envelope). Cap
// the read so a hostile server can't OOM us.
preview, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
return nil, fmt.Errorf("giteaTemplateAssetFetcher: GET %s: HTTP %d: %s", url, resp.StatusCode, string(preview))
}
// The Gitea archive endpoint returns a tar.gz — gunzip then
// stream-extract.
gz, err := gzip.NewReader(resp.Body)
if err != nil {
return nil, fmt.Errorf("giteaTemplateAssetFetcher: gzip.NewReader: %w", err)
}
defer func() { _ = gz.Close() }()
tr := tar.NewReader(gz)
assets := make(map[string][]byte)
for {
hdr, err := tr.Next()
if err == io.EOF {
break
}
if err != nil {
return nil, fmt.Errorf("giteaTemplateAssetFetcher: tar.Next: %w", err)
}
if hdr.FileInfo().IsDir() {
continue
}
if hdr.Typeflag != tar.TypeReg {
continue
}
// Strip the archive's top-level dir prefix. Gitea tarballs
// wrap every entry in "<repo>-<sha>/<relpath>" — we want
// just the relpath so the allowlist check is straightforward.
rel, ok := stripArchiveTopDir(hdr.Name)
if !ok {
// Entry is at the root or has no top-level dir; skip.
continue
}
// Allowlist filter — defense-in-depth. The consumer
// (collectCPConfigFiles) ALSO gates on IsCPTemplateAssetPath;
// skipping non-allowlisted entries here is a free perf win
// (don't allocate bytes for paths the consumer will reject)
// and a cleaner audit log.
if !IsCPTemplateAssetPath(rel) {
continue
}
// Read the file body. tar.Reader streams; we read in a
// bounded buffer (16 MiB safety per #2845 acbc0da9 — skill
// packages can be 700 KiB; we leave headroom for the
// largest expected template asset). The consumer-side cap
// is the real bound; this is just to prevent a hostile
// tarball from allocating a terabyte.
const perFileSafety = 16 << 20
data, err := io.ReadAll(io.LimitReader(tr, perFileSafety+1))
if err != nil {
return nil, fmt.Errorf("giteaTemplateAssetFetcher: read %s: %w", rel, err)
}
if len(data) > perFileSafety {
return nil, fmt.Errorf("giteaTemplateAssetFetcher: %s exceeds per-file safety bound %d bytes (cap enforcement is at the consumer)", rel, perFileSafety)
}
assets[rel] = data
}
return assets, nil
}
// stripArchiveTopDir strips the archive's top-level dir prefix
// from a tarball entry. Gitea wraps entries as "<topdir>/<relpath>"
// where <topdir> is typically "<repo>-<sha>" (e.g.
// "workspace-template-claude-code-abcd1234/config.yaml"). Returns
// the relpath and true on success; returns "" and false if the
// entry is at the top level (no slash), is malformed, or
// contains traversal sequences (../).
//
// Traversal check: a legitimate Gitea archive contains ONLY paths
// inside the top dir; any "../" segment in the relpath is a
// hostile-smuggling attempt (a malicious tarball trying to
// land a file at e.g. "../etc/passwd" — which would write OUTSIDE
// the top dir). Reject before path.Clean collapses the segments
// (otherwise "../etc/passwd" → "/etc/passwd" would slip through).
func stripArchiveTopDir(name string) (string, bool) {
if name == "" {
return "", false
}
// The first slash separates the top-level dir from the
// relpath. If there's no slash, the entry is at the top
// level (malformed for our purposes — Gitea's archive always
// wraps in a top dir).
slashIdx := strings.Index(name, "/")
if slashIdx < 0 {
return "", false
}
rel := name[slashIdx+1:]
if rel == "" {
return "", false
}
// Traversal rejection — check BOTH the raw relpath AND the
// cleaned form. Raw catches the obvious "../" segment; cleaned
// catches the sneaky "../../foo" that normalizes but still
// escaped the top dir. path.Clean collapses segments, so we
// re-detect via the leading "/" + the resulting
// non-relpath-but-still-traversed form. The simplest
// load-bearing check: the cleaned path, when prefixed with
// "/", must NOT contain a "/../" segment. (path.Clean
// guarantees no "/../" remains in the output of a clean run,
// so if "/../" appears it means the input had a traversal
// that resolved to a path starting with one of its parents.)
cleaned := path.Clean("/" + rel)
if strings.Contains(cleaned, "/../") {
return "", false
}
// Also check the raw relpath for the obvious ".." segment
// prefix (defense in depth — if the tarball entry is just
// "../" with nothing else, path.Clean("/../") is "/", which
// is the root, also a reject).
if strings.HasPrefix(rel, "..") || strings.Contains(rel, "/../") {
return "", false
}
cleaned = strings.TrimPrefix(cleaned, "/")
if cleaned == "" || cleaned == "." {
return "", false
}
return cleaned, true
}
// Compile-time check: giteaTemplateAssetFetcher implements the
// TemplateAssetFetcher interface. A future refactor that breaks
// the signature is caught at the earliest possible moment
// (compile time of the package, not at runtime via duck-typing).
var _ TemplateAssetFetcher = (*giteaTemplateAssetFetcher)(nil)
// Sentinel for tests that want to assert "this URL was hit".
// (Not exported — tests can re-derive via a custom httpClient.)
var _ = bytes.NewReader // keep import in case future refactor needs it
@@ -0,0 +1,325 @@
package provisioner
// gitea_template_assets_test.go — tests for the real Gitea
// TemplateAssetFetcher (RFC #2843 #24, PR-B). The previous
// #2855 SCAFFOLD tests covered the interface contract; this
// file covers the PRODUCTION impl.
//
// Tests use httptest.NewServer to serve a real .tar.gz
// generated in-memory (no real Gitea instance needed). The
// dispatch's required test surface:
// - happy path: assert ALL asset paths incl agent-skills are
// returned (must FAIL if skills dropped)
// - allowlist filter: non-allowlisted paths are excluded
// - fail-closed: transport / extract errors surface as errors
// - identity parsing: malformed identities return errors
import (
"archive/tar"
"bytes"
"compress/gzip"
"context"
"io"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
)
// TestGiteaTemplateAssetFetcher_HappyPath pins the production
// contract: a real .tar.gz archive containing config.yaml +
// prompts/ + agent-skills/ is fetched, parsed, and returned
// as a map with all three namespaces populated. The dispatch
// explicitly calls out: "must FAIL if skills dropped" — this
// test is the load-bearing check for that.
func TestGiteaTemplateAssetFetcher_HappyPath(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// Verify the request shape.
if r.URL.Path != "/api/v1/repos/molecule-ai/workspace-template-seo/archive/main.tar.gz" {
t.Errorf("unexpected path: %s", r.URL.Path)
}
if got := r.Header.Get("Authorization"); got != "token the-token" {
t.Errorf("unexpected Authorization header: %q", got)
}
// Serve a real .tar.gz with config.yaml + prompts/system.md +
// agent-skills/seo-audit/SKILL.md wrapped in the
// "<repo>-<sha>" top-level dir Gitea uses.
w.Header().Set("Content-Type", "application/gzip")
w.WriteHeader(http.StatusOK)
gz := gzip.NewWriter(w)
tw := tar.NewWriter(gz)
mustWriteTar(t, tw, "workspace-template-seo-abcd1234/config.yaml", []byte("# SEO config\n"))
mustWriteTar(t, tw, "workspace-template-seo-abcd1234/prompts/system.md", []byte("# System prompt\n"))
mustWriteTar(t, tw, "workspace-template-seo-abcd1234/agent-skills/seo-audit/SKILL.md", []byte("# SEO skill\n"))
mustWriteTar(t, tw, "workspace-template-seo-abcd1234/agent-skills/seo-audit/manifest.yaml", []byte("name: seo-audit\n"))
_ = tw.Close()
_ = gz.Close()
}))
defer srv.Close()
f := NewGiteaTemplateAssetFetcher(srv.URL, "the-token", srv.Client())
assets, err := f.Load(context.Background(), "molecule-ai/workspace-template-seo@main")
if err != nil {
t.Fatalf("Load: %v", err)
}
// All 3 allowlisted namespaces must be present. Per the
// dispatch: "must FAIL if skills dropped" — assert skill
// files explicitly.
mustHaveKey(t, assets, "config.yaml")
mustHaveKey(t, assets, "prompts/system.md")
mustHaveKey(t, assets, "agent-skills/seo-audit/SKILL.md")
mustHaveKey(t, assets, "agent-skills/seo-audit/manifest.yaml")
if len(assets) != 4 {
t.Errorf("expected 4 assets, got %d: %v", len(assets), keysOf(assets))
}
}
// TestGiteaTemplateAssetFetcher_AllowsOnlyAllowlistedPaths pins
// the blast-radius guard. A .tar.gz that contains both
// allowlisted AND non-allowlisted paths (e.g. CLAUDE.md,
// MEMORY.md, .claude/sessions/foo) must have the non-allowlisted
// entries EXCLUDED from the returned map (the consumer's
// IsCPTemplateAssetPath check enforces the same invariant —
// the fetcher pre-filters as a perf + audit-log win).
func TestGiteaTemplateAssetFetcher_AllowsOnlyAllowlistedPaths(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/gzip")
gz := gzip.NewWriter(w)
tw := tar.NewWriter(gz)
// Allowlisted.
mustWriteTar(t, tw, "repo-sha/config.yaml", []byte("ok"))
mustWriteTar(t, tw, "repo-sha/prompts/x.md", []byte("ok"))
mustWriteTar(t, tw, "repo-sha/agent-skills/skill-x/SKILL.md", []byte("ok"))
// NOT allowlisted — must be excluded.
mustWriteTar(t, tw, "repo-sha/CLAUDE.md", []byte("agent-owned"))
mustWriteTar(t, tw, "repo-sha/MEMORY.md", []byte("agent-owned"))
mustWriteTar(t, tw, "repo-sha/USER.md", []byte("agent-owned"))
mustWriteTar(t, tw, "repo-sha/.claude/sessions/foo.json", []byte("agent-owned"))
mustWriteTar(t, tw, "repo-sha/adapter.py", []byte("not-template-asset"))
_ = tw.Close()
_ = gz.Close()
}))
defer srv.Close()
f := NewGiteaTemplateAssetFetcher(srv.URL, "the-token", srv.Client())
assets, err := f.Load(context.Background(), "owner/repo@main")
if err != nil {
t.Fatalf("Load: %v", err)
}
// Allowlisted keys present.
mustHaveKey(t, assets, "config.yaml")
mustHaveKey(t, assets, "prompts/x.md")
mustHaveKey(t, assets, "agent-skills/skill-x/SKILL.md")
// Non-allowlisted keys EXCLUDED.
mustNotHaveKey(t, assets, "CLAUDE.md")
mustNotHaveKey(t, assets, "MEMORY.md")
mustNotHaveKey(t, assets, "USER.md")
mustNotHaveKey(t, assets, ".claude/sessions/foo.json")
mustNotHaveKey(t, assets, "adapter.py")
if len(assets) != 3 {
t.Errorf("expected 3 allowlisted assets, got %d: %v", len(assets), keysOf(assets))
}
}
// TestGiteaTemplateAssetFetcher_FailsClosedOnHTTPError pins
// the fail-closed contract. A non-200 response (401, 404, 500)
// returns an error, NOT a silently-empty result.
func TestGiteaTemplateAssetFetcher_FailsClosedOnHTTPError(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
_, _ = w.Write([]byte(`{"message":"internal server error"}`))
}))
defer srv.Close()
f := NewGiteaTemplateAssetFetcher(srv.URL, "the-token", srv.Client())
_, err := f.Load(context.Background(), "owner/repo@main")
if err == nil {
t.Fatal("expected error on 500 response, got nil (fail-closed violated)")
}
if !strings.Contains(err.Error(), "500") {
t.Errorf("error should mention the HTTP status, got: %v", err)
}
}
// TestGiteaTemplateAssetFetcher_FailsClosedOnTransportError pins
// the fail-closed contract for transport-layer failures (DNS
// failure, connection refused, etc.). The fetcher must NOT
// silently return an empty map.
func TestGiteaTemplateAssetFetcher_FailsClosedOnTransportError(t *testing.T) {
// Use a port that nothing is listening on (reserved by IANA
// for "tcpmux"; 1 is "tcpmux" too). The dial will fail.
f := NewGiteaTemplateAssetFetcher("http://127.0.0.1:1", "the-token", &http.Client{Timeout: 100 * time.Millisecond})
_, err := f.Load(context.Background(), "owner/repo@main")
if err == nil {
t.Fatal("expected error on transport failure, got nil (fail-closed violated)")
}
}
// TestGiteaTemplateAssetFetcher_RejectsEmptyToken pins the
// security guard: an empty token (which would otherwise be
// sent as "Authorization: token " with no credential) is
// rejected at construction time. A forgotten token init would
// otherwise silently fail-against-anonymous-requests, which
// Gitea would 401 on — better to fail loud at Load time.
func TestGiteaTemplateAssetFetcher_RejectsEmptyToken(t *testing.T) {
f := NewGiteaTemplateAssetFetcher("http://example.com", "", nil)
_, err := f.Load(context.Background(), "owner/repo@main")
if err == nil {
t.Fatal("expected error on empty token, got nil (security guard violated)")
}
if !strings.Contains(err.Error(), "token") {
t.Errorf("error should mention token, got: %v", err)
}
}
// TestParseTemplateIdentity pins the identity parser. Format:
// "<owner>/<repo>@<ref>". Malformed identities return errors.
func TestParseTemplateIdentity(t *testing.T) {
cases := []struct {
name string
identity string
wantOwner string
wantRepo string
wantRef string
wantErr bool
}{
{"simple", "owner/repo@main", "owner", "repo", "main", false},
{"with-sha-ref", "owner/repo@abcd1234", "owner", "repo", "abcd1234", false},
{"with-tag-ref", "owner/repo@v1.2.3", "owner", "repo", "v1.2.3", false},
{"nested-owner", "molecule-ai/workspace-template-seo@main", "molecule-ai", "workspace-template-seo", "main", false},
{"empty", "", "", "", "", true},
{"no-at", "owner/repo", "", "", "", true},
{"empty-ref", "owner/repo@", "", "", "", true},
{"no-slash", "owner@main", "", "", "", true},
{"empty-owner", "/repo@main", "", "", "", true},
{"empty-repo", "owner/@main", "", "", "", true},
{"extra-at", "owner/repo@main@extra", "", "", "", true},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
owner, repo, ref, err := parseTemplateIdentity(c.identity)
if c.wantErr {
if err == nil {
t.Errorf("expected error for %q, got nil", c.identity)
}
return
}
if err != nil {
t.Errorf("unexpected error for %q: %v", c.identity, err)
return
}
if owner != c.wantOwner {
t.Errorf("owner = %q, want %q", owner, c.wantOwner)
}
if repo != c.wantRepo {
t.Errorf("repo = %q, want %q", repo, c.wantRepo)
}
if ref != c.wantRef {
t.Errorf("ref = %q, want %q", ref, c.wantRef)
}
})
}
}
// TestStripArchiveTopDir pins the top-level dir stripper. Gitea
// wraps entries in "<repo>-<sha>/<relpath>"; we want just the
// relpath. Top-level entries (no slash) and traversal attempts
// (../) are rejected.
func TestStripArchiveTopDir(t *testing.T) {
cases := []struct {
name string
in string
wantOk bool
want string
}{
{"normal", "repo-sha/config.yaml", true, "config.yaml"},
{"nested", "repo-sha/prompts/system.md", true, "prompts/system.md"},
{"deep", "repo-sha/agent-skills/seo/SKILL.md", true, "agent-skills/seo/SKILL.md"},
{"top-level-no-slash", "config.yaml", false, ""},
{"empty", "", false, ""},
{"slash-only", "/", false, ""},
{"dot-only", "repo-sha/.", false, ""},
{"traversal", "repo-sha/../etc/passwd", false, ""},
{"deep-traversal", "repo-sha/prompts/../../etc", false, ""},
{"dotdot-in-name", "repo-sha/foo..bar", true, "foo..bar"}, // not a traversal, just an unusual filename
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
got, ok := stripArchiveTopDir(c.in)
if ok != c.wantOk {
t.Errorf("ok = %v, want %v (input %q)", ok, c.wantOk, c.in)
return
}
if ok && got != c.want {
t.Errorf("got = %q, want %q (input %q)", got, c.want, c.in)
}
})
}
}
// ---- Test helpers ----
// mustWriteTar writes a single tar entry with the given name +
// data. Errors are reported via t (so the test fails cleanly).
// Wrapped in a helper to keep the test bodies focused on the
// content (not the tar boilerplate).
func mustWriteTar(t *testing.T, tw *tar.Writer, name string, data []byte) {
t.Helper()
hdr := &tar.Header{
Name: name,
Mode: 0o644,
Size: int64(len(data)),
Typeflag: tar.TypeReg,
}
if err := tw.WriteHeader(hdr); err != nil {
t.Fatalf("tar WriteHeader %s: %v", name, err)
}
if _, err := tw.Write(data); err != nil {
t.Fatalf("tar Write %s: %v", name, err)
}
}
// mustHaveKey asserts assets contains key. Failure reports the
// key + the full set so the test message is informative.
func mustHaveKey(t *testing.T, assets map[string][]byte, key string) {
t.Helper()
if _, ok := assets[key]; !ok {
t.Errorf("expected key %q in assets, got keys: %v", key, keysOf(assets))
}
}
// mustNotHaveKey asserts assets does NOT contain key. The
// blast-radius guard's load-bearing assertion.
func mustNotHaveKey(t *testing.T, assets map[string][]byte, key string) {
t.Helper()
if _, ok := assets[key]; ok {
t.Errorf("did NOT expect key %q in assets (not in allowlist), but it's there: %v", key, keysOf(assets))
}
}
// keysOf returns a stable-ish view of the map's keys for
// assertion error messages. Local helper to avoid coupling to
// test-fixture conventions elsewhere.
func keysOf(m map[string][]byte) []string {
out := make([]string, 0, len(m))
for k := range m {
out = append(out, k)
}
return out
}
// Compile-time check: archive/tar is imported for the test
// server's .tar.gz generation. (Avoids a "imported and not
// used" lint in case future refactors move the helper.)
var _ = tar.TypeReg
// Compile-time check: bytes is imported for the test sentinel.
// Avoids unused-import in future refactors.
var _ = bytes.NewReader
// Compile-time check: io is imported for io.LimitReader.
// Avoids unused-import in future refactors.
var _ = io.LimitReader