diff --git a/workspace-server/cmd/server/main.go b/workspace-server/cmd/server/main.go index 507538f7e..af060eb98 100644 --- a/workspace-server/cmd/server/main.go +++ b/workspace-server/cmd/server/main.go @@ -249,6 +249,22 @@ func main() { wh.SetCPProvisioner(cpProv) } + // PR-B (RFC #2843 #24): wire the Gitea TemplateAssetFetcher. + // nil-if-empty + WARN: if the token isn't set, log a warning + // and stay in the "no fetcher wired" state. The SCAFFOLD gate + // in collectCPConfigFiles treats nil fetcher as "skip the + // fetcher" — pre-scaffold behavior preserved for self-host / + // unconfigured tenants. baseURL has a production default + // (https://git.moleculesai.app) but is overridable for staging + // or per-deployment Gitea mirrors. + if token := templateRepoToken(); token != "" { + baseURL := envOr("MOLECULE_GITEA_BASE_URL", "https://git.moleculesai.app") + wh.SetGiteaTemplateFetcher(provisioner.NewGiteaTemplateAssetFetcher(baseURL, token, nil)) + log.Printf("template repo fetcher: wired (baseURL=%q, token set)", baseURL) + } else { + log.Printf("template repo fetcher: MOLECULE_TEMPLATE_REPO_TOKEN unset; fetcher disabled (self-host default / SCAFFOLD-gate skip)") + } + // Self-hosted platform-agent boot-provision (Change 1). The line-128 seed // only creates the concierge DB ROW; on a fresh self-host that leaves it // with no container (status='failed'/'online' but nothing running). Now that @@ -667,6 +683,18 @@ func templateCacheToken() string { return "" } +// templateRepoToken returns the per-deployment READ-ONLY Gitea PAT +// used by the Gitea TemplateAssetFetcher (RFC #2843 #24 PR-B). +// Distinct from templateCacheToken (which is for the template cache, +// a different feature) so a tenant can rotate the fetcher token +// without touching the cache token. nil-if-empty + WARN: callers +// should treat empty as "fetcher disabled" (self-host default — the +// SCAFFOLD gate in collectCPConfigFiles treats nil fetcher as +// "skip the fetcher", pre-scaffold behavior preserved). +func templateRepoToken() string { + return strings.TrimSpace(os.Getenv("MOLECULE_TEMPLATE_REPO_TOKEN")) +} + func shouldRefreshTemplateCache(token, manifestPath string) bool { switch strings.ToLower(strings.TrimSpace(os.Getenv("TEMPLATE_CACHE_REFRESH"))) { case "0", "false", "off", "no": diff --git a/workspace-server/internal/handlers/runtime_registry.go b/workspace-server/internal/handlers/runtime_registry.go index 5900fe313..317b50165 100644 --- a/workspace-server/internal/handlers/runtime_registry.go +++ b/workspace-server/internal/handlers/runtime_registry.go @@ -60,6 +60,7 @@ func manifestPath() string { type manifestEntry struct { Name string `json:"name"` Repo string `json:"repo"` + Ref string `json:"ref"` } type manifestFile struct { @@ -178,3 +179,78 @@ func initKnownRuntimes() { } log.Printf("runtime registry: loaded %d runtimes from %s: %v", len(loaded), path, names) } + +// templateRepoRef is the parsed manifest entry needed to +// derive a TemplateIdentity for the Gitea fetcher. The +// identity is "@" (the giteaTemplateAssetFetcher +// parses this as "/@"). +type templateRepoRef struct { + Repo string // e.g. "molecule-ai/molecule-ai-workspace-template-claude-code" + Ref string // e.g. "main" or a pinned tag/sha +} + +// templateRepoByName holds the runtime → (repo, ref) mapping +// parsed from manifest.json at boot. Empty for runtimes that +// have no template repo (external, kimi, kimi-cli, mock — +// caller leaves cfg.TemplateIdentity empty for those, which +// the SCAFFOLD gate in collectCPConfigFiles treats as +// "skip the fetcher", pre-scaffold behavior preserved). +var templateRepoByName = make(map[string]templateRepoRef) + +// initTemplateRepoByName is called from the package init chain +// (alongside initKnownRuntimes) to populate the repo map. The +// fallback set returns an empty map — external/kimi/kimi-cli/mock +// have no manifest entry by design. +// +// Reconcile-on-every-boot: the map is RESET at the start of every +// call, then re-populated from the current manifest. Stale entries +// (runtimes removed from the manifest) are dropped; the consumer +// (collectCPConfigFiles + the SCAFFOLD gate) can rely on the map +// being exactly the current manifest's runtimes. The reset is +// critical for the every-boot reconcile semantic — without it, +// dropping a template from the manifest would leave its identity +// resolvable, and the fetcher would attempt a no-longer-existing +// repo. Idempotent for the same manifest input. +func initTemplateRepoByName() { + path := manifestPath() + if path == "" { + log.Printf("template repo registry: manifest.json not found, no template repos available") + templateRepoByName = make(map[string]templateRepoRef) + return + } + data, err := os.ReadFile(path) + if err != nil { + log.Printf("template repo registry: manifest.json load failed (%v) — no template repos", err) + templateRepoByName = make(map[string]templateRepoRef) + return + } + var m manifestFile + if err := json.Unmarshal(data, &m); err != nil { + log.Printf("template repo registry: manifest.json parse failed (%v) — no template repos", err) + templateRepoByName = make(map[string]templateRepoRef) + return + } + // Reconcile: reset, then re-populate. Stale entries from a + // previous manifest are dropped — see func comment. + templateRepoByName = make(map[string]templateRepoRef, len(m.WorkspaceTemplates)) + for _, e := range m.WorkspaceTemplates { + // Same normalization as loadRuntimesFromManifest: strip + // the "-default" suffix so the runtime identifier is + // the map key, not the template-variant name. + name := strings.TrimSuffix(e.Name, "-default") + templateRepoByName[name] = templateRepoRef{Repo: e.Repo, Ref: e.Ref} + } +} + +// templateIdentityForRuntime returns the Gitea template identity +// for the given runtime name, or "" + false if the runtime has +// no template repo (external / kimi / kimi-cli / mock / unknown). +// The format is "@" — the giteaTemplateAssetFetcher +// parses this further as "/@". +func templateIdentityForRuntime(runtime string) (string, bool) { + rr, ok := templateRepoByName[runtime] + if !ok { + return "", false + } + return rr.Repo + "@" + rr.Ref, true +} diff --git a/workspace-server/internal/handlers/runtime_registry_test.go b/workspace-server/internal/handlers/runtime_registry_test.go index 0a2f49aa5..58a0b391e 100644 --- a/workspace-server/internal/handlers/runtime_registry_test.go +++ b/workspace-server/internal/handlers/runtime_registry_test.go @@ -6,10 +6,17 @@ package handlers // 2. "external" is always injected, even on manifests without it. // 3. Missing file / malformed JSON returns error, caller uses // fallback (tested at the initKnownRuntimes level via integration). +// 4. initTemplateRepoByName populates the map at the prod-init +// path (PR-B / RFC #2843 #24 contract-pin: the map must be +// non-empty after init for shipped runtimes). +// 5. initTemplateRepoByName is idempotent + reconciles stale +// entries on every-boot (a runtime removed from the manifest +// must NOT be resolvable in the map after the next init). import ( "os" "path/filepath" + "strings" "testing" ) @@ -126,3 +133,190 @@ func keys(m map[string]struct{}) []string { } return out } + +// TestTemplateIdentityForRuntime pins the runtime -> "@" +// mapping that drives cfg.TemplateIdentity (RFC #2843 #24 PR-B). +// Loads the real manifest.json (test setup already has it), then +// asserts the wire shape for a known runtime + the empty-result +// for the BYO-compute meta-runtimes (external / kimi / kimi-cli / +// mock / unknown). +func TestTemplateIdentityForRuntime(t *testing.T) { + path := manifestPath() + if path == "" { + t.Skip("manifest.json not discoverable from this test cwd") + } + // Force the repo registry to load from the same path + // (idempotent — safe to call multiple times). + initTemplateRepoByName() + + // Sanity: a real template-backed runtime resolves to a + // "@" identity. + id, ok := templateIdentityForRuntime("claude-code") + if !ok { + t.Errorf("claude-code should resolve to an identity (manifest has it), got none") + } else { + // Identity must contain an @ and a slash (the fetcher + // parses this as "/@"). + if !strings.Contains(id, "@") || !strings.Contains(id, "/") { + t.Errorf("claude-code identity %q doesn't look like \"/@\"", id) + } + // Identity must NOT be empty (the SCAFFOLD gate in + // collectCPConfigFiles would skip the fetcher on empty). + if id == "" { + t.Errorf("claude-code identity is empty; should be \"@\"") + } + } + + // BYO-compute meta-runtimes have no template repo — the + // lookup MUST return (empty, false) so the SCAFFOLD gate + // skips the fetcher for them (preserves pre-scaffold + // behavior). + for _, rt := range []string{"external", "kimi", "kimi-cli", "mock", "unknown-runtime-xyz"} { + id2, ok2 := templateIdentityForRuntime(rt) + if ok2 { + t.Errorf("runtime %q should NOT have a template identity (BYO-compute / unknown), got identity=%q", rt, id2) + } + if id2 != "" { + t.Errorf("runtime %q identity should be empty, got %q", rt, id2) + } + } +} + +// TestTemplateIdentityForRuntimeOrEmpty pins the +// single-expression wrapper used at the call site in +// buildProvisionerConfig. +func TestTemplateIdentityForRuntimeOrEmpty(t *testing.T) { + if manifestPath() == "" { + t.Skip("manifest.json not discoverable from this test cwd") + } + initTemplateRepoByName() + if got := templateIdentityForRuntimeOrEmpty("claude-code"); got == "" { + t.Error("claude-code should return a non-empty identity") + } + if got := templateIdentityForRuntimeOrEmpty("external"); got != "" { + t.Errorf("external should return empty, got %q", got) + } + if got := templateIdentityForRuntimeOrEmpty("unknown-xyz"); got != "" { + t.Errorf("unknown-xyz should return empty, got %q", got) + } +} + +// TestInitTemplateRepoByName_PopulatesMap_FromTempManifest pins the +// PR-B contract-pin: the prod-init path must populate templateRepoByName +// from a real manifest so cfg.TemplateIdentity is non-empty for +// template-backed runtimes at boot. Uses a temp manifest.json (via +// the WORKSPACE_MANIFEST_PATH env var, read by manifestPath()) so the +// test doesn't depend on a real manifest being present. +// +// This is the load-bearing test PM required: "Keep a test asserting +// the prod-init path populates it (so this can't regress to test-only)". +func TestInitTemplateRepoByName_PopulatesMap_FromTempManifest(t *testing.T) { + // Write a temp manifest.json with a known set of template + // runtimes. + dir := t.TempDir() + manifestPath := filepath.Join(dir, "manifest.json") + manifest := `{ + "workspace_templates": [ + {"name": "claude-code-default", "repo": "molecule-ai/t-cc", "ref": "main"}, + {"name": "hermes", "repo": "molecule-ai/t-hermes", "ref": "v1.2.3"}, + {"name": "codex", "repo": "molecule-ai/t-codex", "ref": "main"} + ] + }` + if err := os.WriteFile(manifestPath, []byte(manifest), 0600); err != nil { + t.Fatalf("write temp manifest: %v", err) + } + // Point manifestPath() at the temp file (it reads + // WORKSPACE_MANIFEST_PATH first). + t.Setenv("WORKSPACE_MANIFEST_PATH", manifestPath) + + // Run the prod-init path. + initTemplateRepoByName() + + // Assert the map is populated for the shipped runtimes. + cases := []struct { + runtime string + wantRepo string + wantRef string + }{ + {"claude-code", "molecule-ai/t-cc", "main"}, + {"hermes", "molecule-ai/t-hermes", "v1.2.3"}, + {"codex", "molecule-ai/t-codex", "main"}, + } + for _, c := range cases { + rr, ok := templateRepoByName[c.runtime] + if !ok { + t.Errorf("runtime %q missing from templateRepoByName after init (got %d entries)", c.runtime, len(templateRepoByName)) + continue + } + if rr.Repo != c.wantRepo { + t.Errorf("runtime %q: want repo=%q, got %q", c.runtime, c.wantRepo, rr.Repo) + } + if rr.Ref != c.wantRef { + t.Errorf("runtime %q: want ref=%q, got %q", c.runtime, c.wantRef, rr.Ref) + } + } + + // Assert the lookup function returns the expected identity. + for _, c := range cases { + id, ok := templateIdentityForRuntime(c.runtime) + if !ok { + t.Errorf("templateIdentityForRuntime(%q) returned (empty, false); want (non-empty, true)", c.runtime) + continue + } + want := c.wantRepo + "@" + c.wantRef + if id != want { + t.Errorf("templateIdentityForRuntime(%q) = %q, want %q", c.runtime, id, want) + } + } +} + +// TestInitTemplateRepoByName_ReconcilesStaleEntries pins the +// every-boot reconcile property: a runtime removed from the manifest +// between two init calls must NOT be resolvable in the map after +// the second init. This catches the "stale entry persists" bug that +// would otherwise let the fetcher attempt a no-longer-existing repo. +func TestInitTemplateRepoByName_ReconcilesStaleEntries(t *testing.T) { + dir := t.TempDir() + manifestPath := filepath.Join(dir, "manifest.json") + t.Setenv("WORKSPACE_MANIFEST_PATH", manifestPath) + + // First manifest: claude-code + hermes present. + if err := os.WriteFile(manifestPath, []byte(`{ + "workspace_templates": [ + {"name": "claude-code-default", "repo": "molecule-ai/t-cc", "ref": "main"}, + {"name": "hermes", "repo": "molecule-ai/t-hermes", "ref": "main"} + ] + }`), 0600); err != nil { + t.Fatalf("write manifest v1: %v", err) + } + initTemplateRepoByName() + if _, ok := templateRepoByName["claude-code"]; !ok { + t.Fatalf("after first init: claude-code missing") + } + if _, ok := templateRepoByName["hermes"]; !ok { + t.Fatalf("after first init: hermes missing") + } + + // Second manifest: hermes REMOVED. claude-code unchanged. + if err := os.WriteFile(manifestPath, []byte(`{ + "workspace_templates": [ + {"name": "claude-code-default", "repo": "molecule-ai/t-cc", "ref": "main"} + ] + }`), 0600); err != nil { + t.Fatalf("write manifest v2: %v", err) + } + initTemplateRepoByName() + + // claude-code still resolves. + if _, ok := templateRepoByName["claude-code"]; !ok { + t.Errorf("after second init: claude-code should still resolve (it stayed in the manifest)") + } + // hermes must be GONE — the manifest removed it. + if _, ok := templateRepoByName["hermes"]; ok { + t.Errorf("after second init: hermes should NOT resolve (it was removed from the manifest); the every-boot reconcile failed") + } + // And the lookup returns ok=false for hermes. + if id, ok := templateIdentityForRuntime("hermes"); ok || id != "" { + t.Errorf("templateIdentityForRuntime(hermes) should return (\"\", false), got (%q, %v)", id, ok) + } +} diff --git a/workspace-server/internal/handlers/workspace.go b/workspace-server/internal/handlers/workspace.go index 34d3eb3df..caf40cedf 100644 --- a/workspace-server/internal/handlers/workspace.go +++ b/workspace-server/internal/handlers/workspace.go @@ -63,6 +63,17 @@ type WorkspaceHandler struct { // registered; Registry.Run handles a nil receiver as a no-op so the // hot path stays a single nil-pointer compare. envMutators *provisionhook.Registry + // giteaTemplateFetcher is the per-deployment TemplateAssetFetcher + // (RFC #2843 #24 PR-B) wired from main.go. nil = no fetcher + // wired (self-host default; the SCAFFOLD gate in + // collectCPConfigFiles treats nil fetcher as "skip the + // fetcher", pre-scaffold behavior preserved). When non-nil, + // the fetcher is called on EVERY provision AND every restart + // (buildProvisionerConfig is the single source of truth for + // WorkspaceConfig across both paths) — collectCPConfigFiles + // reconciles every boot, not just first provision, per the + // dispatch's "every boot" requirement. + giteaTemplateFetcher provisioner.TemplateAssetFetcher // stopFnOverride is set exclusively in tests to intercept provisioner.Stop // calls made by HibernateWorkspace without requiring a running Docker daemon. // Always nil in production; the real provisioner path is used when nil. @@ -241,6 +252,16 @@ func (h *WorkspaceHandler) SetEnvMutators(r *provisionhook.Registry) { h.envMutators = r } +// SetGiteaTemplateFetcher wires the per-deployment TemplateAssetFetcher +// (RFC #2843 #24 PR-B). Nil is fine (self-host default; the SCAFFOLD +// gate in collectCPConfigFiles treats nil as "skip the fetcher"). +// Production wires a giteaTemplateAssetFetcher with baseURL + +// per-identity READ-ONLY Gitea PAT (threaded from Infisical SSOT). +// Tests pass a stub. +func (h *WorkspaceHandler) SetGiteaTemplateFetcher(f provisioner.TemplateAssetFetcher) { + h.giteaTemplateFetcher = f +} + // TokenRegistry returns the provisionhook.Registry so the router can // wire the GET /admin/github-installation-token handler without coupling // to WorkspaceHandler's internals. Returns nil when no plugin has been diff --git a/workspace-server/internal/handlers/workspace_provision.go b/workspace-server/internal/handlers/workspace_provision.go index 4636cc904..036ff92d7 100644 --- a/workspace-server/internal/handlers/workspace_provision.go +++ b/workspace-server/internal/handlers/workspace_provision.go @@ -376,9 +376,38 @@ func (h *WorkspaceHandler) buildProvisionerConfig( // RuntimeImages[Runtime] :latest lookup, which is what the dead // reader's sql.ErrNoRows path was producing already. Image: "", + + // RFC #2843 #24 PR-B — wire the generic template-asset + // channel. cfg.TemplateIdentity is derived from the + // runtime_registry (manifest.json's workspace_templates + // entry for this runtime) — the format is "@" + // (the giteaTemplateAssetFetcher parses this further as + // "/@"). External-like runtimes + // (external / kimi / kimi-cli / mock) have NO template + // repo, so the identity is left empty — the SCAFFOLD + // gate in collectCPConfigFiles treats empty identity as + // "skip the fetcher" (pre-scaffold behavior preserved). + // + // The fetcher itself is assigned by the caller (main.go + // for SaaS, or a test helper) via h.giteaTemplateFetcher + // — wired here so the fetcher resolution is one place, + // not duplicated across first-provision + restart paths. + // nil fetcher = "no fetcher wired" (self-host default; + // falls through to the local TemplatePath path). + TemplateIdentity: templateIdentityForRuntimeOrEmpty(payload.Runtime), + TemplateAssetFetcher: h.giteaTemplateFetcher, } } +// templateIdentityForRuntimeOrEmpty is a tiny wrapper around +// templateIdentityForRuntime that returns "" on miss (rather +// than the (string, bool) tuple). Used at the call site so +// the assignment can be a single expression. +func templateIdentityForRuntimeOrEmpty(runtime string) string { + id, _ := templateIdentityForRuntime(runtime) + return id +} + // issueAndInjectToken rotates the workspace auth token and injects the // plaintext into cfg.ConfigFiles[".auth_token"] so it is written into the // /configs volume by WriteFilesToContainer immediately after the container @@ -605,6 +634,13 @@ var knownRuntimes = fallbackRuntimes func init() { initKnownRuntimes() + // PR-B (RFC #2843 #24): populate the templateRepoByName map at + // boot so cfg.TemplateIdentity is non-empty for template-backed + // runtimes (claude-code / hermes / etc). The init() order matters: + // must follow initKnownRuntimes so the per-runtime manifestEntry + // lookups use the same set of normalized runtime names. Idempotent + // — both inits read manifestPath() (cached) and tolerate each other. + initTemplateRepoByName() } // yamlQuote emits a YAML double-quoted scalar that safely contains any diff --git a/workspace-server/internal/provisioner/gitea_template_assets.go b/workspace-server/internal/provisioner/gitea_template_assets.go new file mode 100644 index 000000000..c8b7705b0 --- /dev/null +++ b/workspace-server/internal/provisioner/gitea_template_assets.go @@ -0,0 +1,289 @@ +package provisioner + +// gitea_template_assets.go — real Gitea-based TemplateAssetFetcher +// (RFC #2843 #24, PR-B). The previous PR-A SCAFFOLD (#2855) wired +// the interface + fields; this file is the production impl. +// +// Transport: GET {baseURL}/api/v1/repos///archive/.tar.gz +// with header "Authorization: token " → stream-extract the +// tarball → return map[relpath][]byte for ONLY allowlisted paths +// (config.yaml + prompts/** + agent-skills/**) → strip the archive's +// top-level dir prefix. +// +// Template identity format: "/@" (e.g. +// "molecule-ai/workspace-template-claude-code@main"). The caller +// (workspace_provision.go) is responsible for resolving the runtime +// name to a (repo, ref) pair via the runtime_registry / manifest.json. +// The fetcher just parses + fetches. +// +// Fail-closed: any transport / extraction / parse error returns a +// non-nil error so the caller can abort the provision rather than +// silently regressing to stub-mode /configs (same contract as the +// persisted-bundle provider in #2831 PIECE 1). +// +// Memory-preservation: this fetcher ONLY materializes TEMPLATE +// ASSETS (config.yaml + prompts/* + agent-skills/*). The existing +// IsCPTemplateAssetPath allowlist in the consumer (collectCPConfigFiles) +// is the load-bearing guard against a fetcher that returns a path +// outside the template-asset namespace (e.g. /workspace, MEMORY.md, +// USER.md, CLAUDE.md, .claude/sessions/*) — those would either be +// rejected by the allowlist (provision aborts) or, if they somehow +// slipped through, would land in the SEPARATE TemplateAssets field +// rather than the SM-bound ConfigFiles field. The transport split +// (TemplateAssets vs ConfigFiles) is the second line of defense +// against clobbering agent-owned state. +// +// Auth: per-identity READ-ONLY Gitea token. The token is threaded +// from Infisical SSOT (per #2676 program) into the fetcher via the +// Token field. The token MUST be a per-identity PAT scoped to the +// template repo with read-only access — NOT a founder PAT, NOT a +// workspace-admin PAT. Workspace-server never logs or echoes the +// token (the httpClient logs are scrubbed via the standard net/http +// strip-header path; the token is in the Authorization header, +// which net/http does not log by default). +// +// NO SIZE CAP: the existing #2845 acbc0da9 added a 16MiB bound +// for TemplateAssets in collectCPConfigFiles (separate from the +// 256KiB SM cap on ConfigFiles). The fetcher itself does NOT cap +// the response — the consumer-side bound is the cap. This matches +// the dispatch's "NO size cap" on the fetcher. + +import ( + "archive/tar" + "bytes" + "compress/gzip" + "context" + "errors" + "fmt" + "io" + "net/http" + "path" + "strings" + "time" +) + +// giteaTemplateAssetFetcher is the production TemplateAssetFetcher +// backed by a Gitea archive endpoint. The baseURL, Token, and +// httpClient fields are exported via a constructor (NewGiteaTemplateAssetFetcher) +// so main.go can wire per-deployment values without making the +// struct fields mutable post-construction. +type giteaTemplateAssetFetcher struct { + baseURL string + token string + httpClient *http.Client +} + +// NewGiteaTemplateAssetFetcher returns a Gitea-backed fetcher wired +// with the given base URL, read-only PAT, and HTTP client. The HTTP +// client may be nil — the constructor substitutes a sane default +// (30s timeout, no surprises). Callers SHOULD use a shared client +// with a connection pool in production; the per-call client here +// is for tests + fallback. +func NewGiteaTemplateAssetFetcher(baseURL, token string, httpClient *http.Client) TemplateAssetFetcher { + if httpClient == nil { + httpClient = &http.Client{Timeout: 30 * time.Second} + } + return &giteaTemplateAssetFetcher{ + baseURL: strings.TrimRight(baseURL, "/"), + token: token, + httpClient: httpClient, + } +} + +// parseTemplateIdentity splits "/@" into its +// three parts. Returns an error on missing pieces or extra "@" +// segments. The ref part is the git ref (branch, tag, or SHA); a +// missing "@" is an error (no default — the caller must specify a +// pinned ref so the artifact is reproducible). +func parseTemplateIdentity(identity string) (owner, repo, ref string, err error) { + if identity == "" { + return "", "", "", errors.New("templateIdentity is empty (want \"/@\")") + } + atIdx := strings.LastIndex(identity, "@") + if atIdx < 0 { + return "", "", "", fmt.Errorf("templateIdentity %q has no @ref suffix (want \"/@\")", identity) + } + ref = identity[atIdx+1:] + if ref == "" { + return "", "", "", fmt.Errorf("templateIdentity %q has empty @ref", identity) + } + repoPart := identity[:atIdx] + slashIdx := strings.Index(repoPart, "/") + if slashIdx < 0 { + return "", "", "", fmt.Errorf("templateIdentity %q has no / prefix", identity) + } + owner = repoPart[:slashIdx] + repo = repoPart[slashIdx+1:] + if owner == "" || repo == "" { + return "", "", "", fmt.Errorf("templateIdentity %q has empty owner or repo", identity) + } + if strings.Contains(repo, "@") { + return "", "", "", fmt.Errorf("templateIdentity %q has extra @ in repo path", identity) + } + return owner, repo, ref, nil +} + +// Load fetches the template's tarball archive and returns the +// allowlisted asset map. See the package doc-comment for the full +// transport + allowlist contract. +func (f *giteaTemplateAssetFetcher) Load(ctx context.Context, templateIdentity string) (map[string][]byte, error) { + if f.baseURL == "" { + return nil, errors.New("giteaTemplateAssetFetcher: baseURL is empty") + } + if f.token == "" { + return nil, errors.New("giteaTemplateAssetFetcher: token is empty (per-identity READ-ONLY Gitea PAT required)") + } + owner, repo, ref, err := parseTemplateIdentity(templateIdentity) + if err != nil { + return nil, fmt.Errorf("giteaTemplateAssetFetcher: %w", err) + } + + url := fmt.Sprintf("%s/api/v1/repos/%s/%s/archive/%s.tar.gz", f.baseURL, owner, repo, ref) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return nil, fmt.Errorf("giteaTemplateAssetFetcher: build request: %w", err) + } + req.Header.Set("Authorization", "token "+f.token) + req.Header.Set("Accept", "application/gzip, application/octet-stream") + + resp, err := f.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("giteaTemplateAssetFetcher: GET %s: %w", url, err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusOK { + // Read up to 4 KiB of the body for the error message (the + // body may contain a structured Gitea error envelope). Cap + // the read so a hostile server can't OOM us. + preview, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return nil, fmt.Errorf("giteaTemplateAssetFetcher: GET %s: HTTP %d: %s", url, resp.StatusCode, string(preview)) + } + + // The Gitea archive endpoint returns a tar.gz — gunzip then + // stream-extract. + gz, err := gzip.NewReader(resp.Body) + if err != nil { + return nil, fmt.Errorf("giteaTemplateAssetFetcher: gzip.NewReader: %w", err) + } + defer func() { _ = gz.Close() }() + + tr := tar.NewReader(gz) + assets := make(map[string][]byte) + for { + hdr, err := tr.Next() + if err == io.EOF { + break + } + if err != nil { + return nil, fmt.Errorf("giteaTemplateAssetFetcher: tar.Next: %w", err) + } + if hdr.FileInfo().IsDir() { + continue + } + if hdr.Typeflag != tar.TypeReg { + continue + } + // Strip the archive's top-level dir prefix. Gitea tarballs + // wrap every entry in "-/" — we want + // just the relpath so the allowlist check is straightforward. + rel, ok := stripArchiveTopDir(hdr.Name) + if !ok { + // Entry is at the root or has no top-level dir; skip. + continue + } + // Allowlist filter — defense-in-depth. The consumer + // (collectCPConfigFiles) ALSO gates on IsCPTemplateAssetPath; + // skipping non-allowlisted entries here is a free perf win + // (don't allocate bytes for paths the consumer will reject) + // and a cleaner audit log. + if !IsCPTemplateAssetPath(rel) { + continue + } + // Read the file body. tar.Reader streams; we read in a + // bounded buffer (16 MiB safety per #2845 acbc0da9 — skill + // packages can be 700 KiB; we leave headroom for the + // largest expected template asset). The consumer-side cap + // is the real bound; this is just to prevent a hostile + // tarball from allocating a terabyte. + const perFileSafety = 16 << 20 + data, err := io.ReadAll(io.LimitReader(tr, perFileSafety+1)) + if err != nil { + return nil, fmt.Errorf("giteaTemplateAssetFetcher: read %s: %w", rel, err) + } + if len(data) > perFileSafety { + return nil, fmt.Errorf("giteaTemplateAssetFetcher: %s exceeds per-file safety bound %d bytes (cap enforcement is at the consumer)", rel, perFileSafety) + } + assets[rel] = data + } + return assets, nil +} + +// stripArchiveTopDir strips the archive's top-level dir prefix +// from a tarball entry. Gitea wraps entries as "/" +// where is typically "-" (e.g. +// "workspace-template-claude-code-abcd1234/config.yaml"). Returns +// the relpath and true on success; returns "" and false if the +// entry is at the top level (no slash), is malformed, or +// contains traversal sequences (../). +// +// Traversal check: a legitimate Gitea archive contains ONLY paths +// inside the top dir; any "../" segment in the relpath is a +// hostile-smuggling attempt (a malicious tarball trying to +// land a file at e.g. "../etc/passwd" — which would write OUTSIDE +// the top dir). Reject before path.Clean collapses the segments +// (otherwise "../etc/passwd" → "/etc/passwd" would slip through). +func stripArchiveTopDir(name string) (string, bool) { + if name == "" { + return "", false + } + // The first slash separates the top-level dir from the + // relpath. If there's no slash, the entry is at the top + // level (malformed for our purposes — Gitea's archive always + // wraps in a top dir). + slashIdx := strings.Index(name, "/") + if slashIdx < 0 { + return "", false + } + rel := name[slashIdx+1:] + if rel == "" { + return "", false + } + // Traversal rejection — check BOTH the raw relpath AND the + // cleaned form. Raw catches the obvious "../" segment; cleaned + // catches the sneaky "../../foo" that normalizes but still + // escaped the top dir. path.Clean collapses segments, so we + // re-detect via the leading "/" + the resulting + // non-relpath-but-still-traversed form. The simplest + // load-bearing check: the cleaned path, when prefixed with + // "/", must NOT contain a "/../" segment. (path.Clean + // guarantees no "/../" remains in the output of a clean run, + // so if "/../" appears it means the input had a traversal + // that resolved to a path starting with one of its parents.) + cleaned := path.Clean("/" + rel) + if strings.Contains(cleaned, "/../") { + return "", false + } + // Also check the raw relpath for the obvious ".." segment + // prefix (defense in depth — if the tarball entry is just + // "../" with nothing else, path.Clean("/../") is "/", which + // is the root, also a reject). + if strings.HasPrefix(rel, "..") || strings.Contains(rel, "/../") { + return "", false + } + cleaned = strings.TrimPrefix(cleaned, "/") + if cleaned == "" || cleaned == "." { + return "", false + } + return cleaned, true +} + +// Compile-time check: giteaTemplateAssetFetcher implements the +// TemplateAssetFetcher interface. A future refactor that breaks +// the signature is caught at the earliest possible moment +// (compile time of the package, not at runtime via duck-typing). +var _ TemplateAssetFetcher = (*giteaTemplateAssetFetcher)(nil) + +// Sentinel for tests that want to assert "this URL was hit". +// (Not exported — tests can re-derive via a custom httpClient.) +var _ = bytes.NewReader // keep import in case future refactor needs it diff --git a/workspace-server/internal/provisioner/gitea_template_assets_test.go b/workspace-server/internal/provisioner/gitea_template_assets_test.go new file mode 100644 index 000000000..ee03b7042 --- /dev/null +++ b/workspace-server/internal/provisioner/gitea_template_assets_test.go @@ -0,0 +1,325 @@ +package provisioner + +// gitea_template_assets_test.go — tests for the real Gitea +// TemplateAssetFetcher (RFC #2843 #24, PR-B). The previous +// #2855 SCAFFOLD tests covered the interface contract; this +// file covers the PRODUCTION impl. +// +// Tests use httptest.NewServer to serve a real .tar.gz +// generated in-memory (no real Gitea instance needed). The +// dispatch's required test surface: +// - happy path: assert ALL asset paths incl agent-skills are +// returned (must FAIL if skills dropped) +// - allowlist filter: non-allowlisted paths are excluded +// - fail-closed: transport / extract errors surface as errors +// - identity parsing: malformed identities return errors + +import ( + "archive/tar" + "bytes" + "compress/gzip" + "context" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" +) + +// TestGiteaTemplateAssetFetcher_HappyPath pins the production +// contract: a real .tar.gz archive containing config.yaml + +// prompts/ + agent-skills/ is fetched, parsed, and returned +// as a map with all three namespaces populated. The dispatch +// explicitly calls out: "must FAIL if skills dropped" — this +// test is the load-bearing check for that. +func TestGiteaTemplateAssetFetcher_HappyPath(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Verify the request shape. + if r.URL.Path != "/api/v1/repos/molecule-ai/workspace-template-seo/archive/main.tar.gz" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if got := r.Header.Get("Authorization"); got != "token the-token" { + t.Errorf("unexpected Authorization header: %q", got) + } + // Serve a real .tar.gz with config.yaml + prompts/system.md + + // agent-skills/seo-audit/SKILL.md wrapped in the + // "-" top-level dir Gitea uses. + w.Header().Set("Content-Type", "application/gzip") + w.WriteHeader(http.StatusOK) + gz := gzip.NewWriter(w) + tw := tar.NewWriter(gz) + mustWriteTar(t, tw, "workspace-template-seo-abcd1234/config.yaml", []byte("# SEO config\n")) + mustWriteTar(t, tw, "workspace-template-seo-abcd1234/prompts/system.md", []byte("# System prompt\n")) + mustWriteTar(t, tw, "workspace-template-seo-abcd1234/agent-skills/seo-audit/SKILL.md", []byte("# SEO skill\n")) + mustWriteTar(t, tw, "workspace-template-seo-abcd1234/agent-skills/seo-audit/manifest.yaml", []byte("name: seo-audit\n")) + _ = tw.Close() + _ = gz.Close() + })) + defer srv.Close() + + f := NewGiteaTemplateAssetFetcher(srv.URL, "the-token", srv.Client()) + assets, err := f.Load(context.Background(), "molecule-ai/workspace-template-seo@main") + if err != nil { + t.Fatalf("Load: %v", err) + } + + // All 3 allowlisted namespaces must be present. Per the + // dispatch: "must FAIL if skills dropped" — assert skill + // files explicitly. + mustHaveKey(t, assets, "config.yaml") + mustHaveKey(t, assets, "prompts/system.md") + mustHaveKey(t, assets, "agent-skills/seo-audit/SKILL.md") + mustHaveKey(t, assets, "agent-skills/seo-audit/manifest.yaml") + if len(assets) != 4 { + t.Errorf("expected 4 assets, got %d: %v", len(assets), keysOf(assets)) + } +} + +// TestGiteaTemplateAssetFetcher_AllowsOnlyAllowlistedPaths pins +// the blast-radius guard. A .tar.gz that contains both +// allowlisted AND non-allowlisted paths (e.g. CLAUDE.md, +// MEMORY.md, .claude/sessions/foo) must have the non-allowlisted +// entries EXCLUDED from the returned map (the consumer's +// IsCPTemplateAssetPath check enforces the same invariant — +// the fetcher pre-filters as a perf + audit-log win). +func TestGiteaTemplateAssetFetcher_AllowsOnlyAllowlistedPaths(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/gzip") + gz := gzip.NewWriter(w) + tw := tar.NewWriter(gz) + // Allowlisted. + mustWriteTar(t, tw, "repo-sha/config.yaml", []byte("ok")) + mustWriteTar(t, tw, "repo-sha/prompts/x.md", []byte("ok")) + mustWriteTar(t, tw, "repo-sha/agent-skills/skill-x/SKILL.md", []byte("ok")) + // NOT allowlisted — must be excluded. + mustWriteTar(t, tw, "repo-sha/CLAUDE.md", []byte("agent-owned")) + mustWriteTar(t, tw, "repo-sha/MEMORY.md", []byte("agent-owned")) + mustWriteTar(t, tw, "repo-sha/USER.md", []byte("agent-owned")) + mustWriteTar(t, tw, "repo-sha/.claude/sessions/foo.json", []byte("agent-owned")) + mustWriteTar(t, tw, "repo-sha/adapter.py", []byte("not-template-asset")) + _ = tw.Close() + _ = gz.Close() + })) + defer srv.Close() + + f := NewGiteaTemplateAssetFetcher(srv.URL, "the-token", srv.Client()) + assets, err := f.Load(context.Background(), "owner/repo@main") + if err != nil { + t.Fatalf("Load: %v", err) + } + + // Allowlisted keys present. + mustHaveKey(t, assets, "config.yaml") + mustHaveKey(t, assets, "prompts/x.md") + mustHaveKey(t, assets, "agent-skills/skill-x/SKILL.md") + // Non-allowlisted keys EXCLUDED. + mustNotHaveKey(t, assets, "CLAUDE.md") + mustNotHaveKey(t, assets, "MEMORY.md") + mustNotHaveKey(t, assets, "USER.md") + mustNotHaveKey(t, assets, ".claude/sessions/foo.json") + mustNotHaveKey(t, assets, "adapter.py") + if len(assets) != 3 { + t.Errorf("expected 3 allowlisted assets, got %d: %v", len(assets), keysOf(assets)) + } +} + +// TestGiteaTemplateAssetFetcher_FailsClosedOnHTTPError pins +// the fail-closed contract. A non-200 response (401, 404, 500) +// returns an error, NOT a silently-empty result. +func TestGiteaTemplateAssetFetcher_FailsClosedOnHTTPError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + _, _ = w.Write([]byte(`{"message":"internal server error"}`)) + })) + defer srv.Close() + + f := NewGiteaTemplateAssetFetcher(srv.URL, "the-token", srv.Client()) + _, err := f.Load(context.Background(), "owner/repo@main") + if err == nil { + t.Fatal("expected error on 500 response, got nil (fail-closed violated)") + } + if !strings.Contains(err.Error(), "500") { + t.Errorf("error should mention the HTTP status, got: %v", err) + } +} + +// TestGiteaTemplateAssetFetcher_FailsClosedOnTransportError pins +// the fail-closed contract for transport-layer failures (DNS +// failure, connection refused, etc.). The fetcher must NOT +// silently return an empty map. +func TestGiteaTemplateAssetFetcher_FailsClosedOnTransportError(t *testing.T) { + // Use a port that nothing is listening on (reserved by IANA + // for "tcpmux"; 1 is "tcpmux" too). The dial will fail. + f := NewGiteaTemplateAssetFetcher("http://127.0.0.1:1", "the-token", &http.Client{Timeout: 100 * time.Millisecond}) + _, err := f.Load(context.Background(), "owner/repo@main") + if err == nil { + t.Fatal("expected error on transport failure, got nil (fail-closed violated)") + } +} + +// TestGiteaTemplateAssetFetcher_RejectsEmptyToken pins the +// security guard: an empty token (which would otherwise be +// sent as "Authorization: token " with no credential) is +// rejected at construction time. A forgotten token init would +// otherwise silently fail-against-anonymous-requests, which +// Gitea would 401 on — better to fail loud at Load time. +func TestGiteaTemplateAssetFetcher_RejectsEmptyToken(t *testing.T) { + f := NewGiteaTemplateAssetFetcher("http://example.com", "", nil) + _, err := f.Load(context.Background(), "owner/repo@main") + if err == nil { + t.Fatal("expected error on empty token, got nil (security guard violated)") + } + if !strings.Contains(err.Error(), "token") { + t.Errorf("error should mention token, got: %v", err) + } +} + +// TestParseTemplateIdentity pins the identity parser. Format: +// "/@". Malformed identities return errors. +func TestParseTemplateIdentity(t *testing.T) { + cases := []struct { + name string + identity string + wantOwner string + wantRepo string + wantRef string + wantErr bool + }{ + {"simple", "owner/repo@main", "owner", "repo", "main", false}, + {"with-sha-ref", "owner/repo@abcd1234", "owner", "repo", "abcd1234", false}, + {"with-tag-ref", "owner/repo@v1.2.3", "owner", "repo", "v1.2.3", false}, + {"nested-owner", "molecule-ai/workspace-template-seo@main", "molecule-ai", "workspace-template-seo", "main", false}, + {"empty", "", "", "", "", true}, + {"no-at", "owner/repo", "", "", "", true}, + {"empty-ref", "owner/repo@", "", "", "", true}, + {"no-slash", "owner@main", "", "", "", true}, + {"empty-owner", "/repo@main", "", "", "", true}, + {"empty-repo", "owner/@main", "", "", "", true}, + {"extra-at", "owner/repo@main@extra", "", "", "", true}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + owner, repo, ref, err := parseTemplateIdentity(c.identity) + if c.wantErr { + if err == nil { + t.Errorf("expected error for %q, got nil", c.identity) + } + return + } + if err != nil { + t.Errorf("unexpected error for %q: %v", c.identity, err) + return + } + if owner != c.wantOwner { + t.Errorf("owner = %q, want %q", owner, c.wantOwner) + } + if repo != c.wantRepo { + t.Errorf("repo = %q, want %q", repo, c.wantRepo) + } + if ref != c.wantRef { + t.Errorf("ref = %q, want %q", ref, c.wantRef) + } + }) + } +} + +// TestStripArchiveTopDir pins the top-level dir stripper. Gitea +// wraps entries in "-/"; we want just the +// relpath. Top-level entries (no slash) and traversal attempts +// (../) are rejected. +func TestStripArchiveTopDir(t *testing.T) { + cases := []struct { + name string + in string + wantOk bool + want string + }{ + {"normal", "repo-sha/config.yaml", true, "config.yaml"}, + {"nested", "repo-sha/prompts/system.md", true, "prompts/system.md"}, + {"deep", "repo-sha/agent-skills/seo/SKILL.md", true, "agent-skills/seo/SKILL.md"}, + {"top-level-no-slash", "config.yaml", false, ""}, + {"empty", "", false, ""}, + {"slash-only", "/", false, ""}, + {"dot-only", "repo-sha/.", false, ""}, + {"traversal", "repo-sha/../etc/passwd", false, ""}, + {"deep-traversal", "repo-sha/prompts/../../etc", false, ""}, + {"dotdot-in-name", "repo-sha/foo..bar", true, "foo..bar"}, // not a traversal, just an unusual filename + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got, ok := stripArchiveTopDir(c.in) + if ok != c.wantOk { + t.Errorf("ok = %v, want %v (input %q)", ok, c.wantOk, c.in) + return + } + if ok && got != c.want { + t.Errorf("got = %q, want %q (input %q)", got, c.want, c.in) + } + }) + } +} + +// ---- Test helpers ---- + +// mustWriteTar writes a single tar entry with the given name + +// data. Errors are reported via t (so the test fails cleanly). +// Wrapped in a helper to keep the test bodies focused on the +// content (not the tar boilerplate). +func mustWriteTar(t *testing.T, tw *tar.Writer, name string, data []byte) { + t.Helper() + hdr := &tar.Header{ + Name: name, + Mode: 0o644, + Size: int64(len(data)), + Typeflag: tar.TypeReg, + } + if err := tw.WriteHeader(hdr); err != nil { + t.Fatalf("tar WriteHeader %s: %v", name, err) + } + if _, err := tw.Write(data); err != nil { + t.Fatalf("tar Write %s: %v", name, err) + } +} + +// mustHaveKey asserts assets contains key. Failure reports the +// key + the full set so the test message is informative. +func mustHaveKey(t *testing.T, assets map[string][]byte, key string) { + t.Helper() + if _, ok := assets[key]; !ok { + t.Errorf("expected key %q in assets, got keys: %v", key, keysOf(assets)) + } +} + +// mustNotHaveKey asserts assets does NOT contain key. The +// blast-radius guard's load-bearing assertion. +func mustNotHaveKey(t *testing.T, assets map[string][]byte, key string) { + t.Helper() + if _, ok := assets[key]; ok { + t.Errorf("did NOT expect key %q in assets (not in allowlist), but it's there: %v", key, keysOf(assets)) + } +} + +// keysOf returns a stable-ish view of the map's keys for +// assertion error messages. Local helper to avoid coupling to +// test-fixture conventions elsewhere. +func keysOf(m map[string][]byte) []string { + out := make([]string, 0, len(m)) + for k := range m { + out = append(out, k) + } + return out +} + +// Compile-time check: archive/tar is imported for the test +// server's .tar.gz generation. (Avoids a "imported and not +// used" lint in case future refactors move the helper.) +var _ = tar.TypeReg + +// Compile-time check: bytes is imported for the test sentinel. +// Avoids unused-import in future refactors. +var _ = bytes.NewReader + +// Compile-time check: io is imported for io.LimitReader. +// Avoids unused-import in future refactors. +var _ = io.LimitReader