harden(org-external): token via http.extraHeader, .complete cache marker, ref '..' deny, naming cleanup #107

Merged
claude-ceo-assistant merged 1 commits from fix/external-resolver-hardening into staging 2026-05-08 13:04:00 +00:00
3 changed files with 211 additions and 48 deletions

View File

@ -2,8 +2,6 @@ package handlers
import (
"context"
"crypto/sha1"
"encoding/hex"
"fmt"
"net/url"
"os"
@ -149,6 +147,11 @@ func resolveExternalMapping(n *yaml.Node, currentDir, rootDir string, visited ma
if !safeRefPattern.MatchString(ref.Ref) {
return fmt.Errorf("!external at line %d: ref %q contains disallowed characters", n.Line, ref.Ref)
}
// Defense-in-depth: even though git itself rejects refs containing
// `..`, the regex above currently allows them. Reject explicitly.
if strings.Contains(ref.Ref, "..") {
return fmt.Errorf("!external at line %d: ref %q must not contain '..'", n.Line, ref.Ref)
}
if strings.Contains(ref.Path, "..") || strings.HasPrefix(ref.Path, "/") {
return fmt.Errorf("!external at line %d: path %q must be relative-and-down-only", n.Line, ref.Path)
}
@ -225,7 +228,7 @@ func resolveExternalMapping(n *yaml.Node, currentDir, rootDir string, visited ma
if err != nil {
return fmt.Errorf("!external at line %d: cannot compute cache prefix: %w", n.Line, err)
}
rewriteFilesDirAndIncludes(root, cachePrefix)
rewriteFilesDir(root, cachePrefix)
// Replace the !external mapping with the resolved content in-place.
*n = *root
@ -235,19 +238,20 @@ func resolveExternalMapping(n *yaml.Node, currentDir, rootDir string, visited ma
return nil
}
// rewriteFilesDirAndIncludes walks the yaml node tree and prepends
// cachePrefix to every files_dir scalar value. Idempotent: if a
// files_dir value already starts with the prefix, no-op.
// rewriteFilesDir walks the yaml node tree and prepends cachePrefix to
// every files_dir scalar value. Idempotent: if a files_dir value already
// starts with the prefix, no-op.
//
// !include paths are NOT rewritten. They resolve relative to their
// containing file's directory (subDir in expandNode), and after fetch
// that directory IS inside the cache, so relative paths Just Work
// without any rewrite. Rewriting them would double-prefix.
// !include paths are intentionally NOT rewritten. They resolve relative
// to their containing file's directory (subDir in expandNode), and after
// fetch that directory IS inside the cache, so relative !include paths
// Just Work without any rewrite. Rewriting them would double-prefix on
// recursive resolution.
//
// files_dir DOES need rewriting because it's consumed at workspace-
// provisioning time relative to orgBaseDir (the parent template's root),
// not relative to the workspace.yaml's containing dir.
func rewriteFilesDirAndIncludes(n *yaml.Node, cachePrefix string) {
func rewriteFilesDir(n *yaml.Node, cachePrefix string) {
if n == nil {
return
}
@ -262,7 +266,7 @@ func rewriteFilesDirAndIncludes(n *yaml.Node, cachePrefix string) {
}
}
for _, child := range n.Content {
rewriteFilesDirAndIncludes(child, cachePrefix)
rewriteFilesDir(child, cachePrefix)
}
}
@ -280,53 +284,110 @@ func safeRepoCacheDir(host, repoPath string) string {
// clone the repo at ref into the cache dir. Cache key includes the
// resolved SHA, so different SHAs of the same ref get different cache
// dirs (no overwrite).
//
// Token handling — important for security. The auth token never enters
// the clone URL (and therefore never lands in the cloned repo's
// .git/config) and never appears in returned errors. We use git's
// `http.extraHeader` config option (passed via `-c`), which sends an
// Authorization header per-request without persisting it. The token is
// briefly visible in the `git` process's argv (so other local users
// with the same uid could see it via `ps`), which is the same exposure
// it has via the env var that supplied it.
//
// Cache validity uses a `.complete` marker written after a successful
// clone+rename. Cache-hit checks for the marker, not just the dir
// existence — a partially-written cache (clone failed mid-way, or a
// concurrent caller wrote a half-baked cache dir) is treated as cache
// miss and re-fetched cleanly.
type gitFetcher struct{}
// cacheCompleteMarker is the filename written after a successful clone.
// Cache-hit requires this marker; without it, the cache dir is treated
// as partially-written and re-fetched.
const cacheCompleteMarker = ".complete"
// Fetch resolves ref → SHA via `git ls-remote`, then `git clone --depth=1`
// if the cache dir is missing. Auth via MOLECULE_GITEA_TOKEN injected
// into the URL.
// if the cache dir is missing or incomplete. Auth via MOLECULE_GITEA_TOKEN
// injected via http.extraHeader (never via URL).
func (g *gitFetcher) Fetch(ctx context.Context, rootDir, host, repoPath, ref string) (string, string, error) {
cacheRoot := filepath.Join(rootDir, externalCacheDirName, safeRepoCacheDir(host, repoPath))
if err := os.MkdirAll(cacheRoot, 0o755); err != nil {
return "", "", fmt.Errorf("mkdir cache root: %w", err)
}
cloneURL := buildAuthedURL(host, repoPath)
cloneURL := buildExternalCloneURL(host, repoPath)
gitArgs := func(extra ...string) []string {
args := authConfigArgs()
return append(args, extra...)
}
// 1. Resolve ref → SHA (so cache dir is content-addressable).
sha, err := g.resolveRefToSHA(ctx, cloneURL, ref)
sha, err := g.resolveRefToSHA(ctx, cloneURL, ref, gitArgs)
if err != nil {
return "", "", fmt.Errorf("ls-remote: %w", err)
return "", "", fmt.Errorf("ls-remote: %s", redactToken(err.Error()))
}
cacheDir := filepath.Join(cacheRoot, sha)
if _, statErr := os.Stat(filepath.Join(cacheDir, ".git")); statErr == nil {
// Cache hit.
// Cache-hit requires the .complete marker AND the .git dir.
// Without the marker, cache is partially-written → treat as miss.
if isCacheComplete(cacheDir) {
return cacheDir, sha, nil
}
// 2. Clone.
tmpDir := cacheDir + ".tmp." + shortHash(time.Now().String())
cmd := exec.CommandContext(ctx, "git", "clone", "--quiet", "--depth=1", "-b", ref, cloneURL, tmpDir)
// Cache miss or partially-written — clean any stale cacheDir before
// cloning (a previous broken attempt would otherwise block rename).
os.RemoveAll(cacheDir)
// 2. Clone into a sibling tmp dir; atomic rename on success.
tmpDir, err := os.MkdirTemp(cacheRoot, sha+".tmp.")
if err != nil {
return "", "", fmt.Errorf("mkdir tmp: %w", err)
}
// MkdirTemp creates the dir; git clone refuses to clone into a
// non-empty dir. Remove + recreate empty.
os.RemoveAll(tmpDir)
cloneAndConfig := append(gitArgs("clone", "--quiet", "--depth=1", "-b", ref, cloneURL, tmpDir))
cmd := exec.CommandContext(ctx, "git", cloneAndConfig...)
cmd.Env = append(os.Environ(), "GIT_TERMINAL_PROMPT=0")
if out, err := cmd.CombinedOutput(); err != nil {
os.RemoveAll(tmpDir)
return "", "", fmt.Errorf("git clone: %w: %s", err, strings.TrimSpace(string(out)))
return "", "", fmt.Errorf("git clone: %w: %s", err, redactToken(strings.TrimSpace(string(out))))
}
// Atomic rename to final cache path.
if err := os.Rename(tmpDir, cacheDir); err != nil {
// Race: another import beat us to it. The other party's content
// is at the same SHA, so it's equivalent. Cleanup our tmp.
// Write the .complete marker BEFORE the rename. If rename succeeds,
// the marker is in place. If rename loses the race (concurrent
// fetcher won), our tmp gets cleaned up and we trust the winner.
if err := os.WriteFile(filepath.Join(tmpDir, cacheCompleteMarker), []byte(time.Now().UTC().Format(time.RFC3339)), 0o644); err != nil {
os.RemoveAll(tmpDir)
if _, statErr := os.Stat(filepath.Join(cacheDir, ".git")); statErr != nil {
return "", "", fmt.Errorf("rename clone to cache: %w", err)
return "", "", fmt.Errorf("write complete marker: %w", err)
}
if err := os.Rename(tmpDir, cacheDir); err != nil {
// Race: another import beat us. Validate THEIR cache, accept it.
os.RemoveAll(tmpDir)
if isCacheComplete(cacheDir) {
return cacheDir, sha, nil
}
return "", "", fmt.Errorf("rename clone to cache (and winner's cache is incomplete): %w", err)
}
return cacheDir, sha, nil
}
func (g *gitFetcher) resolveRefToSHA(ctx context.Context, cloneURL, ref string) (string, error) {
cmd := exec.CommandContext(ctx, "git", "ls-remote", cloneURL, ref)
// isCacheComplete reports whether cacheDir contains both the cloned
// repo (.git) and the .complete marker. Treats partial state as miss.
func isCacheComplete(cacheDir string) bool {
if _, err := os.Stat(filepath.Join(cacheDir, ".git")); err != nil {
return false
}
if _, err := os.Stat(filepath.Join(cacheDir, cacheCompleteMarker)); err != nil {
return false
}
return true
}
func (g *gitFetcher) resolveRefToSHA(ctx context.Context, cloneURL, ref string, gitArgs func(...string) []string) (string, error) {
args := gitArgs("ls-remote", cloneURL, ref)
cmd := exec.CommandContext(ctx, "git", args...)
cmd.Env = append(os.Environ(), "GIT_TERMINAL_PROMPT=0")
out, err := cmd.Output()
if err != nil {
@ -345,16 +406,34 @@ func (g *gitFetcher) resolveRefToSHA(ctx context.Context, cloneURL, ref string)
return line, nil
}
func buildAuthedURL(host, repoPath string) string {
token := os.Getenv("MOLECULE_GITEA_TOKEN")
// buildExternalCloneURL constructs the clone URL WITHOUT auth in userinfo.
// Auth is layered on via authConfigArgs's http.extraHeader.
func buildExternalCloneURL(host, repoPath string) string {
u := url.URL{Scheme: "https", Host: host, Path: "/" + repoPath + ".git"}
if token != "" {
u.User = url.UserPassword("oauth2", token)
}
return u.String()
}
func shortHash(s string) string {
h := sha1.Sum([]byte(s))
return hex.EncodeToString(h[:4])
// authConfigArgs returns the `-c http.extraHeader=Authorization: token X`
// args to pass to git, OR an empty slice if no token is set. The token
// goes into the request headers (not the URL or .git/config), so it
// doesn't persist on disk and doesn't appear in clone error output.
func authConfigArgs() []string {
token := os.Getenv("MOLECULE_GITEA_TOKEN")
if token == "" {
return nil
}
return []string{"-c", "http.extraHeader=Authorization: token " + token}
}
// redactToken scrubs the auth token from a string before it's logged
// or returned in an error. Belt-and-braces: with the http.extraHeader
// approach the token shouldn't appear in git's output, but if some
// future git version or libcurl debug mode emits it, this catches it.
func redactToken(s string) string {
token := os.Getenv("MOLECULE_GITEA_TOKEN")
if token == "" || len(token) < 8 {
return s
}
return strings.ReplaceAll(s, token, "<redacted-token>")
}

View File

@ -2,7 +2,6 @@ package handlers
import (
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
@ -290,6 +289,91 @@ func TestGitFetcher_DirectFetch_CacheHit(t *testing.T) {
if _, err := os.Stat(stamp); err != nil {
t.Errorf("cache hit not honored — stamp file disappeared: %v", err)
}
_ = fmt.Sprint
}
// TestGitFetcher_RejectsRefWithDoubleDot: defense-in-depth on ref input.
// safeRefPattern allows '.' as a regex character, so ".." would match
// without an explicit deny. Verify it's rejected even though git itself
// would also reject the resulting clone.
func TestGitFetcher_RejectsRefWithDoubleDot(t *testing.T) {
rootDir := t.TempDir()
src := []byte(`workspaces:
- !external
repo: molecule-ai/x
ref: foo..bar
path: x.yaml
`)
_, err := resolveYAMLIncludes(src, rootDir)
if err == nil {
t.Fatalf("expected '..' rejection")
}
if !strings.Contains(err.Error(), "..") {
t.Errorf("expected '..' in error; got %v", err)
}
}
// TestGitFetcher_CacheValidatedByCompleteMarker: a partially-written
// cache (the .git dir exists but no .complete marker) is treated as
// cache-miss and re-fetched. Catches the broken-cache-permanence bug.
func TestGitFetcher_CacheValidatedByCompleteMarker(t *testing.T) {
if _, err := exec.LookPath("git"); err != nil {
t.Skipf("git not found: %v", err)
}
if runtime.GOOS == "windows" {
t.Skip("skipping on windows")
}
fixtures := t.TempDir()
barePath := filepath.Join(fixtures, "test.git")
workPath := filepath.Join(fixtures, "w")
mustGit(t, "", "init", "--bare", "-b", "main", barePath)
mustGit(t, "", "clone", barePath, workPath)
mustGit(t, workPath, "config", "user.email", "t@e")
mustGit(t, workPath, "config", "user.name", "T")
mustWriteFile(t, filepath.Join(workPath, "good.txt"), "from-network")
mustGit(t, workPath, "add", ".")
mustGit(t, workPath, "commit", "-m", "seed")
mustGit(t, workPath, "push", "origin", "main")
t.Setenv("GIT_CONFIG_COUNT", "1")
t.Setenv("GIT_CONFIG_KEY_0", "url."+barePath+".insteadOf")
t.Setenv("GIT_CONFIG_VALUE_0", "https://git.moleculesai.app/molecule-ai/marker-test.git")
rootDir := t.TempDir()
g := &gitFetcher{}
// First fetch — populates the cache (creates .complete marker).
cacheDir1, _, err := g.Fetch(context.Background(), rootDir, "git.moleculesai.app", "molecule-ai/marker-test", "main")
if err != nil {
t.Fatalf("first Fetch: %v", err)
}
marker := filepath.Join(cacheDir1, cacheCompleteMarker)
if _, err := os.Stat(marker); err != nil {
t.Fatalf("first fetch should have written .complete marker: %v", err)
}
// Now simulate a partial cache: delete the marker but leave .git
// in place. The next Fetch should treat this as cache-miss and
// re-fetch (NOT silently use the partial cache).
if err := os.Remove(marker); err != nil {
t.Fatal(err)
}
// Drop a sentinel file the second fetch will clobber if it re-fetches.
sentinel := filepath.Join(cacheDir1, "_should_be_clobbered")
if err := os.WriteFile(sentinel, []byte("partial"), 0o644); err != nil {
t.Fatal(err)
}
cacheDir2, _, err := g.Fetch(context.Background(), rootDir, "git.moleculesai.app", "molecule-ai/marker-test", "main")
if err != nil {
t.Fatalf("second Fetch: %v", err)
}
if cacheDir1 != cacheDir2 {
t.Errorf("cache dirs differ across fetches: %q vs %q", cacheDir1, cacheDir2)
}
if _, err := os.Stat(filepath.Join(cacheDir2, cacheCompleteMarker)); err != nil {
t.Errorf("re-fetch should have re-written .complete marker: %v", err)
}
if _, err := os.Stat(sentinel); err == nil {
t.Errorf("sentinel still present — re-fetch did NOT clobber partial cache")
}
}

View File

@ -243,11 +243,11 @@ func TestResolveExternalMapping_MissingRequiredFields(t *testing.T) {
}
}
// TestRewriteFilesDirAndIncludes: verify the path-rewrite walker
// TestRewriteFilesDir: verify the path-rewrite walker
// prefixes files_dir scalars. !include scalars are NOT rewritten —
// they resolve relative to their containing file's dir, which post-
// fetch is naturally inside the cache.
func TestRewriteFilesDirAndIncludes(t *testing.T) {
func TestRewriteFilesDir(t *testing.T) {
src := `name: Foo
files_dir: dev-lead
children:
@ -260,7 +260,7 @@ inner:
if err := yaml.Unmarshal([]byte(src), &n); err != nil {
t.Fatal(err)
}
rewriteFilesDirAndIncludes(&n, ".external-cache/foo/bar")
rewriteFilesDir(&n, ".external-cache/foo/bar")
out, err := yaml.Marshal(&n)
if err != nil {
@ -280,9 +280,9 @@ inner:
}
}
// TestRewriteFilesDirAndIncludes_Idempotent: re-running the rewriter
// TestRewriteFilesDir_Idempotent: re-running the rewriter
// on already-prefixed files_dir doesn't double-prefix.
func TestRewriteFilesDirAndIncludes_Idempotent(t *testing.T) {
func TestRewriteFilesDir_Idempotent(t *testing.T) {
src := `files_dir: .external-cache/foo/bar/dev-lead
inner:
files_dir: .external-cache/foo/bar/dev-lead/sub
@ -291,7 +291,7 @@ inner:
if err := yaml.Unmarshal([]byte(src), &n); err != nil {
t.Fatal(err)
}
rewriteFilesDirAndIncludes(&n, ".external-cache/foo/bar")
rewriteFilesDir(&n, ".external-cache/foo/bar")
out, _ := yaml.Marshal(&n)
got := string(out)