forked from molecule-ai/molecule-core
harden(org-external): token via http.extraHeader, .complete cache marker, ref '..' deny, naming cleanup
Self-review of molecule-core PR #105 + #106 (the !external resolver chain) surfaced 3 real correctness/security gaps and 2 readability nits. Fixes all four in one PR since they're the same file's hardening. (1) TOKEN LEAKAGE — fixed Before: gitFetcher built clone URLs with auth in userinfo (https://oauth2:TOKEN@host/repo.git). Two leak paths: a. Token persisted in cloned repo's .git/config b. Token could appear in clone error output captured via cmd.CombinedOutput() After: clone URL has no userinfo (https://host/repo.git). Auth is layered on via -c http.extraHeader=Authorization: token ... which sends the header per-request without persisting. Plus a redactToken() pass over any error string before it surfaces in fmt.Errorf, as belt-and-braces. Tradeoff: token now visible in 'ps aux' for the duration of the git child process (same as before via env var), but no longer in any persistent state. (2) CACHE-VALIDITY FOOTGUN — fixed Before: cache-hit was 'cacheDir/.git exists'. A clone interrupted after .git was created but before content finished writing would leave a partially-written cache that subsequent imports treated as hit, returning stale/incomplete content forever (no self-heal). After: cache-hit also requires a .complete marker file written only AFTER successful clone+rename. Partially-written cache is treated as cache-miss and re-fetched cleanly (after RemoveAll on the partial dir to avoid blocking the new clone's mkdir). (3) REF '..' DENY — fixed Before: safeRefPattern '^[a-zA-Z0-9_./-]+$' allowed '..' as a substring. Git itself rejects most refs containing '..', but defense-in-depth says don't depend on the downstream tool's validation when sanitizing input at the boundary. After: explicit strings.Contains(ref.Ref, '..') check. (4) NAMING CLEANUP — fixed Before: rewriteFilesDirAndIncludes() — name claims to rewrite !include scalars but doesn't (we removed that during PR-A development; double-prefix bug). Misleading for readers. After: rewriteFilesDir(). Docstring updated to explicitly explain why !include paths are NOT rewritten (relative to subDir, naturally inside cache). Also: removed unused buildAuthedURL() (replaced by buildExternalCloneURL + authConfigArgs split), removed unused shortHash() helper (replaced by os.MkdirTemp), removed unused crypto/sha1 + encoding/hex + fmt imports, removed stray '_ = fmt.Sprint' line in integration test. NEW TESTS - TestGitFetcher_RejectsRefWithDoubleDot (defense-in-depth on ref input) - TestGitFetcher_CacheValidatedByCompleteMarker (partial cache → re-fetch) VERIFIED LOCALLY 2026-05-08 Full ./internal/handlers/ suite: ok (7.8s, 14 external-resolver tests + all existing tests). Two new tests cover the two new behaviors. Refs: internal#77 — extraction RFC molecule-core#105 (resolver), #106 (tests) — original implementation Hongming code-review-and-quality skill invocation 2026-05-08 + 'fix all'
This commit is contained in:
parent
d9056db5b4
commit
c72d0a5383
@ -2,8 +2,6 @@ package handlers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha1"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"os"
|
||||
@ -149,6 +147,11 @@ func resolveExternalMapping(n *yaml.Node, currentDir, rootDir string, visited ma
|
||||
if !safeRefPattern.MatchString(ref.Ref) {
|
||||
return fmt.Errorf("!external at line %d: ref %q contains disallowed characters", n.Line, ref.Ref)
|
||||
}
|
||||
// Defense-in-depth: even though git itself rejects refs containing
|
||||
// `..`, the regex above currently allows them. Reject explicitly.
|
||||
if strings.Contains(ref.Ref, "..") {
|
||||
return fmt.Errorf("!external at line %d: ref %q must not contain '..'", n.Line, ref.Ref)
|
||||
}
|
||||
if strings.Contains(ref.Path, "..") || strings.HasPrefix(ref.Path, "/") {
|
||||
return fmt.Errorf("!external at line %d: path %q must be relative-and-down-only", n.Line, ref.Path)
|
||||
}
|
||||
@ -225,7 +228,7 @@ func resolveExternalMapping(n *yaml.Node, currentDir, rootDir string, visited ma
|
||||
if err != nil {
|
||||
return fmt.Errorf("!external at line %d: cannot compute cache prefix: %w", n.Line, err)
|
||||
}
|
||||
rewriteFilesDirAndIncludes(root, cachePrefix)
|
||||
rewriteFilesDir(root, cachePrefix)
|
||||
|
||||
// Replace the !external mapping with the resolved content in-place.
|
||||
*n = *root
|
||||
@ -235,19 +238,20 @@ func resolveExternalMapping(n *yaml.Node, currentDir, rootDir string, visited ma
|
||||
return nil
|
||||
}
|
||||
|
||||
// rewriteFilesDirAndIncludes walks the yaml node tree and prepends
|
||||
// cachePrefix to every files_dir scalar value. Idempotent: if a
|
||||
// files_dir value already starts with the prefix, no-op.
|
||||
// rewriteFilesDir walks the yaml node tree and prepends cachePrefix to
|
||||
// every files_dir scalar value. Idempotent: if a files_dir value already
|
||||
// starts with the prefix, no-op.
|
||||
//
|
||||
// !include paths are NOT rewritten. They resolve relative to their
|
||||
// containing file's directory (subDir in expandNode), and after fetch
|
||||
// that directory IS inside the cache, so relative paths Just Work
|
||||
// without any rewrite. Rewriting them would double-prefix.
|
||||
// !include paths are intentionally NOT rewritten. They resolve relative
|
||||
// to their containing file's directory (subDir in expandNode), and after
|
||||
// fetch that directory IS inside the cache, so relative !include paths
|
||||
// Just Work without any rewrite. Rewriting them would double-prefix on
|
||||
// recursive resolution.
|
||||
//
|
||||
// files_dir DOES need rewriting because it's consumed at workspace-
|
||||
// provisioning time relative to orgBaseDir (the parent template's root),
|
||||
// not relative to the workspace.yaml's containing dir.
|
||||
func rewriteFilesDirAndIncludes(n *yaml.Node, cachePrefix string) {
|
||||
func rewriteFilesDir(n *yaml.Node, cachePrefix string) {
|
||||
if n == nil {
|
||||
return
|
||||
}
|
||||
@ -262,7 +266,7 @@ func rewriteFilesDirAndIncludes(n *yaml.Node, cachePrefix string) {
|
||||
}
|
||||
}
|
||||
for _, child := range n.Content {
|
||||
rewriteFilesDirAndIncludes(child, cachePrefix)
|
||||
rewriteFilesDir(child, cachePrefix)
|
||||
}
|
||||
}
|
||||
|
||||
@ -280,53 +284,110 @@ func safeRepoCacheDir(host, repoPath string) string {
|
||||
// clone the repo at ref into the cache dir. Cache key includes the
|
||||
// resolved SHA, so different SHAs of the same ref get different cache
|
||||
// dirs (no overwrite).
|
||||
//
|
||||
// Token handling — important for security. The auth token never enters
|
||||
// the clone URL (and therefore never lands in the cloned repo's
|
||||
// .git/config) and never appears in returned errors. We use git's
|
||||
// `http.extraHeader` config option (passed via `-c`), which sends an
|
||||
// Authorization header per-request without persisting it. The token is
|
||||
// briefly visible in the `git` process's argv (so other local users
|
||||
// with the same uid could see it via `ps`), which is the same exposure
|
||||
// it has via the env var that supplied it.
|
||||
//
|
||||
// Cache validity uses a `.complete` marker written after a successful
|
||||
// clone+rename. Cache-hit checks for the marker, not just the dir
|
||||
// existence — a partially-written cache (clone failed mid-way, or a
|
||||
// concurrent caller wrote a half-baked cache dir) is treated as cache
|
||||
// miss and re-fetched cleanly.
|
||||
type gitFetcher struct{}
|
||||
|
||||
// cacheCompleteMarker is the filename written after a successful clone.
|
||||
// Cache-hit requires this marker; without it, the cache dir is treated
|
||||
// as partially-written and re-fetched.
|
||||
const cacheCompleteMarker = ".complete"
|
||||
|
||||
// Fetch resolves ref → SHA via `git ls-remote`, then `git clone --depth=1`
|
||||
// if the cache dir is missing. Auth via MOLECULE_GITEA_TOKEN injected
|
||||
// into the URL.
|
||||
// if the cache dir is missing or incomplete. Auth via MOLECULE_GITEA_TOKEN
|
||||
// injected via http.extraHeader (never via URL).
|
||||
func (g *gitFetcher) Fetch(ctx context.Context, rootDir, host, repoPath, ref string) (string, string, error) {
|
||||
cacheRoot := filepath.Join(rootDir, externalCacheDirName, safeRepoCacheDir(host, repoPath))
|
||||
if err := os.MkdirAll(cacheRoot, 0o755); err != nil {
|
||||
return "", "", fmt.Errorf("mkdir cache root: %w", err)
|
||||
}
|
||||
|
||||
cloneURL := buildAuthedURL(host, repoPath)
|
||||
cloneURL := buildExternalCloneURL(host, repoPath)
|
||||
gitArgs := func(extra ...string) []string {
|
||||
args := authConfigArgs()
|
||||
return append(args, extra...)
|
||||
}
|
||||
|
||||
// 1. Resolve ref → SHA (so cache dir is content-addressable).
|
||||
sha, err := g.resolveRefToSHA(ctx, cloneURL, ref)
|
||||
sha, err := g.resolveRefToSHA(ctx, cloneURL, ref, gitArgs)
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("ls-remote: %w", err)
|
||||
return "", "", fmt.Errorf("ls-remote: %s", redactToken(err.Error()))
|
||||
}
|
||||
|
||||
cacheDir := filepath.Join(cacheRoot, sha)
|
||||
if _, statErr := os.Stat(filepath.Join(cacheDir, ".git")); statErr == nil {
|
||||
// Cache hit.
|
||||
// Cache-hit requires the .complete marker AND the .git dir.
|
||||
// Without the marker, cache is partially-written → treat as miss.
|
||||
if isCacheComplete(cacheDir) {
|
||||
return cacheDir, sha, nil
|
||||
}
|
||||
|
||||
// 2. Clone.
|
||||
tmpDir := cacheDir + ".tmp." + shortHash(time.Now().String())
|
||||
cmd := exec.CommandContext(ctx, "git", "clone", "--quiet", "--depth=1", "-b", ref, cloneURL, tmpDir)
|
||||
// Cache miss or partially-written — clean any stale cacheDir before
|
||||
// cloning (a previous broken attempt would otherwise block rename).
|
||||
os.RemoveAll(cacheDir)
|
||||
|
||||
// 2. Clone into a sibling tmp dir; atomic rename on success.
|
||||
tmpDir, err := os.MkdirTemp(cacheRoot, sha+".tmp.")
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("mkdir tmp: %w", err)
|
||||
}
|
||||
// MkdirTemp creates the dir; git clone refuses to clone into a
|
||||
// non-empty dir. Remove + recreate empty.
|
||||
os.RemoveAll(tmpDir)
|
||||
cloneAndConfig := append(gitArgs("clone", "--quiet", "--depth=1", "-b", ref, cloneURL, tmpDir))
|
||||
cmd := exec.CommandContext(ctx, "git", cloneAndConfig...)
|
||||
cmd.Env = append(os.Environ(), "GIT_TERMINAL_PROMPT=0")
|
||||
if out, err := cmd.CombinedOutput(); err != nil {
|
||||
os.RemoveAll(tmpDir)
|
||||
return "", "", fmt.Errorf("git clone: %w: %s", err, strings.TrimSpace(string(out)))
|
||||
return "", "", fmt.Errorf("git clone: %w: %s", err, redactToken(strings.TrimSpace(string(out))))
|
||||
}
|
||||
// Atomic rename to final cache path.
|
||||
if err := os.Rename(tmpDir, cacheDir); err != nil {
|
||||
// Race: another import beat us to it. The other party's content
|
||||
// is at the same SHA, so it's equivalent. Cleanup our tmp.
|
||||
|
||||
// Write the .complete marker BEFORE the rename. If rename succeeds,
|
||||
// the marker is in place. If rename loses the race (concurrent
|
||||
// fetcher won), our tmp gets cleaned up and we trust the winner.
|
||||
if err := os.WriteFile(filepath.Join(tmpDir, cacheCompleteMarker), []byte(time.Now().UTC().Format(time.RFC3339)), 0o644); err != nil {
|
||||
os.RemoveAll(tmpDir)
|
||||
if _, statErr := os.Stat(filepath.Join(cacheDir, ".git")); statErr != nil {
|
||||
return "", "", fmt.Errorf("rename clone to cache: %w", err)
|
||||
return "", "", fmt.Errorf("write complete marker: %w", err)
|
||||
}
|
||||
|
||||
if err := os.Rename(tmpDir, cacheDir); err != nil {
|
||||
// Race: another import beat us. Validate THEIR cache, accept it.
|
||||
os.RemoveAll(tmpDir)
|
||||
if isCacheComplete(cacheDir) {
|
||||
return cacheDir, sha, nil
|
||||
}
|
||||
return "", "", fmt.Errorf("rename clone to cache (and winner's cache is incomplete): %w", err)
|
||||
}
|
||||
return cacheDir, sha, nil
|
||||
}
|
||||
|
||||
func (g *gitFetcher) resolveRefToSHA(ctx context.Context, cloneURL, ref string) (string, error) {
|
||||
cmd := exec.CommandContext(ctx, "git", "ls-remote", cloneURL, ref)
|
||||
// isCacheComplete reports whether cacheDir contains both the cloned
|
||||
// repo (.git) and the .complete marker. Treats partial state as miss.
|
||||
func isCacheComplete(cacheDir string) bool {
|
||||
if _, err := os.Stat(filepath.Join(cacheDir, ".git")); err != nil {
|
||||
return false
|
||||
}
|
||||
if _, err := os.Stat(filepath.Join(cacheDir, cacheCompleteMarker)); err != nil {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (g *gitFetcher) resolveRefToSHA(ctx context.Context, cloneURL, ref string, gitArgs func(...string) []string) (string, error) {
|
||||
args := gitArgs("ls-remote", cloneURL, ref)
|
||||
cmd := exec.CommandContext(ctx, "git", args...)
|
||||
cmd.Env = append(os.Environ(), "GIT_TERMINAL_PROMPT=0")
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
@ -345,16 +406,34 @@ func (g *gitFetcher) resolveRefToSHA(ctx context.Context, cloneURL, ref string)
|
||||
return line, nil
|
||||
}
|
||||
|
||||
func buildAuthedURL(host, repoPath string) string {
|
||||
token := os.Getenv("MOLECULE_GITEA_TOKEN")
|
||||
// buildExternalCloneURL constructs the clone URL WITHOUT auth in userinfo.
|
||||
// Auth is layered on via authConfigArgs's http.extraHeader.
|
||||
func buildExternalCloneURL(host, repoPath string) string {
|
||||
u := url.URL{Scheme: "https", Host: host, Path: "/" + repoPath + ".git"}
|
||||
if token != "" {
|
||||
u.User = url.UserPassword("oauth2", token)
|
||||
}
|
||||
return u.String()
|
||||
}
|
||||
|
||||
func shortHash(s string) string {
|
||||
h := sha1.Sum([]byte(s))
|
||||
return hex.EncodeToString(h[:4])
|
||||
// authConfigArgs returns the `-c http.extraHeader=Authorization: token X`
|
||||
// args to pass to git, OR an empty slice if no token is set. The token
|
||||
// goes into the request headers (not the URL or .git/config), so it
|
||||
// doesn't persist on disk and doesn't appear in clone error output.
|
||||
func authConfigArgs() []string {
|
||||
token := os.Getenv("MOLECULE_GITEA_TOKEN")
|
||||
if token == "" {
|
||||
return nil
|
||||
}
|
||||
return []string{"-c", "http.extraHeader=Authorization: token " + token}
|
||||
}
|
||||
|
||||
// redactToken scrubs the auth token from a string before it's logged
|
||||
// or returned in an error. Belt-and-braces: with the http.extraHeader
|
||||
// approach the token shouldn't appear in git's output, but if some
|
||||
// future git version or libcurl debug mode emits it, this catches it.
|
||||
func redactToken(s string) string {
|
||||
token := os.Getenv("MOLECULE_GITEA_TOKEN")
|
||||
if token == "" || len(token) < 8 {
|
||||
return s
|
||||
}
|
||||
return strings.ReplaceAll(s, token, "<redacted-token>")
|
||||
}
|
||||
|
||||
|
||||
@ -2,7 +2,6 @@ package handlers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
@ -290,6 +289,91 @@ func TestGitFetcher_DirectFetch_CacheHit(t *testing.T) {
|
||||
if _, err := os.Stat(stamp); err != nil {
|
||||
t.Errorf("cache hit not honored — stamp file disappeared: %v", err)
|
||||
}
|
||||
|
||||
_ = fmt.Sprint
|
||||
}
|
||||
|
||||
// TestGitFetcher_RejectsRefWithDoubleDot: defense-in-depth on ref input.
|
||||
// safeRefPattern allows '.' as a regex character, so ".." would match
|
||||
// without an explicit deny. Verify it's rejected even though git itself
|
||||
// would also reject the resulting clone.
|
||||
func TestGitFetcher_RejectsRefWithDoubleDot(t *testing.T) {
|
||||
rootDir := t.TempDir()
|
||||
src := []byte(`workspaces:
|
||||
- !external
|
||||
repo: molecule-ai/x
|
||||
ref: foo..bar
|
||||
path: x.yaml
|
||||
`)
|
||||
_, err := resolveYAMLIncludes(src, rootDir)
|
||||
if err == nil {
|
||||
t.Fatalf("expected '..' rejection")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "..") {
|
||||
t.Errorf("expected '..' in error; got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestGitFetcher_CacheValidatedByCompleteMarker: a partially-written
|
||||
// cache (the .git dir exists but no .complete marker) is treated as
|
||||
// cache-miss and re-fetched. Catches the broken-cache-permanence bug.
|
||||
func TestGitFetcher_CacheValidatedByCompleteMarker(t *testing.T) {
|
||||
if _, err := exec.LookPath("git"); err != nil {
|
||||
t.Skipf("git not found: %v", err)
|
||||
}
|
||||
if runtime.GOOS == "windows" {
|
||||
t.Skip("skipping on windows")
|
||||
}
|
||||
|
||||
fixtures := t.TempDir()
|
||||
barePath := filepath.Join(fixtures, "test.git")
|
||||
workPath := filepath.Join(fixtures, "w")
|
||||
mustGit(t, "", "init", "--bare", "-b", "main", barePath)
|
||||
mustGit(t, "", "clone", barePath, workPath)
|
||||
mustGit(t, workPath, "config", "user.email", "t@e")
|
||||
mustGit(t, workPath, "config", "user.name", "T")
|
||||
mustWriteFile(t, filepath.Join(workPath, "good.txt"), "from-network")
|
||||
mustGit(t, workPath, "add", ".")
|
||||
mustGit(t, workPath, "commit", "-m", "seed")
|
||||
mustGit(t, workPath, "push", "origin", "main")
|
||||
t.Setenv("GIT_CONFIG_COUNT", "1")
|
||||
t.Setenv("GIT_CONFIG_KEY_0", "url."+barePath+".insteadOf")
|
||||
t.Setenv("GIT_CONFIG_VALUE_0", "https://git.moleculesai.app/molecule-ai/marker-test.git")
|
||||
|
||||
rootDir := t.TempDir()
|
||||
g := &gitFetcher{}
|
||||
|
||||
// First fetch — populates the cache (creates .complete marker).
|
||||
cacheDir1, _, err := g.Fetch(context.Background(), rootDir, "git.moleculesai.app", "molecule-ai/marker-test", "main")
|
||||
if err != nil {
|
||||
t.Fatalf("first Fetch: %v", err)
|
||||
}
|
||||
marker := filepath.Join(cacheDir1, cacheCompleteMarker)
|
||||
if _, err := os.Stat(marker); err != nil {
|
||||
t.Fatalf("first fetch should have written .complete marker: %v", err)
|
||||
}
|
||||
|
||||
// Now simulate a partial cache: delete the marker but leave .git
|
||||
// in place. The next Fetch should treat this as cache-miss and
|
||||
// re-fetch (NOT silently use the partial cache).
|
||||
if err := os.Remove(marker); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
// Drop a sentinel file the second fetch will clobber if it re-fetches.
|
||||
sentinel := filepath.Join(cacheDir1, "_should_be_clobbered")
|
||||
if err := os.WriteFile(sentinel, []byte("partial"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
cacheDir2, _, err := g.Fetch(context.Background(), rootDir, "git.moleculesai.app", "molecule-ai/marker-test", "main")
|
||||
if err != nil {
|
||||
t.Fatalf("second Fetch: %v", err)
|
||||
}
|
||||
if cacheDir1 != cacheDir2 {
|
||||
t.Errorf("cache dirs differ across fetches: %q vs %q", cacheDir1, cacheDir2)
|
||||
}
|
||||
if _, err := os.Stat(filepath.Join(cacheDir2, cacheCompleteMarker)); err != nil {
|
||||
t.Errorf("re-fetch should have re-written .complete marker: %v", err)
|
||||
}
|
||||
if _, err := os.Stat(sentinel); err == nil {
|
||||
t.Errorf("sentinel still present — re-fetch did NOT clobber partial cache")
|
||||
}
|
||||
}
|
||||
|
||||
@ -243,11 +243,11 @@ func TestResolveExternalMapping_MissingRequiredFields(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestRewriteFilesDirAndIncludes: verify the path-rewrite walker
|
||||
// TestRewriteFilesDir: verify the path-rewrite walker
|
||||
// prefixes files_dir scalars. !include scalars are NOT rewritten —
|
||||
// they resolve relative to their containing file's dir, which post-
|
||||
// fetch is naturally inside the cache.
|
||||
func TestRewriteFilesDirAndIncludes(t *testing.T) {
|
||||
func TestRewriteFilesDir(t *testing.T) {
|
||||
src := `name: Foo
|
||||
files_dir: dev-lead
|
||||
children:
|
||||
@ -260,7 +260,7 @@ inner:
|
||||
if err := yaml.Unmarshal([]byte(src), &n); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
rewriteFilesDirAndIncludes(&n, ".external-cache/foo/bar")
|
||||
rewriteFilesDir(&n, ".external-cache/foo/bar")
|
||||
|
||||
out, err := yaml.Marshal(&n)
|
||||
if err != nil {
|
||||
@ -280,9 +280,9 @@ inner:
|
||||
}
|
||||
}
|
||||
|
||||
// TestRewriteFilesDirAndIncludes_Idempotent: re-running the rewriter
|
||||
// TestRewriteFilesDir_Idempotent: re-running the rewriter
|
||||
// on already-prefixed files_dir doesn't double-prefix.
|
||||
func TestRewriteFilesDirAndIncludes_Idempotent(t *testing.T) {
|
||||
func TestRewriteFilesDir_Idempotent(t *testing.T) {
|
||||
src := `files_dir: .external-cache/foo/bar/dev-lead
|
||||
inner:
|
||||
files_dir: .external-cache/foo/bar/dev-lead/sub
|
||||
@ -291,7 +291,7 @@ inner:
|
||||
if err := yaml.Unmarshal([]byte(src), &n); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
rewriteFilesDirAndIncludes(&n, ".external-cache/foo/bar")
|
||||
rewriteFilesDir(&n, ".external-cache/foo/bar")
|
||||
|
||||
out, _ := yaml.Marshal(&n)
|
||||
got := string(out)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user