fix(clone-manifest): don't block tokenless dev bootstrap on private templates #3193

Merged
agent-reviewer-cr2 merged 2 commits from fix/clone-manifest-tolerant-tokenless-bootstrap into main 2026-06-23 23:00:33 +00:00
4 changed files with 236 additions and 21 deletions
+10
View File
@@ -114,3 +114,13 @@ jobs:
# contract (e.g. accepting skipped/failure as success, or re-inlining
# the logic into ci.yml) fails CI.
run: bash .gitea/scripts/tests/test_all_required_check.sh
- name: clone-manifest.sh tolerant-bootstrap contract (network-free)
# Regression lock for the strict/best-effort split in clone-manifest.sh:
# tokenless dev bootstrap must skip ONLY `private: true` repos and still
# hard-fail on any public-repo clone failure (so a real outage / bad ref
# is never swallowed as a missing-creds skip). git-stubbed, no network.
run: |
set -euo pipefail
command -v jq >/dev/null 2>&1 || { sudo apt-get update -qq && sudo apt-get install -y -qq jq; }
sh scripts/test-clone-manifest-tolerant.sh
+4 -4
View File
@@ -1,5 +1,5 @@
{
"_comment": "Platform template registry. Repos may be public or platform-private; CI and runtime template-cache refresh clone them with the SSOT-managed template read token, then strip .git metadata before use. Customer/private tenant templates remain outside this platform manifest.",
"_comment": "Platform template registry. Repos may be public or platform-private; CI and runtime template-cache refresh clone them with the SSOT-managed template read token, then strip .git metadata before use. Customer/private tenant templates remain outside this platform manifest. An entry with `\"private\": true` requires MOLECULE_GITEA_TOKEN to clone — clone-manifest.sh's best-effort (tokenless) mode SKIPS exactly these and still hard-fails on any UNMARKED (public) repo that fails to clone, so a real outage / bad ref / deleted public repo is never silently swallowed as a missing-creds skip.",
"_pinning_contract": "RFC #2927 — every entry's `ref` is pinned to an immutable commit SHA (not a branch like `main` and not a mutable tag). The previous `ref:main` exposure made provisioning non-reproducible — a merge to ANY template's `main` instantly reached every subsequent provision. Pinning restores: (a) reproducible identity (same SHA → same config.yaml + prompts + skills on every boot); (b) auditable provenance (the SHA is the artifact's content-address); (c) explicit upgrades (bumping a pin is a reviewed PR, not silent). CI test TestManifest_RefPinningCompleteness (workspace-server/internal/handlers/manifest_pinning_test.go) asserts the pinning contract: (1) every ref is a 40-char commit SHA, (2) every pinned SHA is reachable in the named repo, (3) workspace_template entries include config.yaml in the pinned ref's tree. To bump a pin: PR with the new SHA, tests run, driver reviews the diff. PLATFORM-AGENT is now pinned (its config.yaml exists at the pinned SHA): clone-manifest.sh stages it at .tenant-bundle-deps/workspace-configs-templates/platform-agent, which Dockerfile.platform-agent COPYs to bake the concierge identity into the molecule-platform-agent image (publish-workspace-server-image.yml). The concierge (kind=platform) provisions on that image (core#2495).",
"version": 1,
"plugins": [
@@ -30,9 +30,9 @@
{"name": "hermes", "repo": "molecule-ai/molecule-ai-workspace-template-hermes", "ref": "ca7e1efafb982f6d97a6a188067fd9198b2f18b7"},
{"name": "openclaw", "repo": "molecule-ai/molecule-ai-workspace-template-openclaw", "ref": "143e69b56f2530433141f5a87373e8a76578c52e"},
{"name": "codex", "repo": "molecule-ai/molecule-ai-workspace-template-codex", "ref": "070447a0afdf66ae6f2bb166ac3e2b2884456951"},
{"name": "google-adk", "repo": "molecule-ai/molecule-ai-workspace-template-google-adk", "ref": "3f9fd7ef6ea4dd912bb65446607f3c3c991ea76e"},
{"name": "seo-agent", "repo": "molecule-ai/molecule-ai-workspace-template-seo-agent", "ref": "f6a18eb4716a040fb24c5fb79830c50c5368a2da", "runtime": "claude-code"},
{"name": "platform-agent", "repo": "molecule-ai/molecule-ai-workspace-template-platform-agent", "ref": "776916df9ada9c03b761f9485b2ed0933a9c9140", "runtime": "claude-code"}
{"name": "google-adk", "repo": "molecule-ai/molecule-ai-workspace-template-google-adk", "ref": "3f9fd7ef6ea4dd912bb65446607f3c3c991ea76e", "private": true},
{"name": "seo-agent", "repo": "molecule-ai/molecule-ai-workspace-template-seo-agent", "ref": "f6a18eb4716a040fb24c5fb79830c50c5368a2da", "runtime": "claude-code", "private": true},
{"name": "platform-agent", "repo": "molecule-ai/molecule-ai-workspace-template-platform-agent", "ref": "776916df9ada9c03b761f9485b2ed0933a9c9140", "runtime": "claude-code", "private": true}
],
"org_templates": [
{"name": "molecule-dev", "repo": "molecule-ai/molecule-ai-org-template-molecule-dev", "ref": "990d7b23f65dadd7afe05958a77eeb74082b4feb"},
+95 -17
View File
@@ -7,11 +7,20 @@
#
# Requires: git, jq (lighter than python3 — ~2MB vs ~50MB in Alpine)
#
# Auth (optional):
# Repos in manifest.json may be public or platform-private. CI and
# operator refresh jobs should set MOLECULE_GITEA_TOKEN to the
# SSOT-managed template read token. Anonymous clone still works for
# public entries, but private platform templates depend on the token.
# Auth (optional) — two modes, keyed on MOLECULE_GITEA_TOKEN:
# STRICT (token set; CI / operator refresh): the token grants access to
# the private platform templates, so ANY clone failure is a genuine
# error and aborts (exit 1). This is the build-correctness path.
# BEST-EFFORT (no token; ecosystem contributor via setup.sh/dev-start.sh):
# a contributor shouldn't need creds to spin up a local dev env. Clone
# what's public; SKIP (with a warning) ONLY repos the manifest marks
# `"private": true` — those need a token. A failure of any UNMARKED
# (public) repo still ABORTS (exit 1), so a bad ref / deleted repo /
# network outage is never swallowed as a missing-creds skip. Exit 0 when
# the only failures were private skips. The palette is then sparse but
# the platform runs.
# Set MOLECULE_GITEA_TOKEN to the SSOT-managed template read token to
# populate the full set.
#
# The token (when set) never enters the Docker image: this script runs
# in the trusted CI context BEFORE `docker buildx build`, populates
@@ -38,6 +47,24 @@ MANIFEST_JSON="$(_strip_comments)"
EXPECTED=0
CLONED=0
SKIPPED=0
# Strict vs best-effort mode.
#
# STRICT=1 when MOLECULE_GITEA_TOKEN is set (CI / operator refresh): the
# token grants access to the private platform templates, so ANY clone
# failure is a genuine error and must fail the build.
#
# STRICT=0 when no token is set (ecosystem contributor running
# infra/scripts/setup.sh → dev-start.sh): the private platform templates
# (seo-agent, platform-agent, google-adk — internal IP) are simply not
# fetchable. Hard-failing here blocked local bootstrap on creds a
# contributor doesn't have (and shouldn't need). In this mode we clone what
# is public, SKIP what we can't access with a warning, and exit 0 — the
# Canvas template palette is then sparse but the platform runs. We still
# fail loudly if even the PUBLIC repos can't be cloned (real network /
# manifest breakage, not just missing creds).
if [ -n "${MOLECULE_GITEA_TOKEN:-}" ]; then STRICT=1; else STRICT=0; fi
# clone_one_with_retry — clone a single repo, retrying on transient failure.
#
@@ -56,10 +83,13 @@ CLONED=0
# The durable fix is more runner RAM/swap (tracked with Infra-SRE); this
# just stops a single flake from being release-blocking.
#
# Args: <target_dir> <name> <clone_url> <display_url> <ref>
# Args: <target_dir> <name> <clone_url> <display_url> <ref> [max_attempts]
# max_attempts defaults to 3 (CI: retry transient SIGKILL/network flakes).
# Best-effort callers pass 1 — a tokenless private-repo clone fails on auth,
# not a transient flake, so retrying just wastes the backoff window.
clone_one_with_retry() {
local tdir="$1" name="$2" url="$3" display="$4" ref="$5"
local attempt=1 max_attempts=3 backoff
local attempt=1 max_attempts="${6:-3}" backoff
while : ; do
# A killed attempt can leave a partial directory behind; git clone
@@ -85,7 +115,11 @@ clone_one_with_retry() {
fi
if [ "$attempt" -ge "$max_attempts" ]; then
echo "::error::clone failed after ${max_attempts} attempts: ${display}" >&2
# Single-attempt best-effort callers handle their own (friendlier)
# messaging; only the retrying CI path emits the ::error:: annotation.
if [ "$max_attempts" -gt 1 ]; then
echo "::error::clone failed after ${max_attempts} attempts: ${display}" >&2
fi
return 1
fi
backoff=$((attempt * 3)) # 3s, then 6s
@@ -107,10 +141,15 @@ clone_category() {
local i=0
while [ "$i" -lt "$count" ]; do
local name repo ref
local name repo ref private
name=$(echo "$MANIFEST_JSON" | jq -r ".${category}[$i].name")
repo=$(echo "$MANIFEST_JSON" | jq -r ".${category}[$i].repo")
ref=$(echo "$MANIFEST_JSON" | jq -r ".${category}[$i].ref // \"main\"")
# `private: true` marks repos that REQUIRE a token to clone. Only
# these may be skipped in best-effort (tokenless) mode; an unmarked
# (public) repo that fails is a genuine error and must fail the run
# even without a token. (manifest.json _comment.)
private=$(echo "$MANIFEST_JSON" | jq -r ".${category}[$i].private // false")
# Idempotent: skip if the target already looks populated. Lets the
# README quickstart rerun setup.sh safely without having to delete
@@ -140,8 +179,33 @@ clone_category() {
fi
echo " cloning $display_url -> $target_dir/$name (ref=$ref)"
clone_one_with_retry "$target_dir" "$name" "$clone_url" "$display_url" "$ref"
CLONED=$((CLONED + 1))
if [ "$STRICT" -eq 1 ]; then
# Token present → genuine clone. Retry transient flakes; a final
# failure is a real error and must abort the build.
if clone_one_with_retry "$target_dir" "$name" "$clone_url" "$display_url" "$ref" 3; then
CLONED=$((CLONED + 1))
else
echo "::error::clone failed for '$name' ($display_url) with MOLECULE_GITEA_TOKEN set — genuine failure, not a missing-creds skip" >&2
exit 1
fi
else
# No token → best effort. A failure is only TOLERATED for a repo
# explicitly marked `private: true` (needs creds we don't have).
# A failure of any UNMARKED (public) repo — bad ref, deleted repo,
# DNS/network outage, git regression, Gitea non-auth error — is a
# GENUINE error and must still abort, so a real outage can never be
# silently swallowed as a missing-creds skip.
if clone_one_with_retry "$target_dir" "$name" "$clone_url" "$display_url" "$ref" 1; then
CLONED=$((CLONED + 1))
elif [ "$private" = "true" ]; then
echo " ⚠ skipping '$name' — marked private and MOLECULE_GITEA_TOKEN is unset (set the token to include it). Bootstrap continues with a reduced template palette." >&2
SKIPPED=$((SKIPPED + 1))
rm -rf "$target_dir/$name" # drop any partial dir so a later token-backed run re-clones cleanly
else
echo "::error::clone failed for PUBLIC repo '$name' ($display_url) — genuine failure (not a missing-creds skip). Check the manifest ref / network / repo existence. (If this repo is actually private, mark it \"private\": true in manifest.json.)" >&2
exit 1
fi
fi
i=$((i + 1))
done
@@ -158,10 +222,24 @@ clone_category "org_templates" "$ORG_DIR"
echo "==> Cloning plugins..."
clone_category "plugins" "$PLUGINS_DIR"
# Verify all repos were cloned
if [ "$CLONED" -ne "$EXPECTED" ]; then
echo "::error::Expected $EXPECTED repos but only cloned $CLONED — some clones failed"
exit 1
# Verify the outcome.
if [ "$STRICT" -eq 1 ]; then
# Token present: every expected repo must have cloned.
if [ "$CLONED" -ne "$EXPECTED" ]; then
echo "::error::Expected $EXPECTED repos but only cloned $CLONED — some clones failed"
exit 1
fi
echo "==> Done. $CLONED/$EXPECTED repos cloned successfully."
else
# No token, and we got here — so every failure was a tolerated `private`
# skip (any PUBLIC-repo failure would already have exited 1 above). A real
# outage can't reach this point: it fails the public clones first. setup.sh
# tolerates an empty palette (the platform falls through to a bare
# default), so we exit 0. CLONED==0 here just means every manifest entry
# was marked private — warn loudly, but still don't block bootstrap.
if [ "$CLONED" -eq 0 ] && [ "$EXPECTED" -gt 0 ]; then
echo " ⚠ WARNING: 0/$EXPECTED template/plugin repos cloned ($SKIPPED private, skipped) — every manifest entry needs MOLECULE_GITEA_TOKEN. The platform will start with an EMPTY template palette. Set the token to populate it." >&2
else
echo "==> Done (best-effort, no MOLECULE_GITEA_TOKEN). $CLONED/$EXPECTED cloned, $SKIPPED skipped (marked private; set the token to include them)."
fi
fi
echo "==> Done. $CLONED/$EXPECTED repos cloned successfully."
+127
View File
@@ -0,0 +1,127 @@
#!/bin/sh
# test-clone-manifest-tolerant.sh — local, network-free test of the
# strict/best-effort behavior in clone-manifest.sh.
#
# Stubs `git` so a clone fails for a repo listed in $PRIVATE_REPOS when the
# URL is anonymous (no oauth2: userinfo) and for any repo in $HARD_FAIL_REPOS
# (fails even with a token) — mirroring real Gitea. The SKIP decision in
# clone-manifest.sh is driven by the manifest's `"private": true` flag, NOT by
# which repo the stub fails, so these tests prove the safety boundary:
#
# A. no token → public clone, MARKED-private skip, exit 0
# B. token set → every repo clones, exit 0
# C. token set + genuine failure → exit 1 (strict)
# E. no token + a PUBLIC (unmarked) repo hard-fails → exit 1
# (the key negative case: best-effort must NOT swallow public failures)
# D. no token + EVERY entry marked private → exit 0 + empty-palette warning
#
# Run: sh scripts/test-clone-manifest-tolerant.sh
# Exit: 0 all pass, 1 otherwise.
set -eu
HERE=$(dirname -- "$0")
HERE=$(cd -- "$HERE" && pwd)
CLONE_SH="$HERE/clone-manifest.sh"
WORK=$(mktemp -d)
trap 'rm -rf "$WORK"' EXIT
mkdir -p "$WORK/bin"
# git stub. Fails clone when:
# - repo basename is in $HARD_FAIL_REPOS (fails even with a token), OR
# - repo basename is in $PRIVATE_REPOS and the URL is anonymous.
cat > "$WORK/bin/git" <<'STUB'
#!/bin/sh
[ "$1" = "clone" ] || exit 0
url=""; target=""
for a in "$@"; do
case "$a" in https://*) url="$a" ;; esac
target="$a"
done
repo=$(basename "$url" .git)
case " ${HARD_FAIL_REPOS:-} " in *" $repo "*) exit 1 ;; esac
authed=0; case "$url" in *oauth2:*) authed=1 ;; esac
case " ${PRIVATE_REPOS:-} " in
*" $repo "*) [ "$authed" -eq 1 ] || exit 1 ;;
esac
mkdir -p "$target" && echo stub > "$target/STUB"
exit 0
STUB
chmod +x "$WORK/bin/git"
# Default manifest: 2 public + 1 MARKED-private workspace template.
write_manifest() { cat > "$WORK/manifest.json"; }
write_manifest <<'JSON'
{
"version": 1, "plugins": [], "org_templates": [],
"workspace_templates": [
{"name": "pub-a", "repo": "molecule-ai/pub-a", "ref": "main"},
{"name": "priv-x", "repo": "molecule-ai/priv-x", "ref": "main", "private": true},
{"name": "pub-b", "repo": "molecule-ai/pub-b", "ref": "main"}
]
}
JSON
fail() { echo "FAIL: $1"; exit 1; }
# run <env opts/assignments...> → exit code preserved, output in $OUT.
# env opts (-u) must precede NAME=VALUE assignments (BSD env), so "$@" goes first.
run() {
OUT=$(env "$@" PATH="$WORK/bin:$PATH" \
sh "$CLONE_SH" "$WORK/manifest.json" "$WORK/ws" "$WORK/org" "$WORK/plugins" 2>&1)
rc=$?
printf '%s\n' "$OUT" > "$WORK/last.out"
return $rc
}
reset() { rm -rf "$WORK/ws" "$WORK/org" "$WORK/plugins"; }
# --- A. no token → marked-private skipped, public cloned, exit 0 ----------
reset
if run -u MOLECULE_GITEA_TOKEN PRIVATE_REPOS="priv-x"; then :; else fail "A: tokenless run should exit 0 (got $?)"; fi
[ -f "$WORK/ws/pub-a/STUB" ] && [ -f "$WORK/ws/pub-b/STUB" ] || fail "A: public repos not cloned"
[ -d "$WORK/ws/priv-x" ] && fail "A: private repo should have been skipped"
echo "$OUT" | grep -q "skipping 'priv-x'" || fail "A: missing skip warning for priv-x"
echo "$OUT" | grep -q "2/3 cloned, 1 skipped" || fail "A: summary wrong: $(echo "$OUT" | tail -1)"
echo "ok A: tokenless → 2 public cloned, 1 marked-private skipped, exit 0"
# --- B. token set → all clone, exit 0 ------------------------------------
reset
if run MOLECULE_GITEA_TOKEN=tok PRIVATE_REPOS="priv-x"; then :; else fail "B: tokened run should exit 0"; fi
[ -f "$WORK/ws/priv-x/STUB" ] || fail "B: private repo should clone with token"
echo "$OUT" | grep -q "3/3 repos cloned successfully" || fail "B: summary wrong: $(echo "$OUT" | tail -1)"
echo "ok B: with token → all 3 cloned, exit 0"
# --- C. token set + genuine failure → exit 1 -----------------------------
reset
if run MOLECULE_GITEA_TOKEN=tok HARD_FAIL_REPOS=pub-b; then
fail "C: a genuine clone failure with token set must exit 1"
fi
echo "$OUT" | grep -q "genuine failure" || fail "C: missing genuine-failure error"
echo "ok C: with token + real failure → exit 1 (strict preserved)"
# --- E. no token + PUBLIC repo hard-fails → exit 1 (the key boundary) -----
# pub-a clones, priv-x is skipped (marked private), pub-b fails and is NOT
# marked private → must abort. Proves best-effort does NOT swallow a genuine
# public failure (bad ref / deleted repo / outage), even after a success.
reset
if run -u MOLECULE_GITEA_TOKEN PRIVATE_REPOS="priv-x" HARD_FAIL_REPOS=pub-b; then
fail "E: tokenless run must exit 1 when a PUBLIC repo fails to clone"
fi
echo "$OUT" | grep -q "PUBLIC repo 'pub-b'" || fail "E: missing public-failure error for pub-b"
echo "ok E: tokenless + public hard-fail → exit 1 (no fail-open on public repos)"
# --- D. no token + EVERY entry marked private → exit 0 + warning ----------
write_manifest <<'JSON'
{
"version": 1, "plugins": [], "org_templates": [],
"workspace_templates": [
{"name": "priv-a", "repo": "molecule-ai/priv-a", "ref": "main", "private": true},
{"name": "priv-b", "repo": "molecule-ai/priv-b", "ref": "main", "private": true}
]
}
JSON
reset
if run -u MOLECULE_GITEA_TOKEN PRIVATE_REPOS="priv-a priv-b"; then :; else fail "D: all-private tokenless run must exit 0 (got $?)"; fi
echo "$OUT" | grep -q "EMPTY template palette" || fail "D: missing empty-palette warning"
echo "ok D: tokenless + all-private → exit 0 with empty-palette warning"
echo "PASS: clone-manifest.sh tolerant tokenless bootstrap"