Merge branch 'main' into fix/178-canvas-shared-auth-headers
Some checks failed
CodeQL / Analyze (${{ matrix.language }}) (go) (pull_request) Successful in 8s
CodeQL / Analyze (${{ matrix.language }}) (javascript-typescript) (pull_request) Successful in 8s
CodeQL / Analyze (${{ matrix.language }}) (python) (pull_request) Successful in 8s
Block internal-flavored paths / Block forbidden paths (pull_request) Successful in 16s
pr-guards / disable-auto-merge-on-push (pull_request) Failing after 8s
CI / Detect changes (pull_request) Successful in 19s
E2E API Smoke Test / detect-changes (pull_request) Successful in 17s
E2E Staging Canvas (Playwright) / detect-changes (pull_request) Successful in 17s
Handlers Postgres Integration / detect-changes (pull_request) Successful in 16s
Harness Replays / detect-changes (pull_request) Successful in 16s
Secret scan / Scan diff for credential-shaped strings (pull_request) Successful in 15s
Runtime PR-Built Compatibility / detect-changes (pull_request) Successful in 16s
CI / Shellcheck (E2E scripts) (pull_request) Successful in 7s
CI / Platform (Go) (pull_request) Successful in 9s
CI / Python Lint & Test (pull_request) Successful in 8s
E2E API Smoke Test / E2E API Smoke Test (pull_request) Successful in 11s
Handlers Postgres Integration / Handlers Postgres Integration (pull_request) Successful in 12s
Runtime PR-Built Compatibility / PR-built wheel + import smoke (pull_request) Successful in 9s
Harness Replays / Harness Replays (pull_request) Failing after 1m20s
E2E Staging Canvas (Playwright) / Canvas tabs E2E (pull_request) Successful in 6m26s
CI / Canvas (Next.js) (pull_request) Failing after 8m35s
CI / Canvas Deploy Reminder (pull_request) Has been skipped

This commit is contained in:
claude-ceo-assistant 2026-05-08 00:20:46 +00:00
commit 9c82b2a61c
29 changed files with 4853 additions and 483 deletions

View File

@ -154,30 +154,71 @@ jobs:
exit 0
fi
# Upstream is publish-workspace-server-image. Check E2E state.
# The jq filter must defend against TWO empty cases that gh
# CLI emits indistinguishably:
# 1. gh exits non-zero (network blip, auth issue) → handled
# by the `|| echo "none/none"` fallback below.
# 2. gh exits zero but returns `[]` (no E2E run on this
# main SHA — the common case for canvas-only / cmd-only
# / sweep-only changes whose paths don't trigger E2E).
# Without `(.[0] // {})`, jq sees `null` and emits
# "null/none" — which the case statement below has no
# branch for, so it falls into *) → exit 1.
# Surfaced 2026-04-30 the first time the App-token chain
# (#2389) actually fired auto-promote-on-e2e from a publish
# upstream — every prior run was E2E-upstream which
# short-circuits before this gate.
RESULT=$(gh run list \
--repo "$REPO" \
--workflow e2e-staging-saas.yml \
--branch main \
--commit "$SHA" \
--limit 1 \
--json status,conclusion \
--jq '(.[0] // {}) | "\(.status // "none")/\(.conclusion // "none")"' \
2>/dev/null || echo "none/none")
# Upstream is publish-workspace-server-image. Check E2E state
# for the same SHA via Gitea's commit-status API.
#
# GitHub-era this was `gh run list --workflow=X --commit=SHA
# --json status,conclusion` returning either `[]` (no run on
# this SHA) or `[{status, conclusion}]` (the run's state).
# Gitea has NO workflow-runs API at all — `/api/v1/repos/.../
# actions/runs` returns 404 (verified 2026-05-07, issue #75).
# However Gitea Actions DOES emit a commit status per workflow
# job, with `context = "<Workflow Name> / <Job Name> (<event>)"`,
# which is exactly what we need: each E2E run leg becomes one
# status row on the SHA, and the aggregate state encodes the
# run's outcome.
#
# Mapping:
# 0 matched contexts → "none/none" (E2E paths-
# filtered
# out — same
# semantic
# as before)
# any context = pending → "in_progress/none" (defer)
# any context = error|failure → "completed/failure" (abort)
# all contexts = success → "completed/success" (proceed)
#
# The "completed/cancelled" and "completed/timed_out" buckets
# don't have direct Gitea analogs (Gitea statuses are
# success / failure / error / pending / warning). Per-SHA
# concurrency cancellation surfaces as `error` on Gitea, which
# we map to "completed/failure" rather than "completed/cancelled"
# — losing the soft-defer semantic of the cancelled bucket on
# this fleet. Tradeoff: the staleness alarm (auto-promote-stale-
# alarm.yml) still catches a stuck :latest within 4h, and a
# legitimate cancel is rare enough that aborting + manual
# re-dispatch is acceptable. If we measure cancel frequency
# > 1/week, revisit by reading the run-step-summary text via
# a follow-up script.
#
# Network or auth blips collapse to "none/none" via the curl
# `|| true` fallback, matching the pre-Gitea behaviour where
# an empty list also degenerated to none/none.
GITEA_API_URL="${GITHUB_SERVER_URL:-https://git.moleculesai.app}/api/v1"
STATUSES_JSON=$(curl --fail-with-body -sS \
-H "Authorization: token ${GH_TOKEN}" \
-H "Accept: application/json" \
"${GITEA_API_URL}/repos/${REPO}/commits/${SHA}/statuses?limit=100" \
2>/dev/null || echo "[]")
RESULT=$(printf '%s' "$STATUSES_JSON" | jq -r '
# Filter to E2E Staging SaaS (full lifecycle) statuses.
# Match by leading workflow-name prefix so the "<job>
# (<event>)" tail is irrelevant. Gitea emits the workflow
# name verbatim from the YAML `name:` field.
[.[] | select(.context | startswith("E2E Staging SaaS (full lifecycle) /"))] as $rows
| if ($rows | length) == 0 then
"none/none"
elif any($rows[]; .status == "pending") then
"in_progress/none"
elif any($rows[]; .status == "failure" or .status == "error") then
"completed/failure"
elif all($rows[]; .status == "success") then
"completed/success"
else
# Mixed / unknown — fall through to *) bucket below.
"completed/" + ($rows[0].status // "unknown")
end
' 2>/dev/null || echo "none/none")
echo "E2E Staging SaaS for ${SHA:0:7}: $RESULT"
@ -199,16 +240,13 @@ jobs:
exit 1
;;
completed/cancelled)
# cancelled ≠ failure. Per-SHA concurrency cancels older E2E
# runs when a newer push lands (memory:
# feedback_concurrency_group_per_sha) — the newer SHA will
# have its own E2E + promote chain. Treat the same as
# in_progress: defer without aborting, let the next E2E run
# promote when it lands.
#
# Caught 2026-05-05 02:03 on sha 31f9a5e — auto-promote
# blocked the whole chain because this case fell through to
# exit 1 instead of clean defer.
# GitHub-era only: cancelled ≠ failure. Gitea statuses
# don't expose a "cancelled" state — a per-SHA concurrency
# cancellation surfaces as `failure` or `error` on Gitea
# and is now handled by the failure branch above. This
# arm is kept for backwards compatibility / dual-host
# operation (if we ever add a non-Gitea fallback) but
# under the post-#75 flow it's unreachable.
echo "proceed=false" >> "$GITHUB_OUTPUT"
{
echo "## ⏭ Auto-promote deferred — E2E Staging SaaS was cancelled"

View File

@ -2,61 +2,148 @@ name: Auto-promote staging → main
# Fires after any of the staging-branch quality gates complete. When ALL
# required gates are green on the same staging SHA, opens (or re-uses)
# a PR `staging → main` and enables auto-merge so the merge queue lands
# it. Closes the gap that historically let features sit on staging for
# weeks waiting for a bulk promotion PR (see molecule-core#1496 for the
# 1172-commit example).
# a PR `staging → main` and schedules Gitea auto-merge so the PR lands
# automatically once approval + status checks are satisfied.
#
# 2026-04-28 rewrite (PR #142): the previous version did a direct
# `git merge --ff-only origin staging && git push origin main`. That
# breaks against main's branch-protection ruleset, which requires
# status checks "set by the expected GitHub apps" — direct pushes
# can't satisfy that condition (only PR merges through the queue can).
# The workflow was failing every tick with:
# remote: error: GH006: Protected branch update failed for refs/heads/main.
# remote: - Required status checks ... were not set by the expected GitHub apps.
# Fix: mirror the PR-based pattern from auto-sync-main-to-staging.yml
# (the reverse-direction sync, fixed in #2234 for the same reason).
# Both directions now use the same merge-queue path that humans use,
# no special-case bypass.
# ============================================================
# What this workflow does
# ============================================================
#
# Safety model:
# - Runs ONLY on workflow_run events for the staging branch.
# - Requires EVERY named gate workflow to have the same head_sha and
# all be `conclusion == success`. If any of them is red, skipped,
# cancelled, or pending, we abort (stay on the current main).
# - The PR base=main head=staging path lets GitHub itself enforce
# branch protection. If main has diverged from staging or required
# checks aren't satisfied, the merge queue declines the PR — no
# need for a manual ff-only ancestry check here.
# - Loop safety: the auto-sync-main-to-staging workflow fires when
# main lands the auto-promote PR, but its merge into staging is by
# GITHUB_TOKEN which doesn't trigger downstream workflow_run events
# (GitHub Actions safety). So this workflow doesn't re-fire from
# its own promote landing.
# 1. On a workflow_run completion event for one of the staging gate
# workflows (CI, E2E Staging Canvas, E2E API Smoke, CodeQL),
# checks if the combined status on the staging head SHA is green.
# 2. If green, opens (or re-uses) a PR `head: staging → base: main`
# via Gitea REST `POST /api/v1/repos/.../pulls`.
# 3. Schedules auto-merge via `POST /api/v1/repos/.../pulls/{index}/merge`
# with `merge_when_checks_succeed: true`. Gitea waits for the
# approval requirement on `main` (`required_approvals: 1`) and
# the status-check gates, then merges.
# 4. The merge commit lands on `main` and fires
# `publish-workspace-server-image.yml` naturally via its
# `on: push: branches: [main]` trigger — no explicit dispatch
# needed (see "Why no workflow_dispatch tail" below).
#
# Toggle via repo variable AUTO_PROMOTE_ENABLED (true/unset). When
# unset, the workflow logs what it would have done but doesn't open
# the PR — useful for dry-running the gate logic without surfacing
# a noisy PR while staging CI is still flaky.
# `auto-sync-main-to-staging.yml` is the reverse-direction
# counterpart (main → staging, fast-forward push). Together they
# keep the staging-superset-of-main invariant tight.
#
# **One-time repo setting (load-bearing):** this workflow opens the
# staging→main PR via `gh pr create` using the default GITHUB_TOKEN.
# Since GitHub's 2022 default change, that token cannot create or
# approve PRs unless the repo opts in. The toggle is at:
# ============================================================
# Why Gitea REST (and not `gh pr create`)
# ============================================================
#
# Settings → Actions → General → Workflow permissions
# → ✅ Allow GitHub Actions to create and approve pull requests
# Pre-2026-05-06 this workflow used `gh pr create`, `gh pr merge --auto`,
# `gh run list`, and `gh workflow run` against GitHub. After the
# GitHub→Gitea cutover those calls fail because:
#
# Without it, every workflow_run fails with:
# - `gh pr create / merge / view / list` route to GitHub GraphQL
# (`/api/graphql`). Gitea does not expose a GraphQL endpoint;
# every call returns `HTTP 405 Method Not Allowed` — same root
# cause as #65 (auto-sync) which PR #66 fixed by dropping `gh`
# entirely.
# - `gh run list --workflow=...` GitHub-shape; Gitea has the
# simpler `GET /repos/.../commits/{ref}/status` combined-status
# endpoint instead.
# - `gh workflow run X.yml` calls `POST /repos/.../actions/workflows/{id}/dispatches`,
# which does NOT exist on Gitea 1.22.6 (verified via swagger.v1.json).
#
# pull request create failed: GraphQL: GitHub Actions is not
# permitted to create or approve pull requests (createPullRequest)
# So this workflow uses direct `curl` calls to Gitea REST. No `gh`
# CLI dependency, no GraphQL, no missing-endpoint footgun.
#
# Observed 2026-04-29 01:43 UTC blocking promotion of fcd87b9 (PRs
# #2248 + #2249); manually bridged via PR #2252. Re-check this
# setting if auto-promote starts failing with createPullRequest
# errors after a repo or org admin change.
# ============================================================
# Why no workflow_dispatch tail (was load-bearing on GitHub, dead on Gitea)
# ============================================================
#
# The GitHub-era version had a 60-line polling step that waited for
# the promote PR to merge, then explicitly dispatched
# `publish-workspace-server-image.yml` on `--ref main`. That step
# existed because GitHub's GITHUB_TOKEN-initiated merges suppress
# downstream `on: push` workflows (the documented "no recursion" rule
# — https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow).
# The explicit dispatch was the workaround.
#
# Gitea Actions does NOT have this no-recursion rule. PR #66's auto-
# sync merge to main fired `auto-promote-staging` on the next push
# trigger naturally. So the cascade fires on the natural push event;
# the explicit dispatch is dead code. (And even if we wanted to
# preserve it, Gitea has no `workflow_dispatch` REST endpoint.)
#
# Removed in this rewrite. If we ever observe the cascade misfire,
# operator can push an empty commit to `main` to wake it.
#
# ============================================================
# Why open a PR (and not direct push)
# ============================================================
#
# `main` branch protection has `enable_push: false` with NO
# `push_whitelist_usernames`. Direct push is impossible for any
# persona, including admins. PR-mediated merge is the only path,
# which is intentional: prod state mutations (and staging→main IS a
# prod mutation, since the next deploy fans out to tenants) require
# Hongming's approval per `feedback_prod_apply_needs_hongming_chat_go`.
#
# The auto-merge schedule preserves this gate: `merge_when_checks_succeed`
# does NOT bypass `required_approvals: 1`. Gitea waits for BOTH
# approval AND green checks before merging. Hongming reviews via the
# canvas/chat-handle of the PR notification, approves, and Gitea
# auto-merges within seconds.
#
# ============================================================
# Identity + token (anti-bot-ring per saved-memory
# `feedback_per_agent_gitea_identity_default`)
# ============================================================
#
# This workflow uses `secrets.AUTO_SYNC_TOKEN` — a personal access
# token issued to the `devops-engineer` Gitea persona. NOT the
# founder PAT. The bot-ring fingerprint that triggered the GitHub
# org suspension on 2026-05-06 was characterised by founder PAT
# acting as CI at machine speed.
#
# Token scope: `push: true` (read+write) on this repo. The persona
# can: open PRs, comment on PRs, schedule auto-merge. The persona
# CANNOT bypass main's branch protection (`required_approvals: 1`
# still applies — only Hongming's review unblocks merge).
#
# Authorship: the PR is opened by `devops-engineer`; the merge
# commit credits Hongming-as-approver and `devops-engineer` as
# the merger.
#
# ============================================================
# Failure modes & operational notes
# ============================================================
#
# A — staging gates not all green at trigger time:
# - The combined-status check returns `state: pending|failure`.
# Workflow exits 0 with a step-summary "not all green; staying
# on current main". Re-fires on the next gate completion.
#
# B — Gitea PR-create returns non-201 (e.g. 422 already-exists):
# - Idempotent: the workflow first GETs the existing open
# staging→main PR. If found, reuse it; if not, POST a new one.
# 422 should never surface; if it does (race), step summary
# captures the body and the next workflow_run picks up.
#
# C — `merge_when_checks_succeed` schedule fails:
# - 422 with "Pull request is not mergeable" if there are
# conflicts or stale base. Step summary surfaces it; operator
# (or `auto-sync-main-to-staging`) needs to bring staging up
# to date with main first. Workflow exits 1 to surface red.
#
# D — `AUTO_SYNC_TOKEN` rotated / wrong scope:
# - 401/403 on first REST call. Step summary surfaces it.
# Re-issue the token from `~/.molecule-ai/personas/` on the
# operator host and update the repo Actions secret.
#
# ============================================================
# Loop safety
# ============================================================
#
# When the promote PR merges to main, `auto-sync-main-to-staging.yml`
# fires (on:push:main) and pushes the merge commit back to staging.
# That push to staging is by `devops-engineer`, NOT this workflow's
# token, and triggers the staging gate workflows. When they all
# complete, we end up back here — but the tree-diff guard catches
# it: staging tree == main tree (the merge commit changes nothing),
# so we skip and the cycle terminates.
on:
workflow_run:
@ -74,26 +161,16 @@ on:
default: "false"
permissions:
contents: write
contents: read
pull-requests: write
# actions: write is needed by the post-merge dispatch tail step
# (#2358 / #2357) — `gh workflow run publish-workspace-server-image.yml`
# POSTs to /actions/workflows/.../dispatches which requires this scope.
# Without it the call 403s and the publish/canary/redeploy chain still
# doesn't run on staging→main promotions, undoing #2358.
actions: write
# Serialize auto-promote runs. Multiple staging gate completions can land
# in quick succession (CI + E2E + CodeQL all finish within seconds of
# each other on a green PR) — without this, two parallel runs both:
# 1. Open / re-use the same promote PR.
# 2. Both call `gh pr merge --auto` (idempotent — fine).
# 3. Both poll for the same mergedAt and both `gh workflow run` publish
# → 2× redundant publish builds racing for the same `:staging-latest`
# retag, and 2× canary-verify chains.
# cancel-in-progress: false because we don't want a brand-new run to kill
# a polling-tail that's about to dispatch — the polling tail's 30 min cap
# is the right backstop, not workflow-level cancel.
# 1. Would race the GET-or-POST PR step.
# 2. Would both call merge-schedule (idempotent — fine on Gitea).
# cancel-in-progress: false because the second run on a fresh staging
# tip should NOT kill the first which has already opened the PR.
concurrency:
group: auto-promote-staging
cancel-in-progress: false
@ -111,126 +188,112 @@ jobs:
all_green: ${{ steps.gates.outputs.all_green }}
head_sha: ${{ steps.gates.outputs.head_sha }}
steps:
# Skip empty-tree promotes (the perpetual auto-promote↔auto-sync cycle
# observed 2026-05-03). Sequence: auto-promote merges via the staging
# merge-queue's MERGE strategy, creating a merge commit on main that
# staging doesn't have. auto-sync then merges main back into staging
# via another merge commit (the queue's MERGE strategy applies on
# the staging side too, even when the workflow's local FF would
# have sufficed). Now staging has a new merge-commit SHA whose
# tree == main's tree — but auto-promote sees "staging ahead of
# main by 1" and opens YET another empty promote PR. Each round
# costs ~30-40 min wallclock, ~2 manual approvals, and burns a
# full CodeQL Go run (~15 min). Without this guard the cycle
# repeats indefinitely.
#
# Long-term fix is to switch the merge_queue ruleset's
# `merge_method` away from MERGE so FF-able PRs land cleanly,
# but that's a broader change affecting every staging PR's
# commit shape. This guard is the one-line surgical fix that
# breaks the cycle without touching merge-queue config.
#
# Fail-open: if `git diff` errors for any reason, fall through
# to the gate check (preserve existing behavior). Only skip
# when the diff is DEFINITIVELY empty.
# Skip empty-tree promotes (the perpetual auto-promote↔auto-sync
# cycle observed pre-cutover on GitHub). On Gitea the cycle shape
# is different (auto-sync uses fast-forward, no merge commit),
# but the tree-diff guard is cheap insurance and protects against
# any future merge-style regression.
- name: Checkout for tree-diff check
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 0
ref: staging
- name: Skip if staging tree == main tree (perpetual-cycle break)
- name: Skip if staging tree == main tree (cycle-break safety)
id: tree-diff
env:
HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
run: |
set -eu
git fetch origin main --depth=50 || { echo "::warning::git fetch main failed — proceeding (fail-open)"; exit 0; }
# Compare staging tip's tree against main's tree. `git diff
# --quiet` exits 0 if no differences, 1 if there are.
if git diff --quiet origin/main "$HEAD_SHA" -- 2>/dev/null; then
{
echo "## Skipped — no code to promote"
echo "## Skipped — no code to promote"
echo
echo "staging tip (\`${HEAD_SHA:0:8}\`) and \`main\` have identical trees."
echo "This is the auto-promote↔auto-sync merge-commit cycle: staging has a"
echo "new SHA (a sync-back merge commit) but the underlying file tree is"
echo "already on main, so there's no real code to ship."
echo
echo "Skipping to avoid opening an empty promote PR. Cycle terminates here."
echo "Skipping to avoid opening an empty promote PR."
} >> "$GITHUB_STEP_SUMMARY"
echo "::notice::auto-promote: staging tree == main tree — no code to promote, skipping"
echo "skip=true" >> "$GITHUB_OUTPUT"
else
echo "skip=false" >> "$GITHUB_OUTPUT"
fi
- name: Check all required gates on this SHA
- name: Check combined status on staging head
if: steps.tree-diff.outputs.skip != 'true'
id: gates
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
REPO: ${{ github.repository }}
GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }}
run: |
set -euo pipefail
# Required gate workflow files. Use file paths (relative to
# .github/workflows/) rather than display names because:
# Gitea-native combined-status endpoint aggregates every
# check context attached to a SHA. This is structurally
# cleaner than the GitHub-era per-workflow `gh run list`
# loop because:
#
# 1. `gh run list --workflow=<name>` is ambiguous when two
# workflows have the same `name:` — observed 2026-04-28
# with "CodeQL" matching both `codeql.yml` (explicit) and
# GitHub's UI-configured Code-quality default setup
# (internal "codeql"). gh CLI returns "could not resolve
# to a unique workflow" → empty result → gate evaluated
# as missing/none → auto-promote dead-locked despite all
# checks actually passing.
# 1. There's no risk of "workflow name collision" (the
# GitHub-era code had to switch from `--workflow=NAME`
# to `--workflow=FILE.YML` to disambiguate "CodeQL"
# between the explicit workflow and GitHub's UI-
# configured default setup; Gitea has no such
# duplicate-name surface).
# 2. Gitea's combined state already encodes the AND
# across all contexts: success only if EVERY context
# is success. Pending or failure on any context
# produces non-success state.
#
# 2. File paths are the unique identifier for workflows;
# `name:` is just a display string and can collide.
#
# When adding/removing a gate, update this list AND the
# branch-protection required-checks list (which uses check-run
# display names, not workflow names; the two are decoupled and
# should be kept in sync manually).
GATES=(
"ci.yml"
"e2e-staging-canvas.yml"
"e2e-api.yml"
"codeql.yml"
)
# See https://docs.gitea.com/api/1.22 for the schema —
# `state` is one of: success, pending, failure, error.
echo "head_sha=${HEAD_SHA}" >> "$GITHUB_OUTPUT"
echo "Checking gates on SHA ${HEAD_SHA}"
echo "Checking combined status on SHA ${HEAD_SHA}"
ALL_GREEN=true
for gate in "${GATES[@]}"; do
# Query the most recent run of this workflow on this SHA.
# event=push to avoid picking up PR runs. branch=staging to
# guard against someone dispatching the gate on a non-staging
# branch at the same SHA.
RESULT=$(gh run list \
--repo "$REPO" \
--workflow "$gate" \
--branch staging \
--event push \
--commit "$HEAD_SHA" \
--limit 1 \
--json status,conclusion \
--jq '.[0] | "\(.status)/\(.conclusion // "none")"' \
2>/dev/null || echo "missing/none")
# `set +o pipefail` for the http-code capture pattern; restore
# immediately. Pattern hardened per `feedback_curl_status_capture_pollution`.
BODY_FILE=$(mktemp)
set +e
STATUS=$(curl -sS \
-H "Authorization: token ${GITEA_TOKEN}" \
-H "Accept: application/json" \
-o "${BODY_FILE}" \
-w "%{http_code}" \
"${GITEA_HOST}/api/v1/repos/${REPO}/commits/${HEAD_SHA}/status")
CURL_RC=$?
set -e
echo " $gate → $RESULT"
# Only completed/success counts. completed/failure or
# in_progress/anything or no record at all = abort.
if [ "$RESULT" != "completed/success" ]; then
ALL_GREEN=false
if [ "${CURL_RC}" -ne 0 ] || [ "${STATUS}" != "200" ]; then
echo "::error::combined-status fetch failed: curl=${CURL_RC} http=${STATUS}"
cat "${BODY_FILE}" | head -c 500 || true
rm -f "${BODY_FILE}"
echo "all_green=false" >> "$GITHUB_OUTPUT"
exit 0
fi
done
echo "all_green=${ALL_GREEN}" >> "$GITHUB_OUTPUT"
if [ "$ALL_GREEN" != "true" ]; then
echo "::notice::auto-promote: not all gates are green on ${HEAD_SHA} — staying on current main"
STATE=$(jq -r '.state // "missing"' < "${BODY_FILE}")
TOTAL=$(jq -r '.total_count // 0' < "${BODY_FILE}")
rm -f "${BODY_FILE}"
echo "Combined status: state=${STATE} total_count=${TOTAL}"
if [ "${STATE}" = "success" ] && [ "${TOTAL}" -gt 0 ]; then
echo "all_green=true" >> "$GITHUB_OUTPUT"
echo "::notice::All gates green on ${HEAD_SHA} (${TOTAL} contexts)"
else
echo "all_green=false" >> "$GITHUB_OUTPUT"
{
echo "## Not promoting — combined status not green"
echo
echo "- SHA: \`${HEAD_SHA:0:8}\`"
echo "- Combined state: \`${STATE}\`"
echo "- Context count: ${TOTAL}"
echo
echo "Will re-fire on the next gate completion. Investigate any red gate via the Actions UI."
} >> "$GITHUB_STEP_SUMMARY"
echo "::notice::auto-promote: combined status is ${STATE} on ${HEAD_SHA} — staying on current main"
fi
promote:
@ -247,188 +310,183 @@ jobs:
# Repo variable AUTO_PROMOTE_ENABLED=true flips this on. While
# it's unset, the workflow dry-runs (logs what it would have
# done) but doesn't open the promote PR. Set the variable in
# Settings → Secrets and variables → Actions → Variables.
# Settings → Actions → Variables.
if [ "${AUTO_PROMOTE_ENABLED:-}" != "true" ] && [ "${FORCE_INPUT:-false}" != "true" ]; then
{
echo "## Auto-promote disabled"
echo "## Auto-promote disabled"
echo
echo "Repo variable \`AUTO_PROMOTE_ENABLED\` is not set to \`true\`."
echo "All gates are green on staging; would have opened a promote PR to \`main\`."
echo
echo "To enable: Settings → Secrets and variables → Actions → Variables → \`AUTO_PROMOTE_ENABLED=true\`."
echo "To enable: Settings → Actions → Variables → \`AUTO_PROMOTE_ENABLED=true\`."
echo "To test once manually: workflow_dispatch with \`force=true\`."
} >> "$GITHUB_STEP_SUMMARY"
echo "::notice::auto-promote disabled — dry run only"
exit 0
fi
# Mint the App token BEFORE the promote-PR step so the auto-merge
# call can use it. GITHUB_TOKEN-initiated merges suppress the
# downstream `push` event on main, breaking the
# publish-workspace-server-image → canary-verify → redeploy-tenants
# chain (issue #2357). Using the App token here means the
# merge-queue-landed merge IS able to fire the cascade naturally;
# the polling tail below stays as defense-in-depth.
- name: Mint App token for promote-PR + downstream dispatch
if: ${{ vars.AUTO_PROMOTE_ENABLED == 'true' || github.event.inputs.force == 'true' }}
id: app-token
uses: actions/create-github-app-token@1b10c78c7865c340bc4f6099eb2f838309f1e8c3 # v3.1.1
with:
app-id: ${{ secrets.MOLECULE_AI_APP_ID }}
private-key: ${{ secrets.MOLECULE_AI_APP_PRIVATE_KEY }}
- name: Open (or reuse) staging → main promote PR + enable auto-merge
- name: Open or reuse promote PR + schedule auto-merge
if: ${{ vars.AUTO_PROMOTE_ENABLED == 'true' || github.event.inputs.force == 'true' }}
env:
GH_TOKEN: ${{ steps.app-token.outputs.token }}
GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
REPO: ${{ github.repository }}
TARGET_SHA: ${{ needs.check-all-gates-green.outputs.head_sha }}
GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }}
run: |
set -euo pipefail
# Look for an existing open promote PR (idempotent on re-run
# of the workflow). The PR's head IS the staging branch — the
# whole point is "advance main to staging's tip", so we don't
# need a per-SHA branch like auto-sync-main-to-staging uses.
PR_NUM=$(gh pr list --repo "$REPO" \
--base main --head staging --state open \
--json number --jq '.[0].number // ""')
API="${GITEA_HOST}/api/v1/repos/${REPO}"
AUTH=(-H "Authorization: token ${GITEA_TOKEN}" -H "Accept: application/json")
if [ -z "$PR_NUM" ]; then
# http_status_get RESULT_VAR URL
# Sets RESULT_VAR to "<http_code>:<body_file>". Curl status
# capture pattern per `feedback_curl_status_capture_pollution`:
# http_code goes to its own tempfile-equivalent (-w), body to
# another tempfile, set +e/-e bracket protects pipeline state.
http_get() {
local body_file="$1"; shift
local url="$1"; shift
set +e
local code
code=$(curl -sS "${AUTH[@]}" -o "${body_file}" -w "%{http_code}" "${url}")
local rc=$?
set -e
if [ "${rc}" -ne 0 ]; then
echo "::error::curl GET failed (rc=${rc}) on ${url}"
return 99
fi
echo "${code}"
}
http_post_json() {
local body_file="$1"; shift
local data="$1"; shift
local url="$1"; shift
set +e
local code
code=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \
-X POST -d "${data}" -o "${body_file}" -w "%{http_code}" "${url}")
local rc=$?
set -e
if [ "${rc}" -ne 0 ]; then
echo "::error::curl POST failed (rc=${rc}) on ${url}"
return 99
fi
echo "${code}"
}
# Step 1: look for an existing open staging→main promote PR
# (idempotent on workflow re-run). Gitea doesn't have a
# head/base filter on the list endpoint that's as ergonomic
# as gh's, but the dedicated `/pulls/{base}/{head}` lookup
# works.
BODY=$(mktemp)
STATUS=$(http_get "${BODY}" "${API}/pulls/main/staging") || true
PR_NUM=""
if [ "${STATUS}" = "200" ]; then
STATE=$(jq -r '.state // "missing"' < "${BODY}")
if [ "${STATE}" = "open" ]; then
PR_NUM=$(jq -r '.number // ""' < "${BODY}")
echo "::notice::Re-using existing open promote PR #${PR_NUM}"
fi
fi
rm -f "${BODY}"
# Step 2: if no open PR, create one.
if [ -z "${PR_NUM}" ]; then
TITLE="staging → main: auto-promote ${TARGET_SHA:0:7}"
BODY_FILE=$(mktemp)
cat > "$BODY_FILE" <<EOFBODY
Automated promotion of \`staging\` (\`${TARGET_SHA:0:8}\`) to \`main\`. All required staging gates green at this SHA: CI, E2E Staging Canvas, E2E API Smoke, CodeQL.
BODY_TEXT=$(cat <<EOFBODY
Automated promotion of \`staging\` (\`${TARGET_SHA:0:8}\`) to \`main\`. All required staging gates are green at this SHA (combined status reported success).
This PR is auto-generated by \`.github/workflows/auto-promote-staging.yml\` whenever every required gate completes green on the same staging SHA. It exists because main's branch protection requires status checks "set by the expected GitHub apps" — direct \`git push\` from a workflow can't satisfy that, only PR merges through the queue can.
This PR is auto-generated by \`.github/workflows/auto-promote-staging.yml\` whenever every required gate completes green on the same staging SHA.
Merge queue lands this; no human action needed unless gates fail. Reverse-direction sync (the merge commit on main → staging) is handled by \`auto-sync-main-to-staging.yml\`.
**Approval gate:** \`main\` branch protection requires 1 approval before this can land. Once approved, Gitea will auto-merge (the workflow scheduled \`merge_when_checks_succeed: true\` immediately after open).
The reverse-direction sync (the merge commit on \`main\` → \`staging\`) is handled automatically by \`auto-sync-main-to-staging.yml\` after this PR lands.
---
- Source: staging at \`${TARGET_SHA}\`
- Opened by: \`devops-engineer\` persona (anti-bot-ring; never founder PAT)
- Refs: #65, #73, #195
EOFBODY
PR_URL=$(gh pr create --repo "$REPO" \
--base main --head staging \
--title "$TITLE" \
--body-file "$BODY_FILE")
PR_NUM=$(echo "$PR_URL" | grep -oE '[0-9]+$' | tail -1)
rm -f "$BODY_FILE"
echo "::notice::Opened PR #${PR_NUM}"
)
REQ=$(jq -n \
--arg title "${TITLE}" \
--arg body "${BODY_TEXT}" \
--arg base "main" \
--arg head "staging" \
'{title:$title, body:$body, base:$base, head:$head}')
BODY=$(mktemp)
STATUS=$(http_post_json "${BODY}" "${REQ}" "${API}/pulls")
if [ "${STATUS}" = "201" ]; then
PR_NUM=$(jq -r '.number // ""' < "${BODY}")
echo "::notice::Opened promote PR #${PR_NUM}"
else
echo "::notice::Re-using existing promote PR #${PR_NUM}"
echo "::error::Failed to create promote PR: HTTP ${STATUS}"
jq -r '.message // .' < "${BODY}" | head -c 500
rm -f "${BODY}"
exit 1
fi
rm -f "${BODY}"
fi
# Enable auto-merge — the merge queue picks it up once
# required gates are green on the merge_group ref.
if ! gh pr merge "$PR_NUM" --repo "$REPO" --auto --merge 2>&1; then
echo "::warning::Failed to enable auto-merge on PR #${PR_NUM} — operator may need to merge manually."
fi
# Step 3: schedule auto-merge. merge_when_checks_succeed
# tells Gitea to wait for both:
# - all required status checks to pass
# - the required-approvals gate (1 approval on main)
# before merging. On approval+green, Gitea merges within
# seconds. On any check failing or approval being denied,
# the schedule stays armed but doesn't fire.
#
# Idempotent: re-arming on an already-armed PR is a no-op.
REQ=$(jq -n '{Do:"merge", merge_when_checks_succeed:true}')
BODY=$(mktemp)
STATUS=$(http_post_json "${BODY}" "${REQ}" "${API}/pulls/${PR_NUM}/merge")
# Gitea returns:
# - 200/204 on successful immediate merge (gates already green AND approved)
# - 405 "Please try again later" when scheduled successfully but waiting
# - 422 on "Pull request is not mergeable" (conflict, stale base, etc.)
#
# 405 here is benign — Gitea's way of saying "scheduled, not merging now".
# We treat 200/204/405 as success, anything else as failure.
case "${STATUS}" in
200|204)
MERGE_OUTCOME="merged-immediately"
echo "::notice::Promote PR #${PR_NUM} merged immediately (gates+approval already green)"
;;
405)
MERGE_OUTCOME="auto-merge-scheduled"
echo "::notice::Promote PR #${PR_NUM}: auto-merge scheduled (Gitea will land on approval+green)"
;;
422)
MERGE_OUTCOME="not-mergeable"
echo "::warning::Promote PR #${PR_NUM}: not mergeable (conflict, stale base, or already merging)."
jq -r '.message // .' < "${BODY}" | head -c 500
;;
*)
echo "::error::Unexpected status ${STATUS} on merge schedule"
jq -r '.message // .' < "${BODY}" | head -c 500
rm -f "${BODY}"
exit 1
;;
esac
rm -f "${BODY}"
{
echo "## ✅ Auto-promote PR opened"
echo "## Auto-promote PR opened"
echo
echo "- Source: staging at \`${TARGET_SHA:0:8}\`"
echo "- PR: #${PR_NUM}"
echo "- Outcome: \`${MERGE_OUTCOME}\`"
echo
echo "Merge queue lands the PR once required gates are green; no human action needed unless gates fail."
} >> "$GITHUB_STEP_SUMMARY"
# Hand the PR number to the next step so we can dispatch the
# tenant-redeploy chain after the merge queue lands the merge.
echo "promote_pr_num=${PR_NUM}" >> "$GITHUB_OUTPUT"
id: promote_pr
# The App token minted above (before the promote-PR step) is
# also used by the polling tail below. Defense-in-depth: with
# the merge-queue-landed merge now using the App token, the
# main-branch push event SHOULD fire the publish/canary/redeploy
# cascade naturally — but if for any reason it doesn't (e.g. an
# unrelated event-suppression edge case), the explicit dispatches
# below still wake the chain.
- name: Wait for promote merge, then dispatch publish + redeploy (#2357)
# Defense-in-depth dispatch. With the auto-merge call above
# now using the App token (this commit), the merge-queue-landed
# merge SHOULD fire publish-workspace-server-image naturally
# via on:push:[main] — App-token-initiated pushes DO trigger
# workflow_run cascades, unlike GITHUB_TOKEN-initiated ones
# (the documented "no recursion" rule —
# https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow).
#
# This explicit dispatch stays as belt-and-suspenders for any
# edge case where the natural cascade misfires. If it never
# observably fires after this token swap (i.e. the publish
# workflow has already started by the time we get here), the
# second dispatch is a harmless no-op (publish-workspace-server-image
# has its own concurrency group that dedupes).
#
# See PR for #2357: pre-fix the merge action was via
# GITHUB_TOKEN, suppressing the cascade and forcing this tail
# to be the SOLE chain trigger. With the auto-merge token swap
# the tail becomes redundant in the happy path; keep until
# we've observed >=10 successful natural cascades, then drop.
if: steps.promote_pr.outputs.promote_pr_num != ''
env:
GH_TOKEN: ${{ steps.app-token.outputs.token }}
REPO: ${{ github.repository }}
PR_NUM: ${{ steps.promote_pr.outputs.promote_pr_num }}
run: |
# Poll for merge — max 30 min (60 × 30s). The merge queue
# typically lands within 5-10 min when gates are green. Break
# early if the PR is closed without merging (operator action,
# gates flipped red post-approval, branch-protection rejection)
# so we don't tie up a runner for the full 30 min on a dead PR.
MERGED=""
STATE=""
for _ in $(seq 1 60); do
VIEW=$(gh pr view "$PR_NUM" --repo "$REPO" --json mergedAt,state)
MERGED=$(echo "$VIEW" | jq -r '.mergedAt // ""')
STATE=$(echo "$VIEW" | jq -r '.state // ""')
if [ -n "$MERGED" ] && [ "$MERGED" != "null" ]; then
echo "::notice::Promote PR #${PR_NUM} merged at ${MERGED}"
break
fi
if [ "$STATE" = "CLOSED" ]; then
echo "::warning::Promote PR #${PR_NUM} was closed without merging — skipping deploy dispatch."
exit 0
fi
sleep 30
done
if [ -z "$MERGED" ] || [ "$MERGED" = "null" ]; then
echo "::warning::Promote PR #${PR_NUM} didn't merge within 30min — skipping deploy dispatch (manually run \`gh workflow run publish-workspace-server-image.yml --ref main\` once it lands)."
exit 0
fi
# Dispatch publish on main using the App token. App-initiated
# workflow_dispatch DOES propagate the workflow_run cascade,
# unlike GITHUB_TOKEN-initiated dispatch.
# publish completes → canary-verify chains via workflow_run →
# redeploy-tenants-on-main chains via workflow_run + branches:[main].
if gh workflow run publish-workspace-server-image.yml \
--repo "$REPO" --ref main 2>&1; then
echo "::notice::Dispatched publish-workspace-server-image on ref=main as molecule-ai App — canary-verify and redeploy-tenants-on-main will chain via workflow_run."
{
echo "## 🚀 Tenant redeploy chain dispatched"
echo
echo "- publish-workspace-server-image (workflow_dispatch on \`main\`, actor: \`molecule-ai[bot]\`)"
echo "- canary-verify will chain on completion"
echo "- redeploy-tenants-on-main will chain on canary green"
} >> "$GITHUB_STEP_SUMMARY"
if [ "${MERGE_OUTCOME}" = "auto-merge-scheduled" ]; then
echo "Gitea will auto-merge once Hongming approves and all checks are green. No human action needed beyond approval."
elif [ "${MERGE_OUTCOME}" = "merged-immediately" ]; then
echo "Merged immediately. \`publish-workspace-server-image.yml\` will fire naturally on the resulting \`main\` push."
else
echo "::error::Failed to dispatch publish-workspace-server-image. Run manually: gh workflow run publish-workspace-server-image.yml --ref main"
fi
# ALSO dispatch auto-sync-main-to-staging.yml. Same root cause as
# publish above (issue #2357): the merge-queue-initiated push to
# main is by GITHUB_TOKEN → no `on: push` triggers fire downstream.
# Without this dispatch, every staging→main promote leaves staging
# one merge commit BEHIND main, which silently dead-locks the NEXT
# promote PR as `mergeStateStatus: BEHIND` because main's
# branch-protection has `strict: true`. Verified empirically on
# 2026-05-02 against PR #2442 (Phase 2 promote): only the explicit
# publish-workspace-server-image dispatch fired on the previous
# promote SHA 76c604fb, while auto-sync silently no-op'd, leaving
# staging behind for ~24h until manually bridged.
if gh workflow run auto-sync-main-to-staging.yml \
--repo "$REPO" --ref main 2>&1; then
echo "::notice::Dispatched auto-sync-main-to-staging on ref=main as molecule-ai App — staging will absorb the new main merge commit via PR + merge queue."
else
echo "::error::Failed to dispatch auto-sync-main-to-staging. Run manually: gh workflow run auto-sync-main-to-staging.yml --ref main"
echo "PR is not auto-merging. Operator may need to bring staging up to date with main, then re-trigger this workflow via workflow_dispatch."
fi
} >> "$GITHUB_STEP_SUMMARY"

404
.github/workflows/auto-sync-canary.yml vendored Normal file
View File

@ -0,0 +1,404 @@
name: Auto-sync canary — AUTO_SYNC_TOKEN rotation drift
# Synthetic health check for the AUTO_SYNC_TOKEN secret consumed by
# auto-sync-main-to-staging.yml (PR #66) and publish-workspace-server-image.yml.
#
# ============================================================
# Why this workflow exists
# ============================================================
#
# PR #66 fixed auto-sync (replaced GitHub-era `gh pr create` — which
# 405s on Gitea's GraphQL endpoint — with a direct git push from the
# `devops-engineer` persona's `AUTO_SYNC_TOKEN`). Hostile self-review
# weakest spot #3 of that PR:
#
# "Token rotation silently breaks auto-sync. If AUTO_SYNC_TOKEN is
# rotated without updating the repo secret, every push to main
# fails red on the auto-sync push step. The workflow surfaces the
# failure mode in the step summary (failure mode B in the header),
# but there's no proactive monitoring."
#
# Detection latency under the status quo: rotation is only caught on
# the next push to `main`. During quiet periods (no main push for
# many hours) the staging-superset-of-main invariant silently breaks.
#
# This workflow closes the gap: every 6 hours, it fires the auth
# surface that auto-sync depends on and emits a red workflow status
# if AUTO_SYNC_TOKEN has drifted out of validity.
#
# ============================================================
# What this checks (Option B — read-only verify)
# ============================================================
#
# 1. `GET /api/v1/user` against Gitea with the token → validates the
# token authenticates AND resolves to `devops-engineer` (catches
# the case where the token was regenerated under a different
# persona by mistake).
# 2. `GET /api/v1/repos/molecule-ai/molecule-core` with the token →
# validates the token has `read:repository` scope on this repo
# (the v2 scope contract — see saved memory
# `reference_persona_token_v2_scope`).
# 3. `git push --dry-run` of the current staging SHA back to
# `refs/heads/staging` via `https://oauth2:<token>@<gitea>/...`
# → validates the EXACT HTTPS basic-auth path that
# `actions/checkout` + `git push origin staging` use inside
# auto-sync-main-to-staging.yml. NOP by construction (push the
# current tip to itself = "Everything up-to-date"); auth is
# checked at the smart-protocol handshake BEFORE the empty-diff
# computation, so bad token → exit 128 with "Authentication
# failed". `git ls-remote` is NOT used here because Gitea
# falls back to anonymous read on public repos and would
# silently green-light a rotated token.
#
# Each step exits non-zero with an actionable error message if it
# fails. The workflow status itself is the operator-facing surface.
#
# ============================================================
# What this does NOT check (intentional)
# ============================================================
#
# - **Branch-protection authz** (failure mode C in auto-sync header):
# would require an actual write to staging. Already monitored by
# `branch-protection-drift.yml` daily. Don't duplicate.
# - **Conflict resolution** (failure mode A): a real conflict is data-
# driven, not auth-driven; can't synthesise it without polluting
# staging. Already surfaces immediately on the next main push.
# - **Concurrency** (failure mode D): handled by workflow concurrency
# group on auto-sync, not a credential issue.
#
# ============================================================
# Why Option B (read-only) and not the alternatives
# ============================================================
#
# Considered + rejected (see issue #72 for full write-up):
#
# - **Option A — full auto-sync on schedule**: every run creates a
# no-op merge commit on staging when main hasn't advanced. 4 noise
# commits/day. And races the real `push:` trigger when main has
# advanced. Rejected.
#
# - **Option C — push to dedicated `auto-sync-canary` branch**: would
# exercise authz too, but adds branch noise on Gitea AND requires
# maintaining a second branch protection (or expanding staging's
# whitelist to a junk branch). Authz already covered by
# `branch-protection-drift.yml`. Rejected.
#
# Prior art for the chosen Option B shape:
# - Cloudflare's `/user/tokens/verify` endpoint (read-only auth
# probe explicitly designed for credential canaries).
# - AWS Secrets Manager rotation Lambda's `testSecret` step (auth
# probe before promoting AWSPENDING → AWSCURRENT).
# - HashiCorp Vault's `vault token lookup` for renewal canaries.
#
# ============================================================
# Operator runbook — what to do when this workflow goes RED
# ============================================================
#
# 1. **Identify which step failed**:
# - Step "Verify token authenticates as devops-engineer" red →
# token is invalid OR resolves to wrong persona.
# - Step "Verify token has repo read scope" red → token valid but
# stripped of `read:repository` scope (or repo perms changed).
# - Step "Verify git HTTPS auth path via no-op dry-run push to
# staging" red → token rotated/revoked OR Gitea git-HTTPS
# surface is broken (rare). Auth check happens on the
# smart-protocol handshake, separate from the API path.
#
# 2. **Re-issue the token** on the operator host:
# ```
# ssh root@5.78.80.188 'docker exec --user git molecule-gitea-1 \
# gitea admin user generate-access-token \
# --username devops-engineer \
# --token-name persona-devops-engineer-vN \
# --scopes "read:repository,write:repository,read:user,read:organization,read:issue,write:issue,read:notification,read:misc"'
# ```
# Update `/etc/molecule-bootstrap/agent-secrets.env` in place
# (per `feedback_unified_credentials_file`). The previous token
# file lands at `.bak.<date>`.
#
# 3. **Update the repo Actions secret** at:
# Settings → Secrets and variables → Actions → AUTO_SYNC_TOKEN
# Paste the new token. (Don't echo it in chat — but per
# `feedback_passwords_in_chat_are_burned`, a paste in a 1:1
# Claude session is within trust boundary.)
#
# 4. **Re-run this canary** via workflow_dispatch. Confirm GREEN.
#
# 5. **Backfill any missed main → staging syncs** by re-running
# `auto-sync-main-to-staging.yml` from its workflow_dispatch
# surface, OR by pushing an empty commit to main (if you'd
# rather force a real trigger).
#
# ============================================================
# Security notes
# ============================================================
#
# - Token usage: read-only (`GET /api/v1/user`, `GET /api/v1/repos/...`,
# `git ls-remote`). No write paths. Same blast-radius profile as
# `actions/checkout` on a public repo.
# - The token NEVER appears in logs: every `curl` uses a header
# variable, never inline; the `git ls-remote` URL builds the
# `oauth2:$TOKEN@host` form into a single env var that's not
# echoed. GitHub Actions secret-masking covers anything that does
# slip through.
# - No new token introduced — same `AUTO_SYNC_TOKEN` the workflow
# under monitor uses. Per least-privilege we deliberately do NOT
# broaden scope for the canary.
on:
schedule:
# Every 6 hours at :17 (offsets the cron herd at :00). Justification
# from issue #72: cheap to run (~5s wall-clock, no quota), 3h average
# detection latency, 6h max. 1h would be 24× the runs for marginal
# benefit; daily would be 6× longer latency and worse than status
# quo on a quiet-main day.
- cron: '17 */6 * * *'
workflow_dispatch:
# No concurrency group needed — the canary is read-only and idempotent.
# Two parallel runs (e.g. operator dispatch during a scheduled tick) are
# harmless: same result, doubled HTTPS calls, no shared state.
permissions:
contents: read
jobs:
verify-token:
name: Verify AUTO_SYNC_TOKEN validity
runs-on: ubuntu-latest
# 2 min surfaces hangs (Gitea API stall, DNS issue) within one
# cron interval. Realistic worst case is ~10s: 2 curls + 1 git
# ls-remote, each capped by the explicit timeouts below.
timeout-minutes: 2
env:
# Pinned in env so individual steps can read it without
# repeating the secret reference. GitHub masks the value in
# logs automatically.
AUTO_SYNC_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
# MUST stay in sync with auto-sync-main-to-staging.yml's
# `git config user.name "devops-engineer"` line. Renaming the
# devops-engineer persona requires updating both files (and
# the staging branch protection's `push_whitelist_usernames`).
EXPECTED_PERSONA: devops-engineer
GITEA_HOST: git.moleculesai.app
REPO_PATH: molecule-ai/molecule-core
steps:
- name: Verify AUTO_SYNC_TOKEN secret is configured
# Schedule-vs-dispatch behaviour split, per
# `feedback_schedule_vs_dispatch_secrets_hardening`:
#
# - schedule: hard-fail when the secret is missing. The
# whole point of the canary is to surface drift; soft-
# skipping on missing-secret would make the canary
# itself drift-invisible (sweep-cf-orphans #2088 lesson).
# - workflow_dispatch: hard-fail too — there's no scenario
# where an operator wants this canary to silently no-op.
# The workflow has no other ad-hoc utility; if you ran
# it, you wanted the answer.
run: |
if [ -z "${AUTO_SYNC_TOKEN}" ]; then
echo "::error::AUTO_SYNC_TOKEN secret is not set on this repo." >&2
echo "::error::Set it at Settings → Secrets and variables → Actions." >&2
echo "::error::Without it, auto-sync-main-to-staging.yml will fail every push to main." >&2
exit 1
fi
echo "AUTO_SYNC_TOKEN is configured (value masked)."
- name: Verify token authenticates as ${{ env.EXPECTED_PERSONA }}
# Calls Gitea's `/api/v1/user` — the canonical
# auth-probe-with-no-side-effects endpoint (mirrors
# Cloudflare's /user/tokens/verify).
#
# Failure surfaces:
# - HTTP 401: token invalid (rotated, revoked, or never
# correctly registered).
# - HTTP 200 but username != devops-engineer: token was
# regenerated under the wrong persona — this would let
# auth pass but commit attribution would be wrong, and
# branch-protection authz would fail because only
# `devops-engineer` is whitelisted.
run: |
set -euo pipefail
response_file="$(mktemp)"
code_file="$(mktemp)"
# `--max-time 30`: full call ceiling. `--connect-timeout 10`:
# DNS + TCP. `-w "%{http_code}"` routed to a tempfile so curl's
# exit code can't pollute the captured status — see
# feedback_curl_status_capture_pollution + the
# `lint-curl-status-capture.yml` gate that rejects the unsafe
# `$(curl ... || echo "000")` shape.
set +e
curl -sS -o "$response_file" \
--max-time 30 --connect-timeout 10 \
-w "%{http_code}" \
-H "Authorization: token ${AUTO_SYNC_TOKEN}" \
-H "Accept: application/json" \
"https://${GITEA_HOST}/api/v1/user" >"$code_file" 2>/dev/null
set -e
status=$(cat "$code_file" 2>/dev/null || true)
[ -z "$status" ] && status="000"
if [ "$status" != "200" ]; then
echo "::error::Token rotation suspected: GET /api/v1/user returned HTTP $status (expected 200)." >&2
echo "::error::Likely cause: AUTO_SYNC_TOKEN has been rotated/revoked on Gitea but the repo Actions secret was not updated." >&2
echo "::error::Runbook: see header comment of this workflow file." >&2
# Print response body but redact anything that looks like a token.
sed -E 's/[A-Fa-f0-9]{32,}/<redacted>/g' "$response_file" >&2 || true
exit 1
fi
username=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('login',''))" "$response_file")
if [ "$username" != "${EXPECTED_PERSONA}" ]; then
echo "::error::Token resolves to user '$username', expected '${EXPECTED_PERSONA}'." >&2
echo "::error::AUTO_SYNC_TOKEN must be the devops-engineer persona PAT (not founder PAT, not another persona)." >&2
echo "::error::Auto-sync push will fail because only 'devops-engineer' is whitelisted on staging branch protection." >&2
exit 1
fi
echo "Token authenticates as: $username ✓"
- name: Verify token has repo read scope
# `GET /api/v1/repos/<owner>/<repo>` requires `read:repository`
# on the persona's v2 scope contract. If the scope was
# narrowed/dropped on rotation we catch it here, before the
# next main push reveals it via a checkout failure.
run: |
set -euo pipefail
response_file="$(mktemp)"
code_file="$(mktemp)"
# See first probe step for the rationale on the tempfile-routed
# `-w "%{http_code}"` pattern — the unsafe `|| echo "000"` shape
# is rejected by lint-curl-status-capture.yml.
set +e
curl -sS -o "$response_file" \
--max-time 30 --connect-timeout 10 \
-w "%{http_code}" \
-H "Authorization: token ${AUTO_SYNC_TOKEN}" \
-H "Accept: application/json" \
"https://${GITEA_HOST}/api/v1/repos/${REPO_PATH}" >"$code_file" 2>/dev/null
set -e
status=$(cat "$code_file" 2>/dev/null || true)
[ -z "$status" ] && status="000"
if [ "$status" != "200" ]; then
echo "::error::Token lacks read:repository scope on ${REPO_PATH}: HTTP $status." >&2
echo "::error::Auto-sync's actions/checkout step will fail with this token." >&2
echo "::error::Re-issue with v2 scope contract: read:repository,write:repository,read:user,read:organization,read:issue,write:issue,read:notification,read:misc" >&2
sed -E 's/[A-Fa-f0-9]{32,}/<redacted>/g' "$response_file" >&2 || true
exit 1
fi
echo "Token has read:repository on ${REPO_PATH} ✓"
- name: Verify git HTTPS auth path via no-op dry-run push to staging
# Final probe: exercise the EXACT auth path that
# `actions/checkout` + `git push origin staging` use in
# auto-sync-main-to-staging.yml. Gitea's API and git-HTTPS
# surfaces share the token-lookup code path internally but
# the wire-level error shapes differ — historically (#173)
# the API path was healthy while git-HTTPS rejected, so
# checking only the API would have given false-green.
#
# IMPORTANT: `git ls-remote` on a public repo (which
# molecule-core is) succeeds even with a junk token because
# Gitea falls back to anonymous-read. `ls-remote` therefore
# CANNOT validate auth on this surface. We use
# `git push --dry-run` instead — push is auth-gated even on
# public repos.
#
# NOP shape: read the current staging SHA via authenticated
# ls-remote (the SHA itself is public; auth is incidental
# here, used only to colocate the discovery in one step), then
# `git push --dry-run <SHA>:refs/heads/staging`. Pushing the
# current tip back to itself is "Everything up-to-date" with
# exit 0 when auth succeeds. With a bad token Gitea returns
# HTTP 401 in the smart-protocol handshake and git exits 128
# with "Authentication failed".
#
# The dry-run never reaches Gitea's pre-receive hook (which
# is where branch-protection authz runs), so this probe does
# not validate failure mode C. That's intentional —
# branch-protection-drift.yml owns authz monitoring; this
# canary owns auth.
env:
# Don't hang waiting for password prompt if auth fails on a
# terminal-attached run. (In Actions there's no terminal,
# but the env-var hardens against an interactive runner
# config.)
GIT_TERMINAL_PROMPT: "0"
run: |
set -euo pipefail
# Token is in $AUTO_SYNC_TOKEN (job-level env). Compose the
# URL as a local var that's never echoed.
url="https://oauth2:${AUTO_SYNC_TOKEN}@${GITEA_HOST}/${REPO_PATH}"
# Step a: read current staging SHA. ~1KB; auth-gated only
# on private repos but always works on public — used here
# only to discover the SHA, not to validate auth.
staging_ref=$(timeout 30s git ls-remote --refs "$url" refs/heads/staging 2>&1) || {
redacted=$(echo "$staging_ref" | sed -E "s|oauth2:[^@]+@|oauth2:<redacted>@|g")
echo "::error::ls-remote against staging failed (network/DNS issue):" >&2
echo "$redacted" >&2
exit 1
}
if ! echo "$staging_ref" | grep -qE '^[0-9a-f]{40}[[:space:]]+refs/heads/staging$'; then
echo "::error::ls-remote returned unexpected shape:" >&2
echo "$staging_ref" | sed -E "s|oauth2:[^@]+@|oauth2:<redacted>@|g" >&2
exit 1
fi
staging_sha=$(echo "$staging_ref" | awk '{print $1}')
# Step b: spin up an ephemeral local repo. `git push` always
# requires a local repo even when pushing a remote SHA that
# isn't in the local object DB (the protocol negotiates and
# discovers we don't need to send any objects). We don't use
# `actions/checkout` for this — it would clone the whole
# repo (~hundreds of MB) for what's essentially `git init`.
tmp_repo="$(mktemp -d)"
trap 'rm -rf "$tmp_repo"' EXIT
git -C "$tmp_repo" init -q
# Author config required for any git operation; values are
# arbitrary because nothing gets committed here.
git -C "$tmp_repo" config user.email canary@auto-sync.local
git -C "$tmp_repo" config user.name auto-sync-canary
# Step c: dry-run push the current staging SHA back to
# staging. NOP by construction — the remote tip equals the
# SHA we're pushing, so "Everything up-to-date" is the
# success path.
#
# Authentication is checked at the smart-protocol handshake,
# BEFORE the dry-run can compute an empty diff. Bad token
# → "Authentication failed", exit 128. Good token → exit 0.
set +e
push_out=$(timeout 30s git -C "$tmp_repo" push --dry-run "$url" "${staging_sha}:refs/heads/staging" 2>&1)
push_rc=$?
set -e
if [ "$push_rc" -ne 0 ]; then
redacted=$(echo "$push_out" | sed -E "s|oauth2:[^@]+@|oauth2:<redacted>@|g")
echo "::error::Token rotation suspected: git push --dry-run against staging failed via the AUTO_SYNC_TOKEN HTTPS auth path (exit $push_rc)." >&2
echo "::error::This is the EXACT auth path that actions/checkout + git push use in auto-sync-main-to-staging.yml." >&2
echo "::error::Likely cause: AUTO_SYNC_TOKEN was rotated/revoked on Gitea but the repo Actions secret was not updated. Runbook: see header." >&2
echo "$redacted" >&2
exit 1
fi
echo "git HTTPS auth path: NOP push --dry-run to staging → ${staging_sha:0:8} ✓"
- name: Summarise canary result
# Everything passed — surface a green summary. (Failures
# already wrote ::error:: lines and exited above; if we got
# here, all three probes passed.)
run: |
{
echo "## Auto-sync canary: GREEN"
echo ""
echo "AUTO_SYNC_TOKEN is healthy:"
echo "- Authenticates as \`${EXPECTED_PERSONA}\` ✓"
echo "- Has \`read:repository\` scope on \`${REPO_PATH}\` ✓"
echo "- Git HTTPS auth path: no-op dry-run push to \`refs/heads/staging\` succeeds ✓"
echo ""
echo "Auto-sync main → staging will succeed on the next push to main."
echo "If this canary ever goes RED, see the runbook in this workflow's header."
} >> "$GITHUB_STEP_SUMMARY"

View File

@ -57,17 +57,42 @@ jobs:
id: bump
if: steps.skip.outputs.skip != 'true'
env:
GH_TOKEN: ${{ github.token }}
# Gitea-shape token (act_runner forwards GITHUB_TOKEN as a
# short-lived per-run secret with read access to this repo).
# We hit `/api/v1/repos/.../pulls?state=closed` directly
# because `gh pr list` calls Gitea's GraphQL endpoint, which
# returns HTTP 405 (issue #75 / post-#66 sweep).
GITEA_TOKEN: ${{ github.token }}
REPO: ${{ github.repository }}
GITEA_API_URL: ${{ github.server_url }}/api/v1
PUSH_SHA: ${{ github.sha }}
run: |
# The merged PR for this push commit. `gh pr list --search` finds
# closed PRs whose merge commit matches; we take the first.
PR=$(gh pr list --state merged --search "${{ github.sha }}" --json number,labels --jq '.[0]' 2>/dev/null || echo "")
# Find the merged PR whose merge_commit_sha matches this push.
# Gitea's `/repos/{owner}/{repo}/pulls?state=closed` returns
# PRs sorted newest-first; we paginate up to 50 and jq-filter
# on `merge_commit_sha == PUSH_SHA`. Bounded — auto-tag fires
# per push to main, so the matching PR is always among the
# most recent closures. 50 is comfortably more than the
# ~10-20 staging→main promotes that close in any reasonable
# window.
set -euo pipefail
PRS_JSON=$(curl --fail-with-body -sS \
-H "Authorization: token ${GITEA_TOKEN}" \
-H "Accept: application/json" \
"${GITEA_API_URL}/repos/${REPO}/pulls?state=closed&sort=newest&limit=50" \
2>/dev/null || echo "[]")
PR=$(printf '%s' "$PRS_JSON" \
| jq -c --arg sha "$PUSH_SHA" \
'[.[] | select(.merged_at != null and .merge_commit_sha == $sha)] | .[0] // empty')
if [ -z "$PR" ] || [ "$PR" = "null" ]; then
echo "No merged PR found for ${{ github.sha }} — defaulting to patch bump."
echo "No merged PR found for ${PUSH_SHA} — defaulting to patch bump."
echo "kind=patch" >> "$GITHUB_OUTPUT"
exit 0
fi
LABELS=$(echo "$PR" | jq -r '.labels[].name')
# Gitea returns labels under `.labels[].name`, same shape as
# GitHub's REST. The previous `gh pr list --json number,labels`
# output was identical; jq filter unchanged.
LABELS=$(printf '%s' "$PR" | jq -r '.labels[]?.name // empty')
if echo "$LABELS" | grep -qx 'release:major'; then
echo "kind=major" >> "$GITHUB_OUTPUT"
elif echo "$LABELS" | grep -qx 'release:minor'; then

View File

@ -19,6 +19,7 @@ on:
branches: [staging, main]
paths:
- 'tools/branch-protection/**'
- '.github/workflows/**'
- '.github/workflows/branch-protection-drift.yml'
permissions:
@ -79,3 +80,32 @@ jobs:
# Repo-admin scope, needed for /branches/:b/protection.
GH_TOKEN: ${{ secrets.GH_TOKEN_FOR_ADMIN_API }}
run: bash tools/branch-protection/drift_check.sh
# Self-test the parity script before running it on the real
# workflows — pins the script's classification logic against
# synthetic safe/unsafe/missing/unsafe-mix/matrix fixtures so a
# regression in the script can't false-pass on the production
# workflow audit. Cheap (~0.5s); always runs.
- name: Self-test check-name parity script
run: bash tools/branch-protection/test_check_name_parity.sh
# Check-name parity gate (#144 / saved memory
# feedback_branch_protection_check_name_parity).
#
# drift_check.sh asserts the live branch protection matches what
# apply.sh would set; check_name_parity.sh closes the orthogonal
# gap: it asserts every required check name in apply.sh maps to a
# workflow job whose "always emits this status" shape is intact.
#
# The two checks fail in different scenarios:
#
# - drift_check fails → live state was rewritten out-of-band
# (UI click, manual PATCH).
# - check_name_parity fails → an apply.sh required name has no
# emitter, OR the emitting workflow has a top-level paths:
# filter without per-step if-gates (the silent-block shape).
#
# Cheap (~1s); runs without the admin token because it only reads
# apply.sh + .github/workflows/ from the checkout.
- name: Run check-name parity gate
run: bash tools/branch-protection/check_name_parity.sh

View File

@ -235,7 +235,13 @@ jobs:
run: npx vitest run --coverage
- name: Upload coverage summary as artifact
if: needs.changes.outputs.canvas == 'true' && always()
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
# Pinned to v3 for Gitea act_runner v0.6 compatibility — v4+ uses
# the GHES 3.10+ artifact protocol that Gitea 1.22.x does NOT
# implement, surfacing as `GHESNotSupportedError: @actions/artifact
# v2.0.0+, upload-artifact@v4+ and download-artifact@v4+ are not
# currently supported on GHES`. Drop this pin when Gitea ships
# the v4 protocol (tracked: post-Gitea-1.23 followup).
uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
with:
name: canvas-coverage-${{ github.run_id }}
path: canvas/coverage/

View File

@ -139,7 +139,11 @@ jobs:
- name: Upload Playwright report on failure
if: failure() && needs.detect-changes.outputs.canvas == 'true'
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
# Pinned to v3 for Gitea act_runner v0.6 compatibility — v4+ uses
# the GHES 3.10+ artifact protocol that Gitea 1.22.x does NOT
# implement (see ci.yml upload step for the canonical error
# cite). Drop this pin when Gitea ships the v4 protocol.
uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
with:
name: playwright-report-staging
path: canvas/playwright-report-staging/
@ -147,7 +151,8 @@ jobs:
- name: Upload screenshots on failure
if: failure() && needs.detect-changes.outputs.canvas == 'true'
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
# Pinned to v3 for Gitea act_runner v0.6 compatibility (see above).
uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2
with:
name: playwright-screenshots
path: canvas/test-results/

View File

@ -98,6 +98,55 @@ jobs:
# github-app-auth sibling-checkout removed 2026-05-07 (#157):
# the plugin was dropped + Dockerfile.tenant no longer COPYs it.
# Pre-clone manifest deps before docker compose builds the tenant
# image (Task #173 followup — same pattern as
# publish-workspace-server-image.yml's "Pre-clone manifest deps"
# step).
#
# Why pre-clone here too: tests/harness/compose.yml builds tenant-alpha
# and tenant-beta from workspace-server/Dockerfile.tenant with
# context=../.. (repo root). That Dockerfile expects
# .tenant-bundle-deps/{workspace-configs-templates,org-templates,plugins}
# to be present at build context root (post-#173 it COPYs from there
# instead of running an in-image clone — the in-image clone failed
# with "could not read Username for https://git.moleculesai.app"
# because there's no auth path inside the build sandbox).
#
# Without this step harness-replays fails before any replay runs,
# with `failed to calculate checksum of ref ...
# "/.tenant-bundle-deps/plugins": not found`. Caught by run #892
# (main, 2026-05-07T20:28:53Z) and run #964 (staging — same
# symptom, different root cause: staging still has the in-image
# clone path, hits the auth error directly).
#
# Token shape matches publish-workspace-server-image.yml: AUTO_SYNC_TOKEN
# is the devops-engineer persona PAT, NOT the founder PAT (per
# `feedback_per_agent_gitea_identity_default`). clone-manifest.sh
# embeds it as basic-auth for the duration of the clones and strips
# .git directories — the token never enters the resulting image.
- name: Pre-clone manifest deps
if: needs.detect-changes.outputs.run == 'true'
env:
MOLECULE_GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
run: |
set -euo pipefail
if [ -z "${MOLECULE_GITEA_TOKEN}" ]; then
echo "::error::AUTO_SYNC_TOKEN secret is empty — register the devops-engineer persona PAT in repo Actions secrets"
exit 1
fi
mkdir -p .tenant-bundle-deps
bash scripts/clone-manifest.sh \
manifest.json \
.tenant-bundle-deps/workspace-configs-templates \
.tenant-bundle-deps/org-templates \
.tenant-bundle-deps/plugins
# Sanity-check counts so a silent partial clone fails fast
# instead of producing a half-empty image.
ws_count=$(find .tenant-bundle-deps/workspace-configs-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
org_count=$(find .tenant-bundle-deps/org-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
plugins_count=$(find .tenant-bundle-deps/plugins -mindepth 1 -maxdepth 1 -type d | wc -l)
echo "Cloned: ws=$ws_count org=$org_count plugins=$plugins_count"
- name: Install Python deps for replays
# peer-discovery-404 (and future replays) eval Python against the
# running tenant — importing workspace/a2a_client.py pulls in

View File

@ -1,9 +1,10 @@
'use client';
import { useEffect, useMemo, useCallback } from "react";
import { useEffect, useMemo, useCallback, useRef } from "react";
import { type Edge, MarkerType } from "@xyflow/react";
import { api } from "@/lib/api";
import { useCanvasStore } from "@/store/canvas";
import { useSocketEvent } from "@/hooks/useSocketEvent";
import type { ActivityEntry } from "@/types/activity";
// ── Constants ─────────────────────────────────────────────────────────────────
@ -11,9 +12,6 @@ import type { ActivityEntry } from "@/types/activity";
/** 60-minute look-back window for delegation activity */
export const A2A_WINDOW_MS = 60 * 60 * 1000;
/** Polling interval — refresh edges every 60 seconds */
export const A2A_POLL_MS = 60 * 1_000;
/** Threshold for "hot" edges: < 5 minutes → animated + violet stroke */
export const A2A_HOT_MS = 5 * 60 * 1_000;
@ -131,6 +129,20 @@ export function buildA2AEdges(
* `a2aEdges`. Canvas.tsx merges these with topology edges and passes the
* combined list to ReactFlow.
*
* Update shape (issue #61 Stage 2, replaces the 60s polling loop):
* - On mount (when showA2AEdges): one HTTP fan-out per visible workspace
* (delegation rows, 60-min window). Bootstraps the local row buffer.
* - Steady state: subscribes to ACTIVITY_LOGGED via useSocketEvent.
* Each delegation event from a visible workspace is appended to the
* buffer; edges are re-derived via the existing buildA2AEdges helper.
* - showA2AEdges toggle off: clears edges + buffer.
* - Visible-ID-set change: re-bootstraps so a freshly-shown workspace
* backfills its 60-min history (existing visibleIdsKey selector
* behaviour preserved that's the 2026-05-04 render-loop fix).
*
* No interval poll. The singleton ReconnectingSocket already owns
* reconnect / backoff / health-check; useSocketEvent inherits those.
*
* Mount this inside CanvasInner (no ReactFlow hook dependency).
*/
export function A2ATopologyOverlay() {
@ -157,7 +169,9 @@ export function A2ATopologyOverlay() {
// the symptom of this re-render storm.
//
// The fix is purely the dependency-stability change here; the fetch
// logic is unchanged.
// logic is unchanged. Post-#61 the polling-driven fetch is gone, but
// the visibleIdsKey gate is still required so a peer-discovery write
// doesn't trigger a wasteful re-bootstrap.
const visibleIdsKey = useCanvasStore((s) =>
s.nodes
.filter((n) => !n.hidden)
@ -171,16 +185,42 @@ export function A2ATopologyOverlay() {
[visibleIdsKey]
);
// Fetch delegation activity for all visible workspaces and rebuild overlay edges.
const fetchAndUpdate = useCallback(async () => {
// Local rolling buffer of delegation rows. Pruned by A2A_WINDOW_MS on
// each rebuild so a long-lived session doesn't accumulate unbounded
// history. The buffer's high-water mark is approximately:
// visibleIds.length × bootstrap-fetch-limit (500) + WS arrivals
// Real-world ceiling: ~3000 entries at the 60-min boundary, all of
// which buildA2AEdges aggregates into at most N² edges.
const bufferRef = useRef<ActivityEntry[]>([]);
// visibleIdsRef gives the WS handler the latest visible-ID set without
// re-subscribing on every render. The bus listener is registered
// exactly once per mount; subscriber-side filtering reads from this ref.
const visibleIdsRef = useRef(visibleIds);
visibleIdsRef.current = visibleIds;
// Re-derive overlay edges from the current buffer + push to store.
// Prunes by A2A_WINDOW_MS first so memory stays bounded across long
// sessions and the aggregation cost stays O(window-size).
const recomputeAndPush = useCallback(() => {
const cutoff = Date.now() - A2A_WINDOW_MS;
bufferRef.current = bufferRef.current.filter(
(r) => new Date(r.created_at).getTime() > cutoff
);
setA2AEdges(buildA2AEdges(bufferRef.current));
}, [setA2AEdges]);
// Bootstrap fan-out — one HTTP per visible workspace. Replaces the
// 60s polling loop entirely. Race-aware: any WS arrivals that landed
// in the buffer DURING the fetch (between the await and resume) are
// preserved by id-dedup-with-fetched-first ordering.
const bootstrap = useCallback(async () => {
if (visibleIds.length === 0) {
bufferRef.current = [];
setA2AEdges([]);
return;
}
try {
// Fan-out — one request per visible workspace.
// Per-request failures are swallowed so one broken workspace doesn't blank the overlay.
const allRows = (
const fetchedRows = (
await Promise.all(
visibleIds.map((id) =>
api
@ -192,24 +232,76 @@ export function A2ATopologyOverlay() {
)
).flat();
setA2AEdges(buildA2AEdges(allRows));
// Merge: fetched rows first, then any in-flight WS arrivals that
// accumulated during the await. Dedup by id so rows that appear
// in both paths are not double-counted in the aggregation.
const merged = [...fetchedRows, ...bufferRef.current];
const seen = new Set<string>();
bufferRef.current = merged.filter((r) => {
if (seen.has(r.id)) return false;
seen.add(r.id);
return true;
});
recomputeAndPush();
} catch {
// Overlay failure is non-critical — canvas remains functional
}
}, [visibleIds, setA2AEdges]);
}, [visibleIds, setA2AEdges, recomputeAndPush]);
useEffect(() => {
if (!showA2AEdges) {
// Clear edges immediately when toggled off
// Clear edges + buffer immediately when toggled off
bufferRef.current = [];
setA2AEdges([]);
return;
}
void bootstrap();
}, [showA2AEdges, bootstrap, setA2AEdges]);
// Initial fetch, then poll every 60 s
void fetchAndUpdate();
const timer = setInterval(() => void fetchAndUpdate(), A2A_POLL_MS);
return () => clearInterval(timer);
}, [showA2AEdges, fetchAndUpdate, setA2AEdges]);
// Live-update path. Filters server-side ACTIVITY_LOGGED events down
// to delegation initiations from visible workspaces and appends each
// into the rolling buffer, re-deriving edges via buildA2AEdges.
//
// Only `method === "delegate"` rows count — the same filter
// buildA2AEdges applies — so delegate_result rows arriving over the
// wire don't double-count.
useSocketEvent((msg) => {
if (!showA2AEdges) return;
if (msg.event !== "ACTIVITY_LOGGED") return;
const p = (msg.payload || {}) as Record<string, unknown>;
if (p.activity_type !== "delegation") return;
if (p.method !== "delegate") return;
const wsId = msg.workspace_id;
if (!visibleIdsRef.current.includes(wsId)) return;
// Synthesise an ActivityEntry from the WS payload so buildA2AEdges
// (which the bootstrap path also feeds) handles it identically.
const entry: ActivityEntry = {
id:
(p.id as string) ||
`ws-push-${msg.timestamp || Date.now()}-${wsId}`,
workspace_id: wsId,
activity_type: "delegation",
source_id: (p.source_id as string | null) ?? null,
target_id: (p.target_id as string | null) ?? null,
method: "delegate",
summary: (p.summary as string | null) ?? null,
request_body: null,
response_body: null,
duration_ms: (p.duration_ms as number | null) ?? null,
status: (p.status as string) || "ok",
error_detail: null,
created_at:
(p.created_at as string) ||
msg.timestamp ||
new Date().toISOString(),
};
bufferRef.current = [...bufferRef.current, entry];
recomputeAndPush();
});
// Pure side-effect — renders nothing
return null;

View File

@ -3,6 +3,7 @@
import { useState, useEffect, useCallback, useRef } from "react";
import { useCanvasStore } from "@/store/canvas";
import { api } from "@/lib/api";
import { useSocketEvent } from "@/hooks/useSocketEvent";
import { COMM_TYPE_LABELS } from "@/lib/design-tokens";
interface Communication {
@ -18,32 +19,71 @@ interface Communication {
durationMs: number | null;
}
/** Workspace-server `ACTIVITY_LOGGED` payload shape. Pulled out so the
* WS handler below has a typed view of the same fields the HTTP
* bootstrap consumes drift between the two paths is a class of bug
* AgentCommsPanel hit historically. */
interface ActivityLoggedPayload {
id?: string;
activity_type?: string;
source_id?: string | null;
target_id?: string | null;
workspace_id?: string;
summary?: string | null;
status?: string;
duration_ms?: number | null;
created_at?: string;
}
/** Fan-out cap for the bootstrap HTTP fetch on mount / on visibility
* re-open. Kept at 3 (carried over from the 2026-05-04 fix) so a
* freshly-mounted overlay on a 15-workspace tenant only spends 3
* round-trips bootstrapping. Live updates after that arrive via the
* WS subscription below no polling, no fan-out to maintain. */
const BOOTSTRAP_FAN_OUT_CAP = 3;
/** Cap on the rendered list. Bootstrap + every WS push prepends, the
* list is sliced to this size after each update. Mirrors the prior
* polling-loop behaviour. */
const COMMS_RENDER_CAP = 20;
/**
* Overlay showing recent A2A communications between workspaces.
* Renders as a floating log panel that auto-updates.
*
* Update shape (issue #61 Stage 1, replaces the 30s polling loop):
* - On mount (when visible): one HTTP bootstrap per online workspace,
* capped at BOOTSTRAP_FAN_OUT_CAP. Yields the initial recent-comms
* window without waiting for live events.
* - Steady state: subscribes to ACTIVITY_LOGGED via useSocketEvent.
* Each event with a matching activity_type from a visible online
* workspace gets synthesised into a Communication and prepended.
* - Visibility re-open: re-bootstraps so the user sees the freshest
* window even if WS was idle while collapsed.
*
* No interval poll. The singleton ReconnectingSocket in `store/socket.ts`
* already owns reconnect/backoff/health-check, and `useSocketEvent`
* inherits those guarantees. If WS is genuinely unhealthy, the overlay
* shows the bootstrap snapshot until the next visibility re-open or
* the next WS reconnect (which fires its own rehydrate burst).
*/
export function CommunicationOverlay() {
const [comms, setComms] = useState<Communication[]>([]);
const [visible, setVisible] = useState(true);
const selectedNodeId = useCanvasStore((s) => s.selectedNodeId);
const nodes = useCanvasStore((s) => s.nodes);
// nodesRef gives the WS handler current node-name resolution without
// re-subscribing on every node-list change. The bus listener is
// registered exactly once per mount; subscriber-side filtering reads
// the latest value via this ref.
const nodesRef = useRef(nodes);
nodesRef.current = nodes;
const fetchComms = useCallback(async () => {
const bootstrapComms = useCallback(async () => {
try {
// Fan-out cap: each polled workspace = 1 round-trip. The platform
// rate limits at 600 req/min/IP; combined with heartbeats + other
// canvas polling, every workspace polled here costs ~6 req/min
// (1 every 30s × 1 per workspace). Capping at 3 keeps this
// overlay's footprint at 18 req/min worst case — well under
// budget even with 8+ workspaces visible. Caught 2026-05-04 when
// a user with 8+ workspaces (Design Director + 6 sub-agents +
// 3 standalones) saw sustained 429s in canvas console.
const onlineNodes = nodesRef.current.filter((n) => n.data.status === "online");
const allComms: Communication[] = [];
for (const node of onlineNodes.slice(0, 3)) {
for (const node of onlineNodes.slice(0, BOOTSTRAP_FAN_OUT_CAP)) {
try {
const activities = await api.get<Array<{
id: string;
@ -59,8 +99,8 @@ export function CommunicationOverlay() {
for (const a of activities) {
if (a.activity_type === "a2a_send" || a.activity_type === "a2a_receive") {
const sourceNode = nodes.find((n) => n.id === (a.source_id || a.workspace_id));
const targetNode = nodes.find((n) => n.id === (a.target_id || ""));
const sourceNode = nodesRef.current.find((n) => n.id === (a.source_id || a.workspace_id));
const targetNode = nodesRef.current.find((n) => n.id === (a.target_id || ""));
allComms.push({
id: a.id,
sourceId: a.source_id || a.workspace_id,
@ -76,11 +116,12 @@ export function CommunicationOverlay() {
}
}
} catch {
// Skip workspaces that fail
// Per-workspace failures must not blank the panel — the same
// robustness the polling version had.
}
}
// Sort by timestamp, newest first, dedupe
// Newest-first with id-dedup, capped at COMMS_RENDER_CAP.
const seen = new Set<string>();
const sorted = allComms
.sort((a, b) => b.timestamp.localeCompare(a.timestamp))
@ -89,29 +130,78 @@ export function CommunicationOverlay() {
seen.add(c.id);
return true;
})
.slice(0, 20);
.slice(0, COMMS_RENDER_CAP);
setComms(sorted);
} catch {
// Silently handle API errors
// Bootstrap failure is non-blocking — the WS subscription below
// will populate the panel as live events arrive.
}
}, []);
// Bootstrap once on mount + every time the user re-opens after a
// collapse. Closed-panel state intentionally drops live updates so
// the panel doesn't churn invisible state — the next open reloads.
useEffect(() => {
// Gate polling on visibility — when the user collapses the overlay
// the data isn't being read, so the per-workspace fan-out becomes
// pure rate-limit overhead. Pre-fix this overlay polled regardless
// of whether the panel was shown, costing ~36 req/min from a
// hidden surface.
if (!visible) return;
fetchComms();
// 30s cadence (was 10s). At 3-workspace fan-out that's 6 req/min
// worst case from this overlay. Combined with heartbeats (~30/min)
// and other canvas polling, leaves ample headroom under the 600/
// min/IP server-side rate limit even at 8+ workspace tenants.
const interval = setInterval(fetchComms, 30000);
return () => clearInterval(interval);
}, [fetchComms, visible]);
bootstrapComms();
}, [bootstrapComms, visible]);
// Live-update path. Filters server-side ACTIVITY_LOGGED events down
// to the comm-overlay-relevant subset and prepends each into the
// rendered list with the same dedup the bootstrap path uses.
//
// Scope guard: ignore events for workspaces not in the visible online
// set, so a user collapsing one workspace doesn't see its comms
// continue to scroll in. Same shape the bootstrap path applies.
useSocketEvent((msg) => {
if (!visible) return;
if (msg.event !== "ACTIVITY_LOGGED") return;
const p = (msg.payload || {}) as ActivityLoggedPayload;
const type = p.activity_type;
if (type !== "a2a_send" && type !== "a2a_receive" && type !== "task_update") return;
const wsId = msg.workspace_id;
const onlineSet = new Set(
nodesRef.current.filter((n) => n.data.status === "online").map((n) => n.id),
);
if (!onlineSet.has(wsId)) return;
const sourceId = p.source_id || wsId;
const targetId = p.target_id || "";
const sourceNode = nodesRef.current.find((n) => n.id === sourceId);
const targetNode = nodesRef.current.find((n) => n.id === targetId);
const incoming: Communication = {
id: p.id || `${msg.timestamp || Date.now()}:${sourceId}:${targetId}`,
sourceId,
targetId,
sourceName: sourceNode?.data.name || "Unknown",
targetName: targetNode?.data.name || "Unknown",
type: type as Communication["type"],
summary: p.summary || "",
status: p.status || "ok",
timestamp: p.created_at || msg.timestamp || new Date().toISOString(),
durationMs: p.duration_ms ?? null,
};
setComms((prev) => {
// Prepend, dedup by id, re-cap. Functional setState is necessary
// because two ACTIVITY_LOGGED events arriving in the same React
// batch would otherwise read a stale `comms` from the closure.
const seen = new Set<string>();
const merged = [incoming, ...prev]
.sort((a, b) => b.timestamp.localeCompare(a.timestamp))
.filter((c) => {
if (seen.has(c.id)) return false;
seen.add(c.id);
return true;
})
.slice(0, COMMS_RENDER_CAP);
return merged;
});
});
if (!visible || comms.length === 0) {
return (

View File

@ -41,6 +41,10 @@ vi.mock("@/store/canvas", () => ({
// ── Imports (after mocks) ─────────────────────────────────────────────────────
import { api } from "@/lib/api";
import {
emitSocketEvent,
_resetSocketEventListenersForTests,
} from "@/store/socket-events";
import {
buildA2AEdges,
formatA2ARelativeTime,
@ -342,6 +346,151 @@ describe("A2ATopologyOverlay component", () => {
expect(mockGet.mock.calls.length).toBe(callsAfterMount);
});
// ── #61 Stage 2: ACTIVITY_LOGGED subscription tests ────────────────────────
//
// Pin the post-#61 behaviour: WS push for delegation contributes to
// the overlay's edge buffer with NO additional HTTP fetch. Same shape
// as Stage 1 (CommunicationOverlay).
describe("#61 stage 2 — ACTIVITY_LOGGED subscription", () => {
beforeEach(() => {
_resetSocketEventListenersForTests();
});
afterEach(() => {
_resetSocketEventListenersForTests();
});
function emitDelegation(overrides: {
workspaceId?: string;
sourceId?: string;
targetId?: string;
method?: string;
activityType?: string;
} = {}) {
// Use Date.now() (real time, fake-timer-frozen) rather than the
// hardcoded NOW constant — buildA2AEdges prunes by Date.now() -
// A2A_WINDOW_MS, so a row dated against the wrong epoch silently
// falls outside the window and the test fails for a confusing
// reason ("edges array empty" vs "filter dropped my row").
const realNow = Date.now();
emitSocketEvent({
event: "ACTIVITY_LOGGED",
workspace_id: overrides.workspaceId ?? "ws-a",
timestamp: new Date(realNow).toISOString(),
payload: {
id: `act-${Math.random().toString(36).slice(2)}`,
activity_type: overrides.activityType ?? "delegation",
method: overrides.method ?? "delegate",
source_id: overrides.sourceId ?? "ws-a",
target_id: overrides.targetId ?? "ws-b",
status: "ok",
created_at: new Date(realNow - 30_000).toISOString(),
},
});
}
it("does NOT poll on a 60s interval after bootstrap (post-#61)", async () => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
mockGet.mockResolvedValue([] as any);
render(<A2ATopologyOverlay />);
await act(async () => { await Promise.resolve(); });
const callsAfterBootstrap = mockGet.mock.calls.length;
expect(callsAfterBootstrap).toBe(2); // ws-a + ws-b
// Pre-#61: a 60s clock tick would fire a fresh fan-out (2 more
// calls). Post-#61: no interval, no extra calls.
await act(async () => {
vi.advanceTimersByTime(120_000);
});
expect(mockGet.mock.calls.length).toBe(callsAfterBootstrap);
});
it("WS push for a delegation event from a visible workspace updates edges with NO HTTP call", async () => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
mockGet.mockResolvedValue([] as any);
render(<A2ATopologyOverlay />);
await act(async () => { await Promise.resolve(); await Promise.resolve(); });
mockGet.mockClear();
mockStoreState.setA2AEdges.mockClear();
await act(async () => {
emitDelegation({ sourceId: "ws-a", targetId: "ws-b" });
});
// Edges-set called with at least one a2a edge for the new push.
const calls = mockStoreState.setA2AEdges.mock.calls;
expect(calls.length).toBeGreaterThanOrEqual(1);
const lastCall = calls[calls.length - 1][0] as Array<{ id: string }>;
expect(lastCall.some((e) => e.id === "a2a-ws-a-ws-b")).toBe(true);
// Critical: no HTTP fetch fired during the WS path.
expect(mockGet).not.toHaveBeenCalled();
});
it("WS push for a non-delegation activity_type is ignored", async () => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
mockGet.mockResolvedValue([] as any);
render(<A2ATopologyOverlay />);
await act(async () => { await Promise.resolve(); });
mockStoreState.setA2AEdges.mockClear();
await act(async () => {
emitDelegation({ activityType: "a2a_send" });
});
// setA2AEdges must not be called by the WS handler — the only
// setA2AEdges calls in this test came from the initial bootstrap.
expect(mockStoreState.setA2AEdges).not.toHaveBeenCalled();
});
it("WS push for a delegate_result row is ignored (mirrors buildA2AEdges filter)", async () => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
mockGet.mockResolvedValue([] as any);
render(<A2ATopologyOverlay />);
await act(async () => { await Promise.resolve(); });
mockStoreState.setA2AEdges.mockClear();
await act(async () => {
emitDelegation({ method: "delegate_result" });
});
// delegate_result rows do not contribute to the edge count — they
// are completion signals, not initiations.
expect(mockStoreState.setA2AEdges).not.toHaveBeenCalled();
});
it("WS push from a hidden workspace is ignored", async () => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
mockGet.mockResolvedValue([] as any);
render(<A2ATopologyOverlay />);
await act(async () => { await Promise.resolve(); });
mockStoreState.setA2AEdges.mockClear();
await act(async () => {
emitDelegation({ workspaceId: "ws-hidden" });
});
expect(mockStoreState.setA2AEdges).not.toHaveBeenCalled();
});
it("WS push while showA2AEdges is false is ignored", async () => {
mockStoreState.showA2AEdges = false;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
mockGet.mockResolvedValue([] as any);
render(<A2ATopologyOverlay />);
// The mount path with showA2AEdges=false calls setA2AEdges([])
// once — clear that to isolate the WS path.
mockStoreState.setA2AEdges.mockClear();
await act(async () => {
emitDelegation();
});
expect(mockStoreState.setA2AEdges).not.toHaveBeenCalled();
expect(mockGet).not.toHaveBeenCalled();
});
});
it("re-fetches when the visible ID set actually changes", async () => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
mockGet.mockResolvedValue([] as any);

View File

@ -36,6 +36,10 @@ vi.mock("@/hooks/useWorkspaceName", () => ({
useWorkspaceName: () => () => "Test WS",
}));
import {
emitSocketEvent,
_resetSocketEventListenersForTests,
} from "@/store/socket-events";
import { ActivityTab } from "../tabs/ActivityTab";
// ── Fixtures ──────────────────────────────────────────────────────────────────
@ -358,6 +362,191 @@ describe("ActivityTab — refresh button", () => {
});
});
// ── Suite 6.5: ACTIVITY_LOGGED subscription (#61 stage 3) ─────────────────────
//
// Pin the post-#61 behaviour: WS push extends the rendered list with NO
// additional HTTP fetch. The 5s polling loop is gone; live updates
// arrive over the WebSocket bus.
describe("ActivityTab — #61 stage 3: ACTIVITY_LOGGED subscription", () => {
beforeEach(() => {
vi.clearAllMocks();
mockGet.mockResolvedValue([]);
_resetSocketEventListenersForTests();
});
afterEach(() => {
cleanup();
_resetSocketEventListenersForTests();
});
function emitActivity(overrides: {
workspaceId?: string;
activityType?: string;
summary?: string;
id?: string;
} = {}) {
const realNow = Date.now();
emitSocketEvent({
event: "ACTIVITY_LOGGED",
workspace_id: overrides.workspaceId ?? "ws-1",
timestamp: new Date(realNow).toISOString(),
payload: {
id: overrides.id ?? `act-${Math.random().toString(36).slice(2)}`,
activity_type: overrides.activityType ?? "agent_log",
source_id: null,
target_id: null,
method: null,
summary: overrides.summary ?? "live-pushed",
status: "ok",
created_at: new Date(realNow - 5_000).toISOString(),
},
});
}
it("WS push for matching workspace prepends to the list with NO HTTP call", async () => {
render(<ActivityTab workspaceId="ws-1" />);
await waitFor(() => {
expect(screen.getByText(/0 activities|no activity/i)).toBeTruthy();
});
mockGet.mockClear();
await act(async () => {
emitActivity({ summary: "live-row-from-bus" });
});
await waitFor(() => {
expect(screen.getByText(/live-row-from-bus/)).toBeTruthy();
});
expect(mockGet).not.toHaveBeenCalled();
});
it("WS push for a different workspace is ignored", async () => {
render(<ActivityTab workspaceId="ws-1" />);
await waitFor(() => screen.getByText(/no activity/i));
await act(async () => {
emitActivity({
workspaceId: "ws-other",
summary: "should-not-render-other-ws",
});
});
expect(screen.queryByText(/should-not-render-other-ws/)).toBeNull();
});
it("WS push respects the active filter — non-matching activity_type is ignored", async () => {
render(<ActivityTab workspaceId="ws-1" />);
await waitFor(() => screen.getByText(/no activity/i));
// Apply "Tasks" filter.
clickButton(/tasks/i);
await waitFor(() => {
expect(
screen.getByRole("button", { name: /tasks/i }).getAttribute("aria-pressed"),
).toBe("true");
});
// Push an a2a_send (does NOT match task_update filter).
await act(async () => {
emitActivity({
activityType: "a2a_send",
summary: "should-not-render-filter-mismatch",
});
});
expect(
screen.queryByText(/should-not-render-filter-mismatch/),
).toBeNull();
});
it("WS push respects the active filter — matching activity_type is rendered", async () => {
render(<ActivityTab workspaceId="ws-1" />);
await waitFor(() => screen.getByText(/no activity/i));
clickButton(/tasks/i);
await waitFor(() => {
expect(
screen.getByRole("button", { name: /tasks/i }).getAttribute("aria-pressed"),
).toBe("true");
});
await act(async () => {
emitActivity({
activityType: "task_update",
summary: "task-filter-match",
});
});
await waitFor(() => {
expect(screen.getByText(/task-filter-match/)).toBeTruthy();
});
});
it("WS push while autoRefresh is paused is ignored", async () => {
render(<ActivityTab workspaceId="ws-1" />);
await waitFor(() => screen.getByText(/no activity/i));
// Toggle Live → Paused.
clickButton(/live/i);
await waitFor(() => {
expect(screen.getByText(/Paused/)).toBeTruthy();
});
await act(async () => {
emitActivity({ summary: "should-not-render-paused" });
});
expect(screen.queryByText(/should-not-render-paused/)).toBeNull();
});
it("WS push for a row already in the list is deduped (no double-render)", async () => {
// Bootstrap with one row — same id as the WS push to trigger dedup.
mockGet.mockResolvedValueOnce([
makeEntry({ id: "shared-id", summary: "bootstrap-summary" }),
]);
render(<ActivityTab workspaceId="ws-1" />);
await waitFor(() => {
expect(screen.getByText(/bootstrap-summary/)).toBeTruthy();
});
mockGet.mockClear();
// Push a row with the SAME id but a different summary — must not
// render the new summary; original row stays.
await act(async () => {
emitActivity({
id: "shared-id",
summary: "should-not-replace-existing",
});
});
expect(screen.queryByText(/should-not-replace-existing/)).toBeNull();
// Also verify count didn't grow.
expect(screen.getByText(/1 activities/)).toBeTruthy();
});
it("does NOT poll on a 5s interval after mount (post-#61)", async () => {
vi.useFakeTimers();
try {
render(<ActivityTab workspaceId="ws-1" />);
// Drain the mount-time bootstrap promise.
await act(async () => {
await Promise.resolve();
await Promise.resolve();
});
const callsAfterBootstrap = mockGet.mock.calls.length;
expect(callsAfterBootstrap).toBeGreaterThanOrEqual(1);
// Pre-#61: a 30s clock advance fires 6 more polls. Post-#61: 0.
await act(async () => {
vi.advanceTimersByTime(30_000);
});
expect(mockGet.mock.calls.length).toBe(callsAfterBootstrap);
} finally {
vi.useRealTimers();
}
});
});
// ── Suite 7: Activity count ───────────────────────────────────────────────────
describe("ActivityTab — activity count", () => {

View File

@ -1,18 +1,28 @@
// @vitest-environment jsdom
/**
* CommunicationOverlay tests pin the rate-limit fix shipped 2026-05-04.
* CommunicationOverlay tests pin both the 2026-05-04 fan-out cap fix
* AND the 2026-05-07 polling ACTIVITY_LOGGED-subscriber refactor
* (issue #61 stage 1).
*
* The overlay polls /workspaces/:id/activity?limit=5 for each online
* workspace. Pre-fix it (a) polled regardless of visibility and (b)
* fanned out to 6 workspaces every 10s. With 8+ workspaces a user
* triggered sustained 429s (server-side rate limit is 600 req/min/IP).
* The overlay used to poll /workspaces/:id/activity?limit=5 on a 30s
* interval per online workspace (capped at 3). Post-#61: it bootstraps
* once on mount via the same HTTP path (cap of 3 retained), then
* subscribes to ACTIVITY_LOGGED via the global socket bus for live
* updates. No interval poll.
*
* These tests pin:
* 1. Fan-out cap of 3 even with 6 online nodes, only 3 fetches
* 2. Visibility gate when collapsed, no polling
* 1. Bootstrap fan-out cap of 3 even with 6 online nodes, only 3
* HTTP fetches on mount.
* 2. Visibility gate when collapsed, no HTTP fetches; re-open
* re-bootstraps.
* 3. NO interval polling advancing the clock past 30s does not fire
* additional HTTP calls.
* 4. WS push extends the rendered list without firing any HTTP call.
* 5. WS push for an offline workspace is ignored.
* 6. WS push for a non-comm activity_type is ignored.
*
* If a future refactor pushes either dial back up, CI fails before
* the regression hits a paying tenant.
* If a future refactor regresses any of these, CI fails before the
* regression hits a paying tenant.
*/
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
import { render, cleanup, act, fireEvent } from "@testing-library/react";
@ -23,7 +33,7 @@ vi.mock("@/lib/api", () => ({
api: { get: vi.fn() },
}));
// Six online nodes — enough to verify the cap of 3.
// Six online nodes — enough to verify the bootstrap cap of 3.
const mockStoreState = {
selectedNodeId: null as string | null,
nodes: [
@ -56,6 +66,10 @@ vi.mock("@/lib/design-tokens", () => ({
// ── Imports (after mocks) ─────────────────────────────────────────────────────
import { api } from "@/lib/api";
import {
emitSocketEvent,
_resetSocketEventListenersForTests,
} from "@/store/socket-events";
import { CommunicationOverlay } from "../CommunicationOverlay";
const mockGet = vi.mocked(api.get);
@ -66,30 +80,34 @@ beforeEach(() => {
vi.useFakeTimers();
mockGet.mockReset();
mockGet.mockResolvedValue([]);
// Drop any subscribers the previous test left on the singleton bus —
// each render adds one via useSocketEvent.
_resetSocketEventListenersForTests();
});
afterEach(() => {
cleanup();
vi.useRealTimers();
_resetSocketEventListenersForTests();
});
// ── Tests ─────────────────────────────────────────────────────────────────────
describe("CommunicationOverlay — fan-out cap", () => {
it("polls at most 3 of 6 online workspaces (rate-limit floor)", async () => {
describe("CommunicationOverlay — bootstrap fan-out cap", () => {
it("bootstraps at most 3 of 6 online workspaces (rate-limit floor preserved post-#61)", async () => {
await act(async () => {
render(<CommunicationOverlay />);
});
// Mount fires the first poll synchronously (no interval tick yet).
// Pre-fix: 6 calls. Post-fix: 3.
// Mount fires the bootstrap synchronously — pre-#61 this was the
// first poll cycle; post-#61 it's the only HTTP fetch (live updates
// arrive via WS push). 6 nodes → 3 fetches.
expect(mockGet).toHaveBeenCalledTimes(3);
// Verify the calls are for the FIRST 3 online nodes (slice order).
expect(mockGet).toHaveBeenCalledWith("/workspaces/ws-1/activity?limit=5");
expect(mockGet).toHaveBeenCalledWith("/workspaces/ws-2/activity?limit=5");
expect(mockGet).toHaveBeenCalledWith("/workspaces/ws-3/activity?limit=5");
});
it("never polls offline workspaces", async () => {
it("never bootstraps offline workspaces", async () => {
await act(async () => {
render(<CommunicationOverlay />);
});
@ -99,40 +117,39 @@ describe("CommunicationOverlay — fan-out cap", () => {
});
});
describe("CommunicationOverlay — cadence", () => {
it("uses 30s interval cadence (was 10s pre-fix)", async () => {
describe("CommunicationOverlay — no interval polling (post-#61)", () => {
// The pre-#61 implementation re-fetched every 30s per workspace.
// Post-#61 the only HTTP path is the bootstrap on mount + on
// visibility-toggle. This test pins the absence of any interval
// poll: a 60s clock advance must not produce a second round of
// fetches.
it("does NOT poll on a 30s interval after bootstrap", async () => {
await act(async () => {
render(<CommunicationOverlay />);
});
expect(mockGet).toHaveBeenCalledTimes(3); // initial mount poll
expect(mockGet).toHaveBeenCalledTimes(3); // initial bootstrap
mockGet.mockClear();
// Advance 10s — pre-fix this would fire another poll. Post-fix: silent.
// Advance 60s — well past any plausible cadence the prior version
// could have used.
await act(async () => {
vi.advanceTimersByTime(10_000);
vi.advanceTimersByTime(60_000);
});
expect(mockGet).toHaveBeenCalledTimes(3);
// Advance to 30s — interval fires.
await act(async () => {
vi.advanceTimersByTime(20_000);
});
expect(mockGet).toHaveBeenCalledTimes(6); // +3 from second tick
expect(mockGet).not.toHaveBeenCalled();
});
});
describe("CommunicationOverlay — visibility gate", () => {
// The visibility gate is the dial that drops collapsed-panel polling
// to ZERO. The cadence test above can't catch its removal — if a
// refactor dropped `if (!visible) return`, the cadence test would
// still pass because the effect would still fire every 30s.
// The visibility gate now does two things post-#61:
// - while closed, the WS handler short-circuits (no setComms churn)
// - re-opening triggers a fresh bootstrap so the list reflects
// anything that happened while the panel was collapsed
//
// Direct probe: render with comms-returning mock so the panel
// actually renders (close button only exists in the expanded panel,
// not the collapsed button-state). Click close, advance the clock,
// assert no further fetches.
it("stops polling after the user collapses the panel", async () => {
// Mock returns one a2a_send so comms.length > 0 → panel renders →
// close button accessible.
it("stops fetching while collapsed and re-bootstraps on re-open", async () => {
mockGet.mockResolvedValue([
{
id: "act-1",
@ -150,29 +167,202 @@ describe("CommunicationOverlay — visibility gate", () => {
const { getByLabelText } = await act(async () => {
return render(<CommunicationOverlay />);
});
// Drain pending microtasks (resolves the await in fetchComms) so
// setComms lands and the panel renders. Don't advance time — that
// would fire the next interval tick and pollute the assertion.
// Drain pending microtasks (resolves the await in bootstrap) so
// setComms lands and the panel renders. Don't advance time — it's
// not load-bearing for the gate test, but matches the pattern used
// pre-#61 for stability.
await act(async () => {
await Promise.resolve();
await Promise.resolve();
await Promise.resolve();
});
// Initial mount polled 3 workspaces.
expect(mockGet).toHaveBeenCalledTimes(3);
expect(mockGet).toHaveBeenCalledTimes(3); // initial bootstrap
mockGet.mockClear();
// Click the close button. Synchronous getByLabelText avoids
// findBy's internal setTimeout (deadlocks under useFakeTimers).
// Click close. While closed, no fetches and no WS-driven updates.
const closeBtn = getByLabelText("Close communications panel");
await act(async () => {
fireEvent.click(closeBtn);
});
await act(async () => {
vi.advanceTimersByTime(60_000);
});
expect(mockGet).not.toHaveBeenCalled();
// Re-open via the collapsed button. Must trigger a fresh bootstrap.
const openBtn = getByLabelText("Show communications panel");
await act(async () => {
fireEvent.click(openBtn);
});
await act(async () => {
await Promise.resolve();
await Promise.resolve();
});
expect(mockGet).toHaveBeenCalledTimes(3); // re-bootstrap on re-open
});
});
describe("CommunicationOverlay — WS subscription (#61 stage 1 core)", () => {
// The load-bearing post-#61 behaviour. Every test in this block must
// verify (a) the WS push DID update the rendered comms list, and
// (b) NO additional HTTP call was fired — the whole point of the
// refactor is to remove the polling-driven HTTP traffic.
function emitActivityLogged(overrides: Partial<{
workspaceId: string;
payload: Record<string, unknown>;
}> = {}) {
emitSocketEvent({
event: "ACTIVITY_LOGGED",
workspace_id: overrides.workspaceId ?? "ws-1",
timestamp: new Date().toISOString(),
payload: {
id: `act-${Math.random().toString(36).slice(2)}`,
activity_type: "a2a_send",
source_id: "ws-1",
target_id: "ws-2",
summary: "live push",
status: "ok",
duration_ms: 42,
created_at: new Date().toISOString(),
...overrides.payload,
},
});
}
it("WS push for a comm activity_type extends the rendered list with NO additional HTTP call", async () => {
const { container } = await act(async () => {
return render(<CommunicationOverlay />);
});
expect(mockGet).toHaveBeenCalledTimes(3); // bootstrap
mockGet.mockClear();
await act(async () => {
emitActivityLogged({ payload: { summary: "hello" } });
});
await act(async () => {
await Promise.resolve();
});
// Two pins:
// 1. comms list reflects the live push (look for the summary text)
// 2. zero HTTP fetches fired during the WS path
expect(container.textContent).toContain("hello");
expect(mockGet).not.toHaveBeenCalled();
});
it("WS push for an offline workspace is ignored", async () => {
const { container } = await act(async () => {
return render(<CommunicationOverlay />);
});
mockGet.mockClear();
await act(async () => {
emitActivityLogged({
workspaceId: "ws-offline",
payload: { source_id: "ws-offline", summary: "should-not-render" },
});
});
await act(async () => {
await Promise.resolve();
});
expect(container.textContent).not.toContain("should-not-render");
expect(mockGet).not.toHaveBeenCalled();
});
it("WS push for a non-comm activity_type is ignored (e.g. delegation)", async () => {
const { container } = await act(async () => {
return render(<CommunicationOverlay />);
});
mockGet.mockClear();
await act(async () => {
emitActivityLogged({
payload: {
activity_type: "delegation",
summary: "should-not-render-delegation",
},
});
});
await act(async () => {
await Promise.resolve();
});
expect(container.textContent).not.toContain("should-not-render-delegation");
expect(mockGet).not.toHaveBeenCalled();
});
it("WS push while the panel is collapsed is ignored (no churn on hidden state)", async () => {
// Bootstrap with one comm so the panel renders → close button
// accessible. Then collapse, emit a WS push, re-open: the rendered
// list must come from the re-bootstrap, NOT from the WS-push that
// arrived during the closed state. Also: nothing visible while
// closed (the collapsed button shows only the count, not summaries).
mockGet.mockResolvedValue([
{
id: "act-bootstrap",
workspace_id: "ws-1",
activity_type: "a2a_send",
source_id: "ws-1",
target_id: "ws-2",
summary: "bootstrap-summary",
status: "ok",
duration_ms: 1,
created_at: new Date().toISOString(),
},
]);
const { getByLabelText, container } = await act(async () => {
return render(<CommunicationOverlay />);
});
await act(async () => {
await Promise.resolve();
await Promise.resolve();
});
// Collapse.
const closeBtn = getByLabelText("Close communications panel");
await act(async () => {
fireEvent.click(closeBtn);
});
// Advance well past the 30s cadence — gate should suppress the tick.
// Bootstrap mock returns nothing on the re-open path so we can
// distinguish "WS push leaked through the gate" from "re-bootstrap
// refilled the list."
mockGet.mockReset();
mockGet.mockResolvedValue([]);
await act(async () => {
vi.advanceTimersByTime(60_000);
emitActivityLogged({
payload: { summary: "leaked-while-closed" },
});
});
await act(async () => {
await Promise.resolve();
});
// Closed state: rendered DOM must not show any push-derived text.
expect(container.textContent).not.toContain("leaked-while-closed");
});
it("non-ACTIVITY_LOGGED events are ignored (e.g. WORKSPACE_OFFLINE)", async () => {
const { container } = await act(async () => {
return render(<CommunicationOverlay />);
});
mockGet.mockClear();
await act(async () => {
emitSocketEvent({
event: "WORKSPACE_OFFLINE",
workspace_id: "ws-1",
timestamp: new Date().toISOString(),
payload: { summary: "should-not-render-event" },
});
});
await act(async () => {
await Promise.resolve();
});
expect(container.textContent).not.toContain("should-not-render-event");
expect(mockGet).not.toHaveBeenCalled();
});
});

View File

@ -1,8 +1,9 @@
"use client";
import { useState, useEffect, useCallback } from "react";
import { useState, useEffect, useCallback, useRef } from "react";
import { api } from "@/lib/api";
import { ConversationTraceModal } from "@/components/ConversationTraceModal";
import { useSocketEvent } from "@/hooks/useSocketEvent";
import { type ActivityEntry } from "@/types/activity";
import { useWorkspaceName } from "@/hooks/useWorkspaceName";
import { inferA2AErrorHint } from "./chat/a2aErrorHint";
@ -48,6 +49,15 @@ export function ActivityTab({ workspaceId }: Props) {
const [traceOpen, setTraceOpen] = useState(false);
const resolveName = useWorkspaceName();
// Refs let the WS handler read the latest filter / autoRefresh
// selection without re-subscribing on every state change. The bus
// listener is registered exactly once per mount via useSocketEvent's
// ref-internal pattern; subscriber-side filtering reads from these.
const filterRef = useRef(filter);
filterRef.current = filter;
const autoRefreshRef = useRef(autoRefresh);
autoRefreshRef.current = autoRefresh;
const loadActivities = useCallback(async () => {
try {
const typeParam = filter !== "all" ? `?type=${filter}` : "";
@ -66,11 +76,58 @@ export function ActivityTab({ workspaceId }: Props) {
loadActivities();
}, [loadActivities]);
useEffect(() => {
if (!autoRefresh) return;
const interval = setInterval(loadActivities, 5000);
return () => clearInterval(interval);
}, [loadActivities, autoRefresh]);
// Live-update path (issue #61 stage 3, replaces the 5s setInterval).
// ACTIVITY_LOGGED events from this workspace prepend to the rendered
// list — dedup by id so a server-side update + a poll reply don't
// double-render the same row.
//
// Honours the user's autoRefresh toggle: when paused, live updates
// are dropped until the user re-enables Live (or hits Refresh, which
// re-bootstraps via loadActivities).
//
// Filter awareness: matches the server-side `?type=<filter>`
// semantics so the panel doesn't show rows the user excluded.
useSocketEvent((msg) => {
if (!autoRefreshRef.current) return;
if (msg.event !== "ACTIVITY_LOGGED") return;
if (msg.workspace_id !== workspaceId) return;
const p = (msg.payload || {}) as Record<string, unknown>;
const activityType = (p.activity_type as string) || "";
const f = filterRef.current;
if (f !== "all" && activityType !== f) return;
const entry: ActivityEntry = {
id:
(p.id as string) ||
`ws-push-${msg.timestamp || Date.now()}-${msg.workspace_id}`,
workspace_id: msg.workspace_id,
activity_type: activityType,
source_id: (p.source_id as string | null) ?? null,
target_id: (p.target_id as string | null) ?? null,
method: (p.method as string | null) ?? null,
summary: (p.summary as string | null) ?? null,
request_body: (p.request_body as Record<string, unknown> | null) ?? null,
response_body:
(p.response_body as Record<string, unknown> | null) ?? null,
duration_ms: (p.duration_ms as number | null) ?? null,
status: (p.status as string) || "ok",
error_detail: (p.error_detail as string | null) ?? null,
created_at:
(p.created_at as string) ||
msg.timestamp ||
new Date().toISOString(),
};
setActivities((prev) => {
// Dedup by id — a row that arrived via the bootstrap fetch and
// also fires ACTIVITY_LOGGED from a delayed server-side hook
// must render exactly once.
if (prev.some((e) => e.id === entry.id)) return prev;
return [entry, ...prev];
});
});
return (
<div className="flex flex-col h-full">

View File

@ -0,0 +1,74 @@
# ADR-002: Local-build mode signalled by `MOLECULE_IMAGE_REGISTRY` presence
* Status: Accepted (2026-05-07)
* Issue: #63 (closes Task #194)
* Decision: Hongming (CTO) + Claude Opus 4.7 (implementation)
## Context
Pre-2026-05-06, every Molecule deployment — both production tenants and OSS contributor laptops — pulled workspace-template-* container images from `ghcr.io/molecule-ai/`. Production tenants additionally set `MOLECULE_IMAGE_REGISTRY` to an AWS ECR mirror via Railway env / EC2 user-data, but the OSS default was the upstream GHCR org.
On 2026-05-06 the `Molecule-AI` GitHub org was suspended (saved memory: `feedback_github_botring_fingerprint`). GHCR now returns **403 Forbidden** for every `molecule-ai/workspace-template-*` manifest. OSS contributors who clone `molecule-core` and run `go run ./workspace-server/cmd/server` cannot provision a workspace — every first provision fails with:
```
docker image "ghcr.io/molecule-ai/workspace-template-claude-code:latest" not found after pull attempt
```
Production tenants are unaffected (their `MOLECULE_IMAGE_REGISTRY` points at ECR, which we still control), but OSS onboarding is broken. Workspace template repos are intentionally separate from `molecule-core` (each runtime is OSS-shape and forkable), and they are mirrored to Gitea (`https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-<runtime>`) — but the provisioner has no path that consumes Gitea source directly.
## Decision
When `MOLECULE_IMAGE_REGISTRY` is **unset** (or empty), the provisioner switches to a **local-build mode** that:
1. Looks up the workspace-template repo's HEAD sha on Gitea via a single API call.
2. Checks whether a SHA-pinned local image (`molecule-local/workspace-template-<runtime>:<sha12>`) already exists; if so, reuses it.
3. Otherwise shallow-clones the repo into `~/.cache/molecule/workspace-template-build/<runtime>/<sha12>/` and runs `docker build --platform=linux/amd64 -t <tag> .`.
4. Hands the SHA-pinned tag to Docker for ContainerCreate, bypassing the registry-pull path entirely.
When `MOLECULE_IMAGE_REGISTRY` is **set**, behavior is unchanged: pull the image from that registry. Existing prod tenants and self-hosters who mirror to a private registry are not affected.
## Consequences
### Positive
* **Zero-config OSS onboarding**`git clone molecule-core && go run ./workspace-server/cmd/server` boots end-to-end without any registry credentials.
* **Production tenants protected** — same env var, same semantics in SaaS-mode. Migration is a no-op.
* **No new env var** — extending an existing var's semantics ("where to pull, OR build locally if absent") rather than introducing `MOLECULE_LOCAL_BUILD=1` keeps the surface small.
* **SHA-pinned cache** — repeat builds are O(API-call); only template-repo HEAD changes invalidate.
* **Production-parity image** — amd64 emulation on Apple Silicon honours `feedback_local_must_mimic_production`. The provisioner's existing `defaultImagePlatform()` already forces amd64 for parity; building amd64 locally lets that decision stay consistent.
### Negative
* **Conflates two concerns**`MOLECULE_IMAGE_REGISTRY` now signals BOTH "where to pull" AND "build locally if absent." A future operator who unsets it expecting a hard error will instead get a slow first-provision. Documented in the runbook.
* **First-provision is slow on Apple Silicon** — 510 min via QEMU emulation on the cold path. Mitigated by SHA-cache (subsequent runs are <1s lookup + 0s build).
* **Coverage gap** — only 4 of 9 runtimes are mirrored to Gitea today (`claude-code`, `hermes`, `langgraph`, `autogen`). The other 5 fail with an actionable "not mirrored" error. Mirroring those repos is a separate task.
* **Implicit trust boundary** — operator running `go run` implicitly trusts `molecule-ai/molecule-ai-workspace-template-*` repos on Gitea. This is the same trust they would extend to the GHCR images today; not a new attack surface.
## Alternatives considered
1. **New env var `MOLECULE_LOCAL_BUILD=1`** — explicit, but requires OSS contributors to know it exists. Violates the zero-config goal.
2. **Push pre-built images to a Gitea container registry, mirror tag from upstream** — operationally cleaner but: (a) Gitea's container-registry add-on isn't deployed on the operator host, (b) defeats the OSS-contributor goal of "hack on the source, see your changes," since they'd still pull a stale image.
3. **Embed Dockerfiles in molecule-core itself, drop the standalone template repos** — would work but breaks the OSS-shape principle; templates are intentionally separable, anyone-can-fork artifacts.
4. **Build native arch on Apple Silicon (arm64) and drop the platform pin in local-mode** — fast, but creates `linux/arm64` images that diverge from the amd64-only prod runtime. Local-vs-prod debug behavior would diverge. Rejected per `feedback_local_must_mimic_production`.
## Security review
* **Gitea repo URL allowlist** — runtime name must be in the `knownRuntimes` allowlist (defence-in-depth against a future code path that lets cfg.Runtime carry untrusted input). Repo prefix is hardcoded to `https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-`; forks can override via `MOLECULE_LOCAL_TEMPLATE_REPO_PREFIX` (opt-in, default off).
* **Token handling** — clones are anonymous over HTTPS by default (templates are public). `MOLECULE_GITEA_TOKEN`, if set, is passed via URL userinfo for the clone and as `Authorization: token` for the API call. The token is **masked in every log line** via `maskTokenInURL` / `maskTokenInString` and never appears in the cache dir path.
* **No silent fallback** — if Gitea is unreachable or the runtime isn't mirrored, we return a clear error mentioning the repo URL and the missing runtime. We **never** fall back to GHCR/ECR (that would be a confusing bug for an OSS contributor who happened to have stale ECR creds in their docker config).
* **Build-arg injection**`docker build` is invoked with NO `--build-arg` from external input. Dockerfile is consumed as-is.
* **Cache poisoning** — cache key is the Gitea HEAD sha + Dockerfile content; a force-push to the template repo's main branch regenerates the key on next run. Cache dir is per-user (`$HOME/.cache`), so cross-user attacks aren't relevant in single-user dev mode.
## Versioning + back-compat
* Existing prod tenants set `MOLECULE_IMAGE_REGISTRY=<ECR url>` → unchanged behavior.
* Existing local installs that set the var → unchanged behavior.
* Existing local installs that don't set it → switch to local-build path. Migration: none required (additive); first provision will take 510 min instead of failing.
* No deprecations.
## References
* Issue #63 — feat(workspace-server): local-dev provisioner builds from Gitea source
* Saved memory `feedback_local_must_mimic_production` — local docker must mimic prod, no bypasses
* Saved memory `reference_post_suspension_pipeline` — full post-2026-05-06 stack shape
* Saved memory `feedback_github_botring_fingerprint` — what got the org suspended

View File

@ -1,5 +1,41 @@
# Local Development
## Workspace Template Images: Local-Build Mode (Issue #63)
OSS contributors who run `molecule-core` locally do **not** need to authenticate to GHCR or AWS ECR. When the `MOLECULE_IMAGE_REGISTRY` env var is **unset**, the platform automatically:
1. Looks up the HEAD sha of `https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-<runtime>` (single API call, no clone).
2. If a local image tagged `molecule-local/workspace-template-<runtime>:<sha12>` already exists, reuses it (cache hit).
3. Otherwise, shallow-clones the repo into `~/.cache/molecule/workspace-template-build/<runtime>/<sha12>/` and runs `docker build --platform=linux/amd64 -t <tag> .`.
4. Hands the SHA-pinned tag to Docker for `ContainerCreate`.
**First-provision build time:** 510 min on Apple Silicon (amd64 emulation). Subsequent provisions hit the cache and start in seconds. Cache is invalidated automatically when the template repo's HEAD moves.
**Currently mirrored on Gitea:** `claude-code`, `hermes`, `langgraph`, `autogen`. Other runtimes (`crewai`, `deepagents`, `codex`, `gemini-cli`, `openclaw`) fail with an actionable "not mirrored to Gitea" error pointing at the missing repo.
**Production tenants are unaffected** — every prod tenant sets `MOLECULE_IMAGE_REGISTRY` to its private ECR mirror via Railway env / EC2 user-data, so the SaaS pull path stays identical.
### Environment overrides
| Var | Default | Use case |
|-----|---------|----------|
| `MOLECULE_IMAGE_REGISTRY` | (unset) | Set to a real registry URL to switch from local-build to SaaS-pull mode. |
| `MOLECULE_LOCAL_BUILD_CACHE` | `~/.cache/molecule/workspace-template-build` | Override cache directory. |
| `MOLECULE_LOCAL_TEMPLATE_REPO_PREFIX` | `https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-` | Point at a fork. |
| `MOLECULE_GITEA_TOKEN` | (unset) | Required only if your fork has private template repos. |
### Verifying a switch from the GHCR-retag stopgap
Pre-fix, OSS contributors worked around the suspended GHCR org by manually retagging an `:latest` image. After this change, that workaround is **redundant**: simply unset `MOLECULE_IMAGE_REGISTRY` (or leave it unset), boot the platform, and provision a workspace. Logs will show:
```
Provisioner: local-build mode → using locally-built image molecule-local/workspace-template-claude-code:<sha12> for runtime claude-code
local-build: cloning https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-claude-code → ...
local-build: docker build done in <duration>
```
If you still see `ghcr.io/molecule-ai/...` in the boot log, double-check `env | grep MOLECULE_IMAGE_REGISTRY` — a stale shell export from the pre-fix workaround could keep SaaS-mode active.
## Starting the Stack
```bash

View File

@ -0,0 +1,147 @@
# Rate-limit observability runbook
> Companion to issue #64 ("RATE_LIMIT default re-tune analysis"). After
> #60 deployed the per-tenant `keyFor` keying, the right RATE_LIMIT
> default became data-dependent. This runbook documents the metrics +
> queries an operator should run to confirm whether the current 600
> req/min/key default is correct, too tight, or too loose.
## What's already exposed
The workspace-server's existing Prometheus middleware
(`workspace-server/internal/metrics/metrics.go`) tracks every request
on every path:
```
molecule_http_requests_total{method, path, status} counter
molecule_http_request_duration_seconds_total{method,path,status} counter
```
Path is the matched route pattern (`/workspaces/:id/activity` etc), so
high-cardinality workspace UUIDs do not explode the label space.
The rate limiter middleware (#60, `workspace-server/internal/middleware/ratelimit.go`)
also stamps every response with `X-RateLimit-Limit`, `X-RateLimit-Remaining`,
and `X-RateLimit-Reset`. Operators with browser-side or proxy-side
header capture can read per-request bucket state directly.
No new instrumentation is needed for #64's acceptance criteria. The
metric surface is sufficient — this runbook just collects the queries.
## Queries to run after #60 deploys
### 1. Is the bucket actually firing 429s?
```promql
sum(rate(molecule_http_requests_total{status="429"}[5m]))
```
If this is zero on a given tenant, the bucket isn't being hit. If it's
sustained > 1/min, dig in.
### 2. Which routes attract 429s?
```promql
topk(
10,
sum by (path) (
rate(molecule_http_requests_total{status="429"}[5m])
)
)
```
Expected shape post-#60:
- `/workspaces/:id/activity` should be near zero — the canvas no longer
polls it on a 30s/60s/5s cadence (PRs #69 / #71 / #76).
- Probe / health / heartbeat paths should be ~0 (those routes have a
separate IP-fallback bucket).
If `/workspaces/:id/activity` 429s persist post-PRs-69/71/76 deploy, the
canvas isn't running the WS-subscriber path — investigate WS health
on that tenant.
### 3. Per-bucket-key inference (no direct exposure today)
The bucket map itself is in-memory only; we deliberately do **not**
expose `org:<uuid>` ↔ remaining-tokens because that map can include
SHA-256 hashes of bearer tokens. A tenant that wants per-key visibility
should rely on response headers (`X-RateLimit-Remaining` on every
response from a given session is the bucket's view of that session).
If you genuinely need server-side per-bucket counts for triage,
file a follow-up — the proper shape is a `/internal/ratelimit-stats`
endpoint that emits **counts per key prefix only** (e.g. `org:`, `tok:`,
`ip:`), never the key payloads. Don't roll that ad-hoc; it's a security
review surface.
## Decision tree for the re-tune
After 14 days of production traffic on a tenant, look at the queries
above and walk this tree:
```
Q1: Is the 429 rate sustained > 0.1/sec on any tenant?
├─ NO → The 600 default has comfortable headroom. Either keep it,
│ or lower it carefully (300) ONLY if you have a documented
│ reason (e.g. a misbehaving client we want to throttle harder).
│ Default to "no change" — see #64 for the math.
└─ YES → Q2.
Q2: Is the 429 rate concentrated on ONE tenant or spread across many?
├─ ONE tenant → Operator override: set RATE_LIMIT=1200 or 1800 on that
│ tenant's box. Document in the tenant's ops note. The
│ default does not need to change.
└─ MANY tenants → Q3.
Q3: Are the 429s on a route that polls (e.g. /activity / /peers)?
├─ YES → Confirm PRs #69, #71, #76 have actually deployed to those
│ tenants. If they have and 429s persist, the canvas may have
│ a regression — do not raise RATE_LIMIT. File a canvas issue.
└─ NO → 429s on mutating routes mean genuine load. Raise the default
to 1200 in `workspace-server/internal/router/router.go:54`.
Same PR should attach: the metric chart, the time window,
and a paragraph explaining what changed in our traffic shape.
```
## Alert rule template (drop-in for Prometheus)
```yaml
# Sustained 429s — file is the SLO trip-wire. If this fires, walk the
# decision tree above. NB: the issue#64 acceptance criterion is "two
# weeks of metrics"; this alert is the inverse — it tells you something
# changed before the two weeks are up.
groups:
- name: workspace-server-ratelimit
rules:
- alert: WorkspaceServerRateLimit429Sustained
expr: |
sum by (instance) (
rate(molecule_http_requests_total{status="429"}[10m])
) > 0.1
for: 30m
labels:
severity: warning
owner: workspace-server
annotations:
summary: "{{ $labels.instance }} sustained 429s — see ratelimit-observability runbook"
runbook: "https://git.moleculesai.app/molecule-ai/molecule-core/blob/main/docs/engineering/ratelimit-observability.md"
```
Threshold rationale: 0.1 req/s = 6/min sustained over 10min. Below
that, a 429 is almost certainly a transient burst that the canvas's
retry-once handler at `canvas/src/lib/api.ts:55` already absorbs. The
30m `for:` keeps the alert from chattering on a brief blip.
## Companion probe script
For one-off triage when an operator can reproduce the problem in their
own browser, `scripts/edge-429-probe.sh` (#62) reproduces a canvas-
sized burst against a tenant subdomain and dumps each 429's response
shape so the operator can distinguish workspace-server bucket overflow
from CF/Vercel edge rate-limiting without dashboard access.
```sh
./scripts/edge-429-probe.sh hongming.moleculesai.app --burst 80 --out /tmp/edge.txt
```
The script's report header explains how to read the output.

View File

@ -17,12 +17,23 @@
#
# Used by .github/workflows/auto-promote-stale-alarm.yml. Logic lives
# here (not inline in the workflow YAML) so we can:
# - Unit-test it with a stubbed `gh` (see test-check-stale-promote-pr.sh)
# - Unit-test it with a fixture (see test-check-stale-promote-pr.sh)
# - Run it ad-hoc by an operator: `scripts/check-stale-promote-pr.sh`
# - Reuse the same surface in any sibling workflow that needs the same
# check (SSOT — one detector, many callers).
#
# Requires: `gh` CLI, `jq`. `GH_TOKEN` env in the workflow context.
# Requires: `curl`, `jq`. `GITEA_TOKEN` (or `GITHUB_TOKEN` / `GH_TOKEN`
# for back-compat) in the workflow context. Reads `GITHUB_SERVER_URL`
# / `GITEA_API_URL` for the Gitea base, defaulting to
# https://git.moleculesai.app/api/v1.
#
# Post-2026-05-06 (Gitea migration, issue #75): the previous version
# called `gh pr list/view/comment`, all of which hit GitHub.com's
# GraphQL or /api/v3 REST shapes. Gitea exposes /api/v1/ only (no
# GraphQL → 405, no /api/v3 → 404). So this script now talks to the
# Gitea v1 API directly via curl. The fixture-driven unit tests are
# unchanged — they bypass the live fetch via PR_FIXTURE and still pass
# the historical (GitHub-shape) JSON which `detect_stale` consumes.
set -euo pipefail
@ -36,14 +47,15 @@ set -euo pipefail
# alarming. Override via env for tests + edge ops.
STALE_HOURS="${STALE_HOURS:-4}"
# Repo defaults to the current `gh` context. Tests pass --repo explicitly.
# Repo defaults to GITHUB_REPOSITORY (act_runner sets this in workflow
# context). Tests pass --repo explicitly.
REPO="${GITHUB_REPOSITORY:-}"
# Whether to post a comment to the PR. Off by default to avoid noise on
# manual ad-hoc runs; the cron workflow turns it on.
POST_COMMENT="${POST_COMMENT:-false}"
# Where to read the open-PR JSON from. Empty = call `gh` live. Tests
# Where to read the open-PR JSON from. Empty = call Gitea live. Tests
# point this at a fixture file.
PR_FIXTURE="${PR_FIXTURE:-}"
@ -51,6 +63,17 @@ PR_FIXTURE="${PR_FIXTURE:-}"
# the staleness math is deterministic.
NOW_OVERRIDE="${NOW_OVERRIDE:-}"
# Gitea API base. act_runner forwards github.server_url as
# GITHUB_SERVER_URL; for the molecule-ai fleet that's
# https://git.moleculesai.app. Append /api/v1 to get the REST root.
# Override directly via GITEA_API_URL for tests / non-default hosts.
GITEA_API_URL="${GITEA_API_URL:-${GITHUB_SERVER_URL:-https://git.moleculesai.app}/api/v1}"
# Token. Workflow context sets GITHUB_TOKEN; we accept GITEA_TOKEN as
# the explicit name and GH_TOKEN for back-compat with operator habits
# from the GitHub era. First non-empty wins.
GITEA_TOKEN="${GITEA_TOKEN:-${GITHUB_TOKEN:-${GH_TOKEN:-}}}"
while [ $# -gt 0 ]; do
case "$1" in
--repo) REPO="$2"; shift 2 ;;
@ -83,7 +106,7 @@ now_epoch() {
fi
}
# Parse RFC3339 timestamps the way GitHub emits them (e.g.
# Parse RFC3339 timestamps the way Gitea / GitHub emit them (e.g.
# "2026-05-05T23:15:00Z"). gnu-date uses -d, bsd-date uses -j -f. Cover
# both because the workflow runs on ubuntu-latest (gnu) but operators
# may run this script on macOS (bsd).
@ -106,14 +129,100 @@ to_epoch() {
# Fetch open auto-promote PRs
# -----------------------------------------------------------------------------
# Gitea v1 returns PRs with the canonical Gitea shape (number, title,
# created_at, html_url, mergeable, state). The previous GitHub-CLI
# version returned a derived `mergeStateStatus` / `reviewDecision`
# pair which only GitHub computes — Gitea doesn't expose them
# natively. Rebuild equivalents:
#
# mergeStateStatus = BLOCKED ↔ Gitea: state==open AND mergeable==true
# AND no APPROVED review yet
# (i.e. branch protection is gating
# the auto-merge pending an approval)
# reviewDecision = REVIEW_REQUIRED ↔ Gitea: 0 APPROVED reviews
#
# This mirrors the SAME silent-block failure mode the GitHub version
# detected: auto-merge armed, branch protection requires 1 review,
# nobody's approved yet.
#
# Implementation: pull the open PR list base=main, then for each PR
# pull /pulls/{n}/reviews and synthesize the GitHub-shape JSON the
# rest of the script + the test fixtures consume.
fetch_prs() {
if [ -n "$PR_FIXTURE" ]; then
cat "$PR_FIXTURE"
return 0
fi
gh pr list --repo "$REPO" \
--base main --head staging --state open \
--json number,title,createdAt,mergeStateStatus,reviewDecision,url
if [ -z "$GITEA_TOKEN" ]; then
echo "::error::GITEA_TOKEN / GITHUB_TOKEN unset — cannot fetch PRs from $GITEA_API_URL" >&2
return 1
fi
local prs_json
prs_json="$(curl --fail-with-body -sS \
-H "Authorization: token ${GITEA_TOKEN}" \
-H "Accept: application/json" \
"${GITEA_API_URL}/repos/${REPO}/pulls?state=open&base=main&limit=50" \
2>/dev/null)" || {
echo "::error::Failed to fetch PRs from ${GITEA_API_URL}/repos/${REPO}/pulls" >&2
return 1
}
# Filter to head=staging (the auto-promote shape) and synthesize
# mergeStateStatus + reviewDecision per PR. Approval count via
# /pulls/{n}/reviews. Errors fall through to 0-approvals (treated
# as REVIEW_REQUIRED) preserving the existing "fail-safe — alarm if
# uncertain" semantic.
local synthesized="[]"
while IFS= read -r pr; do
[ -z "$pr" ] && continue
[ "$pr" = "null" ] && continue
local num
num="$(printf '%s' "$pr" | jq -r '.number')"
[ -z "$num" ] && continue
[ "$num" = "null" ] && continue
local approved_count
approved_count="$(curl --fail-with-body -sS \
-H "Authorization: token ${GITEA_TOKEN}" \
-H "Accept: application/json" \
"${GITEA_API_URL}/repos/${REPO}/pulls/${num}/reviews" 2>/dev/null \
| jq '[.[] | select(.state == "APPROVED" and (.dismissed // false) == false)] | length' \
2>/dev/null || echo 0)"
local mergeable
mergeable="$(printf '%s' "$pr" | jq -r '.mergeable')"
local merge_state="UNKNOWN"
local review_decision="REVIEW_REQUIRED"
if [ "$mergeable" = "true" ]; then
if [ "$approved_count" -ge 1 ]; then
merge_state="CLEAN"
review_decision="APPROVED"
else
# mergeable but no approving review — exactly the wedge state
# the alarm targets.
merge_state="BLOCKED"
review_decision="REVIEW_REQUIRED"
fi
else
# not mergeable (conflicts, behind, failed checks) — different
# failure mode, the author owns the fix; the alarm doesn't fire.
merge_state="DIRTY"
review_decision="REVIEW_REQUIRED"
fi
synthesized="$(printf '%s' "$synthesized" \
| jq -c --argjson pr "$pr" \
--arg ms "$merge_state" \
--arg rd "$review_decision" \
'. + [{
number: $pr.number,
title: $pr.title,
createdAt: $pr.created_at,
mergeStateStatus: $ms,
reviewDecision: $rd,
url: $pr.html_url
}]')"
done < <(printf '%s' "$prs_json" \
| jq -c '.[] | select(.head.ref == "staging")' 2>/dev/null)
printf '%s\n' "$synthesized"
}
# -----------------------------------------------------------------------------
@ -171,18 +280,40 @@ post_comment() {
if [ "$POST_COMMENT" != "true" ]; then
return 0
fi
if [ -z "$GITEA_TOKEN" ]; then
echo "::warning::GITEA_TOKEN unset — cannot post stale-alarm comment on PR #$pr_num" >&2
return 0
fi
# Idempotency: only one alarm comment per PR. Look for the marker
# string in existing comments before posting a new one.
# string in existing comments before posting a new one. Gitea's
# /repos/{owner}/{repo}/issues/{n}/comments returns the same shape
# for issues + PRs (PRs are issues internally on Gitea, same as
# GitHub's REST).
local existing
existing="$(gh pr view "$pr_num" --repo "$REPO" --json comments \
--jq '.comments[] | select(.body | test("scripts/check-stale-promote-pr.sh per issue #2975")) | .databaseId' \
existing="$(curl --fail-with-body -sS \
-H "Authorization: token ${GITEA_TOKEN}" \
-H "Accept: application/json" \
"${GITEA_API_URL}/repos/${REPO}/issues/${pr_num}/comments?limit=50" 2>/dev/null \
| jq -r '.[] | select(.body | test("scripts/check-stale-promote-pr.sh per issue #2975")) | .id' \
| head -n1)"
if [ -n "$existing" ]; then
echo "::notice::PR #$pr_num already has a stale-alarm comment ($existing) — not re-posting"
return 0
fi
comment_body "$age_h" | gh pr comment "$pr_num" --repo "$REPO" --body-file -
local body
body="$(comment_body "$age_h")"
if curl --fail-with-body -sS \
-X POST \
-H "Authorization: token ${GITEA_TOKEN}" \
-H "Accept: application/json" \
-H "Content-Type: application/json" \
"${GITEA_API_URL}/repos/${REPO}/issues/${pr_num}/comments" \
-d "$(jq -nc --arg b "$body" '{body: $b}')" \
>/dev/null 2>&1; then
echo "::notice::Posted stale-alarm comment on PR #$pr_num (age=${age_h}h)"
else
echo "::warning::Failed to POST stale-alarm comment on PR #$pr_num" >&2
fi
}
# -----------------------------------------------------------------------------

155
scripts/edge-429-probe.sh Executable file
View File

@ -0,0 +1,155 @@
#!/usr/bin/env bash
# edge-429-probe.sh — capture 429 origin (workspace-server vs CF/Vercel edge)
# during a simulated canvas-burst against a tenant subdomain.
#
# Issue molecule-core#62. The post-#60 verification step asks an
# operator with CF/Vercel dashboard access to confirm whether the
# layout-chunk 429s observed in DevTools were:
# (a) workspace-server bucket overflow (closes once #60 deploys), or
# (b) actual edge-layer rate-limiting (CF or Vercel).
#
# This script doesn't need dashboard access. It reproduces the burst
# pattern locally and dumps every 429's response shape so the operator
# can distinguish (a) from (b) by inspection: workspace-server emits a
# JSON body, CF emits HTML, Vercel emits a different HTML. Headers tell
# the same story (cf-ray vs x-vercel-*).
#
# Usage:
# ./scripts/edge-429-probe.sh <tenant-host> [--burst N] [--waves N] [--pause SECS] [--out file]
#
# Example:
# ./scripts/edge-429-probe.sh hongming.moleculesai.app --burst 80 --out /tmp/edge.txt
#
# The script is read-only against the target — it only issues GETs to
# public-by-design endpoints. No mutating requests, no credential use.
set -euo pipefail
# ── Help / usage handling first, before positional capture ────────────────────
case "${1:-}" in
-h|--help|"")
sed -n '/^# edge-429-probe.sh/,/^$/p' "$0" | sed 's/^# \{0,1\}//'
exit 0
;;
esac
HOST="$1"; shift
BURST=80
WAVES=3
WAVE_PAUSE=2
OUT=""
while [ "${1:-}" != "" ]; do
case "$1" in
--burst) BURST="$2"; shift 2 ;;
--waves) WAVES="$2"; shift 2 ;;
--pause) WAVE_PAUSE="$2"; shift 2 ;;
--out) OUT="$2"; shift 2 ;;
-h|--help)
sed -n '/^# edge-429-probe.sh/,/^$/p' "$0" | sed 's/^# \{0,1\}//'
exit 0
;;
*) echo "unknown arg: $1" >&2; exit 2 ;;
esac
done
# ── Endpoint discovery ────────────────────────────────────────────────────────
echo "→ Discovering a layout-chunk URL from canvas root..." >&2
ROOT_BODY=$(curl -fsSL --max-time 10 "https://${HOST}/" 2>/dev/null || true)
LAYOUT_PATH=$(echo "$ROOT_BODY" \
| grep -oE '/_next/static/chunks/layout-[A-Za-z0-9_-]+\.js' \
| head -1 || true)
if [ -z "$LAYOUT_PATH" ]; then
LAYOUT_PATH="/_next/static/chunks/layout-probe-not-found.js"
echo " (no layout chunk discovered — using sentinel path; 404 on this is expected)" >&2
else
echo " layout chunk: $LAYOUT_PATH" >&2
fi
# Probe URL: a generic activity endpoint. The rate-limiter middleware
# runs BEFORE workspace-id validation, so unauth/invalid-id requests
# still hit the bucket.
ACTIVITY_PATH="/workspaces/00000000-0000-0000-0000-000000000000/activity?probe=edge-429"
# ── Fire one curl, write a single-line JSON-ish status record to stdout ──────
# Inlined into xargs as a heredoc-style command rather than a function so
# the function-export pitfalls (some shells lose `export -f` across xargs)
# don't apply. Each output line is a parseable record; failed curls emit
# a curl_err record so request volume is preserved.
TMP_RESULTS="$(mktemp -t edge-429-probe.XXXXXX)"
trap 'rm -f "$TMP_RESULTS"' EXIT
run_burst() {
# $1 = path; $2 = label; $3 = wave_id
local path="$1" label="$2" wave="$3"
local i
for i in $(seq 1 "$BURST"); do
{
out=$(curl -sS --max-time 10 -o /dev/null \
-w 'status=%{http_code} size=%{size_download} time=%{time_total} server=%{header.server} cf_ray=%{header.cf-ray} x_vercel=%{header.x-vercel-id} retry_after=%{header.retry-after} content_type=%{header.content-type} x_ratelimit_limit=%{header.x-ratelimit-limit} x_ratelimit_remaining=%{header.x-ratelimit-remaining} x_ratelimit_reset=%{header.x-ratelimit-reset}\n' \
"https://${HOST}${path}" 2>/dev/null) || out="status=curl_err"
printf 'label=%s-%s-%s %s\n' "$label" "$wave" "$i" "$out" >> "$TMP_RESULTS"
} &
done
wait
}
emit() {
if [ -n "$OUT" ]; then
printf '%s\n' "$*" >> "$OUT"
else
printf '%s\n' "$*"
fi
}
if [ -n "$OUT" ]; then : > "$OUT"; fi
emit "# edge-429-probe report"
emit "# host=$HOST burst=$BURST waves=$WAVES pause=${WAVE_PAUSE}s"
emit "# layout_path=$LAYOUT_PATH"
emit "# activity_path=$ACTIVITY_PATH"
emit "# generated=$(date -u +%Y-%m-%dT%H:%M:%SZ)"
emit ""
for wave in $(seq 1 "$WAVES"); do
emit "## wave $wave"
: > "$TMP_RESULTS"
run_burst "$LAYOUT_PATH" "layout" "$wave"
run_burst "$ACTIVITY_PATH" "activity" "$wave"
while read -r line; do
emit " $line"
done < "$TMP_RESULTS"
if [ "$wave" -lt "$WAVES" ]; then
sleep "$WAVE_PAUSE"
fi
done
emit ""
emit "## summary — how to read the report"
emit "# status=429 + content_type starts with application/json + x_ratelimit_limit set"
emit "# => workspace-server bucket overflow. Closes when #60 deploys."
emit "# status=429 + cf_ray set + content_type=text/html"
emit "# => Cloudflare WAF / rate-limit. Audit dashboard rules per #62."
emit "# status=429 + x_vercel set + content_type=text/html"
emit "# => Vercel edge / Bot Fight Mode. Audit Vercel project per #62."
emit "# status=429 with no server/cf_ray/x_vercel"
emit "# => corporate proxy or VPN. Not actionable in this repo."
if [ -n "$OUT" ]; then
echo "→ Report written to $OUT" >&2
# Match only data lines (begin with two-space indent + "label="),
# not the summary's reference text which also mentions "status=429".
# grep -c outputs "0" + exits 1 when zero matches; `|| true` masks
# the exit status so set -e doesn't trip without losing the count.
total=$(grep -c '^ label=' "$OUT" 2>/dev/null || true)
total429=$(grep -c '^ label=.*status=429' "$OUT" 2>/dev/null || true)
total=${total:-0}
total429=${total429:-0}
echo "→ Totals: ${total429} of ${total} requests returned 429" >&2
if [ "${total429}" -gt 0 ]; then
echo "→ Per-label 429 counts:" >&2
grep '^ label=.*status=429' "$OUT" \
| sed -E 's/^ label=([^-]+).*/ \1/' \
| sort | uniq -c >&2
fi
fi

View File

@ -19,9 +19,15 @@ Exit codes:
0 no collisions
1 collision detected; output names the conflicting PR(s) for the author
Designed to run from a GitHub Actions PR check. Reads PR metadata via the
GitHub CLI (gh) which is preinstalled on ubuntu-latest runners. Runs in
under 10s against a typical PR.
Designed to run from a Gitea Actions PR check. Reads PR metadata via direct
HTTP calls to Gitea's REST API (`/api/v1/`), which on the molecule-ai fleet
lives at https://git.moleculesai.app. Runs in under 10s against a typical PR.
Post-2026-05-06 (Gitea migration, issue #75): the previous version called
the GitHub CLI (``gh pr list``, ``gh pr diff``). On Gitea those calls hit
either the GraphQL endpoint (HTTP 405) or /api/v3 (HTTP 404). This module
now talks to /api/v1 directly via urllib so it works against any Gitea
host without a `gh` install or extra dependencies.
"""
from __future__ import annotations
@ -31,12 +37,70 @@ import os
import re
import subprocess
import sys
import urllib.error
import urllib.parse
import urllib.request
from pathlib import Path
MIGRATIONS_DIR = "workspace-server/migrations"
MIGRATION_FILE_RE = re.compile(r"^(\d+)_[^/]+\.(up|down)\.sql$")
def _gitea_api_url() -> str:
"""Resolve the Gitea API base URL.
act_runner forwards github.server_url as GITHUB_SERVER_URL; for the
molecule-ai fleet that's https://git.moleculesai.app. Append /api/v1
to get the REST root. Override directly via GITEA_API_URL for tests
or non-default hosts.
"""
env_override = os.environ.get("GITEA_API_URL", "").rstrip("/")
if env_override:
return env_override
server = os.environ.get("GITHUB_SERVER_URL", "https://git.moleculesai.app").rstrip("/")
return f"{server}/api/v1"
def _gitea_token() -> str:
"""Resolve the Gitea token from env. GITEA_TOKEN wins; falls back
to GITHUB_TOKEN (set by act_runner) and GH_TOKEN (operator habit
from the GitHub era)."""
return (
os.environ.get("GITEA_TOKEN")
or os.environ.get("GITHUB_TOKEN")
or os.environ.get("GH_TOKEN")
or ""
)
def _gitea_get(path: str, params: dict[str, str] | None = None) -> bytes | None:
"""GET against /api/v1; returns response body or None on HTTP error.
Errors return None (not raise) because callers handle missing data
by emitting an actionable workflow message rather than crashing the
PR check on a transient API blip.
"""
base = _gitea_api_url()
qs = ""
if params:
qs = "?" + urllib.parse.urlencode(params)
url = f"{base}/{path.lstrip('/')}{qs}"
req = urllib.request.Request(url)
token = _gitea_token()
if token:
req.add_header("Authorization", f"token {token}")
req.add_header("Accept", "application/json")
try:
with urllib.request.urlopen(req, timeout=20) as resp: # noqa: S310
return resp.read()
except urllib.error.HTTPError as e:
sys.stderr.write(f"Gitea API HTTP {e.code} on {path}: {e.reason}\n")
return None
except (urllib.error.URLError, TimeoutError) as e:
sys.stderr.write(f"Gitea API network error on {path}: {e}\n")
return None
def run(cmd: list[str], check: bool = True) -> str:
"""Run a subprocess and return stdout. Raise on non-zero when check=True."""
result = subprocess.run(cmd, capture_output=True, text=True)
@ -96,32 +160,49 @@ def open_prs_with_migration_prefix(
repo: str, prefix: int, exclude_pr: int
) -> list[dict]:
"""Return open PRs (other than `exclude_pr`) that add a migration with
`prefix`. Uses `gh pr diff` per PR we only need to walk PRs that are
actually in flight, so the cost is bounded by open-PR count.
`prefix`. Walks open PRs via Gitea's `/repos/{owner}/{repo}/pulls` and
pulls each one's changed-file list via `/pulls/{n}/files`. The cost is
bounded by open-PR count, which is small (<100) on this repo. The
return shape mimics the GitHub CLI's `--json number,headRefName`:
``[{"number": int, "headRefName": str}, ...]``.
"""
out = run([
"gh", "pr", "list", "--repo", repo, "--state", "open",
"--json", "number,headRefName", "--limit", "100",
])
prs = json.loads(out)
body = _gitea_get(
f"repos/{repo}/pulls",
{"state": "open", "limit": "50"},
)
if body is None:
# Best-effort: a transient Gitea blip shouldn't fail the PR
# check (the base-branch collision check runs locally and is
# the more common failure mode).
return []
prs = json.loads(body)
matches: list[dict] = []
for pr in prs:
num = pr["number"]
if num == exclude_pr:
continue
try:
files = run([
"gh", "pr", "diff", str(num), "--repo", repo, "--name-only",
], check=False)
except Exception: # noqa: BLE001
# Gitea returns the head ref under .head.ref (REST shape);
# GitHub CLI's --json headRefName flattens it. Normalize on
# the way out so callers see the historical shape.
head_ref_name = (pr.get("head") or {}).get("ref", "")
files_body = _gitea_get(f"repos/{repo}/pulls/{num}/files", {"limit": "100"})
if files_body is None:
continue
for raw in files.splitlines():
try:
files = json.loads(files_body)
except json.JSONDecodeError:
continue
for f in files:
# Gitea's /pulls/{n}/files returns objects with `.filename`
# (same as GitHub's REST). Older Gitea versions emit
# `.name` instead — handle both.
raw = f.get("filename") or f.get("name") or ""
path = Path(raw.strip())
if not path.name:
continue
m = MIGRATION_FILE_RE.match(path.name)
if m and int(m.group(1)) == prefix:
matches.append(pr)
matches.append({"number": num, "headRefName": head_ref_name})
break
return matches
@ -138,7 +219,10 @@ def main() -> int:
pr_number = int(pr_number_env)
base_ref = os.environ.get("BASE_REF", "origin/staging")
head_ref = os.environ.get("HEAD_REF", "HEAD")
repo = os.environ.get("GITHUB_REPOSITORY", "Molecule-AI/molecule-core")
# Default kept lowercase to match the Gitea-canonical org name
# (post-2026-05-06 migration). Tests + workflow context override
# via GITHUB_REPOSITORY which act_runner sets per-run.
repo = os.environ.get("GITHUB_REPOSITORY", "molecule-ai/molecule-core")
added = migrations_in_diff(base_ref, head_ref)
if not added:

View File

@ -0,0 +1,252 @@
#!/usr/bin/env bash
# tools/branch-protection/check_name_parity.sh — assert every required-
# check name listed in apply.sh maps to a workflow job whose "always
# emits this status" shape is intact.
#
# Closes #144 / encodes the saved memory
# feedback_branch_protection_check_name_parity:
#
# "Path filters (e.g., detect-changes → conditional skip) silently
# break branch protection because no job emits the protected
# sentinel status when path-filter returns false."
#
# Two safe shapes for a required-check job:
#
# 1. Single-job-with-per-step-if (path-filter case):
# The workflow has NO top-level `paths:` filter; the always-running
# job has steps gated on `if: needs.<gate>.outputs.<flag> == 'true'`
# so the no-op step alone fires when paths exclude the commit.
# Used by ci.yml's Platform/Canvas/Python/Shellcheck and by
# e2e-api.yml / e2e-staging-canvas.yml / runtime-prbuild-compat.yml.
#
# 2. Aggregator-with-needs+always() (matrix-refactor case):
# An aggregator job named after the protected check `needs:` the
# matrix children + uses `if: always()` + checks each child's
# result. (Not currently in this repo but supported.)
#
# Unsafe shape this script catches:
# - Workflow has top-level `paths:` filter AND the protected check
# name is on a single job. When paths-filter excludes a commit, the
# workflow doesn't fire — branch protection waits forever.
#
# Exit codes:
# 0 — every required check name has at least one safe-shape match
# 1 — a required name has no match OR matches an unsafe shape
# 2 — script-internal error (apply.sh missing, awk failure, etc.)
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
WORKFLOWS_DIR="$REPO_ROOT/.github/workflows"
APPLY_SH="$SCRIPT_DIR/apply.sh"
if [[ ! -f "$APPLY_SH" ]]; then
echo "check_name_parity: missing apply.sh at $APPLY_SH" >&2
exit 2
fi
if [[ ! -d "$WORKFLOWS_DIR" ]]; then
echo "check_name_parity: missing .github/workflows at $WORKFLOWS_DIR" >&2
exit 2
fi
# ─── Extract the union of required check names from apply.sh ──────
# apply.sh has STAGING_CHECKS and MAIN_CHECKS heredocs; union them so
# we audit any name that gates EITHER branch. Filters out blank lines
# and the heredoc end marker. Sorted + uniq so the audit output is stable.
#
# Captures the heredoc end-marker dynamically from the `<<'MARKER'`
# token on the opening line — the token can be `EOF` (production
# apply.sh), `EOF2` (test fixtures with nested heredocs), or any other
# bash-legal identifier. Without dynamic extraction, test fixtures
# with nested heredocs would either skip-capture (wrong end marker)
# or capture the inner end marker as a stray check name.
#
# Two-step approach to keep awk-portable across BSD awk (macOS) and
# gawk (Linux): grep finds the heredoc-opening lines, sed extracts the
# marker, then awk does the capture. Pure-awk attempts hit BSD-vs-GNU
# regex/variable-init differences that regress silently — this shape
# stays in POSIX-portable territory.
extract_heredoc_block() {
local file="$1"
local marker="$2"
awk -v marker="$marker" '
$0 ~ "<<.?" marker { capture=1; next }
$0 == marker && capture { capture=0; next }
capture && NF { print }
' "$file"
}
# Find every heredoc-end marker used in apply.sh (typically just EOF
# in the production script, but EOF2 / TAG / ABC are all valid in
# fixtures or future expansions). Each marker maps to one or more
# heredoc blocks; we union all of them.
markers=$(grep -E "<<['\"]?[A-Za-z0-9_]+['\"]?[[:space:]]*\\|\\|" "$APPLY_SH" \
| sed -E "s/.*<<['\"]?([A-Za-z0-9_]+)['\"]?.*/\\1/" \
| sort -u)
required_names=""
while IFS= read -r marker; do
[[ -z "$marker" ]] && continue
block=$(extract_heredoc_block "$APPLY_SH" "$marker")
if [[ -n "$block" ]]; then
required_names+="$block"$'\n'
fi
done <<< "$markers"
required_names=$(printf '%s' "$required_names" | sort -u | sed '/^$/d')
if [[ -z "$required_names" ]]; then
echo "check_name_parity: failed to extract required check names from apply.sh" >&2
exit 2
fi
# ─── For each required name, find the workflow file that owns it ──
# A workflow "owns" a name if any `name:` line in the file equals the
# required name. We look at job-level names AND the workflow-level
# `name:` (the latter prefixes "Analyze" jobs in codeql.yml).
#
# Then we check whether the owning workflow has a top-level `paths:`
# filter. The unsafe shape is:
# - top-level paths: filter present
# - AND the named job is gated only at the workflow level (no per-
# step `if:` gates)
#
# Distinguishing "no `paths:` filter" from "paths: filter + per-step
# gating" requires parsing the YAML semantics. We do it heuristically:
#
# - "no top-level paths:" → safe by construction (workflow always
# fires)
# - "paths: present" → check that the matching job has at
# least one `if: needs.<x>.outputs`
# step gate. If yes, that's the
# single-job-with-per-step-if shape.
# If no, flag as unsafe.
#
# Heuristic so it stays a portable bash + awk + grep tool — full YAML
# parsing would need yq which isn't a dependency. The known unsafe
# shape (workflow-level paths: AND no per-step if-gates) is what we're
# trying to catch.
failed=0
declare -a unsafe_findings=()
while IFS= read -r name; do
[[ -z "$name" ]] && continue
# Find every workflow file that contains a job with `name: <name>` or
# whose top-level workflow `name:` plus matrix substitution would
# produce <name>. Need to be careful about quoting — YAML allows
# `name: Foo`, `name: "Foo"`, `name: 'Foo'`. Strip quotes.
matches=()
while IFS= read -r f; do
# Look for an exact `name:` match (anywhere in the file). The
# workflow-level name line is at column 0; job-level names are
# indented. Either is acceptable for parity — what matters is
# whether the EMITTED check-run name is the one we required.
# Strip surrounding quotes/whitespace before comparing.
if awk -v want="$name" '
/^[[:space:]]*name:[[:space:]]*/ {
line = $0
sub(/^[[:space:]]*name:[[:space:]]*/, "", line)
# Strip surrounding " or '\''
gsub(/^["\047]|["\047]$/, "", line)
# Strip trailing whitespace + comment
sub(/[[:space:]]*#.*$/, "", line)
sub(/[[:space:]]+$/, "", line)
if (line == want) found = 1
}
END { exit !found }
' "$f"; then
matches+=("$f")
fi
done < <(find "$WORKFLOWS_DIR" -name '*.yml' -o -name '*.yaml')
if [[ ${#matches[@]} -eq 0 ]]; then
# Special case — Analyze (go/javascript-typescript/python) is
# generated by codeql.yml's matrix expansion of `Analyze (${{
# matrix.language }})`. Don't flag those as missing if codeql.yml
# exists with the expected base name.
case "$name" in
"Analyze (go)"|"Analyze (javascript-typescript)"|"Analyze (python)")
# shellcheck disable=SC2016
# The literal `${{ matrix.language }}` is the GHA template
# syntax we're searching FOR — not a shell expansion. SC2016
# would have us add quotes that defeat the search.
if [[ -f "$WORKFLOWS_DIR/codeql.yml" ]] && \
grep -q 'name: Analyze (${{[[:space:]]*matrix.language[[:space:]]*}})' "$WORKFLOWS_DIR/codeql.yml"; then
matches=("$WORKFLOWS_DIR/codeql.yml")
fi
;;
esac
fi
if [[ ${#matches[@]} -eq 0 ]]; then
unsafe_findings+=("MISSING: required check name '$name' has no matching workflow job")
failed=1
continue
fi
# For each owning workflow, classify safe vs unsafe.
for f in "${matches[@]}"; do
rel="${f#"$REPO_ROOT"/}"
# Heuristic: does the workflow have a top-level `paths:` filter?
# Top-level here means under the `on:` key, not under jobs.<x>.if.
# Workflow-level paths filters appear at indent depth 4 (under
# `push:` or `pull_request:`). Job-level `if:` paths-filter doesn't
# block the workflow from firing.
has_top_paths=0
if awk '
# Track whether we are inside the `on:` block. The `on:` block
# starts at column 0 (`on:` key) and ends when the next column-0
# key appears.
/^on:[[:space:]]*$/ { in_on = 1; next }
/^[a-zA-Z]/ && in_on { in_on = 0 }
in_on && /^[[:space:]]+paths:[[:space:]]*$/ { print "yes"; exit }
in_on && /^[[:space:]]+paths:[[:space:]]*\[/ { print "yes"; exit }
' "$f" | grep -q yes; then
has_top_paths=1
fi
if [[ "$has_top_paths" -eq 0 ]]; then
# Safe: workflow always fires. If there are inner per-step if-
# gates (single-job-with-per-step-if pattern), the no-op step
# produces SUCCESS for the protected name — branch-protection-clean.
continue
fi
# Unsafe candidate — has top-level paths: AND we need to verify
# the per-step if-gate pattern is absent. Look for any `if:`
# referencing a paths-filter / detect-changes output inside the
# owning job's body. If at least one is present, classify as the
# single-job-with-per-step-if pattern (safe).
#
# The regex is intentionally anchored loosely — actual workflow
# YAML writes per-step if-gates as ` - if: needs.X.outputs.Y`
# (with the `-` step-marker between the leading spaces and the
# `if`). Anchoring on `^[[:space:]]+if:` would miss those.
if grep -qE "if:[[:space:]]+needs\.[a-zA-Z_-]+\.outputs\." "$f"; then
# Per-step if-gates exist. Combined with top-level paths: this
# would be a buggy mix (the workflow might still skip entirely
# when paths exclude). Flag as unsafe — the safe pattern omits
# the top-level paths: filter altogether and gates per-step.
unsafe_findings+=("UNSAFE-MIX: $rel has top-level paths: AND per-step if-gates — when paths exclude the commit, the workflow doesn't fire and the required check '$name' is silently absent. Drop the top-level paths: filter; keep the per-step if-gates.")
failed=1
else
# Top-level paths: with no per-step if-gates: the canonical
# check-name parity bug.
unsafe_findings+=("UNSAFE-PATH-FILTER: $rel has top-level paths: filter and no per-step if-gates. When paths exclude the commit, no job emits the required check '$name' — branch protection waits forever. Either drop the paths: filter and add per-step if-gates against a detect-changes output, or add an aggregator-with-needs+always() job that emits '$name'.")
failed=1
fi
done
done <<< "$required_names"
if [[ "$failed" -eq 0 ]]; then
echo "check_name_parity: OK — every required check name maps to a safe workflow shape."
exit 0
fi
echo "check_name_parity: FOUND $((${#unsafe_findings[@]})) issue(s):" >&2
for finding in "${unsafe_findings[@]}"; do
echo " - $finding" >&2
done
exit 1

View File

@ -0,0 +1,285 @@
#!/usr/bin/env bash
# tools/branch-protection/test_check_name_parity.sh — unit tests for
# check_name_parity.sh.
#
# Builds synthetic apply.sh + workflow files in a tmpdir for each case,
# invokes the script with REPO_ROOT pointing at the tmpdir, and asserts
# on exit code + stderr. Per feedback_assert_exact_not_substring we
# pin the EXACT exit code AND a substring of the stderr that names the
# offending workflow + name combo — so a "false-pass that prints the
# wrong message" still fails the test.
#
# Run locally: bash tools/branch-protection/test_check_name_parity.sh
# Run in CI: same — added to ci.yml's shellcheck job's "E2E bash unit
# tests" step alongside test_model_slug.sh.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SCRIPT_UNDER_TEST="$SCRIPT_DIR/check_name_parity.sh"
if [[ ! -x "$SCRIPT_UNDER_TEST" ]]; then
echo "test_check_name_parity: script under test missing or not executable: $SCRIPT_UNDER_TEST" >&2
exit 2
fi
PASSED=0
FAILED=0
# Tracks the active tmpdir for the running case so the trap can clean
# up even when assertions abort the case mid-flight.
TMPDIR_FOR_CASE=""
trap '[[ -n "$TMPDIR_FOR_CASE" && -d "$TMPDIR_FOR_CASE" ]] && rm -rf "$TMPDIR_FOR_CASE"' EXIT
# Build a synthetic repo at $1 with apply.sh listing $2 (one name per
# line) as the staging required set + zero main required, then write
# whatever .github/workflows/* files the test case adds.
make_fake_repo() {
local root="$1"
local checks="$2"
mkdir -p "$root/tools/branch-protection"
mkdir -p "$root/.github/workflows"
cat > "$root/tools/branch-protection/apply.sh" <<EOF
#!/usr/bin/env bash
# Stub apply.sh — only the heredoc-shaped check lists matter for the
# parity script. Other functions intentionally absent.
read -r -d '' STAGING_CHECKS <<'EOF2' || true
$checks
EOF2
read -r -d '' MAIN_CHECKS <<'EOF2' || true
$checks
EOF2
EOF
chmod +x "$root/tools/branch-protection/apply.sh"
# Place the script-under-test alongside its sibling apply.sh so the
# script's REPO_ROOT walk finds the synthetic .github/workflows/.
cp "$SCRIPT_UNDER_TEST" "$root/tools/branch-protection/check_name_parity.sh"
}
run_case() {
local desc="$1"
local checks="$2"
local workflow_yaml="$3" # contents to write
local workflow_filename="$4"
local expected_exit="$5"
local expected_stderr_substring="$6"
TMPDIR_FOR_CASE=$(mktemp -d)
make_fake_repo "$TMPDIR_FOR_CASE" "$checks"
printf '%s' "$workflow_yaml" > "$TMPDIR_FOR_CASE/.github/workflows/$workflow_filename"
local stderr_file
stderr_file=$(mktemp)
local actual_exit=0
bash "$TMPDIR_FOR_CASE/tools/branch-protection/check_name_parity.sh" 2>"$stderr_file" >/dev/null || actual_exit=$?
local stderr_content
stderr_content=$(cat "$stderr_file")
rm "$stderr_file"
if [[ "$actual_exit" -ne "$expected_exit" ]]; then
echo "FAIL: $desc"
echo " expected exit: $expected_exit, got: $actual_exit"
echo " stderr: $stderr_content"
FAILED=$((FAILED+1))
rm -rf "$TMPDIR_FOR_CASE"; TMPDIR_FOR_CASE=""
return
fi
# Empty expected substring → no assertion on stderr (used for the
# passing case where stderr should be empty / not interesting).
if [[ -n "$expected_stderr_substring" ]]; then
if ! grep -qF "$expected_stderr_substring" <<< "$stderr_content"; then
echo "FAIL: $desc"
echo " expected stderr to contain: '$expected_stderr_substring'"
echo " actual stderr: $stderr_content"
FAILED=$((FAILED+1))
rm -rf "$TMPDIR_FOR_CASE"; TMPDIR_FOR_CASE=""
return
fi
fi
echo "PASS: $desc"
PASSED=$((PASSED+1))
rm -rf "$TMPDIR_FOR_CASE"; TMPDIR_FOR_CASE=""
}
# Case 1: safe workflow — no top-level paths: filter, single job
# emitting the required name. Should exit 0.
run_case "safe: no paths filter, job emits required name" \
"Foo Build" \
"$(cat <<'EOF'
name: Foo
on:
push:
branches: [main]
pull_request:
jobs:
foo:
name: Foo Build
runs-on: ubuntu-latest
steps:
- run: echo ok
EOF
)" \
"foo.yml" \
0 \
""
# Case 2: unsafe — top-level paths: filter AND no per-step if-gates.
# This is the silent-block shape from the saved memory.
run_case "unsafe: top-level paths: filter without per-step if-gates" \
"Bar Build" \
"$(cat <<'EOF'
name: Bar
on:
push:
branches: [main]
paths:
- 'bar/**'
pull_request:
paths:
- 'bar/**'
jobs:
bar:
name: Bar Build
runs-on: ubuntu-latest
steps:
- run: echo ok
EOF
)" \
"bar.yml" \
1 \
"UNSAFE-PATH-FILTER"
# Case 3: required name has no emitter at all.
run_case "missing: required name not in any workflow" \
"Nonexistent Job" \
"$(cat <<'EOF'
name: Other
on:
pull_request:
jobs:
other:
name: Other Job
runs-on: ubuntu-latest
steps:
- run: echo ok
EOF
)" \
"other.yml" \
1 \
"MISSING: required check name 'Nonexistent Job'"
# Case 4: safe — top-level paths: filter is absent BUT per-step if-
# gates are present (single-job-with-per-step-if pattern, what
# ci.yml + e2e-api.yml use). Should exit 0.
run_case "safe: per-step if-gates without top-level paths" \
"Baz Build" \
"$(cat <<'EOF'
name: Baz
on:
push:
branches: [main]
pull_request:
jobs:
changes:
name: Detect changes
runs-on: ubuntu-latest
outputs:
baz: ${{ steps.check.outputs.baz }}
steps:
- id: check
run: echo "baz=true" >> "$GITHUB_OUTPUT"
baz:
needs: changes
name: Baz Build
runs-on: ubuntu-latest
steps:
- if: needs.changes.outputs.baz != 'true'
run: echo no-op
- if: needs.changes.outputs.baz == 'true'
run: echo real work
EOF
)" \
"baz.yml" \
0 \
""
# Case 5: unsafe-mix — top-level paths: AND per-step if-gates. The
# script flags this distinctly because the workflow may STILL skip
# entirely when paths exclude the commit (the per-step gates only
# matter if the workflow actually fires).
run_case "unsafe-mix: top-level paths: AND per-step if-gates" \
"Qux Build" \
"$(cat <<'EOF'
name: Qux
on:
push:
branches: [main]
paths:
- 'qux/**'
pull_request:
paths:
- 'qux/**'
jobs:
changes:
name: Detect changes
runs-on: ubuntu-latest
outputs:
qux: ${{ steps.check.outputs.qux }}
steps:
- id: check
run: echo "qux=true" >> "$GITHUB_OUTPUT"
qux:
needs: changes
name: Qux Build
runs-on: ubuntu-latest
steps:
- if: needs.changes.outputs.qux == 'true'
run: echo build
EOF
)" \
"qux.yml" \
1 \
"UNSAFE-MIX"
# Case 6: codeql.yml matrix — required names like "Analyze (go)" are
# generated by `Analyze (${{ matrix.language }})`. Script must
# special-case match this pattern.
run_case "matrix: codeql Analyze (go) is recognised via matrix expansion" \
"$(printf 'Analyze (go)\nAnalyze (javascript-typescript)\nAnalyze (python)')" \
"$(cat <<'EOF'
name: CodeQL
on:
pull_request:
jobs:
analyze:
name: Analyze (${{ matrix.language }})
runs-on: ubuntu-latest
strategy:
matrix:
language: [go, javascript-typescript, python]
steps:
- run: echo analyse
EOF
)" \
"codeql.yml" \
0 \
""
echo ""
echo "================================================"
echo "test_check_name_parity: $PASSED passed, $FAILED failed"
echo "================================================"
exit "$FAILED"

View File

@ -5,13 +5,15 @@ import (
"context"
"net/http"
"strconv"
"strings"
"sync"
"time"
"github.com/gin-gonic/gin"
)
// RateLimiter implements a simple token bucket rate limiter per IP.
// RateLimiter implements a token bucket rate limiter keyed by tenant
// identity (org id, then bearer token, then client IP — see keyFor).
type RateLimiter struct {
mu sync.Mutex
buckets map[string]*bucket
@ -42,9 +44,9 @@ func NewRateLimiter(rate int, interval time.Duration, ctx context.Context) *Rate
case <-ticker.C:
rl.mu.Lock()
cutoff := time.Now().Add(-10 * time.Minute)
for ip, b := range rl.buckets {
for k, b := range rl.buckets {
if b.lastReset.Before(cutoff) {
delete(rl.buckets, ip)
delete(rl.buckets, k)
}
}
rl.mu.Unlock()
@ -54,29 +56,73 @@ func NewRateLimiter(rate int, interval time.Duration, ctx context.Context) *Rate
return rl
}
// Middleware returns a Gin middleware that rate limits by client IP.
// keyFor returns the bucket identifier for this request. Priority:
//
// 1. X-Molecule-Org-Id header — when present (CP-routed SaaS traffic),
// isolates tenants from each other regardless of the upstream proxy IP
// they all share.
// 2. SHA-256 of Authorization Bearer token — when present (per-workspace
// bearer, ADMIN_TOKEN, org-scoped API token). On a per-tenant Caddy
// box where the org-id header isn't attached, this still distinguishes
// distinct user sessions on the same egress IP.
// 3. ClientIP() — anonymous probes, /health scrapes, registry boot
// signals (when SetTrustedProxies(nil) is in effect, this is the
// direct TCP RemoteAddr — fine for the probe surface, not fine as a
// primary key behind a proxy, hence the priority order above).
//
// Mixing these namespaces is fine because they never collide: org ids
// are UUIDs ("org:..."), token hashes are 64-char hex ("tok:..."), IPs
// contain dots/colons ("ip:...").
//
// Security note on X-Molecule-Org-Id spoofing: the rate limiter runs
// BEFORE TenantGuard, so the org-id value here is unvalidated. A caller
// reaching workspace-server directly could spoof the header to drain
// another org's bucket. In production this surface is closed by the
// CP/Caddy front: tenant SGs reject :8080 from the public internet, and
// CP rewrites the header to the verified org. If a future deployment
// exposes :8080 directly, validate the org-id (e.g. against
// MOLECULE_ORG_ID) before keying on it, or move this middleware after
// TenantGuard. The token-hash and IP fallbacks are unspoofable.
//
// Issue #59 — replaces the previous IP-only keying that silently
// collapsed all canvas traffic into one bucket once #179 disabled
// proxy-header trust. See the issue for the deployment-shape analysis.
func (rl *RateLimiter) keyFor(c *gin.Context) string {
if orgID := strings.TrimSpace(c.GetHeader("X-Molecule-Org-Id")); orgID != "" {
return "org:" + orgID
}
if tok := bearerFromHeader(c.GetHeader("Authorization")); tok != "" {
return "tok:" + tokenKey(tok)
}
return "ip:" + c.ClientIP()
}
// Middleware returns a Gin middleware that rate limits per caller. The
// caller-key derivation lives in keyFor — see that function's doc for
// the priority list and rationale.
func (rl *RateLimiter) Middleware() gin.HandlerFunc {
return func(c *gin.Context) {
// Tier-1b dev-mode hatch — same gate as AdminAuth / WorkspaceAuth /
// discovery. On a local single-user Docker setup the 600-req/min
// bucket fills fast: a 15-workspace canvas + activity polling +
// approvals polling + A2A overlay + initial hydration all share
// one IP bucket, so a minute of active use can trip 429 and blank
// the page. Gated by MOLECULE_ENV=development + empty ADMIN_TOKEN
// so SaaS production keeps the bucket.
// approvals polling + A2A overlay + initial hydration all land in
// one bucket (whichever keyFor returns — typically the dev user's
// IP or shared admin token), so a minute of active use can trip
// 429 and blank the page. Gated by MOLECULE_ENV=development +
// empty ADMIN_TOKEN so SaaS production keeps the bucket.
if isDevModeFailOpen() {
c.Header("X-RateLimit-Limit", "unlimited")
c.Next()
return
}
ip := c.ClientIP()
key := rl.keyFor(c)
rl.mu.Lock()
b, exists := rl.buckets[ip]
b, exists := rl.buckets[key]
if !exists {
b = &bucket{tokens: rl.rate, lastReset: time.Now()}
rl.buckets[ip] = b
rl.buckets[key] = b
}
// Reset tokens if interval has passed

View File

@ -0,0 +1,303 @@
package middleware
import (
"context"
"crypto/sha256"
"fmt"
"go/ast"
"go/parser"
"go/token"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
"github.com/gin-gonic/gin"
)
// newTestLimiterForKeyFor — same shape as newTestLimiter in ratelimit_test.go
// but exposes the *gin.Engine and lets the caller inject headers per-request.
func newTestLimiterForKeyFor(t *testing.T, rate int) *gin.Engine {
t.Helper()
gin.SetMode(gin.TestMode)
ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(cancel)
rl := NewRateLimiter(rate, 5*time.Second, ctx)
r := gin.New()
if err := r.SetTrustedProxies(nil); err != nil {
t.Fatalf("SetTrustedProxies: %v", err)
}
r.Use(rl.Middleware())
r.GET("/x", func(c *gin.Context) { c.String(http.StatusOK, "ok") })
return r
}
// TestKeyFor_OrgIdHeaderTrumpsBearerAndIP — when X-Molecule-Org-Id is set
// the bucket is keyed on it regardless of bearer token or IP. This is the
// load-bearing case for the production SaaS plane: every tenant routed
// through the same upstream proxy IP gets its own bucket because the
// CP attaches the org-id header.
func TestKeyFor_OrgIdHeaderTrumpsBearerAndIP(t *testing.T) {
gin.SetMode(gin.TestMode)
ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(cancel)
rl := NewRateLimiter(2, 5*time.Second, ctx)
c, _ := gin.CreateTestContext(httptest.NewRecorder())
c.Request = httptest.NewRequest(http.MethodGet, "/x", nil)
c.Request.RemoteAddr = "10.0.0.1:1234"
c.Request.Header.Set("X-Molecule-Org-Id", "org-aaa")
c.Request.Header.Set("Authorization", "Bearer ignored-token-value")
got := rl.keyFor(c)
if got != "org:org-aaa" {
t.Errorf("keyFor with org-id header: got %q, want %q", got, "org:org-aaa")
}
}
// TestKeyFor_BearerTokenWhenNoOrgId — the per-tenant Caddy box path:
// no org-id header (canvas same-origin), but Authorization Bearer is
// always set by WorkspaceAuth-protected routes. Bucket keyed on the
// SHA-256 hex of the token so distinct sessions on the same egress IP
// get distinct buckets — and so the in-memory map can never become a
// token dump if the process is inspected.
func TestKeyFor_BearerTokenWhenNoOrgId(t *testing.T) {
gin.SetMode(gin.TestMode)
ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(cancel)
rl := NewRateLimiter(2, 5*time.Second, ctx)
c, _ := gin.CreateTestContext(httptest.NewRecorder())
c.Request = httptest.NewRequest(http.MethodGet, "/x", nil)
c.Request.RemoteAddr = "10.0.0.1:1234"
c.Request.Header.Set("Authorization", "Bearer secret-token-abc")
got := rl.keyFor(c)
expectedHash := fmt.Sprintf("%x", sha256.Sum256([]byte("secret-token-abc")))
if got != "tok:"+expectedHash {
t.Errorf("keyFor with bearer-only: got %q, want %q", got, "tok:"+expectedHash)
}
// Critical security pin: raw token must never appear in the key.
if strings.Contains(got, "secret-token-abc") {
t.Errorf("keyFor leaked raw bearer token in bucket key: %q", got)
}
}
// TestKeyFor_IPFallbackWhenNoOrgIdNoBearer — anonymous probes (no auth,
// no tenant header) fall through to ClientIP keying. This is the only
// path that depended on the pre-#179 trust-XFF behaviour and is fine
// to keep IP-keyed because the surface is just /health, /buildinfo,
// and the registry-boot endpoints.
func TestKeyFor_IPFallbackWhenNoOrgIdNoBearer(t *testing.T) {
gin.SetMode(gin.TestMode)
ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(cancel)
rl := NewRateLimiter(2, 5*time.Second, ctx)
c, _ := gin.CreateTestContext(httptest.NewRecorder())
c.Request = httptest.NewRequest(http.MethodGet, "/x", nil)
c.Request.RemoteAddr = "203.0.113.1:1234"
got := rl.keyFor(c)
// gin.ClientIP() strips the port — we just need to confirm the prefix
// and that the IP appears.
if !strings.HasPrefix(got, "ip:") {
t.Errorf("keyFor without auth/org headers: got %q, want prefix %q", got, "ip:")
}
if !strings.Contains(got, "203.0.113.1") {
t.Errorf("keyFor IP fallback: got %q, want to contain %q", got, "203.0.113.1")
}
}
// TestRateLimit_TwoOrgsSameIP_IndependentBuckets — the load-bearing
// regression test for issue #59. Two tenants behind the same upstream
// proxy must NOT share a bucket; the production SaaS-plane outage was
// every tenant collapsing to the proxy IP and saturating one bucket.
//
// Mutation invariant: removing the org-id branch from keyFor — say,
// returning "ip:" + c.ClientIP() unconditionally — collapses both
// tenants back into one bucket and this test fails on the 3rd
// request because it would 429 instead of 200.
func TestRateLimit_TwoOrgsSameIP_IndependentBuckets(t *testing.T) {
r := newTestLimiterForKeyFor(t, 2)
exhaust := func(orgID string) {
t.Helper()
for i := 0; i < 2; i++ {
req := httptest.NewRequest(http.MethodGet, "/x", nil)
req.RemoteAddr = "10.0.0.1:1234" // SAME upstream proxy IP
req.Header.Set("X-Molecule-Org-Id", orgID)
w := httptest.NewRecorder()
r.ServeHTTP(w, req)
if w.Code != http.StatusOK {
t.Fatalf("setup orgID=%s req %d: want 200, got %d", orgID, i+1, w.Code)
}
}
}
exhaust("org-aaa")
// org-aaa is now at 0 tokens. org-bbb's bucket must be FRESH.
req := httptest.NewRequest(http.MethodGet, "/x", nil)
req.RemoteAddr = "10.0.0.1:1234"
req.Header.Set("X-Molecule-Org-Id", "org-bbb")
w := httptest.NewRecorder()
r.ServeHTTP(w, req)
if w.Code != http.StatusOK {
t.Fatalf("org-bbb on same IP must have its own bucket: got %d, want 200 (issue #59 regression)", w.Code)
}
// Confirm org-aaa is still throttled — proves we're not just opening
// the gate to everyone.
req = httptest.NewRequest(http.MethodGet, "/x", nil)
req.RemoteAddr = "10.0.0.1:1234"
req.Header.Set("X-Molecule-Org-Id", "org-aaa")
w = httptest.NewRecorder()
r.ServeHTTP(w, req)
if w.Code != http.StatusTooManyRequests {
t.Errorf("org-aaa exhausted bucket: want 429, got %d", w.Code)
}
}
// TestRateLimit_TwoTokensSameIP_IndependentBuckets — analog of the
// org-id case for the per-tenant Caddy box: two distinct user
// sessions on the same egress IP, distinguished only by their bearer
// tokens, must get independent buckets. This was the path Hongming
// hit on hongming.moleculesai.app — a single user with multiple
// browser tabs against one workspace-server box.
func TestRateLimit_TwoTokensSameIP_IndependentBuckets(t *testing.T) {
r := newTestLimiterForKeyFor(t, 2)
exhaust := func(token string) {
t.Helper()
for i := 0; i < 2; i++ {
req := httptest.NewRequest(http.MethodGet, "/x", nil)
req.RemoteAddr = "127.0.0.1:1234" // local Caddy proxy — same for both
req.Header.Set("Authorization", "Bearer "+token)
w := httptest.NewRecorder()
r.ServeHTTP(w, req)
if w.Code != http.StatusOK {
t.Fatalf("setup token=%s req %d: want 200, got %d", token, i+1, w.Code)
}
}
}
exhaust("user-a-token")
req := httptest.NewRequest(http.MethodGet, "/x", nil)
req.RemoteAddr = "127.0.0.1:1234"
req.Header.Set("Authorization", "Bearer user-b-token")
w := httptest.NewRecorder()
r.ServeHTTP(w, req)
if w.Code != http.StatusOK {
t.Fatalf("user-b token on same proxy IP must have its own bucket: got %d, want 200", w.Code)
}
}
// TestRateLimit_SameOrgDifferentTokens_SharedBucket — counter-pin:
// ensure org-id keying really does collapse all tokens within one
// org into one bucket. This is the desired behaviour: a tenant that
// mints multiple tokens shouldn't be able to circumvent its quota
// by rotating tokens between requests. (The same-IP-different-org
// test above proves we don't collapse ACROSS orgs; this one proves
// we DO collapse WITHIN one org.)
func TestRateLimit_SameOrgDifferentTokens_SharedBucket(t *testing.T) {
r := newTestLimiterForKeyFor(t, 2)
for _, tok := range []string{"token-1", "token-2"} {
req := httptest.NewRequest(http.MethodGet, "/x", nil)
req.RemoteAddr = "10.0.0.1:1234"
req.Header.Set("X-Molecule-Org-Id", "org-shared")
req.Header.Set("Authorization", "Bearer "+tok)
w := httptest.NewRecorder()
r.ServeHTTP(w, req)
if w.Code != http.StatusOK {
t.Fatalf("setup tok=%s: want 200, got %d", tok, w.Code)
}
}
// Bucket should be exhausted now — third request, even with a fresh
// token, must 429 because the org-id is keying it.
req := httptest.NewRequest(http.MethodGet, "/x", nil)
req.RemoteAddr = "10.0.0.1:1234"
req.Header.Set("X-Molecule-Org-Id", "org-shared")
req.Header.Set("Authorization", "Bearer token-3")
w := httptest.NewRecorder()
r.ServeHTTP(w, req)
if w.Code != http.StatusTooManyRequests {
t.Errorf("rotating tokens within one org should NOT bypass the quota: got %d, want 429", w.Code)
}
}
// TestRateLimit_Middleware_RoutesThroughKeyFor is the AST gate (mirror
// of #36/#10/#12's gates). Pins the SSOT routing invariant:
// (*RateLimiter).Middleware MUST call rl.keyFor and MUST NOT carry a
// direct c.ClientIP() call (= the parallel-impl drift this PR fixes).
//
// Mutation invariant: a future PR that re-introduces direct IP keying
// in Middleware (`ip := c.ClientIP()`) makes this test fail. That's
// the signal to either (a) extend keyFor's contract to cover the new
// case OR (b) update this gate with an explicit reason. Either way the
// drift gets a reviewer's attention before shipping.
func TestRateLimit_Middleware_RoutesThroughKeyFor(t *testing.T) {
fset := token.NewFileSet()
file, err := parser.ParseFile(fset, "ratelimit.go", nil, parser.ParseComments)
if err != nil {
t.Fatalf("parse ratelimit.go: %v", err)
}
var fn *ast.FuncDecl
ast.Inspect(file, func(n ast.Node) bool {
f, ok := n.(*ast.FuncDecl)
if !ok {
return true
}
// Match `func (rl *RateLimiter) Middleware() ...`
if f.Name.Name != "Middleware" {
return true
}
if f.Recv == nil || len(f.Recv.List) != 1 {
return true
}
star, ok := f.Recv.List[0].Type.(*ast.StarExpr)
if !ok {
return true
}
if id, ok := star.X.(*ast.Ident); !ok || id.Name != "RateLimiter" {
return true
}
fn = f
return false
})
if fn == nil {
t.Fatal("(*RateLimiter).Middleware not found — was it renamed? update this gate or the SSOT routing assumption")
}
var (
callsKeyFor bool
callsClientIP bool
)
ast.Inspect(fn.Body, func(n ast.Node) bool {
call, ok := n.(*ast.CallExpr)
if !ok {
return true
}
sel, ok := call.Fun.(*ast.SelectorExpr)
if !ok {
return true
}
switch sel.Sel.Name {
case "keyFor":
callsKeyFor = true
case "ClientIP":
callsClientIP = true
}
return true
})
if !callsKeyFor {
t.Error("(*RateLimiter).Middleware must call rl.keyFor for SSOT bucket-key derivation — see issue #59. Found no keyFor call.")
}
if callsClientIP {
t.Error("(*RateLimiter).Middleware carries a direct c.ClientIP() call. This is the parallel-impl drift issue #59 fixed. " +
"Either route through rl.keyFor OR — if a new use case truly needs direct IP — extend keyFor's contract first and update this gate to allow the specific delta.")
}
}

View File

@ -0,0 +1,545 @@
package provisioner
import (
"context"
"crypto/sha256"
"encoding/hex"
"errors"
"fmt"
"io"
"log"
"net/http"
"net/url"
"os"
"os/exec"
"path/filepath"
"strings"
"sync"
"time"
)
// Local-build mode: clone the workspace-template-<runtime> repo from Gitea
// and `docker build` it on the host so OSS contributors can run molecule-core
// end-to-end without authenticating to (or being able to reach) GHCR/ECR.
//
// The flow:
//
// 1. ensureLocalImage(runtime) is called by the provisioner before
// ContainerCreate, but only when Resolve().Mode == RegistryModeLocal.
// 2. We compute a cache key from the Gitea repo's HEAD sha (one HTTP
// call to https://git.moleculesai.app/api/v1/repos/.../branches/main).
// 3. If `molecule-local/workspace-template-<runtime>:<sha12>` already
// exists in the local Docker image store, we return immediately.
// 4. Otherwise: shallow git-clone the repo into the cache dir, then
// `docker buildx build --platform=linux/amd64 -t <tag>` on it. We
// also tag `:latest` so `docker images` shows a friendly entry.
//
// Why amd64 emulation: the provisioner's defaultImagePlatform() forces
// linux/amd64 on Apple Silicon for parity with the (amd64-only) prod
// images. Building native arm64 in local-mode would diverge — see the
// design rationale in Issue #63 and the saved memory
// `feedback_local_must_mimic_production`.
//
// Auth: clone is anonymous (templates are public). If MOLECULE_GITEA_TOKEN
// is set, we use it via the URL's userinfo — the token is masked in
// every log line by maskTokenInURL().
//
// Failure mode: fail-closed. If Gitea is unreachable we surface a clear
// error message including the repo URL; we NEVER fall back to GHCR/ECR
// silently (would be a confusing bug for an OSS contributor who
// happens to have stale ECR creds in their docker config).
// gitTemplateRepoPrefix is the prefix all workspace-template repos live
// under on Gitea. Hardcoded so an attacker who controlled cfg.Runtime
// (defence-in-depth — today the field is platform-validated upstream)
// can only ever reach a repo under molecule-ai/.
//
// Operators who want to point local-build at a fork can override the
// full prefix via MOLECULE_LOCAL_TEMPLATE_REPO_PREFIX (e.g.
// `https://git.example.com/myorg/molecule-ai-workspace-template-`).
// Default-off; opt-in only.
const gitTemplateRepoPrefix = "https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-"
// localBuildLockMap serializes concurrent ensureLocalImage calls per
// runtime so two workspace creates that hit the cold path together don't
// race on `docker build` (Docker's daemon would serialize anyway, but
// the duplicate clone + log spam are confusing). Lock granularity is
// per-runtime, so different runtimes still build in parallel.
var (
localBuildLockMap = make(map[string]*sync.Mutex)
localBuildLockMapMu sync.Mutex
)
func runtimeBuildLock(runtime string) *sync.Mutex {
localBuildLockMapMu.Lock()
defer localBuildLockMapMu.Unlock()
if m, ok := localBuildLockMap[runtime]; ok {
return m
}
m := &sync.Mutex{}
localBuildLockMap[runtime] = m
return m
}
// LocalBuildOptions controls the local-build path. Exposed so tests can
// inject fakes without standing up a real git+docker chain. Production
// uses zero-value defaults via newDefaultLocalBuildOptions().
type LocalBuildOptions struct {
// CacheDir is the host filesystem location where cloned template
// repos are kept between builds. Empty = use $XDG_CACHE_HOME or
// $HOME/.cache. Override via env var MOLECULE_LOCAL_BUILD_CACHE.
CacheDir string
// RepoPrefix is the URL prefix all template repos hang off. Empty
// = use gitTemplateRepoPrefix. Override via env var
// MOLECULE_LOCAL_TEMPLATE_REPO_PREFIX.
RepoPrefix string
// Token, if non-empty, is sent via URL userinfo to Gitea. Default
// empty (templates are public). Override via env var
// MOLECULE_GITEA_TOKEN.
Token string
// Platform is the buildx --platform value. Empty = host default;
// today we always pass linux/amd64 because the provisioner only
// runs amd64 images. Exposed so tests can override.
Platform string
// HTTPClient is used for the Gitea-API HEAD-sha lookup. Empty =
// http.DefaultClient with a 30s timeout.
HTTPClient *http.Client
// remoteHeadSha + dockerBuild + gitClone are seams for tests; if
// nil, the production implementations are used.
remoteHeadSha func(ctx context.Context, opts *LocalBuildOptions, runtime string) (string, error)
gitClone func(ctx context.Context, opts *LocalBuildOptions, runtime, dest string) error
dockerBuild func(ctx context.Context, opts *LocalBuildOptions, contextDir, tag string) error
dockerHasTag func(ctx context.Context, tag string) (bool, error)
dockerTag func(ctx context.Context, src, dst string) error
}
func newDefaultLocalBuildOptions() *LocalBuildOptions {
o := &LocalBuildOptions{
CacheDir: os.Getenv("MOLECULE_LOCAL_BUILD_CACHE"),
RepoPrefix: os.Getenv("MOLECULE_LOCAL_TEMPLATE_REPO_PREFIX"),
Token: os.Getenv("MOLECULE_GITEA_TOKEN"),
Platform: "linux/amd64",
}
if o.CacheDir == "" {
if xdg := os.Getenv("XDG_CACHE_HOME"); xdg != "" {
o.CacheDir = filepath.Join(xdg, "molecule", "workspace-template-build")
} else if home, err := os.UserHomeDir(); err == nil {
o.CacheDir = filepath.Join(home, ".cache", "molecule", "workspace-template-build")
} else {
// Last-resort fallback: /tmp. Loses the cache between reboots
// but at least lets the path produce builds.
o.CacheDir = filepath.Join(os.TempDir(), "molecule", "workspace-template-build")
}
}
if o.RepoPrefix == "" {
o.RepoPrefix = gitTemplateRepoPrefix
}
o.HTTPClient = &http.Client{Timeout: 30 * time.Second}
return o
}
// LocalImageTag formats the SHA-pinned tag for a runtime. Exported for
// tests + the provisioner's image-resolution branch.
func LocalImageTag(runtime, sha string) string {
short := sha
if len(short) > 12 {
short = short[:12]
}
return fmt.Sprintf("%s/workspace-template-%s:%s", localImagePrefix, runtime, short)
}
// LocalImageLatestTag returns the floating `:latest` form. Used as a
// human-readable alias and as the value RuntimeImage() returns in
// local-mode.
func LocalImageLatestTag(runtime string) string {
return fmt.Sprintf("%s/workspace-template-%s:latest", localImagePrefix, runtime)
}
// EnsureLocalImage is the entry point the provisioner calls before
// ContainerCreate when Resolve().Mode == RegistryModeLocal. Returns the
// image tag (SHA-pinned form) the caller should hand to Docker, or an
// error if the build/clone fails.
//
// Concurrency: per-runtime lock; parallel calls for the same runtime
// share the build, parallel calls for different runtimes proceed.
//
// Idempotent: a cached SHA-pinned tag short-circuits without network
// or docker calls. The Gitea HEAD lookup is the only network call on
// the cache-hit path.
func EnsureLocalImage(ctx context.Context, runtime string) (string, error) {
return ensureLocalImageWithOpts(ctx, runtime, newDefaultLocalBuildOptions())
}
// ensureLocalImageHook is the seam Start() calls into. Production code
// uses EnsureLocalImage; tests substitute a fake to exercise the
// provisioner-Start integration without standing up a real
// git+docker chain. Single-process scoped — never reassigned in
// production code.
var ensureLocalImageHook = EnsureLocalImage
func ensureLocalImageWithOpts(ctx context.Context, runtime string, opts *LocalBuildOptions) (string, error) {
if !IsKnownRuntime(runtime) {
return "", fmt.Errorf("local-build: refusing to build unknown runtime %q (must be one of %v)", runtime, knownRuntimes)
}
lock := runtimeBuildLock(runtime)
lock.Lock()
defer lock.Unlock()
// 1. HEAD lookup → cache key.
headFn := opts.remoteHeadSha
if headFn == nil {
headFn = remoteHeadShaProd
}
sha, err := headFn(ctx, opts, runtime)
if err != nil {
// Fail-closed: do not fall back to GHCR/ECR. The whole point of
// local-build mode is that GHCR is unreachable.
return "", fmt.Errorf("local-build: cannot determine HEAD sha for runtime %q at %s: %w", runtime, repoURL(opts, runtime), err)
}
if len(sha) < 12 {
return "", fmt.Errorf("local-build: Gitea returned a short sha %q for runtime %q (expected ≥12 chars)", sha, runtime)
}
tag := LocalImageTag(runtime, sha)
latest := LocalImageLatestTag(runtime)
// 2. Cache hit?
hasFn := opts.dockerHasTag
if hasFn == nil {
hasFn = dockerHasTagProd
}
exists, hasErr := hasFn(ctx, tag)
if hasErr != nil {
log.Printf("local-build: image inspect for %s failed (%v); will rebuild", tag, hasErr)
}
if exists {
log.Printf("local-build: cache hit for %s (sha=%s) — skipping clone+build", tag, sha[:12])
// Refresh the floating :latest alias so admins inspecting `docker
// images` see the current sha. Best-effort.
tagFn := opts.dockerTag
if tagFn == nil {
tagFn = dockerTagProd
}
if tErr := tagFn(ctx, tag, latest); tErr != nil {
log.Printf("local-build: best-effort retag of %s → %s failed: %v", tag, latest, tErr)
}
return tag, nil
}
// 3. Cold path — clone + build.
dest := filepath.Join(opts.CacheDir, runtime, sha[:12])
if err := os.MkdirAll(filepath.Dir(dest), 0o755); err != nil {
return "", fmt.Errorf("local-build: prepare cache dir %q: %w", filepath.Dir(dest), err)
}
// Idempotent: if the dest exists from a previous failed run, wipe and
// re-clone so we don't build a partial tree.
if _, statErr := os.Stat(dest); statErr == nil {
if rmErr := os.RemoveAll(dest); rmErr != nil {
return "", fmt.Errorf("local-build: clean stale cache dir %q: %w", dest, rmErr)
}
}
cloneFn := opts.gitClone
if cloneFn == nil {
cloneFn = gitCloneProd
}
log.Printf("local-build: cloning %s → %s (sha=%s)", redactedRepoURL(opts, runtime), dest, sha[:12])
cloneStart := time.Now()
if err := cloneFn(ctx, opts, runtime, dest); err != nil {
// Best-effort cleanup so a half-cloned tree doesn't poison future runs.
_ = os.RemoveAll(dest)
return "", fmt.Errorf("local-build: clone %s: %w", redactedRepoURL(opts, runtime), err)
}
log.Printf("local-build: clone complete in %s", time.Since(cloneStart).Round(time.Millisecond))
// 4. Sanity-check the cloned tree contains a Dockerfile at the root.
dockerfile := filepath.Join(dest, "Dockerfile")
info, statErr := os.Stat(dockerfile)
if statErr != nil || info.IsDir() {
_ = os.RemoveAll(dest)
return "", fmt.Errorf("local-build: cloned tree at %s has no Dockerfile (template repo malformed)", dest)
}
// 5. Build.
buildFn := opts.dockerBuild
if buildFn == nil {
buildFn = dockerBuildProd
}
log.Printf("local-build: docker build start for %s (platform=%s, context=%s)", tag, opts.Platform, dest)
buildStart := time.Now()
if err := buildFn(ctx, opts, dest, tag); err != nil {
return "", fmt.Errorf("local-build: docker build %s: %w", tag, err)
}
log.Printf("local-build: docker build done for %s in %s", tag, time.Since(buildStart).Round(time.Second))
// Tag :latest as a friendly alias.
tagFn := opts.dockerTag
if tagFn == nil {
tagFn = dockerTagProd
}
if err := tagFn(ctx, tag, latest); err != nil {
log.Printf("local-build: best-effort retag of %s → %s failed: %v", tag, latest, err)
}
return tag, nil
}
// repoURL composes the full Gitea repo URL for the given runtime. The
// prefix is hardcoded by default; operators can override via env so a
// fork can point local-build at their own Gitea instance.
func repoURL(opts *LocalBuildOptions, runtime string) string {
return opts.RepoPrefix + runtime
}
// redactedRepoURL returns the same value with any embedded token replaced
// by "***". Use this for log lines.
func redactedRepoURL(opts *LocalBuildOptions, runtime string) string {
return maskTokenInURL(repoURL(opts, runtime))
}
// maskTokenInURL replaces userinfo (username:password@) in a URL with
// `***@` so log lines never echo a Gitea PAT. Returns the input as-is
// on parse failures (defence: never silently corrupt the visible URL).
//
// Implementation note: net/url's URL.User stringifier percent-encodes
// the username, so `u.User = url.User("***"); u.String()` would yield
// `https://%2A%2A%2A@host/...` — unhelpful for humans grepping logs.
// We drop the userinfo via URL.User=nil, get the canonical scheme-and-
// rest, and re-insert the literal `***@` between the scheme separator
// and the host.
func maskTokenInURL(s string) string {
u, err := url.Parse(s)
if err != nil || u.User == nil {
return s
}
u.User = nil
out := u.String()
prefix := u.Scheme + "://"
if !strings.HasPrefix(out, prefix) {
return s
}
return prefix + "***@" + out[len(prefix):]
}
// remoteHeadShaProd looks up the HEAD commit sha of branch `main` for
// the workspace-template-<runtime> repo on Gitea. We use the Gitea API
// (a single HTTPS call) rather than `git ls-remote` so we don't need a
// git binary just for the HEAD lookup — we still need git for the
// clone, but the cache-hit path stays git-free.
func remoteHeadShaProd(ctx context.Context, opts *LocalBuildOptions, runtime string) (string, error) {
// Convert a `git.example.com/org/prefix-` URL into the API form
// `git.example.com/api/v1/repos/org/prefix-<runtime>/branches/main`.
// Works for both git.moleculesai.app (default) and any forks that
// share the Gitea API shape.
apiURL, err := giteaBranchAPIURL(opts.RepoPrefix, runtime, "main")
if err != nil {
return "", err
}
req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil)
if err != nil {
return "", err
}
if opts.Token != "" {
// Gitea accepts "token <PAT>" in the Authorization header for
// API calls. Userinfo is also accepted but only matters for
// the HTTPS clone, not the JSON API.
req.Header.Set("Authorization", "token "+opts.Token)
}
cli := opts.HTTPClient
if cli == nil {
cli = &http.Client{Timeout: 30 * time.Second}
}
resp, err := cli.Do(req)
if err != nil {
return "", err
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode == http.StatusNotFound {
return "", fmt.Errorf("repo not found at %s — runtime %q may not be mirrored to Gitea (only claude-code/hermes/langgraph/autogen today)", apiURL, runtime)
}
if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
return "", fmt.Errorf("auth failure (%d) at %s — verify MOLECULE_GITEA_TOKEN if private repo", resp.StatusCode, apiURL)
}
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HEAD lookup at %s returned %d", apiURL, resp.StatusCode)
}
body, err := io.ReadAll(io.LimitReader(resp.Body, 64<<10))
if err != nil {
return "", fmt.Errorf("read HEAD response body: %w", err)
}
// Tiny ad-hoc parser: we want commit.id, no need to drag in encoding/json
// — actually simpler to use json. Switch to it.
return parseGiteaBranchHeadSha(body)
}
// giteaBranchAPIURL maps a repo-prefix URL like
// `https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-`
// + runtime "claude-code" + branch "main"
// to the API URL
// `https://git.moleculesai.app/api/v1/repos/molecule-ai/molecule-ai-workspace-template-claude-code/branches/main`.
func giteaBranchAPIURL(repoPrefix, runtime, branch string) (string, error) {
u, err := url.Parse(repoPrefix + runtime)
if err != nil {
return "", fmt.Errorf("parse repo URL %q: %w", repoPrefix+runtime, err)
}
parts := strings.TrimPrefix(u.Path, "/")
parts = strings.TrimSuffix(parts, "/")
if parts == "" {
return "", fmt.Errorf("repo URL %q has empty path", repoPrefix+runtime)
}
// Expect `<org>/<repo>` (single slash) — the prefix already includes
// org+partial-repo; runtime appends the rest.
if !strings.Contains(parts, "/") {
return "", fmt.Errorf("repo URL %q missing org/repo path", repoPrefix+runtime)
}
apiURL := url.URL{
Scheme: u.Scheme,
Host: u.Host,
Path: "/api/v1/repos/" + parts + "/branches/" + branch,
}
return apiURL.String(), nil
}
// parseGiteaBranchHeadSha extracts commit.id from the Gitea
// /branches/<name> response. We use a permissive substring scan so a
// missing-key in the JSON gives a clear error rather than the
// json.Decoder's somewhat opaque "missing field" message.
func parseGiteaBranchHeadSha(body []byte) (string, error) {
// Look for `"id":"<40-hex>"` inside the commit object.
idx := strings.Index(string(body), `"id":"`)
if idx < 0 {
return "", errors.New("Gitea branch response missing commit.id field")
}
rest := string(body[idx+len(`"id":"`):])
end := strings.IndexByte(rest, '"')
if end < 0 {
return "", errors.New("Gitea branch response has malformed commit.id (no closing quote)")
}
sha := rest[:end]
if len(sha) < 7 {
return "", fmt.Errorf("Gitea returned suspiciously short sha %q", sha)
}
return sha, nil
}
// gitCloneProd shallow-clones the runtime's template repo into dest.
//
// We invoke `git` rather than implementing the protocol ourselves —
// every host that runs the workspace-server already needs git available
// (it's a hard dep of go-mod for vendored repos) and the OSS contributor
// onboarding doc lists it as a prerequisite.
func gitCloneProd(ctx context.Context, opts *LocalBuildOptions, runtime, dest string) error {
cloneURL := repoURL(opts, runtime)
if opts.Token != "" {
// HTTPS clone with userinfo: https://oauth2:<token>@host/...
u, err := url.Parse(cloneURL)
if err == nil {
u.User = url.UserPassword("oauth2", opts.Token)
cloneURL = u.String()
}
// On parse failure we silently fall through to the public URL —
// better to attempt the anonymous clone than to refuse outright.
}
cmd := exec.CommandContext(ctx, "git", "clone", "--depth=1", "--branch=main", "--single-branch", cloneURL, dest)
// Drop git's askpass prompts so we fail-fast on auth errors instead
// of hanging waiting for an interactive password.
cmd.Env = append(os.Environ(), "GIT_TERMINAL_PROMPT=0", "GIT_ASKPASS=/bin/echo")
out, err := cmd.CombinedOutput()
if err != nil {
// Mask the token in any error string git emits via stderr — git
// occasionally echoes the URL verbatim on failure.
errMsg := maskTokenInString(string(out), opts.Token)
return fmt.Errorf("%w: %s", err, strings.TrimSpace(errMsg))
}
return nil
}
// maskTokenInString replaces literal occurrences of the token with `***`.
// Defence against git binary or docker echoing the URL into stderr.
func maskTokenInString(s, token string) string {
if token == "" {
return s
}
return strings.ReplaceAll(s, token, "***")
}
// dockerBuildProd invokes the docker CLI to build the workspace-template
// image. We shell out rather than use the Docker SDK's ImageBuild — the
// SDK requires hand-tarballing the build context, which adds a
// non-trivial code path with its own bug surface. The docker CLI is
// already a hard dep of the workspace-server (the provisioner needs the
// daemon), so requiring the CLI binary on PATH adds nothing.
//
// Uses the legacy `docker build` (not `docker buildx build`) because
// buildx isn't always installed by default on Linux distros and the
// legacy builder produces an image the local Docker daemon picks up
// automatically. We pass --platform=linux/amd64 directly; with Docker
// 20.10+ this works without buildx because the legacy builder
// auto-promotes to BuildKit when available, falling back to v1
// otherwise (still produces an amd64 image via QEMU).
func dockerBuildProd(ctx context.Context, opts *LocalBuildOptions, contextDir, tag string) error {
args := []string{"build"}
if opts.Platform != "" {
args = append(args, "--platform="+opts.Platform)
}
args = append(args,
"-t", tag,
"-f", filepath.Join(contextDir, "Dockerfile"),
contextDir,
)
cmd := exec.CommandContext(ctx, "docker", args...)
cmd.Env = append(os.Environ(), "DOCKER_BUILDKIT=1")
out, err := cmd.CombinedOutput()
if err != nil {
// Sanitize defensive — docker build output shouldn't contain a
// token, but maskTokenInString is a no-op when token is empty.
return fmt.Errorf("%w: %s", err, strings.TrimSpace(maskTokenInString(string(out), opts.Token)))
}
return nil
}
// dockerHasTagProd returns true iff the given tag exists in the local
// image store. Used as the fast cache-hit check.
func dockerHasTagProd(ctx context.Context, tag string) (bool, error) {
cmd := exec.CommandContext(ctx, "docker", "image", "inspect", "--format={{.Id}}", tag)
out, err := cmd.CombinedOutput()
if err == nil {
return strings.TrimSpace(string(out)) != "", nil
}
// `docker image inspect` exits 1 with "Error: No such image" when
// missing — that's a definitive false, not an error condition.
low := strings.ToLower(string(out))
if strings.Contains(low, "no such image") || strings.Contains(low, "not found") {
return false, nil
}
return false, fmt.Errorf("%w: %s", err, strings.TrimSpace(string(out)))
}
// dockerTagProd creates an alias from src → dst. Used to refresh the
// floating `:latest` after a build or cache hit.
func dockerTagProd(ctx context.Context, src, dst string) error {
cmd := exec.CommandContext(ctx, "docker", "tag", src, dst)
out, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("%w: %s", err, strings.TrimSpace(string(out)))
}
return nil
}
// CacheKey is exposed for diagnostic logs / tests so the cache-key shape
// is documented in code rather than only as a string format.
//
// cache_key = sha256(runtime || head_sha || repoPrefix)[:16]
//
// Today only the SHA is consumed, but the helper is kept for future
// extensions (e.g. include Dockerfile-content-hash to invalidate when
// only the Dockerfile changes between two runs targeting the same SHA).
func CacheKey(runtime, sha, repoPrefix string) string {
h := sha256.Sum256([]byte(runtime + "|" + sha + "|" + repoPrefix))
return hex.EncodeToString(h[:8])
}

View File

@ -0,0 +1,662 @@
package provisioner
import (
"context"
"errors"
"fmt"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"sync"
"testing"
)
// makeTestOpts produces a LocalBuildOptions where every external seam
// (Gitea HEAD, git clone, docker build/has/tag) is replaced by a stub.
// Tests override the stub for the behavior they want to assert.
func makeTestOpts(t *testing.T) *LocalBuildOptions {
t.Helper()
tmp := t.TempDir()
return &LocalBuildOptions{
CacheDir: tmp,
RepoPrefix: "https://git.test/molecule-ai/molecule-ai-workspace-template-",
Platform: "linux/amd64",
HTTPClient: &http.Client{},
remoteHeadSha: func(ctx context.Context, opts *LocalBuildOptions, runtime string) (string, error) {
return "abcdef0123456789abcdef0123456789abcdef01", nil
},
gitClone: func(ctx context.Context, opts *LocalBuildOptions, runtime, dest string) error {
// Write a fake Dockerfile so the sanity-check passes.
if err := os.MkdirAll(dest, 0o755); err != nil {
return err
}
return os.WriteFile(filepath.Join(dest, "Dockerfile"), []byte("FROM scratch\n"), 0o644)
},
dockerBuild: func(ctx context.Context, opts *LocalBuildOptions, contextDir, tag string) error {
return nil
},
dockerHasTag: func(ctx context.Context, tag string) (bool, error) {
return false, nil
},
dockerTag: func(ctx context.Context, src, dst string) error {
return nil
},
}
}
// TestEnsureLocalImage_Success — happy path: HEAD lookup succeeds, no
// cache hit, clone + build run, returned tag is SHA-pinned.
func TestEnsureLocalImage_Success(t *testing.T) {
opts := makeTestOpts(t)
tag, err := ensureLocalImageWithOpts(context.Background(), "claude-code", opts)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
want := "molecule-local/workspace-template-claude-code:abcdef012345"
if tag != want {
t.Errorf("tag = %q, want %q", tag, want)
}
}
// TestEnsureLocalImage_CacheHit — second call with a cached image must
// skip clone + build entirely.
func TestEnsureLocalImage_CacheHit(t *testing.T) {
opts := makeTestOpts(t)
var cloneCount, buildCount int
opts.gitClone = func(ctx context.Context, opts *LocalBuildOptions, runtime, dest string) error {
cloneCount++
return os.WriteFile(filepath.Join(dest, "Dockerfile"), []byte("FROM scratch\n"), 0o644)
}
opts.dockerBuild = func(ctx context.Context, opts *LocalBuildOptions, contextDir, tag string) error {
buildCount++
return nil
}
opts.dockerHasTag = func(ctx context.Context, tag string) (bool, error) {
return true, nil // cached
}
if _, err := ensureLocalImageWithOpts(context.Background(), "hermes", opts); err != nil {
t.Fatalf("unexpected error: %v", err)
}
if cloneCount != 0 {
t.Errorf("cache hit triggered %d clones, want 0", cloneCount)
}
if buildCount != 0 {
t.Errorf("cache hit triggered %d builds, want 0", buildCount)
}
}
// TestEnsureLocalImage_UnknownRuntime — the allowlist guard rejects
// arbitrary runtime names before any network or filesystem call.
func TestEnsureLocalImage_UnknownRuntime(t *testing.T) {
opts := makeTestOpts(t)
for _, bad := range []string{
"", "unknown", "../../../etc/passwd", "claude-code; rm -rf /",
} {
t.Run(bad, func(t *testing.T) {
_, err := ensureLocalImageWithOpts(context.Background(), bad, opts)
if err == nil {
t.Errorf("EnsureLocalImage(%q) should fail (not a known runtime)", bad)
}
if err != nil && !strings.Contains(err.Error(), "unknown runtime") {
t.Errorf("error = %v, want one mentioning %q", err, "unknown runtime")
}
})
}
}
// TestEnsureLocalImage_GiteaUnreachable — fail-closed when the HEAD
// lookup fails. Must NOT fall back to GHCR/ECR.
func TestEnsureLocalImage_GiteaUnreachable(t *testing.T) {
opts := makeTestOpts(t)
opts.remoteHeadSha = func(ctx context.Context, opts *LocalBuildOptions, runtime string) (string, error) {
return "", errors.New("dial tcp: no such host")
}
_, err := ensureLocalImageWithOpts(context.Background(), "langgraph", opts)
if err == nil {
t.Fatalf("expected error, got nil")
}
if !strings.Contains(err.Error(), "cannot determine HEAD sha") {
t.Errorf("error = %v, want one mentioning HEAD sha lookup", err)
}
// Critical: error must NOT mention ghcr or ecr (no silent fallback).
low := strings.ToLower(err.Error())
if strings.Contains(low, "ghcr") || strings.Contains(low, "ecr") {
t.Errorf("error message %q must not mention ghcr/ecr (no silent fallback)", err.Error())
}
}
// TestEnsureLocalImage_RepoNotFound — Gitea returned 404. Must surface
// a runtime-naming error so the OSS contributor can file the right
// mirroring task.
func TestEnsureLocalImage_RepoNotFound(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusNotFound)
_, _ = w.Write([]byte(`{"message":"repo not found"}`))
}))
defer srv.Close()
opts := makeTestOpts(t)
opts.RepoPrefix = srv.URL + "/molecule-ai/molecule-ai-workspace-template-"
opts.HTTPClient = srv.Client()
opts.remoteHeadSha = nil // exercise real HTTP path
_, err := ensureLocalImageWithOpts(context.Background(), "crewai", opts)
if err == nil {
t.Fatalf("expected error, got nil")
}
if !strings.Contains(err.Error(), "not mirrored") && !strings.Contains(err.Error(), "not found") {
t.Errorf("error = %v, want a missing-repo message", err)
}
}
// TestEnsureLocalImage_AuthFailure — Gitea returned 401/403. Must
// produce an actionable error (mentions the token env var so an OSS
// contributor knows what to set).
func TestEnsureLocalImage_AuthFailure(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusUnauthorized)
}))
defer srv.Close()
opts := makeTestOpts(t)
opts.RepoPrefix = srv.URL + "/molecule-ai/molecule-ai-workspace-template-"
opts.HTTPClient = srv.Client()
opts.remoteHeadSha = nil
_, err := ensureLocalImageWithOpts(context.Background(), "claude-code", opts)
if err == nil {
t.Fatalf("expected error, got nil")
}
if !strings.Contains(err.Error(), "MOLECULE_GITEA_TOKEN") {
t.Errorf("error = %v, want one mentioning MOLECULE_GITEA_TOKEN", err)
}
}
// TestEnsureLocalImage_HeadShaWithRealJSON — exercise the JSON parser
// against a Gitea-shaped response to catch parse drift.
func TestEnsureLocalImage_HeadShaWithRealJSON(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// Real Gitea response shape (truncated for relevance).
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{
"name":"main",
"commit":{
"id":"3c849b3ba778abcdef0123456789abcdef012345",
"message":"feat: stuff"
}
}`))
}))
defer srv.Close()
opts := makeTestOpts(t)
opts.RepoPrefix = srv.URL + "/molecule-ai/molecule-ai-workspace-template-"
opts.HTTPClient = srv.Client()
opts.remoteHeadSha = nil // exercise real HTTP path
tag, err := ensureLocalImageWithOpts(context.Background(), "claude-code", opts)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if !strings.Contains(tag, "3c849b3ba778") {
t.Errorf("tag = %q, want one containing the parsed sha", tag)
}
}
// TestEnsureLocalImage_BuildFailure — surfaces docker-build errors with
// the build context so an operator can debug locally.
func TestEnsureLocalImage_BuildFailure(t *testing.T) {
opts := makeTestOpts(t)
opts.dockerBuild = func(ctx context.Context, opts *LocalBuildOptions, contextDir, tag string) error {
return errors.New("Dockerfile syntax error")
}
_, err := ensureLocalImageWithOpts(context.Background(), "autogen", opts)
if err == nil {
t.Fatalf("expected error, got nil")
}
if !strings.Contains(err.Error(), "docker build") {
t.Errorf("error = %v, want one mentioning docker build", err)
}
}
// TestEnsureLocalImage_MissingDockerfile — the cloned tree must contain
// a Dockerfile at root; absence is a malformed-template-repo error.
func TestEnsureLocalImage_MissingDockerfile(t *testing.T) {
opts := makeTestOpts(t)
opts.gitClone = func(ctx context.Context, opts *LocalBuildOptions, runtime, dest string) error {
// Empty dir, no Dockerfile.
return os.MkdirAll(dest, 0o755)
}
_, err := ensureLocalImageWithOpts(context.Background(), "hermes", opts)
if err == nil {
t.Fatalf("expected error, got nil")
}
if !strings.Contains(err.Error(), "no Dockerfile") {
t.Errorf("error = %v, want one mentioning missing Dockerfile", err)
}
}
// TestEnsureLocalImage_ConcurrentSameRuntime — two goroutines hitting
// the same runtime serialize via the per-runtime lock; the build runs
// once.
func TestEnsureLocalImage_ConcurrentSameRuntime(t *testing.T) {
opts := makeTestOpts(t)
var (
buildCount int
buildMu sync.Mutex
)
opts.dockerHasTag = func(ctx context.Context, tag string) (bool, error) {
// First call: cache miss. Second call (after first build): hit.
buildMu.Lock()
defer buildMu.Unlock()
return buildCount > 0, nil
}
opts.dockerBuild = func(ctx context.Context, opts *LocalBuildOptions, contextDir, tag string) error {
buildMu.Lock()
buildCount++
buildMu.Unlock()
return nil
}
const N = 5
var wg sync.WaitGroup
wg.Add(N)
for i := 0; i < N; i++ {
go func() {
defer wg.Done()
_, _ = ensureLocalImageWithOpts(context.Background(), "langgraph", opts)
}()
}
wg.Wait()
if buildCount != 1 {
t.Errorf("buildCount = %d, want 1 (lock should serialize concurrent calls)", buildCount)
}
}
// TestMaskTokenInURL — Gitea PATs in URLs must NEVER appear in logs.
func TestMaskTokenInURL(t *testing.T) {
cases := []struct {
in string
want string
}{
{"https://oauth2:secret123@git.example.com/foo/bar", "https://***@git.example.com/foo/bar"},
{"https://user:tok@host/path", "https://***@host/path"},
{"https://no-userinfo.example.com/path", "https://no-userinfo.example.com/path"},
{"not a url", "not a url"},
{"", ""},
}
for _, tc := range cases {
t.Run(tc.in, func(t *testing.T) {
got := maskTokenInURL(tc.in)
if got != tc.want {
t.Errorf("maskTokenInURL(%q) = %q, want %q", tc.in, got, tc.want)
}
})
}
}
// TestMaskTokenInString — defence against git/docker echoing the token
// into stderr on failure.
func TestMaskTokenInString(t *testing.T) {
got := maskTokenInString("error: clone https://oauth2:abc123@git.test/foo: failed", "abc123")
if strings.Contains(got, "abc123") {
t.Errorf("masked string %q still contains the token", got)
}
if !strings.Contains(got, "***") {
t.Errorf("masked string %q should have *** in place of token", got)
}
// No-op when token is empty.
if got := maskTokenInString("hello world", ""); got != "hello world" {
t.Errorf("empty token must not modify string, got %q", got)
}
}
// TestGiteaBranchAPIURL — the URL composer must produce the canonical
// /api/v1/repos/<org>/<repo>/branches/<branch> shape.
func TestGiteaBranchAPIURL(t *testing.T) {
cases := []struct {
prefix, runtime, branch, want string
}{
{
"https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-",
"claude-code",
"main",
"https://git.moleculesai.app/api/v1/repos/molecule-ai/molecule-ai-workspace-template-claude-code/branches/main",
},
{
"http://localhost:3000/myorg/template-",
"foo",
"main",
"http://localhost:3000/api/v1/repos/myorg/template-foo/branches/main",
},
}
for _, tc := range cases {
t.Run(tc.runtime, func(t *testing.T) {
got, err := giteaBranchAPIURL(tc.prefix, tc.runtime, tc.branch)
if err != nil {
t.Fatalf("err = %v", err)
}
if got != tc.want {
t.Errorf("got %q, want %q", got, tc.want)
}
})
}
}
// TestGiteaBranchAPIURL_RejectsMalformed — malformed prefixes (no org
// path) produce an error rather than a malformed API call.
func TestGiteaBranchAPIURL_RejectsMalformed(t *testing.T) {
for _, bad := range []string{
"https://example.com/", // no path component
"://broken",
} {
t.Run(bad, func(t *testing.T) {
if _, err := giteaBranchAPIURL(bad, "claude-code", "main"); err == nil {
t.Errorf("expected error for malformed prefix %q", bad)
}
})
}
}
// TestParseGiteaBranchHeadSha — pin the parser against representative
// Gitea responses so a future Gitea API rev that adds fields doesn't
// silently break detection.
func TestParseGiteaBranchHeadSha(t *testing.T) {
good := []byte(`{"name":"main","commit":{"id":"abc123def456","message":"hi"}}`)
got, err := parseGiteaBranchHeadSha(good)
if err != nil {
t.Fatalf("err = %v", err)
}
if got != "abc123def456" {
t.Errorf("got %q, want abc123def456", got)
}
for _, bad := range [][]byte{
[]byte(`{}`),
[]byte(`{"name":"main","commit":{}}`),
[]byte(`{"commit":{"id":"`), // truncated
[]byte(`<html>404</html>`),
} {
if _, err := parseGiteaBranchHeadSha(bad); err == nil {
t.Errorf("expected error for malformed body %q", string(bad))
}
}
}
// TestLocalImageTag_ShortSha — caller-supplied SHA gets truncated to
// 12 chars in the tag so `docker images` output stays readable.
func TestLocalImageTag_ShortSha(t *testing.T) {
got := LocalImageTag("claude-code", "abcdef0123456789abcdef0123456789abcdef01")
want := "molecule-local/workspace-template-claude-code:abcdef012345"
if got != want {
t.Errorf("got %q, want %q", got, want)
}
}
// TestLocalImageLatestTag — the floating alias used as the human-readable
// :latest entry.
func TestLocalImageLatestTag(t *testing.T) {
got := LocalImageLatestTag("hermes")
want := "molecule-local/workspace-template-hermes:latest"
if got != want {
t.Errorf("got %q, want %q", got, want)
}
}
// TestRemoteHeadShaProd_IncludesAuthHeader — when a token is configured,
// the API request must carry the `Authorization: token <pat>` header.
func TestRemoteHeadShaProd_IncludesAuthHeader(t *testing.T) {
var got string
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
got = r.Header.Get("Authorization")
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"commit":{"id":"deadbeef0000aaaa1111bbbb2222cccc33334444"}}`))
}))
defer srv.Close()
opts := makeTestOpts(t)
opts.RepoPrefix = srv.URL + "/myorg/template-"
opts.HTTPClient = srv.Client()
opts.Token = "secret-pat-do-not-log"
if _, err := remoteHeadShaProd(context.Background(), opts, "claude-code"); err != nil {
t.Fatalf("err = %v", err)
}
if got != "token secret-pat-do-not-log" {
t.Errorf("Authorization header = %q, want %q", got, "token secret-pat-do-not-log")
}
}
// TestCacheKey_Stable — the helper must be deterministic and incorporate
// each input.
func TestCacheKey_Stable(t *testing.T) {
a := CacheKey("claude-code", "abc", "https://git/")
b := CacheKey("claude-code", "abc", "https://git/")
if a != b {
t.Errorf("CacheKey is non-deterministic: %q vs %q", a, b)
}
if a == CacheKey("claude-code", "def", "https://git/") {
t.Errorf("CacheKey ignores sha")
}
if a == CacheKey("hermes", "abc", "https://git/") {
t.Errorf("CacheKey ignores runtime")
}
}
// TestRedactedRepoURL_NoToken — a repo URL with no embedded credential
// is unmodified.
func TestRedactedRepoURL_NoToken(t *testing.T) {
opts := &LocalBuildOptions{RepoPrefix: "https://git.example.com/org/template-"}
got := redactedRepoURL(opts, "claude-code")
want := "https://git.example.com/org/template-claude-code"
if got != want {
t.Errorf("got %q, want %q", got, want)
}
}
// TestRepoURL_AppendsRuntime — the prefix + runtime composer is stable.
func TestRepoURL_AppendsRuntime(t *testing.T) {
opts := &LocalBuildOptions{RepoPrefix: "https://git.example.com/org/template-"}
got := repoURL(opts, "claude-code")
if got != "https://git.example.com/org/template-claude-code" {
t.Errorf("got %q", got)
}
}
// TestNewDefaultLocalBuildOptions_RespectsEnvOverrides — the env var
// overrides documented in the runbook actually take effect.
func TestNewDefaultLocalBuildOptions_RespectsEnvOverrides(t *testing.T) {
t.Setenv("MOLECULE_LOCAL_BUILD_CACHE", "/var/tmp/molecule-test")
t.Setenv("MOLECULE_LOCAL_TEMPLATE_REPO_PREFIX", "https://my.fork/org/tpl-")
t.Setenv("MOLECULE_GITEA_TOKEN", "tok-from-env")
opts := newDefaultLocalBuildOptions()
if opts.CacheDir != "/var/tmp/molecule-test" {
t.Errorf("CacheDir = %q", opts.CacheDir)
}
if opts.RepoPrefix != "https://my.fork/org/tpl-" {
t.Errorf("RepoPrefix = %q", opts.RepoPrefix)
}
if opts.Token != "tok-from-env" {
t.Errorf("Token = %q", opts.Token)
}
if opts.Platform != "linux/amd64" {
t.Errorf("Platform = %q, want linux/amd64", opts.Platform)
}
}
// TestNewDefaultLocalBuildOptions_DefaultCacheDir — XDG-compliant
// fallback when nothing is overridden.
func TestNewDefaultLocalBuildOptions_DefaultCacheDir(t *testing.T) {
t.Setenv("MOLECULE_LOCAL_BUILD_CACHE", "")
t.Setenv("XDG_CACHE_HOME", "")
t.Setenv("MOLECULE_LOCAL_TEMPLATE_REPO_PREFIX", "")
opts := newDefaultLocalBuildOptions()
if !strings.Contains(opts.CacheDir, ".cache") && !strings.Contains(opts.CacheDir, "molecule") {
t.Errorf("CacheDir = %q, want one under .cache/molecule", opts.CacheDir)
}
if opts.RepoPrefix != gitTemplateRepoPrefix {
t.Errorf("RepoPrefix = %q, want default %q", opts.RepoPrefix, gitTemplateRepoPrefix)
}
}
// TestEnsureLocalImage_ShortSha — a remote that returns a too-short
// sha is rejected (defence against a misbehaving Gitea proxy).
func TestEnsureLocalImage_ShortSha(t *testing.T) {
opts := makeTestOpts(t)
opts.remoteHeadSha = func(ctx context.Context, opts *LocalBuildOptions, runtime string) (string, error) {
return "abc", nil
}
_, err := ensureLocalImageWithOpts(context.Background(), "claude-code", opts)
if err == nil {
t.Fatalf("expected error for short sha")
}
if !strings.Contains(err.Error(), "short sha") {
t.Errorf("error = %v, want short-sha message", err)
}
}
// TestEnsureLocalImage_StaleCacheDirCleaned — a partial clone left over
// from a previous failed run must not poison the next attempt.
func TestEnsureLocalImage_StaleCacheDirCleaned(t *testing.T) {
opts := makeTestOpts(t)
// Pre-create a stale dir at the cache target (with a partial Dockerfile).
staleDir := filepath.Join(opts.CacheDir, "claude-code", "abcdef012345")
if err := os.MkdirAll(staleDir, 0o755); err != nil {
t.Fatalf("setup: %v", err)
}
if err := os.WriteFile(filepath.Join(staleDir, "stale-marker"), []byte("delete me"), 0o644); err != nil {
t.Fatalf("setup: %v", err)
}
if _, err := ensureLocalImageWithOpts(context.Background(), "claude-code", opts); err != nil {
t.Fatalf("err = %v", err)
}
if _, err := os.Stat(filepath.Join(staleDir, "stale-marker")); !os.IsNotExist(err) {
t.Errorf("stale-marker should have been wiped before re-clone (err=%v)", err)
}
// Dockerfile from the new clone should be present.
if _, err := os.Stat(filepath.Join(staleDir, "Dockerfile")); err != nil {
t.Errorf("expected Dockerfile from re-clone, got err=%v", err)
}
}
// TestEnsureLocalImage_ContextCancelled — context cancellation
// propagates to the network/clone seams (best-effort: the test asserts
// that no work happens after Done()).
func TestEnsureLocalImage_ContextCancelled(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
cancel()
opts := makeTestOpts(t)
opts.remoteHeadSha = func(ctx context.Context, opts *LocalBuildOptions, runtime string) (string, error) {
if err := ctx.Err(); err != nil {
return "", err
}
return "deadbeef00000000aaaa1111bbbb2222cccc33334444", nil
}
_, err := ensureLocalImageWithOpts(ctx, "claude-code", opts)
if err == nil {
t.Fatalf("expected error from cancelled context")
}
}
// TestEnsureLocalImage_RetagAfterCacheHit — a cache-hit must refresh
// the floating :latest alias so admins inspecting `docker images` see
// the current SHA.
func TestEnsureLocalImage_RetagAfterCacheHit(t *testing.T) {
opts := makeTestOpts(t)
var src, dst string
opts.dockerHasTag = func(ctx context.Context, tag string) (bool, error) { return true, nil }
opts.dockerTag = func(ctx context.Context, s, d string) error {
src, dst = s, d
return nil
}
tag, err := ensureLocalImageWithOpts(context.Background(), "claude-code", opts)
if err != nil {
t.Fatalf("err = %v", err)
}
if src != tag {
t.Errorf("retag src = %q, want %q", src, tag)
}
wantDst := "molecule-local/workspace-template-claude-code:latest"
if dst != wantDst {
t.Errorf("retag dst = %q, want %q", dst, wantDst)
}
}
// TestRemoteHeadShaProd_BodyOverflow — defence against a malicious or
// misbehaving Gitea returning a multi-MB body.
func TestRemoteHeadShaProd_BodyOverflow(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
// Stream a 100MB body. The reader should cap at 64KB and yield
// a parse error rather than OOM.
_, _ = w.Write([]byte(`{"commit":{"id":"`))
_, _ = w.Write([]byte(strings.Repeat("a", 64<<10))) // 64KB of 'a'
// Connection drops here; we don't write the closing quote.
}))
defer srv.Close()
opts := makeTestOpts(t)
opts.RepoPrefix = srv.URL + "/myorg/template-"
opts.HTTPClient = srv.Client()
_, err := remoteHeadShaProd(context.Background(), opts, "claude-code")
if err == nil {
t.Fatalf("expected error from over-long sha (no closing quote within cap)")
}
}
// TestProvisionerStartUsesLocalBuild_LocalMode — pin the provisioner→
// local-build wiring at the integration boundary. We don't want a future
// refactor to silently bypass EnsureLocalImage when registry is unset.
//
// This test inspects the mode-decision logic without standing up Docker.
func TestProvisionerStartUsesLocalBuild_LocalMode(t *testing.T) {
t.Setenv("MOLECULE_IMAGE_REGISTRY", "")
src := Resolve()
if src.Mode != RegistryModeLocal {
t.Fatalf("Resolve in unset env = %q, want local", src.Mode)
}
// The provisioner Start() branches on this same Resolve() call before
// reaching ContainerCreate. Pinning the boolean here means a refactor
// that flips the sense (e.g. `if src.Mode == RegistryModeSaaS`) is
// caught by this test.
}
// TestEnsureLocalImageHook_DefaultIsRealFunction — pin that the
// production hook points at EnsureLocalImage. Tests that swap the hook
// must restore it via t.Cleanup; this test catches a leaked override.
func TestEnsureLocalImageHook_DefaultIsRealFunction(t *testing.T) {
// Sanity: hook is set to a non-nil function. We can't compare
// function pointers directly with == in Go (compiler error), so
// we exercise it instead — but we don't want to actually clone
// from the network in the unit test, so use an unknown runtime
// and assert the known-error path runs.
_, err := ensureLocalImageHook(context.Background(), "this-runtime-cannot-exist-194")
if err == nil {
t.Fatalf("expected error from EnsureLocalImage on unknown runtime")
}
if !strings.Contains(err.Error(), "unknown runtime") {
t.Errorf("hook = unexpected function (got error %q, want one mentioning unknown runtime)", err.Error())
}
}
// TestProvisionerStartUsesLocalBuild_SaaSMode — and the symmetric guard:
// in SaaS-mode, no local-build path runs.
func TestProvisionerStartUsesLocalBuild_SaaSMode(t *testing.T) {
t.Setenv("MOLECULE_IMAGE_REGISTRY", "registry.example.com/molecule-ai")
src := Resolve()
if src.Mode != RegistryModeSaaS {
t.Fatalf("Resolve with registry set = %q, want saas", src.Mode)
}
if src.Prefix != "registry.example.com/molecule-ai" {
t.Fatalf("Prefix = %q", src.Prefix)
}
}
// silence unused warning if we ever drop fmt usage
var _ = fmt.Sprintf

View File

@ -320,6 +320,26 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
image := selectImage(cfg)
// Local-build mode (issue #63 / Task #194): when MOLECULE_IMAGE_REGISTRY
// is unset, the OSS contributor path skips the registry pull entirely
// and instead clones the workspace-template-<runtime> repo from Gitea
// + `docker build`s it locally. Replace the placeholder image ref with
// the SHA-pinned tag of the freshly-built image before ContainerCreate.
//
// Pinned overrides (cfg.Image set, e.g. via runtime_image_pins for
// production thin-AMI launches) bypass this path — they pin a digest
// the operator chose explicitly.
if cfg.Image == "" && cfg.Runtime != "" {
if src := Resolve(); src.Mode == RegistryModeLocal {
builtTag, buildErr := ensureLocalImageHook(ctx, cfg.Runtime)
if buildErr != nil {
return "", fmt.Errorf("local-build mode: ensure image for runtime %q: %w", cfg.Runtime, buildErr)
}
image = builtTag
log.Printf("Provisioner: local-build mode → using locally-built image %s for runtime %s", image, cfg.Runtime)
}
}
containerCfg := &container.Config{
Image: image,
Env: env,

View File

@ -0,0 +1,96 @@
package provisioner
import "os"
// localImagePrefix is the synthetic registry hostname used for images
// that the local-build path produces. It is intentionally NOT a real
// hostname — Docker won't try to pull it from the network (no DNS
// resolution path), and the workspace-image-refresh / image-watch
// paths short-circuit on it.
//
// Tag scheme: `molecule-local/workspace-template-<runtime>:<tag>` where
// `<tag>` is either the 12-char Gitea HEAD sha for SHA-pinned references
// or the moving `:latest` for human inspection (the provisioner
// consumes the SHA-pinned form via EnsureLocalImage()).
//
// Issue #63 / Task #194.
const localImagePrefix = "molecule-local"
// RegistryMode classifies how the provisioner sources workspace-template
// container images. The two modes are mutually exclusive and selected
// by presence/absence of the MOLECULE_IMAGE_REGISTRY env var (Q2 design
// lock, 2026-05-07): set ⇒ SaaS-mode pull; unset ⇒ local-build mode.
//
// Discriminated value rather than a bare string return so every call
// site that decides on image source has to acknowledge the two modes —
// a bare string returning `""` on local-mode would silently produce
// malformed image refs (e.g. `/workspace-template-foo:latest`).
type RegistryMode string
const (
// RegistryModeSaaS — pull workspace-template-* images from a real
// container registry whose URL is in `MOLECULE_IMAGE_REGISTRY`.
// Used by every prod tenant (env injected via Railway / EC2
// user-data) and any self-hosted operator who has mirrored the
// images to their own GHCR/ECR/Harbor.
RegistryModeSaaS RegistryMode = "saas"
// RegistryModeLocal — clone the workspace-template-<runtime> repo
// from Gitea
// (`https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-<runtime>`)
// and `docker build` the image locally. Used by OSS contributors
// who run `go run ./workspace-server/cmd/server` without setting
// MOLECULE_IMAGE_REGISTRY. Closes the post-2026-05-06 GHCR-403 gap
// (Task #194 / Issue #63).
RegistryModeLocal RegistryMode = "local"
)
// RegistrySource is the SSOT for image-resolution decisions. Returned
// by Resolve(); read by:
// - the provisioner Start() path — branches on Mode for clone+build
// vs pull
// - admin_workspace_images.go — skips remote pull in local mode
// - imagewatch.Watcher — short-circuits in local mode (no GHCR poll)
//
// SaaS-mode .Prefix matches the existing RegistryPrefix() return value;
// local-mode .Prefix is the synthetic `molecule-local`.
type RegistrySource struct {
Mode RegistryMode
Prefix string
}
// Resolve inspects the runtime environment and returns the image-source
// classification. Treats both unset AND empty-string MOLECULE_IMAGE_REGISTRY
// as "local mode" — an operator who set the var to "" via a misconfigured
// deploy would otherwise silently get malformed image refs in SaaS-mode;
// instead they get the local-build path, which fails loudly if the host
// has no Docker daemon (better blast radius).
//
// Mirrors the existing RegistryPrefix() empty-string handling, so the two
// functions agree on every input.
func Resolve() RegistrySource {
if v := os.Getenv("MOLECULE_IMAGE_REGISTRY"); v != "" {
return RegistrySource{Mode: RegistryModeSaaS, Prefix: v}
}
return RegistrySource{Mode: RegistryModeLocal, Prefix: localImagePrefix}
}
// IsKnownRuntime reports whether the given runtime name is in the
// canonical knownRuntimes list. Exposed so the local-build path can
// refuse to clone arbitrary repo paths supplied via cfg.Runtime —
// defence-in-depth against a future code path that might let an
// attacker influence the runtime string before it reaches the build
// code.
func IsKnownRuntime(runtime string) bool {
for _, r := range knownRuntimes {
if r == runtime {
return true
}
}
return false
}
// LocalImagePrefix returns the synthetic registry hostname used by the
// local-build path. Exposed so handlers that need to branch on "is
// this a local-built image?" don't have to duplicate the constant.
func LocalImagePrefix() string { return localImagePrefix }

View File

@ -0,0 +1,152 @@
package provisioner
import (
"strings"
"testing"
)
// Tests for the new mode-detection surface. The legacy RegistryPrefix()
// shim is covered by registry_test.go; these tests pin the explicit
// two-mode discriminated return from Resolve().
// TestResolve_LocalModeWhenRegistryUnset — the OSS-contributor default.
// Issue #63: with MOLECULE_IMAGE_REGISTRY unset, the provisioner must
// switch to the local-build path instead of trying to pull from a GHCR
// org that's been suspended.
func TestResolve_LocalModeWhenRegistryUnset(t *testing.T) {
t.Setenv("MOLECULE_IMAGE_REGISTRY", "")
got := Resolve()
if got.Mode != RegistryModeLocal {
t.Errorf("Mode = %q, want %q (unset registry → local-build)", got.Mode, RegistryModeLocal)
}
if got.Prefix != localImagePrefix {
t.Errorf("Prefix = %q, want %q", got.Prefix, localImagePrefix)
}
}
// TestResolve_SaaSModeWhenRegistrySet — production tenants set the var
// to their ECR mirror; we must keep producing pull-style image refs.
func TestResolve_SaaSModeWhenRegistrySet(t *testing.T) {
const ecr = "123456789012.dkr.ecr.us-east-2.amazonaws.com/molecule-ai"
t.Setenv("MOLECULE_IMAGE_REGISTRY", ecr)
got := Resolve()
if got.Mode != RegistryModeSaaS {
t.Errorf("Mode = %q, want %q (set registry → saas)", got.Mode, RegistryModeSaaS)
}
if got.Prefix != ecr {
t.Errorf("Prefix = %q, want %q", got.Prefix, ecr)
}
}
// TestResolve_EmptyEnvIsLocalMode — operator who set the var to "" via
// a misconfigured deploy must NOT silently produce malformed image refs;
// they get the local path which fails loudly if Docker is missing.
// This contract is the safer-blast-radius half of Issue #63.
func TestResolve_EmptyEnvIsLocalMode(t *testing.T) {
t.Setenv("MOLECULE_IMAGE_REGISTRY", "")
if Resolve().Mode != RegistryModeLocal {
t.Fatalf("empty MOLECULE_IMAGE_REGISTRY should be local-mode, got %q", Resolve().Mode)
}
}
// TestResolve_GarbageURL — a registry value that's syntactically malformed
// (e.g. `not-a-url`, `foo bar`) is still treated as SaaS-mode. The whole
// design of MOLECULE_IMAGE_REGISTRY is "operator-supplied trusted value";
// validating the URL here would be pretending we can prevent operator
// error. The downstream docker-pull will fail loudly with a registry-
// shaped error message, which is the right blast radius.
func TestResolve_GarbageURLStillSaaSMode(t *testing.T) {
for _, garbage := range []string{
"not-a-url",
"http://",
"ghcr.io/",
" ",
"\thello\n",
} {
t.Run(garbage, func(t *testing.T) {
t.Setenv("MOLECULE_IMAGE_REGISTRY", garbage)
if Resolve().Mode != RegistryModeSaaS {
t.Errorf("Mode = %q, want saas (any non-empty value is SaaS-mode by design)", Resolve().Mode)
}
})
}
}
// TestRegistryPrefix_AlignedWithResolve — the back-compat shim must
// agree with Resolve().Prefix on every input the new code distinguishes.
func TestRegistryPrefix_AlignedWithResolve(t *testing.T) {
cases := []struct {
name string
env string
}{
{"unset", ""},
{"ecr", "999999999999.dkr.ecr.us-east-2.amazonaws.com/molecule-ai"},
{"harbor", "harbor.example.com/molecule"},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
t.Setenv("MOLECULE_IMAGE_REGISTRY", tc.env)
gotPrefix := RegistryPrefix()
gotResolve := Resolve().Prefix
// Note: with the new design, RegistryPrefix() unset returns
// the SaaS GHCR default (legacy back-compat) while
// Resolve().Prefix returns the local-mode "molecule-local"
// hostname. They DIVERGE on the unset path by design — that
// divergence is what closes the GHCR-403 hole. Pin both so a
// future refactor can't accidentally re-couple them.
if tc.env == "" {
if gotPrefix != defaultRegistryPrefix {
t.Errorf("RegistryPrefix() = %q, want %q (legacy shim)", gotPrefix, defaultRegistryPrefix)
}
if gotResolve != localImagePrefix {
t.Errorf("Resolve().Prefix = %q, want %q (local-build hostname)", gotResolve, localImagePrefix)
}
} else {
if gotPrefix != tc.env {
t.Errorf("RegistryPrefix() = %q, want %q", gotPrefix, tc.env)
}
if gotResolve != tc.env {
t.Errorf("Resolve().Prefix = %q, want %q", gotResolve, tc.env)
}
}
})
}
}
// TestIsKnownRuntime — defence-in-depth guard for the local-build path.
// Must accept every entry in knownRuntimes and reject anything else.
func TestIsKnownRuntime(t *testing.T) {
for _, rt := range knownRuntimes {
if !IsKnownRuntime(rt) {
t.Errorf("IsKnownRuntime(%q) = false, want true", rt)
}
}
for _, bad := range []string{
"", "unknown", "WORKSPACE-TEMPLATE-FAKE", "../../../etc/passwd",
"langgraph;rm -rf /", "claude-code\n", " langgraph",
} {
if IsKnownRuntime(bad) {
t.Errorf("IsKnownRuntime(%q) = true, want false (untrusted input)", bad)
}
}
}
// TestLocalImagePrefix_Stable — the synthetic prefix is part of the
// public surface; admin handlers and image-watch use it to short-circuit
// network calls. Pin the constant.
func TestLocalImagePrefix_Stable(t *testing.T) {
if got := LocalImagePrefix(); got != "molecule-local" {
t.Errorf("LocalImagePrefix() = %q, want %q", got, "molecule-local")
}
}
// TestLocalImagePrefix_NoDots — the synthetic hostname must not contain
// a `.` because Docker's image-ref parser would interpret it as a real
// DNS-resolvable registry. With no dot, the daemon treats `molecule-local`
// as the registry hostname only when explicitly tagged that way locally,
// and never tries to resolve it via DNS for a pull.
func TestLocalImagePrefix_NoDots(t *testing.T) {
if strings.Contains(LocalImagePrefix(), ".") {
t.Errorf("LocalImagePrefix() = %q contains '.' — Docker would attempt DNS resolution", LocalImagePrefix())
}
}