diff --git a/.gitea/workflows/sop-tier-check.yml b/.gitea/workflows/sop-tier-check.yml new file mode 100644 index 00000000..db720f2f --- /dev/null +++ b/.gitea/workflows/sop-tier-check.yml @@ -0,0 +1,170 @@ +# sop-tier-check — canonical Gitea Actions workflow for §SOP-6 enforcement. +# +# Copy this file to `.gitea/workflows/sop-tier-check.yml` in any repo that +# wants the §SOP-6 PR gate enforced. Pair with branch protection on the +# protected branch: +# required_status_checks: ["sop-tier-check"] +# required_approving_reviews: 1 +# approving_review_teams: ["ceo", "managers", "engineers"] +# +# What it does: +# 1. Reads the PR's `tier:*` label (low | medium | high). Fails if absent +# or ambiguous. +# 2. Reads every approving review on the PR. +# 3. For each approver, queries Gitea team membership. +# 4. Marks the check success only if at least one approver is in a team +# whose tier-tag covers the PR's tier label, AND the approver is not +# the author. +# +# Tier → eligible-team mapping (mirror of dev-sop §SOP-6): +# tier:low → engineers, managers, ceo +# tier:medium → managers, ceo +# tier:high → ceo +# +# Author identity is excluded automatically; Gitea's review system already +# rejects self-reviews, but this workflow re-checks defensively in case the +# native rule is bypassed (admin override, branch-protection edit, etc.). +# +# Force-merge: Owners-team override remains available out-of-band via the +# Gitea merge API; force-merge writes `incident.force_merge` to +# structure_events per §Persistent structured logging gate (Phase 3). + +name: sop-tier-check + +on: + pull_request: + types: [opened, edited, synchronize, reopened, labeled, unlabeled] + pull_request_review: + types: [submitted, dismissed, edited] + +jobs: + tier-check: + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: read + steps: + - name: Verify tier label + reviewer team membership + env: + # SOP_TIER_CHECK_TOKEN is the read-only `sop-tier-bot` PAT, + # provisioned with read:org scope and added to ceo/managers/ + # engineers teams (a Gitea team-membership probe requires the + # caller to be a member of the team being probed). The auto- + # injected GITHUB_TOKEN's scope is repo-level only and cannot + # query org team membership, hence the dedicated secret. + # Falls back to GITHUB_TOKEN so the workflow at least starts and + # surfaces a clear error when the secret is missing. + GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }} + GITEA_HOST: git.moleculesai.app + REPO: ${{ github.repository }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PR_AUTHOR: ${{ github.event.pull_request.user.login }} + run: | + set -euo pipefail + if [ -z "${GITEA_TOKEN:-}" ]; then + echo "::error::Neither GITEA_TOKEN nor GITHUB_TOKEN is available. Add a GITEA_TOKEN secret with org-membership read scope to enable team-based approval gating." + exit 1 + fi + OWNER="${REPO%%/*}" + NAME="${REPO##*/}" + API="https://${GITEA_HOST}/api/v1" + AUTH="Authorization: token ${GITEA_TOKEN}" + echo "::notice::tier-check start: repo=$OWNER/$NAME pr=$PR_NUMBER author=$PR_AUTHOR" + # Sanity-check the token resolves a user; surfaces token-scope problems + # early instead of failing on a downstream call with no context. + WHOAMI=$(curl -sS -H "$AUTH" "${API}/user" | jq -r '.login // ""') + if [ -z "$WHOAMI" ]; then + echo "::error::GITEA_TOKEN cannot resolve a user via /api/v1/user — check the token scope and that the secret is wired correctly." + exit 1 + fi + echo "::notice::token resolves to user: $WHOAMI" + + # 1. Read tier label + LABELS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/issues/${PR_NUMBER}/labels" | jq -r '.[].name') + TIER="" + for L in $LABELS; do + case "$L" in + tier:low|tier:medium|tier:high) + if [ -n "$TIER" ]; then + echo "::error::Multiple tier labels: $TIER + $L. Apply exactly one." + exit 1 + fi + TIER="$L" + ;; + esac + done + if [ -z "$TIER" ]; then + echo "::error::PR has no tier:low|tier:medium|tier:high label. Apply one before merge." + exit 1 + fi + echo "tier=$TIER" + + # 2. Tier → eligible teams + case "$TIER" in + tier:low) ELIGIBLE="engineers managers ceo" ;; + tier:medium) ELIGIBLE="managers ceo" ;; + tier:high) ELIGIBLE="ceo" ;; + esac + echo "eligible_teams=$ELIGIBLE" + + # Resolve team-name → team-id once. The /orgs/{org}/teams/{slug}/... + # endpoints don't exist on Gitea 1.22; we have to use /teams/{id}. + # Fail loud on missing team rather than treating it as "user not in + # team" — that'd mask a misconfigured deployment. + ORG_TEAMS_FILE=$(mktemp) + HTTP_CODE=$(curl -sS -o "$ORG_TEAMS_FILE" -w '%{http_code}' -H "$AUTH" \ + "${API}/orgs/${OWNER}/teams") + echo "teams-list HTTP=$HTTP_CODE size=$(wc -c <"$ORG_TEAMS_FILE")" + echo "teams-list body (first 300 chars):" + head -c 300 "$ORG_TEAMS_FILE"; echo + if [ "$HTTP_CODE" != "200" ]; then + echo "::error::GET /orgs/${OWNER}/teams returned HTTP $HTTP_CODE — token likely lacks read:org scope. Add a SOP_TIER_CHECK_TOKEN secret with read:organization scope." + exit 1 + fi + declare -A TEAM_ID + for T in $ELIGIBLE; do + ID=$(jq -r --arg t "$T" '.[] | select(.name==$t) | .id' <"$ORG_TEAMS_FILE" | head -1) + if [ -z "$ID" ] || [ "$ID" = "null" ]; then + VISIBLE=$(jq -r '.[]?.name? // empty' <"$ORG_TEAMS_FILE" 2>/dev/null | tr '\n' ' ') + echo "::error::Team \"$T\" not found in org $OWNER. Teams visible: $VISIBLE" + exit 1 + fi + TEAM_ID[$T]="$ID" + echo "team-id: $T → $ID" + done + + # 3. Read approving reviewers + REVIEWS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}/reviews") + APPROVERS=$(echo "$REVIEWS" | jq -r '[.[] | select(.state=="APPROVED") | .user.login] | unique | .[]') + if [ -z "$APPROVERS" ]; then + echo "::error::No approving reviews. Tier $TIER requires approval from {$ELIGIBLE} (non-author)." + exit 1 + fi + echo "approvers: $(echo $APPROVERS | tr '\n' ' ')" + + # 4. For each approver: check non-author + team membership (by id) + OK="" + for U in $APPROVERS; do + if [ "$U" = "$PR_AUTHOR" ]; then + echo "skip self-review by $U" + continue + fi + for T in $ELIGIBLE; do + ID="${TEAM_ID[$T]}" + CODE=$(curl -sS -o /dev/null -w '%{http_code}' -H "$AUTH" \ + "${API}/teams/${ID}/members/${U}") + echo " probe: $U in team $T (id=$ID) → HTTP $CODE" + if [ "$CODE" = "200" ] || [ "$CODE" = "204" ]; then + echo "::notice::approver $U is in team $T (eligible for $TIER)" + OK="yes" + break + fi + done + [ -n "$OK" ] && break + done + + if [ -z "$OK" ]; then + echo "::error::Tier $TIER requires approval from a non-author member of {$ELIGIBLE}. Got approvers: $APPROVERS — none of them satisfied team membership (probe HTTP codes above)." + exit 1 + fi + echo "::notice::sop-tier-check passed: $TIER, approver in {$ELIGIBLE}" diff --git a/.github/workflows/auto-promote-on-e2e.yml b/.github/workflows/auto-promote-on-e2e.yml deleted file mode 100644 index a4daef2b..00000000 --- a/.github/workflows/auto-promote-on-e2e.yml +++ /dev/null @@ -1,467 +0,0 @@ -name: Auto-promote :latest after main image build - -# Retags `ghcr.io/molecule-ai/{platform,platform-tenant}:staging-` -# → `:latest` after either the image build or E2E completes on a `main` -# push, gated on E2E Staging SaaS not being red for that SHA. -# -# Why two triggers: -# -# `publish-workspace-server-image` and `e2e-staging-saas` are both -# paths-filtered, but with DIFFERENT path sets: -# -# publish-workspace-server-image: -# workspace-server/**, canvas/**, manifest.json -# -# e2e-staging-saas (full lifecycle): -# workspace-server/internal/handlers/{registry,workspace_provision, -# a2a_proxy}.go, workspace-server/internal/middleware/**, -# workspace-server/internal/provisioner/**, tests/e2e/test_staging_full_saas.sh -# -# The E2E set is a strict SUBSET of the publish set. So: -# - canvas/** changes → publish fires, E2E does not -# - workspace-server/cmd/** changes → publish fires, E2E does not -# - workspace-server/internal/sweep/** → publish fires, E2E does not -# -# The previous version triggered ONLY on E2E completion, which meant -# non-E2E-path changes (canvas, cmd, sweep, etc.) rebuilt the image -# but never advanced `:latest`. Result: as of 2026-04-28 this workflow -# had run zero times since merge despite eight main pushes — `:latest` -# was ~7 hours / 9 PRs behind main with no human realising. See -# `molecule-core` Slack discussion 2026-04-28. -# -# Adding `publish-workspace-server-image` as a second trigger closes -# the gap: any image rebuild on main eligibly advances `:latest`. -# -# Why E2E remains a kill-switch (not the trigger): -# -# When E2E DID run for this SHA and ended red, we abort — `:latest` -# stays on the prior known-good digest. When E2E didn't run (paths -# filtered out), we proceed: pre-merge gates already validated this -# SHA on staging via auto-promote-staging requiring CI + E2E Canvas + -# E2E API + CodeQL all green. Image content for non-E2E-paths -# (canvas, cmd, sweep) is exercised by those staging gates. -# -# Why `main` only: -# -# `:latest` is what prod tenants pull. We only want SHAs that have -# reached main (via auto-promote-staging) to advance `:latest`. -# Triggering on staging would let a staging-only revert advance -# `:latest` to a SHA that never reaches main, breaking the "production -# runs what's on main" invariant. -# -# Idempotency: -# -# When a SHA touches paths that match BOTH publish and E2E, both -# workflows fire and complete. Both trigger this workflow on -# completion → two runs race. Both retag `:staging-` → -# `:latest`. crane tag is idempotent (re-tagging the same digest is a -# no-op), so the second run is harmless. concurrency group serializes -# them anyway. - -on: - workflow_run: - workflows: - - 'E2E Staging SaaS (full lifecycle)' - - 'publish-workspace-server-image' - types: [completed] - branches: [main] - workflow_dispatch: - inputs: - sha: - description: 'Short sha to promote (override; defaults to upstream workflow_run head_sha)' - required: false - type: string - -permissions: - contents: read - packages: write - -concurrency: - # Serialize promotes per-SHA so the publish+E2E both-fired race lands - # cleanly. Different SHAs can promote in parallel. - group: auto-promote-latest-${{ github.event.workflow_run.head_sha || github.event.inputs.sha || github.sha }} - cancel-in-progress: false - -env: - IMAGE_NAME: ghcr.io/molecule-ai/platform - TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant - -jobs: - promote: - # Proceed if upstream succeeded OR manual dispatch. Upstream-failure - # paths are filtered here; the E2E-was-red kill-switch lives in the - # gate-check step below (covers the case where upstream is publish - # success but E2E for the same SHA failed). - if: | - github.event_name == 'workflow_dispatch' || - (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') - runs-on: ubuntu-latest - steps: - - name: Compute short sha - id: sha - run: | - set -euo pipefail - if [ -n "${{ github.event.inputs.sha }}" ]; then - FULL="${{ github.event.inputs.sha }}" - else - FULL="${{ github.event.workflow_run.head_sha }}" - fi - echo "short=${FULL:0:7}" >> "$GITHUB_OUTPUT" - echo "full=${FULL}" >> "$GITHUB_OUTPUT" - - - name: Gate — E2E Staging SaaS state for this SHA - # When upstream IS E2E success, we know it's green (filtered by - # the job-level `if` already). When upstream is publish, look up - # E2E state for the same SHA. Four buckets: - # - # - completed/success: E2E confirmed safe → proceed - # - completed/failure|cancelled|timed_out: E2E found a - # regression → ABORT (exit 1), `:latest` stays put - # - in_progress|queued|requested: E2E is RACING with publish - # for a runtime-touching SHA. publish typically completes - # ~5-10min before E2E (~10-15min). If we promote on the - # publish signal here, a later E2E failure can't roll back - # `:latest` — it'd already be wrongly advanced. So we DEFER: - # skip subsequent steps (proceed=false) and let E2E's own - # completion event re-fire this workflow, which then takes - # the upstream-is-E2E path. exit 0 so the run shows as - # success rather than a noisy fake-failure. - # - none/none: E2E was paths-filtered out for this SHA (the - # change touched canvas/cmd/sweep/etc. — paths covered by - # publish but not by E2E). pre-merge gates on staging - # already validated this SHA → proceed. - # - # Manual dispatch skips this check — operator override. - id: gate - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - REPO: ${{ github.repository }} - SHA: ${{ steps.sha.outputs.full }} - UPSTREAM_NAME: ${{ github.event.workflow_run.name }} - EVENT_NAME: ${{ github.event_name }} - run: | - set -euo pipefail - - if [ "$EVENT_NAME" = "workflow_dispatch" ]; then - echo "proceed=true" >> "$GITHUB_OUTPUT" - echo "::notice::Manual dispatch — skipping E2E gate (operator override)" - exit 0 - fi - - if [ "$UPSTREAM_NAME" = "E2E Staging SaaS (full lifecycle)" ]; then - echo "proceed=true" >> "$GITHUB_OUTPUT" - echo "::notice::Upstream is E2E itself (success per job-level if) — gate trivially satisfied" - exit 0 - fi - - # Upstream is publish-workspace-server-image. Check E2E state - # for the same SHA via Gitea's commit-status API. - # - # GitHub-era this was `gh run list --workflow=X --commit=SHA - # --json status,conclusion` returning either `[]` (no run on - # this SHA) or `[{status, conclusion}]` (the run's state). - # Gitea has NO workflow-runs API at all — `/api/v1/repos/.../ - # actions/runs` returns 404 (verified 2026-05-07, issue #75). - # However Gitea Actions DOES emit a commit status per workflow - # job, with `context = " / ()"`, - # which is exactly what we need: each E2E run leg becomes one - # status row on the SHA, and the aggregate state encodes the - # run's outcome. - # - # Mapping: - # 0 matched contexts → "none/none" (E2E paths- - # filtered - # out — same - # semantic - # as before) - # any context = pending → "in_progress/none" (defer) - # any context = error|failure → "completed/failure" (abort) - # all contexts = success → "completed/success" (proceed) - # - # The "completed/cancelled" and "completed/timed_out" buckets - # don't have direct Gitea analogs (Gitea statuses are - # success / failure / error / pending / warning). Per-SHA - # concurrency cancellation surfaces as `error` on Gitea, which - # we map to "completed/failure" rather than "completed/cancelled" - # — losing the soft-defer semantic of the cancelled bucket on - # this fleet. Tradeoff: the staleness alarm (auto-promote-stale- - # alarm.yml) still catches a stuck :latest within 4h, and a - # legitimate cancel is rare enough that aborting + manual - # re-dispatch is acceptable. If we measure cancel frequency - # > 1/week, revisit by reading the run-step-summary text via - # a follow-up script. - # - # Network or auth blips collapse to "none/none" via the curl - # `|| true` fallback, matching the pre-Gitea behaviour where - # an empty list also degenerated to none/none. - GITEA_API_URL="${GITHUB_SERVER_URL:-https://git.moleculesai.app}/api/v1" - STATUSES_JSON=$(curl --fail-with-body -sS \ - -H "Authorization: token ${GH_TOKEN}" \ - -H "Accept: application/json" \ - "${GITEA_API_URL}/repos/${REPO}/commits/${SHA}/statuses?limit=100" \ - 2>/dev/null || echo "[]") - RESULT=$(printf '%s' "$STATUSES_JSON" | jq -r ' - # Filter to E2E Staging SaaS (full lifecycle) statuses. - # Match by leading workflow-name prefix so the " - # ()" tail is irrelevant. Gitea emits the workflow - # name verbatim from the YAML `name:` field. - [.[] | select(.context | startswith("E2E Staging SaaS (full lifecycle) /"))] as $rows - | if ($rows | length) == 0 then - "none/none" - elif any($rows[]; .status == "pending") then - "in_progress/none" - elif any($rows[]; .status == "failure" or .status == "error") then - "completed/failure" - elif all($rows[]; .status == "success") then - "completed/success" - else - # Mixed / unknown — fall through to *) bucket below. - "completed/" + ($rows[0].status // "unknown") - end - ' 2>/dev/null || echo "none/none") - - echo "E2E Staging SaaS for ${SHA:0:7}: $RESULT" - - case "$RESULT" in - completed/success) - echo "proceed=true" >> "$GITHUB_OUTPUT" - echo "::notice::E2E green for this SHA — proceeding with promote" - ;; - completed/failure|completed/timed_out) - echo "proceed=false" >> "$GITHUB_OUTPUT" - { - echo "## ❌ Auto-promote aborted — E2E Staging SaaS failed" - echo - echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\`" - echo "\`:latest\` stays on the prior known-good digest." - echo - echo "If the failure was a flake, manually dispatch this workflow with the same sha to override." - } >> "$GITHUB_STEP_SUMMARY" - exit 1 - ;; - completed/cancelled) - # GitHub-era only: cancelled ≠ failure. Gitea statuses - # don't expose a "cancelled" state — a per-SHA concurrency - # cancellation surfaces as `failure` or `error` on Gitea - # and is now handled by the failure branch above. This - # arm is kept for backwards compatibility / dual-host - # operation (if we ever add a non-Gitea fallback) but - # under the post-#75 flow it's unreachable. - echo "proceed=false" >> "$GITHUB_OUTPUT" - { - echo "## ⏭ Auto-promote deferred — E2E Staging SaaS was cancelled" - echo - echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\`" - echo "Likely per-SHA concurrency (newer push superseded this E2E run)." - echo "The newer SHA's E2E will fire its own promote when it lands." - echo "If you need this specific SHA promoted, manually dispatch." - } >> "$GITHUB_STEP_SUMMARY" - ;; - in_progress/*|queued/*|requested/*|waiting/*|pending/*) - echo "proceed=false" >> "$GITHUB_OUTPUT" - { - echo "## ⏳ Auto-promote deferred — E2E Staging SaaS still running" - echo - echo "Publish completed before E2E for \`${SHA:0:7}\` (state: \`$RESULT\`)." - echo "Skipping retag here — E2E's own completion event will re-fire this workflow." - echo "If E2E ends green, that run promotes \`:latest\`. If red, it aborts." - } >> "$GITHUB_STEP_SUMMARY" - ;; - none/none) - echo "proceed=true" >> "$GITHUB_OUTPUT" - echo "::notice::E2E paths-filtered out for this SHA — pre-merge staging gates carry" - ;; - *) - echo "proceed=false" >> "$GITHUB_OUTPUT" - { - echo "## ❓ Auto-promote aborted — unexpected E2E state" - echo - echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\` (unhandled)" - echo "Manual investigation needed; re-dispatch with the same sha once resolved." - } >> "$GITHUB_STEP_SUMMARY" - exit 1 - ;; - esac - - - if: steps.gate.outputs.proceed == 'true' - uses: imjasonh/setup-crane@6da1ae018866400525525ce74ff892880c099987 # v0.5 - - - name: GHCR login - if: steps.gate.outputs.proceed == 'true' - run: | - echo "${{ secrets.GITHUB_TOKEN }}" | \ - crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin - - - name: Verify :staging- exists for both images - # Better to fail fast with a clear message than to half-tag - # (platform retagged but platform-tenant missing → tenants pull - # a stale image). - if: steps.gate.outputs.proceed == 'true' - run: | - set -euo pipefail - for img in "${IMAGE_NAME}" "${TENANT_IMAGE_NAME}"; do - tag="${img}:staging-${{ steps.sha.outputs.short }}" - if ! crane manifest "$tag" >/dev/null 2>&1; then - echo "::error::Missing tag: $tag" - echo "::error::publish-workspace-server-image must complete on this SHA before auto-promote can retag :latest." - exit 1 - fi - echo " ok: $tag exists" - done - - - name: Ancestry check — refuse to promote :latest backwards - # #2244: workflow_run completions arrive in arbitrary order. If - # SHA-A and SHA-B both reach main within ~10 min and SHA-B's E2E - # completes before SHA-A's, this workflow can fire for SHA-A - # AFTER it already promoted SHA-B → :latest goes backwards. The - # orphan-reconciler "next run corrects it" doesn't apply: there's - # no auto-corrective re-promote, :latest stays wrong until the - # next main push lands. - # - # Detection: read current :latest's `org.opencontainers.image.revision` - # label (set by publish-workspace-server-image.yml at build time) - # and ask the GitHub compare API whether the candidate SHA is - # ahead-of / identical-to / behind / diverged-from current. - # Hard-fail on `behind` and `diverged` per the approved design — - # silent-bypass is the class we're moving away from. Workflow - # goes red, oncall sees it, operator decides how to recover - # (manual dispatch with the right SHA, force-promote, etc.). - # - # Manual dispatch skips this check — operator override semantics - # match the gate-check step above. - # - # Backward-compat: when current :latest carries no revision - # label (legacy image pre-publish-with-label), skip-with-warning. - # All :latest images on main are post-label as of 2026-04-29, so - # this branch will be dead within 90 days; remove then. - if: steps.gate.outputs.proceed == 'true' && github.event_name != 'workflow_dispatch' - id: ancestry - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - REPO: ${{ github.repository }} - TARGET_SHA: ${{ steps.sha.outputs.full }} - run: | - set -euo pipefail - - # Read the current :latest config and pull the revision label. - # `crane config` returns the OCI image config blob (not the manifest); - # labels live under `.config.Labels`. `// empty` makes jq return "" - # rather than the literal "null" so the test below works. - CURRENT_REVISION=$(crane config "${IMAGE_NAME}:latest" 2>/dev/null \ - | jq -r '.config.Labels["org.opencontainers.image.revision"] // empty' \ - || true) - - if [ -z "$CURRENT_REVISION" ]; then - echo "decision=skip-no-label" >> "$GITHUB_OUTPUT" - { - echo "## ⚠ Ancestry check skipped — current :latest has no revision label" - echo - echo "Likely a legacy image built before \`org.opencontainers.image.revision\` was set." - echo "Falling through to retag. After all \`:latest\` images are post-label (TODO 90 days), this branch is dead and should be removed." - } >> "$GITHUB_STEP_SUMMARY" - echo "::warning::Current :latest carries no revision label — skipping ancestry check (legacy image)" - exit 0 - fi - - if [ "$CURRENT_REVISION" = "$TARGET_SHA" ]; then - echo "decision=identical" >> "$GITHUB_OUTPUT" - echo "::notice:::latest already at ${TARGET_SHA:0:7} — retag will be a no-op" - exit 0 - fi - - # Ask GitHub which side of the merge graph TARGET_SHA sits on - # relative to CURRENT_REVISION. Returns one of: ahead | identical - # | behind | diverged. Network or auth errors collapse to "error" - # via the explicit fallback so the case below always matches. - STATUS=$(gh api \ - "repos/${REPO}/compare/${CURRENT_REVISION}...${TARGET_SHA}" \ - --jq '.status' 2>/dev/null || echo "error") - - echo "ancestry compare ${CURRENT_REVISION:0:7} → ${TARGET_SHA:0:7}: $STATUS" - - case "$STATUS" in - ahead) - echo "decision=ahead" >> "$GITHUB_OUTPUT" - echo "::notice::Target ${TARGET_SHA:0:7} is ahead of current :latest (${CURRENT_REVISION:0:7}) — proceeding with retag" - ;; - identical) - echo "decision=identical" >> "$GITHUB_OUTPUT" - echo "::notice::Target identical to :latest — retag will be a no-op" - ;; - behind) - echo "decision=behind" >> "$GITHUB_OUTPUT" - { - echo "## ❌ Auto-promote refused — target is BEHIND current :latest" - echo - echo "| Field | Value |" - echo "|---|---|" - echo "| Target SHA | \`$TARGET_SHA\` |" - echo "| Current :latest revision | \`$CURRENT_REVISION\` |" - echo "| GitHub compare status | \`behind\` |" - echo - echo "This guard catches the workflow_run-completion-order race (#2244):" - echo "two rapid main pushes whose E2Es complete out-of-order can otherwise" - echo "promote \`:latest\` backwards. \`:latest\` stays on \`${CURRENT_REVISION:0:7}\`." - echo - echo "**Recovery:** if this is a legitimate revert that should land on \`:latest\`," - echo "manually dispatch this workflow with the target sha as input — the manual-dispatch" - echo "path skips the ancestry check (operator override)." - } >> "$GITHUB_STEP_SUMMARY" - exit 1 - ;; - diverged) - echo "decision=diverged" >> "$GITHUB_OUTPUT" - { - echo "## ❓ Auto-promote refused — history diverged" - echo - echo "| Field | Value |" - echo "|---|---|" - echo "| Target SHA | \`$TARGET_SHA\` |" - echo "| Current :latest revision | \`$CURRENT_REVISION\` |" - echo "| GitHub compare status | \`diverged\` |" - echo - echo "Likely cause: force-push rewrote main's history, leaving the previous" - echo "\`:latest\` revision orphaned. Needs human review before \`:latest\` advances." - } >> "$GITHUB_STEP_SUMMARY" - exit 1 - ;; - error|*) - echo "decision=error" >> "$GITHUB_OUTPUT" - { - echo "## ❌ Auto-promote aborted — ancestry-check API error" - echo - echo "\`gh api repos/${REPO}/compare/${CURRENT_REVISION}...${TARGET_SHA}\` returned unexpected status: \`$STATUS\`" - echo - echo "Manual dispatch with the target sha bypasses this check." - } >> "$GITHUB_STEP_SUMMARY" - exit 1 - ;; - esac - - - name: Retag platform :staging- → :latest - if: steps.gate.outputs.proceed == 'true' - run: | - crane tag "${IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest - - - name: Retag tenant :staging- → :latest - if: steps.gate.outputs.proceed == 'true' - run: | - crane tag "${TENANT_IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest - - - name: Summary - if: steps.gate.outputs.proceed == 'true' - run: | - { - echo "## :latest promoted to ${{ steps.sha.outputs.short }}" - echo - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - echo "- Trigger: manual dispatch" - else - echo "- Upstream: \`${{ github.event.workflow_run.name }}\` ([run](${{ github.event.workflow_run.html_url }}))" - fi - echo "- platform:staging-${{ steps.sha.outputs.short }} → :latest" - echo "- platform-tenant:staging-${{ steps.sha.outputs.short }} → :latest" - echo - echo "Tenant fleet auto-pulls within 5 min via IMAGE_AUTO_REFRESH=true." - echo "Force immediate fanout: dispatch redeploy-tenants-on-main.yml." - } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/auto-promote-staging.yml b/.github/workflows/auto-promote-staging.yml deleted file mode 100644 index 7d2ce310..00000000 --- a/.github/workflows/auto-promote-staging.yml +++ /dev/null @@ -1,492 +0,0 @@ -name: Auto-promote staging → main - -# Fires after any of the staging-branch quality gates complete. When ALL -# required gates are green on the same staging SHA, opens (or re-uses) -# a PR `staging → main` and schedules Gitea auto-merge so the PR lands -# automatically once approval + status checks are satisfied. -# -# ============================================================ -# What this workflow does -# ============================================================ -# -# 1. On a workflow_run completion event for one of the staging gate -# workflows (CI, E2E Staging Canvas, E2E API Smoke, CodeQL), -# checks if the combined status on the staging head SHA is green. -# 2. If green, opens (or re-uses) a PR `head: staging → base: main` -# via Gitea REST `POST /api/v1/repos/.../pulls`. -# 3. Schedules auto-merge via `POST /api/v1/repos/.../pulls/{index}/merge` -# with `merge_when_checks_succeed: true`. Gitea waits for the -# approval requirement on `main` (`required_approvals: 1`) and -# the status-check gates, then merges. -# 4. The merge commit lands on `main` and fires -# `publish-workspace-server-image.yml` naturally via its -# `on: push: branches: [main]` trigger — no explicit dispatch -# needed (see "Why no workflow_dispatch tail" below). -# -# `auto-sync-main-to-staging.yml` is the reverse-direction -# counterpart (main → staging, fast-forward push). Together they -# keep the staging-superset-of-main invariant tight. -# -# ============================================================ -# Why Gitea REST (and not `gh pr create`) -# ============================================================ -# -# Pre-2026-05-06 this workflow used `gh pr create`, `gh pr merge --auto`, -# `gh run list`, and `gh workflow run` against GitHub. After the -# GitHub→Gitea cutover those calls fail because: -# -# - `gh pr create / merge / view / list` route to GitHub GraphQL -# (`/api/graphql`). Gitea does not expose a GraphQL endpoint; -# every call returns `HTTP 405 Method Not Allowed` — same root -# cause as #65 (auto-sync) which PR #66 fixed by dropping `gh` -# entirely. -# - `gh run list --workflow=...` GitHub-shape; Gitea has the -# simpler `GET /repos/.../commits/{ref}/status` combined-status -# endpoint instead. -# - `gh workflow run X.yml` calls `POST /repos/.../actions/workflows/{id}/dispatches`, -# which does NOT exist on Gitea 1.22.6 (verified via swagger.v1.json). -# -# So this workflow uses direct `curl` calls to Gitea REST. No `gh` -# CLI dependency, no GraphQL, no missing-endpoint footgun. -# -# ============================================================ -# Why no workflow_dispatch tail (was load-bearing on GitHub, dead on Gitea) -# ============================================================ -# -# The GitHub-era version had a 60-line polling step that waited for -# the promote PR to merge, then explicitly dispatched -# `publish-workspace-server-image.yml` on `--ref main`. That step -# existed because GitHub's GITHUB_TOKEN-initiated merges suppress -# downstream `on: push` workflows (the documented "no recursion" rule -# — https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow). -# The explicit dispatch was the workaround. -# -# Gitea Actions does NOT have this no-recursion rule. PR #66's auto- -# sync merge to main fired `auto-promote-staging` on the next push -# trigger naturally. So the cascade fires on the natural push event; -# the explicit dispatch is dead code. (And even if we wanted to -# preserve it, Gitea has no `workflow_dispatch` REST endpoint.) -# -# Removed in this rewrite. If we ever observe the cascade misfire, -# operator can push an empty commit to `main` to wake it. -# -# ============================================================ -# Why open a PR (and not direct push) -# ============================================================ -# -# `main` branch protection has `enable_push: false` with NO -# `push_whitelist_usernames`. Direct push is impossible for any -# persona, including admins. PR-mediated merge is the only path, -# which is intentional: prod state mutations (and staging→main IS a -# prod mutation, since the next deploy fans out to tenants) require -# Hongming's approval per `feedback_prod_apply_needs_hongming_chat_go`. -# -# The auto-merge schedule preserves this gate: `merge_when_checks_succeed` -# does NOT bypass `required_approvals: 1`. Gitea waits for BOTH -# approval AND green checks before merging. Hongming reviews via the -# canvas/chat-handle of the PR notification, approves, and Gitea -# auto-merges within seconds. -# -# ============================================================ -# Identity + token (anti-bot-ring per saved-memory -# `feedback_per_agent_gitea_identity_default`) -# ============================================================ -# -# This workflow uses `secrets.AUTO_SYNC_TOKEN` — a personal access -# token issued to the `devops-engineer` Gitea persona. NOT the -# founder PAT. The bot-ring fingerprint that triggered the GitHub -# org suspension on 2026-05-06 was characterised by founder PAT -# acting as CI at machine speed. -# -# Token scope: `push: true` (read+write) on this repo. The persona -# can: open PRs, comment on PRs, schedule auto-merge. The persona -# CANNOT bypass main's branch protection (`required_approvals: 1` -# still applies — only Hongming's review unblocks merge). -# -# Authorship: the PR is opened by `devops-engineer`; the merge -# commit credits Hongming-as-approver and `devops-engineer` as -# the merger. -# -# ============================================================ -# Failure modes & operational notes -# ============================================================ -# -# A — staging gates not all green at trigger time: -# - The combined-status check returns `state: pending|failure`. -# Workflow exits 0 with a step-summary "not all green; staying -# on current main". Re-fires on the next gate completion. -# -# B — Gitea PR-create returns non-201 (e.g. 422 already-exists): -# - Idempotent: the workflow first GETs the existing open -# staging→main PR. If found, reuse it; if not, POST a new one. -# 422 should never surface; if it does (race), step summary -# captures the body and the next workflow_run picks up. -# -# C — `merge_when_checks_succeed` schedule fails: -# - 422 with "Pull request is not mergeable" if there are -# conflicts or stale base. Step summary surfaces it; operator -# (or `auto-sync-main-to-staging`) needs to bring staging up -# to date with main first. Workflow exits 1 to surface red. -# -# D — `AUTO_SYNC_TOKEN` rotated / wrong scope: -# - 401/403 on first REST call. Step summary surfaces it. -# Re-issue the token from `~/.molecule-ai/personas/` on the -# operator host and update the repo Actions secret. -# -# ============================================================ -# Loop safety -# ============================================================ -# -# When the promote PR merges to main, `auto-sync-main-to-staging.yml` -# fires (on:push:main) and pushes the merge commit back to staging. -# That push to staging is by `devops-engineer`, NOT this workflow's -# token, and triggers the staging gate workflows. When they all -# complete, we end up back here — but the tree-diff guard catches -# it: staging tree == main tree (the merge commit changes nothing), -# so we skip and the cycle terminates. - -on: - workflow_run: - workflows: - - CI - - E2E Staging Canvas (Playwright) - - E2E API Smoke Test - - CodeQL - types: [completed] - workflow_dispatch: - inputs: - force: - description: "Force promote even when AUTO_PROMOTE_ENABLED is unset (manual override)" - required: false - default: "false" - -permissions: - contents: read - pull-requests: write - -# Serialize auto-promote runs. Multiple staging gate completions can land -# in quick succession (CI + E2E + CodeQL all finish within seconds of -# each other on a green PR) — without this, two parallel runs both: -# 1. Would race the GET-or-POST PR step. -# 2. Would both call merge-schedule (idempotent — fine on Gitea). -# cancel-in-progress: false because the second run on a fresh staging -# tip should NOT kill the first which has already opened the PR. -concurrency: - group: auto-promote-staging - cancel-in-progress: false - -jobs: - check-all-gates-green: - # Only consider staging pushes. PRs into staging don't promote. - if: > - (github.event_name == 'workflow_run' && - github.event.workflow_run.head_branch == 'staging' && - github.event.workflow_run.event == 'push') - || github.event_name == 'workflow_dispatch' - runs-on: ubuntu-latest - outputs: - all_green: ${{ steps.gates.outputs.all_green }} - head_sha: ${{ steps.gates.outputs.head_sha }} - steps: - # Skip empty-tree promotes (the perpetual auto-promote↔auto-sync - # cycle observed pre-cutover on GitHub). On Gitea the cycle shape - # is different (auto-sync uses fast-forward, no merge commit), - # but the tree-diff guard is cheap insurance and protects against - # any future merge-style regression. - - name: Checkout for tree-diff check - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - fetch-depth: 0 - ref: staging - - - name: Skip if staging tree == main tree (cycle-break safety) - id: tree-diff - env: - HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }} - run: | - set -eu - git fetch origin main --depth=50 || { echo "::warning::git fetch main failed — proceeding (fail-open)"; exit 0; } - if git diff --quiet origin/main "$HEAD_SHA" -- 2>/dev/null; then - { - echo "## Skipped — no code to promote" - echo - echo "staging tip (\`${HEAD_SHA:0:8}\`) and \`main\` have identical trees." - echo "Skipping to avoid opening an empty promote PR." - } >> "$GITHUB_STEP_SUMMARY" - echo "::notice::auto-promote: staging tree == main tree — no code to promote, skipping" - echo "skip=true" >> "$GITHUB_OUTPUT" - else - echo "skip=false" >> "$GITHUB_OUTPUT" - fi - - - name: Check combined status on staging head - if: steps.tree-diff.outputs.skip != 'true' - id: gates - env: - GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }} - HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }} - REPO: ${{ github.repository }} - GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }} - run: | - set -euo pipefail - - # Gitea-native combined-status endpoint aggregates every - # check context attached to a SHA. This is structurally - # cleaner than the GitHub-era per-workflow `gh run list` - # loop because: - # - # 1. There's no risk of "workflow name collision" (the - # GitHub-era code had to switch from `--workflow=NAME` - # to `--workflow=FILE.YML` to disambiguate "CodeQL" - # between the explicit workflow and GitHub's UI- - # configured default setup; Gitea has no such - # duplicate-name surface). - # 2. Gitea's combined state already encodes the AND - # across all contexts: success only if EVERY context - # is success. Pending or failure on any context - # produces non-success state. - # - # See https://docs.gitea.com/api/1.22 for the schema — - # `state` is one of: success, pending, failure, error. - - echo "head_sha=${HEAD_SHA}" >> "$GITHUB_OUTPUT" - echo "Checking combined status on SHA ${HEAD_SHA}" - - # `set +o pipefail` for the http-code capture pattern; restore - # immediately. Pattern hardened per `feedback_curl_status_capture_pollution`. - BODY_FILE=$(mktemp) - set +e - STATUS=$(curl -sS \ - -H "Authorization: token ${GITEA_TOKEN}" \ - -H "Accept: application/json" \ - -o "${BODY_FILE}" \ - -w "%{http_code}" \ - "${GITEA_HOST}/api/v1/repos/${REPO}/commits/${HEAD_SHA}/status") - CURL_RC=$? - set -e - - if [ "${CURL_RC}" -ne 0 ] || [ "${STATUS}" != "200" ]; then - echo "::error::combined-status fetch failed: curl=${CURL_RC} http=${STATUS}" - cat "${BODY_FILE}" | head -c 500 || true - rm -f "${BODY_FILE}" - echo "all_green=false" >> "$GITHUB_OUTPUT" - exit 0 - fi - - STATE=$(jq -r '.state // "missing"' < "${BODY_FILE}") - TOTAL=$(jq -r '.total_count // 0' < "${BODY_FILE}") - rm -f "${BODY_FILE}" - - echo "Combined status: state=${STATE} total_count=${TOTAL}" - - if [ "${STATE}" = "success" ] && [ "${TOTAL}" -gt 0 ]; then - echo "all_green=true" >> "$GITHUB_OUTPUT" - echo "::notice::All gates green on ${HEAD_SHA} (${TOTAL} contexts)" - else - echo "all_green=false" >> "$GITHUB_OUTPUT" - { - echo "## Not promoting — combined status not green" - echo - echo "- SHA: \`${HEAD_SHA:0:8}\`" - echo "- Combined state: \`${STATE}\`" - echo "- Context count: ${TOTAL}" - echo - echo "Will re-fire on the next gate completion. Investigate any red gate via the Actions UI." - } >> "$GITHUB_STEP_SUMMARY" - echo "::notice::auto-promote: combined status is ${STATE} on ${HEAD_SHA} — staying on current main" - fi - - promote: - needs: check-all-gates-green - if: needs.check-all-gates-green.outputs.all_green == 'true' - runs-on: ubuntu-latest - steps: - - name: Check rollout gate - env: - AUTO_PROMOTE_ENABLED: ${{ vars.AUTO_PROMOTE_ENABLED }} - FORCE_INPUT: ${{ github.event.inputs.force }} - run: | - set -eu - # Repo variable AUTO_PROMOTE_ENABLED=true flips this on. While - # it's unset, the workflow dry-runs (logs what it would have - # done) but doesn't open the promote PR. Set the variable in - # Settings → Actions → Variables. - if [ "${AUTO_PROMOTE_ENABLED:-}" != "true" ] && [ "${FORCE_INPUT:-false}" != "true" ]; then - { - echo "## Auto-promote disabled" - echo - echo "Repo variable \`AUTO_PROMOTE_ENABLED\` is not set to \`true\`." - echo "All gates are green on staging; would have opened a promote PR to \`main\`." - echo - echo "To enable: Settings → Actions → Variables → \`AUTO_PROMOTE_ENABLED=true\`." - echo "To test once manually: workflow_dispatch with \`force=true\`." - } >> "$GITHUB_STEP_SUMMARY" - echo "::notice::auto-promote disabled — dry run only" - exit 0 - fi - - - name: Open or reuse promote PR + schedule auto-merge - if: ${{ vars.AUTO_PROMOTE_ENABLED == 'true' || github.event.inputs.force == 'true' }} - env: - GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }} - REPO: ${{ github.repository }} - TARGET_SHA: ${{ needs.check-all-gates-green.outputs.head_sha }} - GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }} - run: | - set -euo pipefail - - API="${GITEA_HOST}/api/v1/repos/${REPO}" - AUTH=(-H "Authorization: token ${GITEA_TOKEN}" -H "Accept: application/json") - - # http_status_get RESULT_VAR URL - # Sets RESULT_VAR to ":". Curl status - # capture pattern per `feedback_curl_status_capture_pollution`: - # http_code goes to its own tempfile-equivalent (-w), body to - # another tempfile, set +e/-e bracket protects pipeline state. - http_get() { - local body_file="$1"; shift - local url="$1"; shift - set +e - local code - code=$(curl -sS "${AUTH[@]}" -o "${body_file}" -w "%{http_code}" "${url}") - local rc=$? - set -e - if [ "${rc}" -ne 0 ]; then - echo "::error::curl GET failed (rc=${rc}) on ${url}" - return 99 - fi - echo "${code}" - } - http_post_json() { - local body_file="$1"; shift - local data="$1"; shift - local url="$1"; shift - set +e - local code - code=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \ - -X POST -d "${data}" -o "${body_file}" -w "%{http_code}" "${url}") - local rc=$? - set -e - if [ "${rc}" -ne 0 ]; then - echo "::error::curl POST failed (rc=${rc}) on ${url}" - return 99 - fi - echo "${code}" - } - - # Step 1: look for an existing open staging→main promote PR - # (idempotent on workflow re-run). Gitea doesn't have a - # head/base filter on the list endpoint that's as ergonomic - # as gh's, but the dedicated `/pulls/{base}/{head}` lookup - # works. - BODY=$(mktemp) - STATUS=$(http_get "${BODY}" "${API}/pulls/main/staging") || true - - PR_NUM="" - if [ "${STATUS}" = "200" ]; then - STATE=$(jq -r '.state // "missing"' < "${BODY}") - if [ "${STATE}" = "open" ]; then - PR_NUM=$(jq -r '.number // ""' < "${BODY}") - echo "::notice::Re-using existing open promote PR #${PR_NUM}" - fi - fi - rm -f "${BODY}" - - # Step 2: if no open PR, create one. - if [ -z "${PR_NUM}" ]; then - TITLE="staging → main: auto-promote ${TARGET_SHA:0:7}" - BODY_TEXT=$(cat <> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/auto-promote-stale-alarm.yml b/.github/workflows/auto-promote-stale-alarm.yml deleted file mode 100644 index 58667c6f..00000000 --- a/.github/workflows/auto-promote-stale-alarm.yml +++ /dev/null @@ -1,83 +0,0 @@ -name: auto-promote-stale-alarm - -# Hourly cron + on-demand alarm for the silent-block failure mode that -# motivated issue #2975: -# - The auto-promote-staging.yml workflow opened a PR + armed -# auto-merge, but main's branch protection requires a human review -# (reviewDecision=REVIEW_REQUIRED). The PR sat BLOCKED with no -# surface-up-the-stack for 12+ hours, holding 25 commits hostage -# including the Memory v2 redesign and a reno-stars data-loss fix. -# -# This workflow runs `scripts/check-stale-promote-pr.sh` against the -# repo's open auto-promote PRs (base=main head=staging). When a PR has -# been BLOCKED on REVIEW_REQUIRED for >4h, it: -# 1. Emits a workflow-level warning (visible in run summary + the -# Actions UI feed). -# 2. Posts a comment on the PR (idempotent — one alarm per PR). -# -# The detection logic lives in scripts/check-stale-promote-pr.sh so -# it's unit-testable with stubbed `gh` (see test-check-stale-promote-pr.sh). -# This file is the schedule + invocation surface only — SSOT for the -# detector itself. - -on: - schedule: - # Hourly. Cheap (one `gh pr list` + jq), and 1h granularity is - # plenty for a 4h staleness threshold — operators see the alarm - # within at most 1h of crossing the threshold. - - cron: "27 * * * *" # at :27 to dodge the cron herd at :00 - workflow_dispatch: - inputs: - stale_hours: - description: "Hours after which a BLOCKED+REVIEW_REQUIRED PR is stale (default 4)" - required: false - default: "4" - post_comment: - description: "Post a comment on stale PRs (default true)" - required: false - default: "true" - -permissions: - contents: read - pull-requests: write # post comments on stale PRs - -# Serialize so the on-demand and scheduled runs don't double-comment -# the same PR. cancel-in-progress=false because the script is idempotent -# (existing comment marker prevents dupes), but a scheduled run firing -# while a manual one runs would just re-list the same PR set. -concurrency: - group: auto-promote-stale-alarm - cancel-in-progress: false - -jobs: - scan: - runs-on: ubuntu-latest - steps: - - name: Checkout (need scripts/ only) - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - sparse-checkout: | - scripts/check-stale-promote-pr.sh - sparse-checkout-cone-mode: false - - name: Run stale-PR detector - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - GITHUB_REPOSITORY: ${{ github.repository }} - STALE_HOURS: ${{ inputs.stale_hours || '4' }} - POST_COMMENT: ${{ inputs.post_comment || 'true' }} - run: | - # The script's exit code reflects the count of stale PRs. - # We don't want a stale finding to fail the workflow run — - # the warning + comment are the signal, the green/red is - # noise. So convert any non-zero exit to a workflow notice - # and exit 0. - set +e - bash scripts/check-stale-promote-pr.sh - rc=$? - set -e - if [ "$rc" -ne 0 ]; then - echo "::notice::Stale PR detector found $rc PR(s) needing attention. See warnings above + comments on the PRs." - fi - # Always succeed — operator-facing surface is the warning, - # not the workflow status. - exit 0 diff --git a/.github/workflows/auto-sync-canary.yml b/.github/workflows/auto-sync-canary.yml deleted file mode 100644 index f6b0437b..00000000 --- a/.github/workflows/auto-sync-canary.yml +++ /dev/null @@ -1,404 +0,0 @@ -name: Auto-sync canary — AUTO_SYNC_TOKEN rotation drift - -# Synthetic health check for the AUTO_SYNC_TOKEN secret consumed by -# auto-sync-main-to-staging.yml (PR #66) and publish-workspace-server-image.yml. -# -# ============================================================ -# Why this workflow exists -# ============================================================ -# -# PR #66 fixed auto-sync (replaced GitHub-era `gh pr create` — which -# 405s on Gitea's GraphQL endpoint — with a direct git push from the -# `devops-engineer` persona's `AUTO_SYNC_TOKEN`). Hostile self-review -# weakest spot #3 of that PR: -# -# "Token rotation silently breaks auto-sync. If AUTO_SYNC_TOKEN is -# rotated without updating the repo secret, every push to main -# fails red on the auto-sync push step. The workflow surfaces the -# failure mode in the step summary (failure mode B in the header), -# but there's no proactive monitoring." -# -# Detection latency under the status quo: rotation is only caught on -# the next push to `main`. During quiet periods (no main push for -# many hours) the staging-superset-of-main invariant silently breaks. -# -# This workflow closes the gap: every 6 hours, it fires the auth -# surface that auto-sync depends on and emits a red workflow status -# if AUTO_SYNC_TOKEN has drifted out of validity. -# -# ============================================================ -# What this checks (Option B — read-only verify) -# ============================================================ -# -# 1. `GET /api/v1/user` against Gitea with the token → validates the -# token authenticates AND resolves to `devops-engineer` (catches -# the case where the token was regenerated under a different -# persona by mistake). -# 2. `GET /api/v1/repos/molecule-ai/molecule-core` with the token → -# validates the token has `read:repository` scope on this repo -# (the v2 scope contract — see saved memory -# `reference_persona_token_v2_scope`). -# 3. `git push --dry-run` of the current staging SHA back to -# `refs/heads/staging` via `https://oauth2:@/...` -# → validates the EXACT HTTPS basic-auth path that -# `actions/checkout` + `git push origin staging` use inside -# auto-sync-main-to-staging.yml. NOP by construction (push the -# current tip to itself = "Everything up-to-date"); auth is -# checked at the smart-protocol handshake BEFORE the empty-diff -# computation, so bad token → exit 128 with "Authentication -# failed". `git ls-remote` is NOT used here because Gitea -# falls back to anonymous read on public repos and would -# silently green-light a rotated token. -# -# Each step exits non-zero with an actionable error message if it -# fails. The workflow status itself is the operator-facing surface. -# -# ============================================================ -# What this does NOT check (intentional) -# ============================================================ -# -# - **Branch-protection authz** (failure mode C in auto-sync header): -# would require an actual write to staging. Already monitored by -# `branch-protection-drift.yml` daily. Don't duplicate. -# - **Conflict resolution** (failure mode A): a real conflict is data- -# driven, not auth-driven; can't synthesise it without polluting -# staging. Already surfaces immediately on the next main push. -# - **Concurrency** (failure mode D): handled by workflow concurrency -# group on auto-sync, not a credential issue. -# -# ============================================================ -# Why Option B (read-only) and not the alternatives -# ============================================================ -# -# Considered + rejected (see issue #72 for full write-up): -# -# - **Option A — full auto-sync on schedule**: every run creates a -# no-op merge commit on staging when main hasn't advanced. 4 noise -# commits/day. And races the real `push:` trigger when main has -# advanced. Rejected. -# -# - **Option C — push to dedicated `auto-sync-canary` branch**: would -# exercise authz too, but adds branch noise on Gitea AND requires -# maintaining a second branch protection (or expanding staging's -# whitelist to a junk branch). Authz already covered by -# `branch-protection-drift.yml`. Rejected. -# -# Prior art for the chosen Option B shape: -# - Cloudflare's `/user/tokens/verify` endpoint (read-only auth -# probe explicitly designed for credential canaries). -# - AWS Secrets Manager rotation Lambda's `testSecret` step (auth -# probe before promoting AWSPENDING → AWSCURRENT). -# - HashiCorp Vault's `vault token lookup` for renewal canaries. -# -# ============================================================ -# Operator runbook — what to do when this workflow goes RED -# ============================================================ -# -# 1. **Identify which step failed**: -# - Step "Verify token authenticates as devops-engineer" red → -# token is invalid OR resolves to wrong persona. -# - Step "Verify token has repo read scope" red → token valid but -# stripped of `read:repository` scope (or repo perms changed). -# - Step "Verify git HTTPS auth path via no-op dry-run push to -# staging" red → token rotated/revoked OR Gitea git-HTTPS -# surface is broken (rare). Auth check happens on the -# smart-protocol handshake, separate from the API path. -# -# 2. **Re-issue the token** on the operator host: -# ``` -# ssh root@5.78.80.188 'docker exec --user git molecule-gitea-1 \ -# gitea admin user generate-access-token \ -# --username devops-engineer \ -# --token-name persona-devops-engineer-vN \ -# --scopes "read:repository,write:repository,read:user,read:organization,read:issue,write:issue,read:notification,read:misc"' -# ``` -# Update `/etc/molecule-bootstrap/agent-secrets.env` in place -# (per `feedback_unified_credentials_file`). The previous token -# file lands at `.bak.`. -# -# 3. **Update the repo Actions secret** at: -# Settings → Secrets and variables → Actions → AUTO_SYNC_TOKEN -# Paste the new token. (Don't echo it in chat — but per -# `feedback_passwords_in_chat_are_burned`, a paste in a 1:1 -# Claude session is within trust boundary.) -# -# 4. **Re-run this canary** via workflow_dispatch. Confirm GREEN. -# -# 5. **Backfill any missed main → staging syncs** by re-running -# `auto-sync-main-to-staging.yml` from its workflow_dispatch -# surface, OR by pushing an empty commit to main (if you'd -# rather force a real trigger). -# -# ============================================================ -# Security notes -# ============================================================ -# -# - Token usage: read-only (`GET /api/v1/user`, `GET /api/v1/repos/...`, -# `git ls-remote`). No write paths. Same blast-radius profile as -# `actions/checkout` on a public repo. -# - The token NEVER appears in logs: every `curl` uses a header -# variable, never inline; the `git ls-remote` URL builds the -# `oauth2:$TOKEN@host` form into a single env var that's not -# echoed. GitHub Actions secret-masking covers anything that does -# slip through. -# - No new token introduced — same `AUTO_SYNC_TOKEN` the workflow -# under monitor uses. Per least-privilege we deliberately do NOT -# broaden scope for the canary. - -on: - schedule: - # Every 6 hours at :17 (offsets the cron herd at :00). Justification - # from issue #72: cheap to run (~5s wall-clock, no quota), 3h average - # detection latency, 6h max. 1h would be 24× the runs for marginal - # benefit; daily would be 6× longer latency and worse than status - # quo on a quiet-main day. - - cron: '17 */6 * * *' - workflow_dispatch: - -# No concurrency group needed — the canary is read-only and idempotent. -# Two parallel runs (e.g. operator dispatch during a scheduled tick) are -# harmless: same result, doubled HTTPS calls, no shared state. - -permissions: - contents: read - -jobs: - verify-token: - name: Verify AUTO_SYNC_TOKEN validity - runs-on: ubuntu-latest - # 2 min surfaces hangs (Gitea API stall, DNS issue) within one - # cron interval. Realistic worst case is ~10s: 2 curls + 1 git - # ls-remote, each capped by the explicit timeouts below. - timeout-minutes: 2 - - env: - # Pinned in env so individual steps can read it without - # repeating the secret reference. GitHub masks the value in - # logs automatically. - AUTO_SYNC_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }} - # MUST stay in sync with auto-sync-main-to-staging.yml's - # `git config user.name "devops-engineer"` line. Renaming the - # devops-engineer persona requires updating both files (and - # the staging branch protection's `push_whitelist_usernames`). - EXPECTED_PERSONA: devops-engineer - GITEA_HOST: git.moleculesai.app - REPO_PATH: molecule-ai/molecule-core - - steps: - - name: Verify AUTO_SYNC_TOKEN secret is configured - # Schedule-vs-dispatch behaviour split, per - # `feedback_schedule_vs_dispatch_secrets_hardening`: - # - # - schedule: hard-fail when the secret is missing. The - # whole point of the canary is to surface drift; soft- - # skipping on missing-secret would make the canary - # itself drift-invisible (sweep-cf-orphans #2088 lesson). - # - workflow_dispatch: hard-fail too — there's no scenario - # where an operator wants this canary to silently no-op. - # The workflow has no other ad-hoc utility; if you ran - # it, you wanted the answer. - run: | - if [ -z "${AUTO_SYNC_TOKEN}" ]; then - echo "::error::AUTO_SYNC_TOKEN secret is not set on this repo." >&2 - echo "::error::Set it at Settings → Secrets and variables → Actions." >&2 - echo "::error::Without it, auto-sync-main-to-staging.yml will fail every push to main." >&2 - exit 1 - fi - echo "AUTO_SYNC_TOKEN is configured (value masked)." - - - name: Verify token authenticates as ${{ env.EXPECTED_PERSONA }} - # Calls Gitea's `/api/v1/user` — the canonical - # auth-probe-with-no-side-effects endpoint (mirrors - # Cloudflare's /user/tokens/verify). - # - # Failure surfaces: - # - HTTP 401: token invalid (rotated, revoked, or never - # correctly registered). - # - HTTP 200 but username != devops-engineer: token was - # regenerated under the wrong persona — this would let - # auth pass but commit attribution would be wrong, and - # branch-protection authz would fail because only - # `devops-engineer` is whitelisted. - run: | - set -euo pipefail - response_file="$(mktemp)" - code_file="$(mktemp)" - # `--max-time 30`: full call ceiling. `--connect-timeout 10`: - # DNS + TCP. `-w "%{http_code}"` routed to a tempfile so curl's - # exit code can't pollute the captured status — see - # feedback_curl_status_capture_pollution + the - # `lint-curl-status-capture.yml` gate that rejects the unsafe - # `$(curl ... || echo "000")` shape. - set +e - curl -sS -o "$response_file" \ - --max-time 30 --connect-timeout 10 \ - -w "%{http_code}" \ - -H "Authorization: token ${AUTO_SYNC_TOKEN}" \ - -H "Accept: application/json" \ - "https://${GITEA_HOST}/api/v1/user" >"$code_file" 2>/dev/null - set -e - status=$(cat "$code_file" 2>/dev/null || true) - [ -z "$status" ] && status="000" - - if [ "$status" != "200" ]; then - echo "::error::Token rotation suspected: GET /api/v1/user returned HTTP $status (expected 200)." >&2 - echo "::error::Likely cause: AUTO_SYNC_TOKEN has been rotated/revoked on Gitea but the repo Actions secret was not updated." >&2 - echo "::error::Runbook: see header comment of this workflow file." >&2 - # Print response body but redact anything that looks like a token. - sed -E 's/[A-Fa-f0-9]{32,}//g' "$response_file" >&2 || true - exit 1 - fi - - username=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('login',''))" "$response_file") - if [ "$username" != "${EXPECTED_PERSONA}" ]; then - echo "::error::Token resolves to user '$username', expected '${EXPECTED_PERSONA}'." >&2 - echo "::error::AUTO_SYNC_TOKEN must be the devops-engineer persona PAT (not founder PAT, not another persona)." >&2 - echo "::error::Auto-sync push will fail because only 'devops-engineer' is whitelisted on staging branch protection." >&2 - exit 1 - fi - echo "Token authenticates as: $username ✓" - - - name: Verify token has repo read scope - # `GET /api/v1/repos//` requires `read:repository` - # on the persona's v2 scope contract. If the scope was - # narrowed/dropped on rotation we catch it here, before the - # next main push reveals it via a checkout failure. - run: | - set -euo pipefail - response_file="$(mktemp)" - code_file="$(mktemp)" - # See first probe step for the rationale on the tempfile-routed - # `-w "%{http_code}"` pattern — the unsafe `|| echo "000"` shape - # is rejected by lint-curl-status-capture.yml. - set +e - curl -sS -o "$response_file" \ - --max-time 30 --connect-timeout 10 \ - -w "%{http_code}" \ - -H "Authorization: token ${AUTO_SYNC_TOKEN}" \ - -H "Accept: application/json" \ - "https://${GITEA_HOST}/api/v1/repos/${REPO_PATH}" >"$code_file" 2>/dev/null - set -e - status=$(cat "$code_file" 2>/dev/null || true) - [ -z "$status" ] && status="000" - - if [ "$status" != "200" ]; then - echo "::error::Token lacks read:repository scope on ${REPO_PATH}: HTTP $status." >&2 - echo "::error::Auto-sync's actions/checkout step will fail with this token." >&2 - echo "::error::Re-issue with v2 scope contract: read:repository,write:repository,read:user,read:organization,read:issue,write:issue,read:notification,read:misc" >&2 - sed -E 's/[A-Fa-f0-9]{32,}//g' "$response_file" >&2 || true - exit 1 - fi - echo "Token has read:repository on ${REPO_PATH} ✓" - - - name: Verify git HTTPS auth path via no-op dry-run push to staging - # Final probe: exercise the EXACT auth path that - # `actions/checkout` + `git push origin staging` use in - # auto-sync-main-to-staging.yml. Gitea's API and git-HTTPS - # surfaces share the token-lookup code path internally but - # the wire-level error shapes differ — historically (#173) - # the API path was healthy while git-HTTPS rejected, so - # checking only the API would have given false-green. - # - # IMPORTANT: `git ls-remote` on a public repo (which - # molecule-core is) succeeds even with a junk token because - # Gitea falls back to anonymous-read. `ls-remote` therefore - # CANNOT validate auth on this surface. We use - # `git push --dry-run` instead — push is auth-gated even on - # public repos. - # - # NOP shape: read the current staging SHA via authenticated - # ls-remote (the SHA itself is public; auth is incidental - # here, used only to colocate the discovery in one step), then - # `git push --dry-run :refs/heads/staging`. Pushing the - # current tip back to itself is "Everything up-to-date" with - # exit 0 when auth succeeds. With a bad token Gitea returns - # HTTP 401 in the smart-protocol handshake and git exits 128 - # with "Authentication failed". - # - # The dry-run never reaches Gitea's pre-receive hook (which - # is where branch-protection authz runs), so this probe does - # not validate failure mode C. That's intentional — - # branch-protection-drift.yml owns authz monitoring; this - # canary owns auth. - env: - # Don't hang waiting for password prompt if auth fails on a - # terminal-attached run. (In Actions there's no terminal, - # but the env-var hardens against an interactive runner - # config.) - GIT_TERMINAL_PROMPT: "0" - run: | - set -euo pipefail - # Token is in $AUTO_SYNC_TOKEN (job-level env). Compose the - # URL as a local var that's never echoed. - url="https://oauth2:${AUTO_SYNC_TOKEN}@${GITEA_HOST}/${REPO_PATH}" - - # Step a: read current staging SHA. ~1KB; auth-gated only - # on private repos but always works on public — used here - # only to discover the SHA, not to validate auth. - staging_ref=$(timeout 30s git ls-remote --refs "$url" refs/heads/staging 2>&1) || { - redacted=$(echo "$staging_ref" | sed -E "s|oauth2:[^@]+@|oauth2:@|g") - echo "::error::ls-remote against staging failed (network/DNS issue):" >&2 - echo "$redacted" >&2 - exit 1 - } - if ! echo "$staging_ref" | grep -qE '^[0-9a-f]{40}[[:space:]]+refs/heads/staging$'; then - echo "::error::ls-remote returned unexpected shape:" >&2 - echo "$staging_ref" | sed -E "s|oauth2:[^@]+@|oauth2:@|g" >&2 - exit 1 - fi - staging_sha=$(echo "$staging_ref" | awk '{print $1}') - - # Step b: spin up an ephemeral local repo. `git push` always - # requires a local repo even when pushing a remote SHA that - # isn't in the local object DB (the protocol negotiates and - # discovers we don't need to send any objects). We don't use - # `actions/checkout` for this — it would clone the whole - # repo (~hundreds of MB) for what's essentially `git init`. - tmp_repo="$(mktemp -d)" - trap 'rm -rf "$tmp_repo"' EXIT - git -C "$tmp_repo" init -q - # Author config required for any git operation; values are - # arbitrary because nothing gets committed here. - git -C "$tmp_repo" config user.email canary@auto-sync.local - git -C "$tmp_repo" config user.name auto-sync-canary - - # Step c: dry-run push the current staging SHA back to - # staging. NOP by construction — the remote tip equals the - # SHA we're pushing, so "Everything up-to-date" is the - # success path. - # - # Authentication is checked at the smart-protocol handshake, - # BEFORE the dry-run can compute an empty diff. Bad token - # → "Authentication failed", exit 128. Good token → exit 0. - set +e - push_out=$(timeout 30s git -C "$tmp_repo" push --dry-run "$url" "${staging_sha}:refs/heads/staging" 2>&1) - push_rc=$? - set -e - - if [ "$push_rc" -ne 0 ]; then - redacted=$(echo "$push_out" | sed -E "s|oauth2:[^@]+@|oauth2:@|g") - echo "::error::Token rotation suspected: git push --dry-run against staging failed via the AUTO_SYNC_TOKEN HTTPS auth path (exit $push_rc)." >&2 - echo "::error::This is the EXACT auth path that actions/checkout + git push use in auto-sync-main-to-staging.yml." >&2 - echo "::error::Likely cause: AUTO_SYNC_TOKEN was rotated/revoked on Gitea but the repo Actions secret was not updated. Runbook: see header." >&2 - echo "$redacted" >&2 - exit 1 - fi - - echo "git HTTPS auth path: NOP push --dry-run to staging → ${staging_sha:0:8} ✓" - - - name: Summarise canary result - # Everything passed — surface a green summary. (Failures - # already wrote ::error:: lines and exited above; if we got - # here, all three probes passed.) - run: | - { - echo "## Auto-sync canary: GREEN" - echo "" - echo "AUTO_SYNC_TOKEN is healthy:" - echo "- Authenticates as \`${EXPECTED_PERSONA}\` ✓" - echo "- Has \`read:repository\` scope on \`${REPO_PATH}\` ✓" - echo "- Git HTTPS auth path: no-op dry-run push to \`refs/heads/staging\` succeeds ✓" - echo "" - echo "Auto-sync main → staging will succeed on the next push to main." - echo "If this canary ever goes RED, see the runbook in this workflow's header." - } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/auto-sync-main-to-staging.yml b/.github/workflows/auto-sync-main-to-staging.yml deleted file mode 100644 index c0173a3d..00000000 --- a/.github/workflows/auto-sync-main-to-staging.yml +++ /dev/null @@ -1,255 +0,0 @@ -name: Auto-sync main → staging - -# Reflects every push to `main` back onto `staging` so the -# staging-as-superset-of-main invariant holds. -# -# ============================================================ -# What this workflow does -# ============================================================ -# -# On every push to `main`: -# 1. Checks if staging already contains main → no-op. -# 2. Fetches both branches, merges main into staging in the -# runner workspace (fast-forward if possible, else -# `--no-ff` merge commit). -# 3. Pushes staging directly to origin via the -# `devops-engineer` persona's `AUTO_SYNC_TOKEN`. -# -# Authoritative path: a single `git push origin staging` from -# inside this workflow is the SSOT for advancing staging after -# a main push. No PR, no merge queue, no human approval — -# staging is mechanically maintained as a superset of main. -# -# `auto-promote-staging.yml` is the reverse-direction -# counterpart (staging → main, gated on green CI). Together -# they keep the staging-superset-of-main invariant tight. -# -# ============================================================ -# Why direct push (and not "open a PR") -# ============================================================ -# -# Pre-2026-05-06 the canonical SCM was GitHub.com, where: -# - The `staging` branch had a `merge_queue` ruleset that -# blocked ALL direct pushes (no bypass even for org -# admins or the GitHub Actions integration). -# - Therefore this workflow opened a PR via `gh pr create` -# and let auto-merge land it through the queue. -# -# Post-2026-05-06 the canonical SCM is Gitea -# (`git.moleculesai.app/molecule-ai/molecule-core`). Gitea: -# - Has no `merge_queue` concept. -# - Allows direct push to protected branches via per-user -# `push_whitelist_usernames` on the branch protection. -# - Does not expose a GraphQL endpoint, so `gh pr create` -# returns `HTTP 405 Method Not Allowed -# (https://git.moleculesai.app/api/graphql)` — the -# pre-suspension architecture cannot work on Gitea. -# -# The molecule-ai/molecule-core staging branch protection -# (verified via `GET /api/v1/repos/.../branch_protections`) -# whitelists `devops-engineer` for direct push. So the -# correct Gitea-shape architecture is: authenticate as -# `devops-engineer`, merge locally, push staging directly. -# -# This is structurally simpler than the GitHub-era PR dance -# and removes the dependence on `gh` CLI / GraphQL entirely. -# -# ============================================================ -# Identity + token (anti-bot-ring per saved-memory -# `feedback_per_agent_gitea_identity_default`) -# ============================================================ -# -# This workflow uses `secrets.AUTO_SYNC_TOKEN`, which is a -# personal access token issued to the `devops-engineer` -# persona on Gitea — NOT the founder PAT. The bot-ring -# fingerprint that triggered the GitHub org suspension on -# 2026-05-06 was characterised by founder PAT acting as CI -# at machine speed; per-persona identities split the -# attribution honestly. -# -# Token scope on Gitea: repo write. Push target restricted -# to `staging` (this workflow is the only writer; main is -# untouched). Compromise blast radius: bounded to staging -# branch + this repo's read surface. -# -# Commits are authored by the persona email -# `devops-engineer@agents.moleculesai.app` so commit history -# reflects which automation produced the merge. -# -# ============================================================ -# Failure modes & operational notes -# ============================================================ -# -# A — staging has commits main doesn't, and the merge -# conflicts: -# - The `--no-ff` merge step exits non-zero. Workflow -# fails red. Operator (devops-engineer or human) -# resolves manually: -# git fetch origin -# git checkout staging -# git merge --no-ff origin/main -# # resolve conflicts -# git push origin staging -# - Step summary surfaces the conflict so the failed run -# is self-explanatory. -# -# B — `AUTO_SYNC_TOKEN` rotated / wrong scope: -# - `git push` step exits non-zero with `HTTP 401` / -# `403`. Step summary surfaces the failed push. -# - Re-issue the token from `~/.molecule-ai/personas/` -# on the operator host and update the repo Actions -# secret. Re-run the workflow. -# -# C — staging branch protection no longer whitelists -# `devops-engineer`: -# - `git push` exits non-zero with a Gitea protected- -# branch rejection. Step summary surfaces it. -# - Re-add `devops-engineer` to -# `push_whitelist_usernames` on the staging -# protection (Settings → Branches → staging). -# -# D — concurrent push to main while a sync is in flight: -# - The `concurrency` group below serialises runs. -# The second waits for the first; if main advances -# again while we're syncing, the second run picks -# up the new tip on its own fetch. -# -# ============================================================ -# Loop safety -# ============================================================ -# -# The push to staging from this workflow does NOT itself -# fire a `push: branches: [main]` event (different branch), -# so there's no risk of self-recursion. `auto-promote-staging.yml` -# fires on `workflow_run` of CI etc. — it sees the new -# staging tip on its next gate-completion event, NOT on this -# push directly. No loop. - -on: - push: - branches: [main] - # workflow_dispatch lets operators manually backfill a - # missed sync (e.g. if AUTO_SYNC_TOKEN was rotated and a - # main push slipped through while the secret was stale). - workflow_dispatch: - -permissions: - contents: write - -concurrency: - group: auto-sync-main-to-staging - cancel-in-progress: false - -jobs: - sync-staging: - runs-on: ubuntu-latest - steps: - - name: Checkout staging (with devops-engineer push token) - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - fetch-depth: 0 - ref: staging - # AUTO_SYNC_TOKEN authenticates as the - # `devops-engineer` Gitea persona — the only - # identity whitelisted for direct push to - # staging. See header comment for context. - token: ${{ secrets.AUTO_SYNC_TOKEN }} - - - name: Configure git author - run: | - # Per-persona identity, NOT founder PAT. - # `feedback_per_agent_gitea_identity_default`. - git config user.name "devops-engineer" - git config user.email "devops-engineer@agents.moleculesai.app" - - - name: Check if staging already contains main - id: check - run: | - set -euo pipefail - git fetch origin main - if git merge-base --is-ancestor origin/main HEAD; then - echo "needs_sync=false" >> "$GITHUB_OUTPUT" - { - echo "## No-op" - echo - echo "staging already contains \`origin/main\` ($(git rev-parse --short=8 origin/main))." - } >> "$GITHUB_STEP_SUMMARY" - else - echo "needs_sync=true" >> "$GITHUB_OUTPUT" - MAIN_SHORT=$(git rev-parse --short=8 origin/main) - echo "main_short=${MAIN_SHORT}" >> "$GITHUB_OUTPUT" - echo "::notice::staging is missing main's tip (${MAIN_SHORT}) — merging in-runner and pushing" - fi - - - name: Merge main into staging (in-runner) - if: steps.check.outputs.needs_sync == 'true' - id: merge - run: | - set -euo pipefail - # Already on staging from checkout. Try fast-forward - # first (cleanest history); fall back to merge commit - # if staging has commits main doesn't. - if git merge --ff-only origin/main; then - echo "did_ff=true" >> "$GITHUB_OUTPUT" - echo "::notice::Fast-forwarded staging to origin/main" - else - echo "did_ff=false" >> "$GITHUB_OUTPUT" - if ! git merge --no-ff origin/main \ - -m "chore: sync main → staging (auto, ${{ steps.check.outputs.main_short }})"; then - # Hygiene: leave the work tree clean before failing. - git merge --abort || true - { - echo "## Conflict" - echo - echo "Auto-merge \`main → staging\` failed with conflicts." - echo "A human (or devops-engineer persona) needs to resolve manually:" - echo - echo '```' - echo "git fetch origin" - echo "git checkout staging" - echo "git merge --no-ff origin/main" - echo "# resolve conflicts" - echo "git push origin staging" - echo '```' - } >> "$GITHUB_STEP_SUMMARY" - exit 1 - fi - fi - - - name: Push staging to origin - if: steps.check.outputs.needs_sync == 'true' - run: | - set -euo pipefail - # Direct push to staging. devops-engineer persona is - # whitelisted for direct push on the staging branch - # protection (Settings → Branches → staging). - # - # No --force / --force-with-lease: a fast-forward or - # legitimate merge commit on top of current staging - # is the only thing we'd ever push. If origin/staging - # advanced under us (concurrent merge), the push - # legitimately rejects and the next run picks up the - # new state. - if ! git push origin staging; then - { - echo "## Push rejected" - echo - echo "Direct push to \`staging\` failed. Likely causes:" - echo "- \`AUTO_SYNC_TOKEN\` rotated / wrong scope (HTTP 401/403)" - echo "- \`devops-engineer\` no longer in" - echo " \`push_whitelist_usernames\` on the staging" - echo " branch protection (HTTP 422)" - echo "- staging advanced concurrently — re-running this" - echo " workflow on the new main tip will pick it up" - } >> "$GITHUB_STEP_SUMMARY" - exit 1 - fi - - { - echo "## Auto-sync succeeded" - echo - echo "- staging advanced to: \`$(git rev-parse --short=8 HEAD)\`" - echo "- main tip: \`${{ steps.check.outputs.main_short }}\`" - echo "- Strategy: $([ "${{ steps.merge.outputs.did_ff }}" = "true" ] && echo "fast-forward" || echo "merge commit")" - echo "- Pushed by: \`devops-engineer\` (per-agent persona, anti-bot-ring)" - } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/canary-staging.yml b/.github/workflows/canary-staging.yml index b040b196..bf75c57f 100644 --- a/.github/workflows/canary-staging.yml +++ b/.github/workflows/canary-staging.yml @@ -20,6 +20,19 @@ on: # a few minutes under load — that's fine for a canary. - cron: '*/30 * * * *' workflow_dispatch: + inputs: + keep_on_failure: + description: >- + Skip teardown when the canary fails (debugging only). The + tenant org + EC2 + CF tunnel + DNS stay alive so an operator + can SSM into the workspace EC2 and capture docker logs of the + failing claude-code container. REMEMBER to manually delete + via DELETE /cp/admin/tenants/ when done so the org + doesn't accumulate cost. Only honored on workflow_dispatch; + cron runs always tear down (we don't want unattended cron + to leak resources). + type: boolean + default: false # Serialise with the full-SaaS workflow so they don't contend for the # same org-create quota on staging. Different group key from @@ -80,6 +93,14 @@ jobs: # is "Token Plan only" but cheap-per-token and fast. E2E_MODEL_SLUG: MiniMax-M2.7-highspeed E2E_RUN_ID: "canary-${{ github.run_id }}" + # Debug-only: when an operator dispatches with keep_on_failure=true, + # the canary script's E2E_KEEP_ORG=1 path skips teardown so the + # tenant org + EC2 stay alive for SSM-based log capture. Cron runs + # never set this (the input only exists on workflow_dispatch) so + # unattended cron always tears down. See molecule-core#129 + # failure mode #1 — capturing the actual exception requires + # docker logs from the live container. + E2E_KEEP_ORG: ${{ github.event.inputs.keep_on_failure == 'true' && '1' || '0' }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -137,27 +158,28 @@ jobs: id: canary run: bash tests/e2e/test_staging_full_saas.sh - # Alerting: open an issue only after THREE consecutive failures so - # transient flakes (Cloudflare DNS hiccup, AWS API blip) don't spam - # the issue list. If an issue is already open, we still comment on - # every failure so ops sees the streak. Auto-close on next green. + # Alerting: open a sticky issue on the FIRST failure; comment on + # subsequent failures; auto-close on next green. Comment-on-existing + # de-duplicates so a single open issue accumulates the streak — + # ops sees one issue with N comments rather than N issues. # - # Threshold rationale: canary fires every 30 min, so 3 failures = - # ~90 min of consecutive red — well past any single-run flake but - # still tight enough that a real outage gets surfaced before the - # next deploy window. + # Why no consecutive-failures threshold (e.g., wait 3 runs before + # filing): the prior threshold check used + # `github.rest.actions.listWorkflowRuns()` which Gitea 1.22.6 does + # not expose (returns 404). On Gitea Actions the threshold call + # ALWAYS failed, breaking the entire alerting step and going days + # silent on real regressions (38h+ chronic red on 2026-05-07/08 + # before this fix; tracked in molecule-core#129). Filing on first + # failure is also better UX — we want to know about the first red, + # not wait 90 min for it to "count." Real flakes get one issue + + # a quick close-on-green; persistent reds accumulate comments. - name: Open issue on failure if: failure() uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 - env: - # Inject the workflow path explicitly — context.workflow is - # the *name*, not the file path the actions API needs. - WORKFLOW_PATH: '.github/workflows/canary-staging.yml' - CONSECUTIVE_THRESHOLD: '3' with: script: | const title = '🔴 Canary failing: staging SaaS smoke'; - const runURL = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; + const runURL = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; // Find an existing open canary issue (stable title match). // If one exists, this isn't a "first failure" — comment and exit. @@ -177,32 +199,12 @@ jobs: return; } - // No open issue yet — check the last N-1 runs' conclusions. - // We open the issue only if the last (THRESHOLD-1) runs ALSO - // failed (so this is the 3rd consecutive red). - const threshold = parseInt(process.env.CONSECUTIVE_THRESHOLD, 10); - const { data: runs } = await github.rest.actions.listWorkflowRuns({ - owner: context.repo.owner, repo: context.repo.repo, - workflow_id: process.env.WORKFLOW_PATH, - status: 'completed', - per_page: threshold, - // Skip the current in-progress run; it isn't 'completed' yet. - }); - // listWorkflowRuns returns recent first. We need (threshold-1) - // prior failures (current run is the threshold-th). - const priorFailures = (runs.workflow_runs || []) - .slice(0, threshold - 1) - .filter(r => r.id !== context.runId) - .filter(r => r.conclusion === 'failure') - .length; - if (priorFailures < threshold - 1) { - core.info(`Below threshold: ${priorFailures + 1}/${threshold} consecutive failures — not filing yet`); - return; - } - + // No open issue yet — file one on this first failure. The + // comment-on-existing branch above means subsequent failures + // accumulate as comments on this same issue, so we don't + // spam new issues per run. const body = - `Canary run failed at ${new Date().toISOString()}, ` + - `${threshold} consecutive runs red.\n\n` + + `Canary run failed at ${new Date().toISOString()}.\n\n` + `Run: ${runURL}\n\n` + `This issue auto-closes on the next green canary run. ` + `Consecutive failures add a comment here rather than a new issue.`; @@ -211,7 +213,7 @@ jobs: title, body, labels: ['canary-staging', 'bug'], }); - core.info(`Opened canary failure issue (${threshold} consecutive reds)`); + core.info('Opened canary failure issue (first red)'); - name: Auto-close canary issue on success if: success() diff --git a/.github/workflows/e2e-staging-canvas.yml b/.github/workflows/e2e-staging-canvas.yml index 30a38e5f..67fe1d6d 100644 --- a/.github/workflows/e2e-staging-canvas.yml +++ b/.github/workflows/e2e-staging-canvas.yml @@ -22,9 +22,9 @@ on: # spending CI cycles. See e2e-api.yml for the rationale on why this # is a single job rather than two-jobs-sharing-name. push: - branches: [main, staging] + branches: [main] pull_request: - branches: [main, staging] + branches: [main] workflow_dispatch: schedule: # Weekly on Sunday 08:00 UTC — catches Chrome / Playwright / Next.js diff --git a/.github/workflows/e2e-staging-external.yml b/.github/workflows/e2e-staging-external.yml index acd9cef2..5b8d4a9c 100644 --- a/.github/workflows/e2e-staging-external.yml +++ b/.github/workflows/e2e-staging-external.yml @@ -32,7 +32,7 @@ name: E2E Staging External Runtime on: push: - branches: [staging, main] + branches: [main] paths: - 'workspace-server/internal/handlers/workspace.go' - 'workspace-server/internal/handlers/registry.go' @@ -44,7 +44,7 @@ on: - 'tests/e2e/test_staging_external_runtime.sh' - '.github/workflows/e2e-staging-external.yml' pull_request: - branches: [staging, main] + branches: [main] paths: - 'workspace-server/internal/handlers/workspace.go' - 'workspace-server/internal/handlers/registry.go' diff --git a/.github/workflows/e2e-staging-saas.yml b/.github/workflows/e2e-staging-saas.yml index bab88409..43e81aba 100644 --- a/.github/workflows/e2e-staging-saas.yml +++ b/.github/workflows/e2e-staging-saas.yml @@ -20,13 +20,12 @@ name: E2E Staging SaaS (full lifecycle) # via the same paths watcher that e2e-api.yml uses) on: - # Fire on staging push too — previously this only ran on main, which - # meant the most thorough end-to-end test caught regressions AFTER - # they shipped to staging (and then to the auto-promote PR). Running - # on staging push catches them BEFORE the staging→main promotion - # opens, so a green canary into auto-promote is more meaningful. + # Trunk-based (Phase 3 of internal#81): main is the only branch. + # Previously this fired on staging push too because staging was a + # superset of main and ran the gate ahead of auto-promote; with no + # staging branch, main is where E2E gates the deploy. push: - branches: [staging, main] + branches: [main] paths: - 'workspace-server/internal/handlers/registry.go' - 'workspace-server/internal/handlers/workspace_provision.go' @@ -36,7 +35,7 @@ on: - 'tests/e2e/test_staging_full_saas.sh' - '.github/workflows/e2e-staging-saas.yml' pull_request: - branches: [staging, main] + branches: [main] paths: - 'workspace-server/internal/handlers/registry.go' - 'workspace-server/internal/handlers/workspace_provision.go' diff --git a/.github/workflows/harness-replays.yml b/.github/workflows/harness-replays.yml index b53d0b3f..23681ff7 100644 --- a/.github/workflows/harness-replays.yml +++ b/.github/workflows/harness-replays.yml @@ -119,6 +119,17 @@ jobs: # symptom, different root cause: staging still has the in-image # clone path, hits the auth error directly). # + # 2026-05-08 sub-finding (#192): the clone step ALSO fails when + # any referenced workspace-template repo is private and the + # AUTO_SYNC_TOKEN bearer (devops-engineer persona) lacks read + # access. Root cause: 5 of 9 workspace-template repos + # (openclaw, codex, crewai, deepagents, gemini-cli) had been + # marked private with no team grant. Resolution: flipped them + # to public per `feedback_oss_first_repo_visibility_default` + # (the OSS surface should be public). Layer-3 (customer-private + + # marketplace third-party repos) tracked separately in + # internal#102. + # # Token shape matches publish-workspace-server-image.yml: AUTO_SYNC_TOKEN # is the devops-engineer persona PAT, NOT the founder PAT (per # `feedback_per_agent_gitea_identity_default`). clone-manifest.sh diff --git a/.github/workflows/redeploy-tenants-on-staging.yml b/.github/workflows/redeploy-tenants-on-staging.yml index 2726db9e..695f6643 100644 --- a/.github/workflows/redeploy-tenants-on-staging.yml +++ b/.github/workflows/redeploy-tenants-on-staging.yml @@ -36,7 +36,7 @@ on: workflow_run: workflows: ['publish-workspace-server-image'] types: [completed] - branches: [staging] + branches: [main] workflow_dispatch: inputs: target_tag: diff --git a/.github/workflows/retarget-main-to-staging.yml b/.github/workflows/retarget-main-to-staging.yml deleted file mode 100644 index 5c5d81f8..00000000 --- a/.github/workflows/retarget-main-to-staging.yml +++ /dev/null @@ -1,276 +0,0 @@ -name: Retarget main PRs to staging - -# Mechanical enforcement of SHARED_RULES rule 8 ("Staging-first -# workflow, no exceptions"). When a bot opens a PR against `main`, -# retarget it to `staging` automatically and leave an explanatory -# comment. Human / CEO-authored PRs (the staging→main promotion -# PRs, etc.) are left alone — they're the authorised exception -# to the rule. -# -# ============================================================ -# What this workflow does -# ============================================================ -# -# On `pull_request_target` opened/reopened against `main`: -# 1. If the PR head is `staging`, skip (the auto-promote PRs -# MUST stay base=main). -# 2. If the PR author is a bot, retarget the PR base to -# `staging` via Gitea REST `PATCH /pulls/{N}` body -# `{"base":"staging"}`. -# 3. If the retarget returns 422 "pull request already exists -# for base branch 'staging'" (issue #1884 case: another PR -# on the same head already targets staging), close the -# now-redundant main-PR via Gitea REST instead of failing -# red. -# 4. Post an explainer comment on the retargeted PR via -# Gitea REST `POST /issues/{N}/comments`. -# -# ============================================================ -# Why Gitea REST (and not `gh api / gh pr close / gh pr comment`) -# ============================================================ -# -# Pre-2026-05-06 this workflow used `gh api -X PATCH "repos/{owner}/{repo}/pulls/{N}" -f base=staging` -# plus `gh pr close` and `gh pr comment`. After the GitHub→Gitea -# cutover those calls fail because: -# -# - `gh` CLI defaults to `api.github.com`. Even with `GH_HOST` -# pointing at Gitea, `gh pr close / comment` route through -# GraphQL (`/api/graphql`) which Gitea does not expose. -# Empirical: every `gh pr *` call returns -# `HTTP 405 Method Not Allowed (https://git.moleculesai.app/api/graphql)` -# — same root cause as #65 (auto-sync, fixed in PR #66) and -# #73/#195 (auto-promote, fixed in PR #78). -# - `gh api -X PATCH /pulls/{N}` happens to use a REST path -# that Gitea also has, but the `gh` host-resolution layer -# and pagination/retry logic don't always hit Gitea cleanly, -# and the cost of switching to direct `curl` is one extra -# line of code. -# -# So this workflow uses direct `curl` calls to Gitea REST. No -# `gh` CLI dependency, no GraphQL, no flaky host-resolution. -# -# ============================================================ -# Identity + token (anti-bot-ring per saved-memory -# `feedback_per_agent_gitea_identity_default`) -# ============================================================ -# -# Pre-fix this workflow used the per-job ephemeral -# `secrets.GITHUB_TOKEN`. On Gitea Actions that token has -# narrow scope and unpredictable cross-PR write capability. -# -# Post-fix: `secrets.AUTO_SYNC_TOKEN` (the `devops-engineer` -# Gitea persona). Same persona used by `auto-sync-main-to-staging.yml` -# (PR #66) and `auto-promote-staging.yml` (PR #78). Token scope: -# `push: true` repo write, sufficient for PR-edit + close + comment. -# -# Why this token does NOT need branch-protection bypass: -# patching a PR's base ref is a PR-level operation that does not -# require push perms on either branch (the PR's own commits stay -# put; only the metadata changes). -# -# ============================================================ -# Failure modes & operational notes -# ============================================================ -# -# A — PATCH base→staging returns 422 "pull request already exists" -# (issue #1884 case): -# - Detected by string-match on response body. Workflow -# falls through to closing the now-redundant main-PR -# (Gitea REST `PATCH /pulls/{N}` with `state: closed`) -# and posts an explanation comment. Step summary surfaces. -# -# B — `AUTO_SYNC_TOKEN` rotated / wrong scope: -# - First REST call returns 401/403. Step summary surfaces. -# Re-issue token from `~/.molecule-ai/personas/` on the -# operator host and update repo Actions secret. -# -# C — PR was deleted between trigger and run: -# - REST call returns 404. Workflow exits 0 with a notice -# (the rule was already enforced or the PR is gone). -# -# D — author is not actually a bot but the filter mis-fires: -# - Filter is conservative: only triggers on -# `user.type == 'Bot'`, `login` ends with `[bot]`, or -# known bot logins (`molecule-ai[bot]`, `app/molecule-ai`). -# Human PRs slip through unaffected. If a NEW bot login -# starts shipping main-PRs, add it to the filter. - -on: - pull_request_target: - types: [opened, reopened] - branches: [main] - -permissions: - pull-requests: write - -jobs: - retarget: - name: Retarget to staging - runs-on: ubuntu-latest - # Only fire for bot-authored PRs. Human CEO PRs (staging→main - # promotion) are intentional and pass through. - # - # Head-ref guard: never retarget a PR whose head IS `staging` - # — those are the auto-promote staging→main PRs (opened by - # `devops-engineer` since PR #78 / #195 fix). Retargeting - # head=staging onto base=staging fails with HTTP 422 "no new - # commits between base 'staging' and head 'staging'", which - # would surface as a noisy red workflow run on every - # auto-promote (caught 2026-05-03 on the GitHub-era PR #2588). - if: >- - github.event.pull_request.head.ref != 'staging' - && ( - github.event.pull_request.user.type == 'Bot' - || endsWith(github.event.pull_request.user.login, '[bot]') - || github.event.pull_request.user.login == 'app/molecule-ai' - || github.event.pull_request.user.login == 'molecule-ai[bot]' - || github.event.pull_request.user.login == 'devops-engineer' - ) - steps: - - name: Retarget PR base to staging via Gitea REST - id: retarget - env: - GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }} - GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }} - REPO: ${{ github.repository }} - PR_NUMBER: ${{ github.event.pull_request.number }} - PR_AUTHOR: ${{ github.event.pull_request.user.login }} - # Issue #1884 case: when the bot opens a PR against main - # and there's already another PR on the same head branch - # targeting staging, Gitea's PATCH returns 422 with a - # body mentioning "pull request already exists for base - # branch 'staging'" (the Gitea message wording is - # slightly different from GitHub's; the substring match - # below covers both for forward/back compat). - # The retarget can't proceed — but the right response is - # to close the now-redundant main-PR, not to fail the - # workflow noisily. Detect that specific 422 and close - # instead. - run: | - set -euo pipefail - - API="${GITEA_HOST}/api/v1/repos/${REPO}" - AUTH=(-H "Authorization: token ${GITEA_TOKEN}" -H "Accept: application/json") - - echo "Retargeting PR #${PR_NUMBER} (author: ${PR_AUTHOR}) from main → staging" - - # Curl-status-capture pattern per `feedback_curl_status_capture_pollution`: - # http_code via -w to its own scalar, body to a tempfile, set +e/-e - # bracket so curl's non-zero-on-4xx doesn't pollute the script's exit chain. - BODY_FILE=$(mktemp) - REQ='{"base":"staging"}' - - set +e - STATUS=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \ - -X PATCH -d "${REQ}" \ - -o "${BODY_FILE}" -w "%{http_code}" \ - "${API}/pulls/${PR_NUMBER}") - CURL_RC=$? - set -e - - if [ "${CURL_RC}" -ne 0 ]; then - echo "::error::curl PATCH failed (rc=${CURL_RC})" - rm -f "${BODY_FILE}" - exit 1 - fi - - if [ "${STATUS}" = "201" ] || [ "${STATUS}" = "200" ]; then - NEW_BASE=$(jq -r '.base.ref // "?"' < "${BODY_FILE}") - rm -f "${BODY_FILE}" - if [ "${NEW_BASE}" = "staging" ]; then - echo "::notice::Retargeted PR #${PR_NUMBER} → staging" - echo "outcome=retargeted" >> "$GITHUB_OUTPUT" - exit 0 - fi - echo "::error::PATCH returned ${STATUS} but base.ref is '${NEW_BASE}', not 'staging'" - exit 1 - fi - - # Specifically match the 422 duplicate-base/head error so - # any OTHER PATCH failure (auth, deleted PR, etc.) still - # surfaces as a real workflow failure. - BODY=$(cat "${BODY_FILE}" || true) - rm -f "${BODY_FILE}" - - if [ "${STATUS}" = "422" ] && echo "${BODY}" | grep -qE "(pull request already exists for base branch 'staging'|already exists.*base.*staging)"; then - echo "::notice::PR #${PR_NUMBER}: duplicate target-staging PR exists on same head — closing this main-PR as redundant." - - # Close the now-redundant main-PR via Gitea REST - # (PATCH state=closed). Post comment explaining - # rationale BEFORE close so the comment lands on the - # PR (commenting on a closed PR works on Gitea, but - # historically caused notification ordering surprises). - - CLOSE_BODY_FILE=$(mktemp) - CMT_REQ=$(jq -n '{body:"[retarget-bot] Closing — another PR on the same head branch already targets `staging`. This PR is redundant. See issue #1884 for the rationale."}') - set +e - CMT_STATUS=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \ - -X POST -d "${CMT_REQ}" \ - -o "${CLOSE_BODY_FILE}" -w "%{http_code}" \ - "${API}/issues/${PR_NUMBER}/comments") - set -e - if [ "${CMT_STATUS}" != "201" ]; then - echo "::warning::dup-close comment POST returned ${CMT_STATUS}; continuing to close anyway" - cat "${CLOSE_BODY_FILE}" | head -c 300 || true - fi - rm -f "${CLOSE_BODY_FILE}" - - CLOSE_REQ='{"state":"closed"}' - CLOSE_RESP=$(mktemp) - set +e - CL_STATUS=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \ - -X PATCH -d "${CLOSE_REQ}" \ - -o "${CLOSE_RESP}" -w "%{http_code}" \ - "${API}/pulls/${PR_NUMBER}") - set -e - if [ "${CL_STATUS}" = "201" ] || [ "${CL_STATUS}" = "200" ]; then - echo "::notice::Closed PR #${PR_NUMBER} as redundant" - echo "outcome=closed-as-duplicate" >> "$GITHUB_OUTPUT" - rm -f "${CLOSE_RESP}" - exit 0 - fi - echo "::error::Failed to close redundant PR: HTTP ${CL_STATUS}" - cat "${CLOSE_RESP}" | head -c 300 || true - rm -f "${CLOSE_RESP}" - exit 1 - fi - - echo "::error::Retarget PATCH failed and was NOT a duplicate-base error: HTTP ${STATUS}" - echo "${BODY}" | head -c 500 >&2 - exit 1 - - - name: Post explainer comment - if: steps.retarget.outputs.outcome == 'retargeted' - env: - GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }} - GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }} - REPO: ${{ github.repository }} - PR_NUMBER: ${{ github.event.pull_request.number }} - run: | - set -euo pipefail - - API="${GITEA_HOST}/api/v1/repos/${REPO}" - AUTH=(-H "Authorization: token ${GITEA_TOKEN}" -H "Accept: application/json") - - # PR comments live on the issue endpoint in Gitea - # (PRs ARE issues — same endpoint, different sub-resources - # for diffs/files/etc.). The body uses jq to safely - # encode the multi-line markdown without shell-quote - # nightmares. - REQ=$(jq -n '{body:"[retarget-bot] This PR was opened against `main` and has been retargeted to `staging` automatically.\n\n**Why:** per [SHARED_RULES rule 8](https://git.moleculesai.app/molecule-ai/molecule-ai-org-template-molecule-dev/src/branch/main/SHARED_RULES.md), all feature work targets `staging` first; the CEO promotes `staging → main` separately.\n\n**What changed:** just the base branch — no code change. CI will re-run against `staging`. If you get merge conflicts, rebase on `staging`.\n\n**If this PR is the CEO`s staging→main promotion:** the Action skipped you (only bot-authored PRs are retargeted, head=staging is also exempted). If you see this comment on your CEO PR, that`s a bug — please tag @hongmingwang."}') - - BODY_FILE=$(mktemp) - set +e - STATUS=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \ - -X POST -d "${REQ}" \ - -o "${BODY_FILE}" -w "%{http_code}" \ - "${API}/issues/${PR_NUMBER}/comments") - set -e - - if [ "${STATUS}" = "201" ]; then - echo "::notice::Posted explainer comment on PR #${PR_NUMBER}" - else - echo "::warning::Failed to post explainer (HTTP ${STATUS}) — retarget itself succeeded" - cat "${BODY_FILE}" | head -c 300 || true - fi - rm -f "${BODY_FILE}" diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..847a85ce --- /dev/null +++ b/Makefile @@ -0,0 +1,28 @@ +# Top-level Makefile — convenience wrappers around docker compose. +# +# Most molecule-core dev work happens via these shortcuts. CI doesn't +# use this Makefile; CI calls docker compose / go test directly so the +# Makefile can evolve without breaking the build. + +.PHONY: help dev up down logs build test + +help: ## Show this help. + @grep -E '^[a-zA-Z_-]+:.*?## ' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-12s\033[0m %s\n", $$1, $$2}' + +dev: ## Start the full stack with air hot-reload for the platform service. + docker compose -f docker-compose.yml -f docker-compose.dev.yml up + +up: ## Start the full stack in production-shape mode (no air, normal Dockerfile). + docker compose up + +down: ## Stop the stack and remove containers (volumes preserved). + docker compose down + +logs: ## Tail logs from all services (Ctrl-C to detach). + docker compose logs -f + +build: ## Force a fresh build of the platform image (no cache). + docker compose build --no-cache platform + +test: ## Run Go unit tests in workspace-server/. + cd workspace-server && go test -race ./... diff --git a/canvas/.dockerignore b/canvas/.dockerignore new file mode 100644 index 00000000..f859f851 --- /dev/null +++ b/canvas/.dockerignore @@ -0,0 +1,10 @@ +# Excluded from `docker build` context. Without this, the COPY . . step in +# canvas/Dockerfile clobbers the freshly-installed node_modules with the +# host's (potentially broken / wrong-arch) copy — the @tailwindcss/oxide +# native binary disagreed and broke `next build`. +node_modules +.next +.git +*.log +.env* +!.env.example diff --git a/canvas/Dockerfile b/canvas/Dockerfile index e834b7a5..6aa8f446 100644 --- a/canvas/Dockerfile +++ b/canvas/Dockerfile @@ -1,7 +1,11 @@ FROM node:22-alpine AS builder WORKDIR /app COPY package.json package-lock.json* ./ -RUN npm install +# `npm ci` (not `install`) for lockfile-exact reproducibility. +# `--include=optional` ensures the platform-specific @tailwindcss/oxide +# native binary lands — without it, postcss fails with "Cannot read +# properties of undefined (reading 'All')" at build time. +RUN npm ci --include=optional COPY . . ARG NEXT_PUBLIC_PLATFORM_URL=http://localhost:8080 ARG NEXT_PUBLIC_WS_URL=ws://localhost:8080/ws diff --git a/canvas/src/components/tabs/chat/AttachmentAudio.tsx b/canvas/src/components/tabs/chat/AttachmentAudio.tsx index b696125d..d81ee80a 100644 --- a/canvas/src/components/tabs/chat/AttachmentAudio.tsx +++ b/canvas/src/components/tabs/chat/AttachmentAudio.tsx @@ -9,6 +9,7 @@ // AttachmentLightbox). import { useState, useEffect, useRef } from "react"; +import { platformAuthHeaders } from "@/lib/api"; import type { ChatAttachment } from "./types"; import { isPlatformAttachment, resolveAttachmentHref } from "./uploads"; import { AttachmentChip } from "./AttachmentViews"; @@ -43,13 +44,8 @@ export function AttachmentAudio({ workspaceId, attachment, onDownload, tone }: P void (async () => { try { const href = resolveAttachmentHref(workspaceId, attachment.uri); - const headers: Record = {}; - const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN; - if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`; - const slug = getTenantSlug(); - if (slug) headers["X-Molecule-Org-Slug"] = slug; const res = await fetch(href, { - headers, + headers: platformAuthHeaders(), credentials: "include", signal: AbortSignal.timeout(60_000), }); @@ -116,9 +112,5 @@ export function AttachmentAudio({ workspaceId, attachment, onDownload, tone }: P ); } -function getTenantSlug(): string | null { - if (typeof window === "undefined") return null; - const host = window.location.hostname; - const m = host.match(/^([^.]+)\.moleculesai\.app$/); - return m ? m[1] : null; -} +// Local getTenantSlug() removed — auth-header construction now goes +// through platformAuthHeaders() from @/lib/api (#178). diff --git a/canvas/src/components/tabs/chat/AttachmentImage.tsx b/canvas/src/components/tabs/chat/AttachmentImage.tsx index a219cb80..ca4df242 100644 --- a/canvas/src/components/tabs/chat/AttachmentImage.tsx +++ b/canvas/src/components/tabs/chat/AttachmentImage.tsx @@ -35,6 +35,7 @@ // downscale via canvas, but defer that to v2. import { useState, useEffect, useRef } from "react"; +import { platformAuthHeaders } from "@/lib/api"; import type { ChatAttachment } from "./types"; import { isPlatformAttachment, resolveAttachmentHref } from "./uploads"; import { AttachmentLightbox } from "./AttachmentLightbox"; @@ -75,22 +76,14 @@ export function AttachmentImage({ workspaceId, attachment, onDownload, tone }: P } // Platform-auth path: identical to downloadChatFile but we keep - // the blob (don't trigger a Save-As). Use the same headers it does - // by going through it indirectly — no, downloadChatFile triggers a - // Save-As. Need a separate fetch. + // the blob (don't trigger a Save-As). Auth headers come from the + // shared `platformAuthHeaders()` helper — one source of truth for + // every authenticated raw fetch in the canvas (#178). void (async () => { try { const href = resolveAttachmentHref(workspaceId, attachment.uri); - const headers: Record = {}; - // Read the same env var downloadChatFile reads — single source - // of truth would be cleaner; refactor opportunity for PR-2 if - // we add the same path to AttachmentVideo. - const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN; - if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`; - const slug = getTenantSlug(); - if (slug) headers["X-Molecule-Org-Slug"] = slug; const res = await fetch(href, { - headers, + headers: platformAuthHeaders(), credentials: "include", signal: AbortSignal.timeout(30_000), }); @@ -184,15 +177,7 @@ export function AttachmentImage({ workspaceId, attachment, onDownload, tone }: P ); } -// Internal helper — duplicated from uploads.ts (it's not exported -// there). Kept local so this component doesn't reach into private -// surface; if AttachmentVideo / AttachmentPDF in PR-2/PR-3 also need -// it, lift to an exported helper at that point (the third-caller -// rule). -function getTenantSlug(): string | null { - if (typeof window === "undefined") return null; - const host = window.location.hostname; - // Tenant subdomain shape: .moleculesai.app - const m = host.match(/^([^.]+)\.moleculesai\.app$/); - return m ? m[1] : null; -} +// Local getTenantSlug() removed — auth-header construction now goes +// through platformAuthHeaders() from @/lib/api which uses the canonical +// getTenantSlug() from @/lib/tenant. This eliminates the duplicate +// hostname-regex + the duplicate bearer-token-attach pattern (#178). diff --git a/canvas/src/components/tabs/chat/AttachmentPDF.tsx b/canvas/src/components/tabs/chat/AttachmentPDF.tsx index efe5fc90..784aa444 100644 --- a/canvas/src/components/tabs/chat/AttachmentPDF.tsx +++ b/canvas/src/components/tabs/chat/AttachmentPDF.tsx @@ -33,6 +33,7 @@ // timeout, swap to chip. Implemented as a 3-second watchdog. import { useState, useEffect, useRef } from "react"; +import { platformAuthHeaders } from "@/lib/api"; import type { ChatAttachment } from "./types"; import { isPlatformAttachment, resolveAttachmentHref } from "./uploads"; import { AttachmentLightbox } from "./AttachmentLightbox"; @@ -69,13 +70,8 @@ export function AttachmentPDF({ workspaceId, attachment, onDownload, tone }: Pro void (async () => { try { const href = resolveAttachmentHref(workspaceId, attachment.uri); - const headers: Record = {}; - const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN; - if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`; - const slug = getTenantSlug(); - if (slug) headers["X-Molecule-Org-Slug"] = slug; const res = await fetch(href, { - headers, + headers: platformAuthHeaders(), credentials: "include", signal: AbortSignal.timeout(60_000), }); @@ -189,9 +185,5 @@ function PdfGlyph() { ); } -function getTenantSlug(): string | null { - if (typeof window === "undefined") return null; - const host = window.location.hostname; - const m = host.match(/^([^.]+)\.moleculesai\.app$/); - return m ? m[1] : null; -} +// Local getTenantSlug() removed — auth-header construction now goes +// through platformAuthHeaders() from @/lib/api (#178). diff --git a/canvas/src/components/tabs/chat/AttachmentTextPreview.tsx b/canvas/src/components/tabs/chat/AttachmentTextPreview.tsx index 41a92a45..c10c3098 100644 --- a/canvas/src/components/tabs/chat/AttachmentTextPreview.tsx +++ b/canvas/src/components/tabs/chat/AttachmentTextPreview.tsx @@ -26,6 +26,7 @@ // to download the full file. import { useState, useEffect } from "react"; +import { platformAuthHeaders } from "@/lib/api"; import type { ChatAttachment } from "./types"; import { isPlatformAttachment, resolveAttachmentHref } from "./uploads"; import { AttachmentChip } from "./AttachmentViews"; @@ -57,13 +58,13 @@ export function AttachmentTextPreview({ workspaceId, attachment, onDownload, ton void (async () => { try { const href = resolveAttachmentHref(workspaceId, attachment.uri); - const headers: Record = {}; - if (isPlatformAttachment(attachment.uri)) { - const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN; - if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`; - const slug = getTenantSlug(); - if (slug) headers["X-Molecule-Org-Slug"] = slug; - } + // Only attach platform auth headers for in-platform URIs — + // off-platform URLs (HTTP/HTTPS attachments) MUST NOT receive + // our bearer token (it would leak the admin token to a third + // party). The branch is preserved with the new shared helper. + const headers: Record = isPlatformAttachment(attachment.uri) + ? platformAuthHeaders() + : {}; const res = await fetch(href, { headers, credentials: "include", @@ -182,9 +183,5 @@ export function AttachmentTextPreview({ workspaceId, attachment, onDownload, ton ); } -function getTenantSlug(): string | null { - if (typeof window === "undefined") return null; - const host = window.location.hostname; - const m = host.match(/^([^.]+)\.moleculesai\.app$/); - return m ? m[1] : null; -} +// Local getTenantSlug() removed — auth-header construction now goes +// through platformAuthHeaders() from @/lib/api (#178). diff --git a/canvas/src/components/tabs/chat/AttachmentVideo.tsx b/canvas/src/components/tabs/chat/AttachmentVideo.tsx index 0691006d..83d25a02 100644 --- a/canvas/src/components/tabs/chat/AttachmentVideo.tsx +++ b/canvas/src/components/tabs/chat/AttachmentVideo.tsx @@ -25,6 +25,7 @@ // fetch via service worker. v2 if measured-needed. import { useState, useEffect, useRef } from "react"; +import { platformAuthHeaders } from "@/lib/api"; import type { ChatAttachment } from "./types"; import { isPlatformAttachment, resolveAttachmentHref } from "./uploads"; import { AttachmentChip } from "./AttachmentViews"; @@ -61,13 +62,8 @@ export function AttachmentVideo({ workspaceId, attachment, onDownload, tone }: P void (async () => { try { const href = resolveAttachmentHref(workspaceId, attachment.uri); - const headers: Record = {}; - const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN; - if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`; - const slug = getTenantSlug(); - if (slug) headers["X-Molecule-Org-Slug"] = slug; const res = await fetch(href, { - headers, + headers: platformAuthHeaders(), credentials: "include", // Videos are larger than images on average; give the request // more headroom. The server's per-request body cap (50MB) is @@ -147,11 +143,5 @@ export function AttachmentVideo({ workspaceId, attachment, onDownload, tone }: P ); } -// Internal helper — same shape as AttachmentImage's. Lifted to a -// shared util in PR-2.5 if a third caller needs it (PDF, audio). -function getTenantSlug(): string | null { - if (typeof window === "undefined") return null; - const host = window.location.hostname; - const m = host.match(/^([^.]+)\.moleculesai\.app$/); - return m ? m[1] : null; -} +// Local getTenantSlug() removed — auth-header construction now goes +// through platformAuthHeaders() from @/lib/api (#178). diff --git a/canvas/src/components/tabs/chat/uploads.ts b/canvas/src/components/tabs/chat/uploads.ts index 76fbdba9..d9cdde31 100644 --- a/canvas/src/components/tabs/chat/uploads.ts +++ b/canvas/src/components/tabs/chat/uploads.ts @@ -1,12 +1,16 @@ -import { PLATFORM_URL } from "@/lib/api"; -import { getTenantSlug } from "@/lib/tenant"; +import { PLATFORM_URL, platformAuthHeaders } from "@/lib/api"; import type { ChatAttachment } from "./types"; /** Chat attachments are intentionally uploaded via a direct fetch() * instead of the `api.post` helper — `api.post` JSON-stringifies the - * body, which would 500 on a Blob. Mirrors the header plumbing - * (tenant slug, admin token, credentials) so SaaS + self-hosted - * callers work the same way. */ + * body, which would 500 on a Blob. Auth headers (tenant slug, admin + * token, credentials) come from `platformAuthHeaders()` — the same + * helper `request()` uses, so a missing bearer surfaces as a single + * fix site instead of N copies. We deliberately do NOT set + * Content-Type so the browser writes the multipart boundary into the + * header; setting it manually would yield a multipart body the server + * can't parse. See lib/api.ts platformAuthHeaders() for the full + * rationale on why this pair must stay matched. */ export async function uploadChatFiles( workspaceId: string, files: File[], @@ -16,18 +20,12 @@ export async function uploadChatFiles( const form = new FormData(); for (const f of files) form.append("files", f, f.name); - const headers: Record = {}; - const slug = getTenantSlug(); - if (slug) headers["X-Molecule-Org-Slug"] = slug; - const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN; - if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`; - // Uploads legitimately take a while on cold cache (tar write + // docker cp into the container). 60s is comfortable for the 25MB/ // 50MB caps the server enforces. const res = await fetch(`${PLATFORM_URL}/workspaces/${workspaceId}/chat/uploads`, { method: "POST", - headers, + headers: platformAuthHeaders(), body: form, credentials: "include", signal: AbortSignal.timeout(60_000), @@ -143,14 +141,8 @@ export async function downloadChatFile( return; } - const headers: Record = {}; - const slug = getTenantSlug(); - if (slug) headers["X-Molecule-Org-Slug"] = slug; - const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN; - if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`; - const res = await fetch(href, { - headers, + headers: platformAuthHeaders(), credentials: "include", signal: AbortSignal.timeout(60_000), }); diff --git a/canvas/src/lib/__tests__/platform-auth-headers.test.ts b/canvas/src/lib/__tests__/platform-auth-headers.test.ts new file mode 100644 index 00000000..b750f226 --- /dev/null +++ b/canvas/src/lib/__tests__/platform-auth-headers.test.ts @@ -0,0 +1,97 @@ +// @vitest-environment jsdom +import { describe, it, expect, beforeEach, afterEach, vi } from "vitest"; + +// Tests for platformAuthHeaders — the shared helper extracted in #178 +// to consolidate the bearer-token-attach + tenant-slug-attach pattern +// that was previously duplicated across 7 raw-fetch callsites in the +// canvas (uploads + 5 Attachment* components + the api.ts request() +// function). +// +// What we pin here: +// - Returns a fresh object each call (so callers can mutate without +// leaking into each other). +// - Empty result on a non-tenant host with no admin token (the +// localhost / self-hosted shape). +// - Bearer attached when NEXT_PUBLIC_ADMIN_TOKEN is set. +// - X-Molecule-Org-Slug attached when window.location.hostname is a +// tenant subdomain (.moleculesai.app). +// - Both attached when both apply (the production SaaS shape). +// +// Why jsdom: getTenantSlug() reads window.location.hostname. Node-only +// environment yields no window and getTenantSlug returns null +// unconditionally — wouldn't exercise the slug branch. + +import { platformAuthHeaders } from "../api"; + +describe("platformAuthHeaders", () => { + let originalAdminToken: string | undefined; + + beforeEach(() => { + originalAdminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN; + delete process.env.NEXT_PUBLIC_ADMIN_TOKEN; + }); + + afterEach(() => { + if (originalAdminToken === undefined) delete process.env.NEXT_PUBLIC_ADMIN_TOKEN; + else process.env.NEXT_PUBLIC_ADMIN_TOKEN = originalAdminToken; + // jsdom resets hostname between tests via the @vitest-environment + // pragma's per-test isolation. No explicit reset needed. + }); + + it("returns an empty object on a non-tenant host with no admin token", () => { + // jsdom default hostname is "localhost" — not a tenant slug, so + // getTenantSlug() returns null and no X-Molecule-Org-Slug is added. + const headers = platformAuthHeaders(); + expect(headers).toEqual({}); + }); + + it("attaches Authorization when NEXT_PUBLIC_ADMIN_TOKEN is set", () => { + process.env.NEXT_PUBLIC_ADMIN_TOKEN = "local-dev-admin"; + const headers = platformAuthHeaders(); + expect(headers).toEqual({ Authorization: "Bearer local-dev-admin" }); + }); + + it("does NOT attach Authorization when NEXT_PUBLIC_ADMIN_TOKEN is empty string", () => { + // Empty-string env is the JS-side shape of `KEY=` in .env. + // Treating it as unset matches the matched-pair guard in + // next.config.ts (admin-token-pair.test.ts) — symmetric semantics. + process.env.NEXT_PUBLIC_ADMIN_TOKEN = ""; + const headers = platformAuthHeaders(); + expect(headers).toEqual({}); + }); + + it("attaches X-Molecule-Org-Slug on a tenant subdomain", () => { + Object.defineProperty(window, "location", { + value: { hostname: "reno-stars.moleculesai.app" }, + writable: true, + }); + const headers = platformAuthHeaders(); + expect(headers).toEqual({ "X-Molecule-Org-Slug": "reno-stars" }); + }); + + it("attaches both when both apply (production SaaS shape)", () => { + Object.defineProperty(window, "location", { + value: { hostname: "reno-stars.moleculesai.app" }, + writable: true, + }); + process.env.NEXT_PUBLIC_ADMIN_TOKEN = "tenant-bearer"; + const headers = platformAuthHeaders(); + // Pin exact-equality on the full shape — substring/contains + // assertions would also pass for an extra-header bug. + expect(headers).toEqual({ + "X-Molecule-Org-Slug": "reno-stars", + Authorization: "Bearer tenant-bearer", + }); + }); + + it("returns a fresh object each call (callers can mutate safely)", () => { + process.env.NEXT_PUBLIC_ADMIN_TOKEN = "tok"; + const a = platformAuthHeaders(); + const b = platformAuthHeaders(); + expect(a).not.toBe(b); // distinct refs + expect(a).toEqual(b); // same content + a["Content-Type"] = "application/json"; + // Mutation on `a` does not leak into `b`. + expect(b["Content-Type"]).toBeUndefined(); + }); +}); diff --git a/canvas/src/lib/api.ts b/canvas/src/lib/api.ts index dae1152b..3ae5f413 100644 --- a/canvas/src/lib/api.ts +++ b/canvas/src/lib/api.ts @@ -21,6 +21,45 @@ export interface RequestOptions { timeoutMs?: number; } +/** + * Build the platform auth header set used by every authenticated fetch + * from the canvas. Returns a fresh object so callers can mutate (e.g. + * append `Content-Type` for JSON requests, omit it for FormData). + * + * SaaS cross-origin shape: + * - `X-Molecule-Org-Slug` — derived from `window.location.hostname` + * by `getTenantSlug()`. Control plane uses it for fly-replay + * routing. Empty on localhost / non-tenant hosts — safe to omit. + * - `Authorization: Bearer ` — `NEXT_PUBLIC_ADMIN_TOKEN` baked + * into the canvas build (see canvas/Dockerfile L8/L11). Required by + * the workspace-server when `ADMIN_TOKEN` is set on the server side + * (Tier-2b AdminAuth gate, wsauth_middleware.go ~L245). Empty when + * no admin token was provisioned — the Tier-1 session-cookie path + * handles that case via `credentials:"include"`. + * + * Why a shared helper: the two-line "read env, attach bearer; read + * slug, attach header" pattern was duplicated across `request()` and + * 7 raw-fetch callsites (chat uploads/download + 5 Attachment* + * components) before this consolidation. A new poller or raw fetch + * that forgets one of the two headers silently 401s against + * workspace-server when ADMIN_TOKEN is set — the exact bug shape + * called out in #178 / closes the post-#176 self-review gap. + * + * Callers that want JSON Content-Type should spread this and add it + * themselves; FormData callers should NOT add Content-Type (the + * browser sets the multipart boundary). Centralizing the auth pair + * but leaving Content-Type up to the caller is the minimum viable + * shared shape. + */ +export function platformAuthHeaders(): Record { + const headers: Record = {}; + const slug = getTenantSlug(); + if (slug) headers["X-Molecule-Org-Slug"] = slug; + const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN; + if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`; + return headers; +} + async function request( method: string, path: string, @@ -28,17 +67,16 @@ async function request( retryCount = 0, options?: RequestOptions, ): Promise { - // SaaS cross-origin shape: - // - X-Molecule-Org-Slug: derived from window.location.hostname by - // getTenantSlug(). Control plane uses it for fly-replay routing. - // Empty on localhost / non-tenant hosts — safe to omit. - // - credentials:"include": sends the session cookie cross-origin. - // Cookie's Domain=.moleculesai.app attribute + cp's CORS allow this. - const headers: Record = { "Content-Type": "application/json" }; + // JSON-bodied request — Content-Type is JSON. Auth pair comes from + // the shared helper; see its doc comment for the SaaS-shape rationale. + const headers: Record = { + "Content-Type": "application/json", + ...platformAuthHeaders(), + }; + // Re-read slug locally for the 401 handler below — `headers` already + // has it, but the 401 branch needs the bare value to gate the + // session-probe + redirect logic on tenant context. const slug = getTenantSlug(); - if (slug) headers["X-Molecule-Org-Slug"] = slug; - const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN; - if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`; const res = await fetch(`${PLATFORM_URL}${path}`, { method, diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml new file mode 100644 index 00000000..ac668dfd --- /dev/null +++ b/docker-compose.dev.yml @@ -0,0 +1,43 @@ +# docker-compose.dev.yml — overlay over docker-compose.yml for local dev +# with air-driven live reload of the platform (workspace-server) service. +# +# Usage: +# docker compose -f docker-compose.yml -f docker-compose.dev.yml up +# (or `make dev` shorthand from repo root) +# +# What this overlay changes vs docker-compose.yml alone: +# - Platform service uses workspace-server/Dockerfile.dev (air on top of +# golang:1.25-alpine) instead of the multi-stage prod Dockerfile. +# - Platform service bind-mounts the host's workspace-server/ source +# into /app/workspace-server so air sees source edits live. +# - Other services (postgres, redis, langfuse, etc.) inherit unchanged +# from docker-compose.yml. +# +# What stays the same: +# - All env vars, volumes, depends_on, healthchecks from docker-compose.yml. +# - Network topology + ports. +# - Postgres/Redis as service containers (no in-process replacements). + +services: + platform: + build: + context: . + dockerfile: workspace-server/Dockerfile.dev + # Rebind source: edits under host's workspace-server/ propagate live. + # The named volume on go-build-cache speeds up first build per container. + volumes: + - ./workspace-server:/app/workspace-server + - go-build-cache:/root/.cache/go-build + - go-mod-cache:/go/pkg/mod + # Air signals the running binary on rebuild; ensure shell stops cleanly. + init: true + # Mark the service as dev-mode so the platform can short-circuit any + # behavior that's incompatible with hot-reload (e.g. background + # cron-style watchers that don't survive process restart). No-op + # today; reserved for future flag use. + environment: + MOLECULE_DEV_HOT_RELOAD: "1" + +volumes: + go-build-cache: + go-mod-cache: diff --git a/docker-compose.yml b/docker-compose.yml index 00e5804e..0bcb4a5d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,6 +13,7 @@ services: - pgdata:/var/lib/postgresql/data networks: - molecule-monorepo-net + restart: unless-stopped healthcheck: test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-dev}"] interval: 2s @@ -50,6 +51,7 @@ services: - redisdata:/data networks: - molecule-monorepo-net + restart: unless-stopped healthcheck: test: ["CMD", "redis-cli", "ping"] interval: 2s @@ -126,6 +128,10 @@ services: REDIS_URL: redis://redis:6379 PORT: "${PLATFORM_PORT:-8080}" PLATFORM_URL: "http://platform:${PLATFORM_PORT:-8080}" + # Container network namespace is already isolated; "all interfaces" + # inside the container = the bridge interface only. The fail-open + # default (127.0.0.1) would block host-to-container access. + BIND_ADDR: "${BIND_ADDR:-0.0.0.0}" # Default MOLECULE_ENV=development so the WorkspaceAuth / AdminAuth # middleware fail-open path activates when ADMIN_TOKEN is unset — # otherwise the canvas (which runs without a bearer in pure local @@ -195,12 +201,28 @@ services: # App private key — read-only bind-mount. The host-side path is # gitignored per .gitignore rules (/.secrets/ + *.pem). - ./.secrets/github-app.pem:/secrets/github-app.pem:ro + # Per-role persona credentials (molecule-core#242 local surface). + # Sourced at workspace creation time by org_import.go::loadPersonaEnvFile + # when a workspace.yaml carries `role: `. The host-side dir is + # populated by the operator-host bootstrap kit (28 dev-tree personas); + # /etc/molecule-bootstrap/personas is the in-container path the + # platform expects (matches the prod tenant-EC2 path so the same code + # works in both modes). + # + # Read-only mount — workspace-server only reads, never writes here. + # If the host dir is empty/missing the platform's loadPersonaEnvFile + # silently no-ops per its existing semantics, so this mount is safe + # even on a fresh machine that hasn't run the bootstrap kit yet. + - ${MOLECULE_PERSONA_ROOT_HOST:-${HOME}/.molecule-ai/personas}:/etc/molecule-bootstrap/personas:ro ports: - "${PLATFORM_PUBLISH_PORT:-8080}:${PLATFORM_PORT:-8080}" networks: - molecule-monorepo-net + restart: unless-stopped healthcheck: - test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:${PLATFORM_PORT:-8080}/health || exit 1"] + # Plain GET — `--spider` would issue HEAD, which returns 404 because + # /health is registered as GET only. + test: ["CMD-SHELL", "wget -qO /dev/null --tries=1 http://localhost:${PLATFORM_PORT:-8080}/health || exit 1"] interval: 5s timeout: 5s retries: 10 @@ -238,7 +260,7 @@ services: networks: - molecule-monorepo-net healthcheck: - test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://127.0.0.1:${CANVAS_PORT:-3000} || exit 1"] + test: ["CMD-SHELL", "wget -qO /dev/null --tries=1 http://127.0.0.1:${CANVAS_PORT:-3000} || exit 1"] interval: 10s timeout: 5s retries: 10 diff --git a/manifest.json b/manifest.json index e7b69ef7..2ac2f462 100644 --- a/manifest.json +++ b/manifest.json @@ -1,5 +1,5 @@ { - "_comment": "Pin refs to release tags for reproducible builds. 'main' is OK while all repos are internal.", + "_comment": "OSS surface registry — every repo listed here MUST be public on git.moleculesai.app. Layer-3 customer/private templates are NOT registered here; they are handled at provision-time via the per-tenant credential resolver (see internal#102 RFC). 'main' refs are pinned to tags before broad rollout.", "version": 1, "plugins": [ {"name": "browser-automation", "repo": "molecule-ai/molecule-ai-plugin-browser-automation", "ref": "main"}, @@ -40,7 +40,6 @@ {"name": "free-beats-all", "repo": "molecule-ai/molecule-ai-org-template-free-beats-all", "ref": "main"}, {"name": "medo-smoke", "repo": "molecule-ai/molecule-ai-org-template-medo-smoke", "ref": "main"}, {"name": "molecule-worker-gemini", "repo": "molecule-ai/molecule-ai-org-template-molecule-worker-gemini", "ref": "main"}, - {"name": "reno-stars", "repo": "molecule-ai/molecule-ai-org-template-reno-stars", "ref": "main"}, {"name": "ux-ab-lab", "repo": "molecule-ai/molecule-ai-org-template-ux-ab-lab", "ref": "main"}, {"name": "mock-bigorg", "repo": "molecule-ai/molecule-ai-org-template-mock-bigorg", "ref": "main"} ] diff --git a/scripts/clone-manifest.sh b/scripts/clone-manifest.sh index 3ad98580..4e9e5d99 100755 --- a/scripts/clone-manifest.sh +++ b/scripts/clone-manifest.sh @@ -8,27 +8,24 @@ # Requires: git, jq (lighter than python3 — ~2MB vs ~50MB in Alpine) # # Auth (optional): -# When MOLECULE_GITEA_TOKEN is set, embed it as the basic-auth password so -# private Gitea repos clone successfully. When unset, clone anonymously -# (works only for repos that are public on git.moleculesai.app). +# Post-2026-05-08 (#192): every repo in manifest.json is public on +# git.moleculesai.app. Anonymous clone works for the entire registered +# set. The OSS-surface contract is recorded in manifest.json's _comment +# — Layer-3 customer/private templates (e.g. reno-stars) are NOT in the +# manifest; they are handled at provision-time via the per-tenant +# credential resolver (internal#102 RFC). # -# This is the path the publish-workspace-server-image.yml workflow uses: -# it injects AUTO_SYNC_TOKEN (devops-engineer persona PAT, repo:read on -# the molecule-ai org) so the in-CI pre-clone step succeeds for ALL -# manifest entries — including the 5 private workspace-template-* repos -# (codex, crewai, deepagents, gemini-cli, langgraph) and all 7 -# org-template-* repos. +# MOLECULE_GITEA_TOKEN is therefore optional today. Kept supported for +# two reasons: (a) historical CI configs that still inject +# AUTO_SYNC_TOKEN remain harmless, (b) reserved for the case where a +# private internal-only template is later registered via a ci-readonly +# team grant — review must explicitly sign off on that, since it +# violates the public-OSS-surface contract. # -# The token never enters the Docker image: this script runs in the -# trusted CI context BEFORE `docker buildx build`, populates +# The token (when set) never enters the Docker image: this script runs +# in the trusted CI context BEFORE `docker buildx build`, populates # .tenant-bundle-deps/, then `Dockerfile.tenant` COPYs from there with # the .git directories already stripped (see line ~67 below). -# -# For backward compatibility — and so a fresh clone works without -# secrets when (eventually) the workspace-template-* repos flip public — -# the unset path remains a plain anonymous HTTPS clone. That path will -# FAIL with "could not read Username" on private repos today; CI MUST -# set MOLECULE_GITEA_TOKEN. set -euo pipefail diff --git a/workspace-server/.air.toml b/workspace-server/.air.toml new file mode 100644 index 00000000..6e365f3c --- /dev/null +++ b/workspace-server/.air.toml @@ -0,0 +1,49 @@ +# air.toml — live-reload config for local docker-compose dev mode. +# +# Active when the platform service runs from workspace-server/Dockerfile.dev +# (selected via docker-compose.dev.yml overlay). In production, the regular +# Dockerfile builds a static binary; air is dev-only. +# +# Reference: https://github.com/air-verse/air + +root = "." +testdata_dir = "testdata" +tmp_dir = "tmp" + +[build] + # Same build invocation as Dockerfile's builder stage minus the + # CGO_ENABLED=0 toggle (CGO ok in dev for richer race detector output). + cmd = "go build -o ./tmp/server ./cmd/server" + bin = "tmp/server" + full_bin = "" + args_bin = [] + # Watch every .go and .yaml file under workspace-server/. + include_ext = ["go", "yaml", "tmpl"] + # Don't watch tests, build artifacts, vendored deps, or migration .sql + # (migrations need a clean DB anyway — handled by docker-compose down/up). + exclude_dir = ["assets", "tmp", "vendor", "testdata", "node_modules"] + exclude_file = [] + # _test.go and *_mock.go shouldn't trigger a rebuild — saves cycles. + exclude_regex = ["_test\\.go$", "_mock\\.go$"] + exclude_unchanged = true + follow_symlink = false + log = "build-errors.log" + # Kill running binary 1s before starting new one. + kill_delay = "1s" + send_interrupt = true + stop_on_error = true + # Debounce: wait this long after last change before triggering rebuild. + delay = 500 + +[log] + time = false + +[color] + main = "magenta" + watcher = "cyan" + build = "yellow" + runner = "green" + +[misc] + # Don't keep the tmp/ dir around between runs. + clean_on_exit = true diff --git a/workspace-server/.gitignore b/workspace-server/.gitignore index 3f67c92f..47fc4dc0 100644 --- a/workspace-server/.gitignore +++ b/workspace-server/.gitignore @@ -1,2 +1,5 @@ # The compiled binary, not the cmd/server package. /server + +# air live-reload build cache (Dockerfile.dev + docker-compose.dev.yml). +/tmp/ diff --git a/workspace-server/Dockerfile.dev b/workspace-server/Dockerfile.dev new file mode 100644 index 00000000..a1efe00b --- /dev/null +++ b/workspace-server/Dockerfile.dev @@ -0,0 +1,44 @@ +# Dockerfile.dev — local-development image with air-driven live reload. +# +# Selected by docker-compose.dev.yml (overlay over docker-compose.yml). +# Production stays on workspace-server/Dockerfile (static binary, no air). +# +# Workflow: +# 1. docker compose -f docker-compose.yml -f docker-compose.dev.yml up +# 2. Edit any .go file under workspace-server/ +# 3. air detects, rebuilds, kills old binary, starts new one (~3-5s) +# 4. No `docker compose up --build` needed +# +# Templates + plugins are NOT pre-cloned here — air-mode assumes the +# developer's filesystem has the workspace-configs-templates/ + plugins/ +# dirs available, mounted at runtime via docker-compose.dev.yml. + +FROM golang:1.25-alpine + +# air + git (for go mod) + ca-certs (for TLS) + tzdata (for time-zone DB) +# + docker-cli + docker-cli-buildx so the platform binary can shell out to +# /var/run/docker.sock (bind-mounted from host) for local-build provisioning. +# docker-cli alone is insufficient: alpine's docker-cli enables BuildKit by +# default but ships without buildx, producing +# `ERROR: BuildKit is enabled but the buildx component is missing or broken` +# on every `docker build`. docker-cli-buildx provides the buildx subcommand. +RUN apk add --no-cache git ca-certificates tzdata wget docker-cli docker-cli-buildx \ + && go install github.com/air-verse/air@latest + +WORKDIR /app/workspace-server + +# Pre-fetch deps so the first `air` rebuild on a fresh container is fast. +# These are bind-mount-overridden at runtime, so the COPY here is just +# to warm the module cache. +COPY workspace-server/go.mod workspace-server/go.sum ./ +RUN go mod download + +# Source is bind-mounted at runtime (see docker-compose.dev.yml volumes +# block) so the Dockerfile doesn't need to COPY it. air watches the +# bind-mounted dir for changes. + +ENV CGO_ENABLED=0 +ENV GOFLAGS="-buildvcs=false" + +# Run air with the .air.toml in the bind-mounted source dir. +CMD ["air", "-c", ".air.toml"] diff --git a/workspace-server/internal/handlers/handlers_extended_test.go b/workspace-server/internal/handlers/handlers_extended_test.go index ae35fca4..1cb779ca 100644 --- a/workspace-server/internal/handlers/handlers_extended_test.go +++ b/workspace-server/internal/handlers/handlers_extended_test.go @@ -26,6 +26,14 @@ func TestExtended_WorkspaceDelete(t *testing.T) { WithArgs(wsDelID). WillReturnRows(sqlmock.NewRows([]string{"id", "name"})) + // CascadeDelete walks descendants unconditionally (the 0-children + // optimization in the old inline path was dropped during the + // CascadeDelete extraction — descendant CTE returns 0 rows here, + // same end state, one extra cheap query). + mock.ExpectQuery("WITH RECURSIVE descendants"). + WithArgs(wsDelID). + WillReturnRows(sqlmock.NewRows([]string{"id"})) + // #73: batch UPDATE happens BEFORE any container teardown. // Uses ANY($1::uuid[]) even with a single ID for consistency. mock.ExpectExec("UPDATE workspaces SET status ="). diff --git a/workspace-server/internal/handlers/local_e2e_dev_dept_test.go b/workspace-server/internal/handlers/local_e2e_dev_dept_test.go new file mode 100644 index 00000000..85473141 --- /dev/null +++ b/workspace-server/internal/handlers/local_e2e_dev_dept_test.go @@ -0,0 +1,375 @@ +package handlers + +import ( + "archive/tar" + "bytes" + "net" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + "time" + + "gopkg.in/yaml.v3" +) + +// Local E2E for the dev-department extraction (RFC internal#77). +// +// Pre-conditions: both repos cloned as siblings under +// /tmp/local-e2e-deploy/{molecule-dev, molecule-dev-department}. +// (Set up by the orchestrator before running this test.) +// +// What this proves end-to-end through real platform code: +// 1. resolveYAMLIncludes follows the dev-lead symlink at the parent's +// template root and pulls in the dev-department subtree. +// 2. Recursive !include's inside the symlinked subtree resolve +// correctly via the chain dev-lead/workspace.yaml → +// ./core-lead/workspace.yaml → ./core-be/workspace.yaml etc. +// 3. The resolved YAML unmarshals into a complete OrgTemplate with the +// expected count of workspaces (parent's PM+Marketing+Research + +// dev-department's atomized 28 workspaces). +// +// Skipped if the local-e2e-deploy fixture isn't present — won't block +// CI on hosts that haven't set it up. +func TestLocalE2E_DevDepartmentExtraction(t *testing.T) { + parent := "/tmp/local-e2e-deploy/molecule-dev" + if _, err := os.Stat(filepath.Join(parent, "org.yaml")); err != nil { + t.Skipf("local-e2e fixture not present at %s: %v", parent, err) + } + + orgYAML, err := os.ReadFile(filepath.Join(parent, "org.yaml")) + if err != nil { + t.Fatalf("read org.yaml: %v", err) + } + + expanded, err := resolveYAMLIncludes(orgYAML, parent) + if err != nil { + t.Fatalf("resolveYAMLIncludes failed: %v", err) + } + + var tmpl OrgTemplate + if err := yaml.Unmarshal(expanded, &tmpl); err != nil { + t.Fatalf("unmarshal expanded OrgTemplate: %v", err) + } + + // Walk the full workspace tree, collect names. + names := []string{} + var walk func([]OrgWorkspace) + walk = func(ws []OrgWorkspace) { + for _, w := range ws { + names = append(names, w.Name) + walk(w.Children) + } + } + walk(tmpl.Workspaces) + + t.Logf("org name: %q", tmpl.Name) + t.Logf("total workspaces (recursive): %d", len(names)) + for _, n := range names { + t.Logf(" - %q", n) + } + + // Expected: PM + Marketing Lead + Dev Lead at top level, plus the + // full sub-trees under each. After atomization, we expect: + // - PM tree: PM + Research Lead + 3 research roles = 5 + // - Marketing tree: Marketing Lead + 5 marketing roles = 6 + // - Dev Lead tree: Dev Lead + (5 sub-team leads × ~6 each) + + // 3 floaters + Triage Operator = ~32 + // Roughly ~43 total. Be liberal; just assert a floor. + if len(names) < 30 { + t.Errorf("workspace count too low (%d) — expected ~40+ (PM+Marketing+Dev tree)", len(names)) + } + + // Specific sentinel names we expect to find: + expected := []string{ + "PM", + "Marketing Lead", + "Dev Lead", + "Core Platform Lead", + "Controlplane Lead", + "App & Docs Lead", + "Infra Lead", + "SDK Lead", + "Documentation Specialist", // Q1 — should be under app-lead + "Triage Operator", // Q2 — should be under dev-lead + } + found := map[string]bool{} + for _, n := range names { + found[n] = true + } + for _, want := range expected { + if !found[want] { + t.Errorf("missing expected workspace %q", want) + } + } +} + +// Stage-2 of the local e2e: prove every resolved workspace's `files_dir` +// path actually consumes correctly through the rest of the import chain. +// resolveYAMLIncludes returning a populated OrgTemplate is necessary but +// not sufficient — `POST /org/import` then does: +// +// 1. resolveInsideRoot(orgBaseDir, ws.FilesDir) → must return a path +// that exists and stat-resolves to a directory (org_import.go:313-317). +// 2. CopyTemplateToContainer(ctx, containerID, templatePath) → walks +// the dir with filepath.Walk and tars its contents into the +// workspace's /configs/ mount (provisioner.go:766-820). +// +// This stage-2 test exercises both #1 and #2 against every workspace in +// the resolved tree, mimicking what the platform does post-include- +// resolution. Catches: files_dir paths that don't resolve through the +// symlink, paths that exist but are empty (silently produces empty +// /configs/), or filepath.Walk failing to descend through cross-repo +// symlink boundaries. +func TestLocalE2E_FilesDirConsumption(t *testing.T) { + parent := "/tmp/local-e2e-deploy/molecule-dev" + if _, err := os.Stat(filepath.Join(parent, "org.yaml")); err != nil { + t.Skipf("local-e2e fixture not present at %s: %v", parent, err) + } + + orgYAML, err := os.ReadFile(filepath.Join(parent, "org.yaml")) + if err != nil { + t.Fatalf("read org.yaml: %v", err) + } + expanded, err := resolveYAMLIncludes(orgYAML, parent) + if err != nil { + t.Fatalf("resolveYAMLIncludes: %v", err) + } + var tmpl OrgTemplate + if err := yaml.Unmarshal(expanded, &tmpl); err != nil { + t.Fatalf("unmarshal: %v", err) + } + + // Flatten every workspace — including children, grandchildren, etc. + flat := []OrgWorkspace{} + var walk func([]OrgWorkspace) + walk = func(ws []OrgWorkspace) { + for _, w := range ws { + flat = append(flat, w) + walk(w.Children) + } + } + walk(tmpl.Workspaces) + + checked := 0 + for _, w := range flat { + if w.FilesDir == "" { + continue // workspace declared inline (no files_dir) — skip + } + checked++ + t.Run(w.Name+"/"+w.FilesDir, func(t *testing.T) { + // Step 1: resolveInsideRoot returns a path that's-inside-root. + abs, err := resolveInsideRoot(parent, w.FilesDir) + if err != nil { + t.Fatalf("resolveInsideRoot(%q, %q): %v", parent, w.FilesDir, err) + } + info, err := os.Stat(abs) + if err != nil { + t.Fatalf("stat %q (resolved from files_dir %q): %v", abs, w.FilesDir, err) + } + if !info.IsDir() { + t.Fatalf("files_dir %q resolved to %q which is not a directory", w.FilesDir, abs) + } + + // Step 2: walk the dir like CopyTemplateToContainer does. + // Mirror the platform's symlink-resolution at the root — + // filepath.Walk doesn't descend into a symlink leaf, so + // CopyTemplateToContainer (provisioner.go) calls + // EvalSymlinks on templatePath first. Replicate exactly. + if resolved, err := filepath.EvalSymlinks(abs); err == nil { + abs = resolved + } + var buf bytes.Buffer + tw := tar.NewWriter(&buf) + fileCount := 0 + fileNames := []string{} + err = filepath.Walk(abs, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + rel, err := filepath.Rel(abs, path) + if err != nil { + return err + } + if rel == "." { + return nil + } + header, _ := tar.FileInfoHeader(info, "") + header.Name = rel + if err := tw.WriteHeader(header); err != nil { + return err + } + if !info.IsDir() { + fileCount++ + fileNames = append(fileNames, rel) + data, err := os.ReadFile(path) + if err != nil { + return err + } + header.Size = int64(len(data)) + tw.Write(data) + } + return nil + }) + if err != nil { + t.Fatalf("filepath.Walk %q (mimics CopyTemplateToContainer): %v", abs, err) + } + tw.Close() + + if fileCount == 0 { + t.Errorf("files_dir %q at %q is empty — CopyTemplateToContainer would produce empty /configs/", + w.FilesDir, abs) + } + + // Sanity: every workspace folder should have AT LEAST one of + // {workspace.yaml, system-prompt.md, initial-prompt.md} — + // these are the markers a workspace folder is recognizable + // as a workspace (mirrors validator's WORKSPACE_FOLDER_MARKERS). + markers := []string{"workspace.yaml", "system-prompt.md", "initial-prompt.md"} + hasMarker := false + for _, name := range fileNames { + for _, m := range markers { + if name == m || strings.HasSuffix(name, "/"+m) { + hasMarker = true + break + } + } + if hasMarker { + break + } + } + if !hasMarker { + t.Errorf("files_dir %q at %q has %d files but none of the workspace markers %v — found: %v", + w.FilesDir, abs, fileCount, markers, fileNames) + } + }) + } + t.Logf("checked %d workspaces with files_dir", checked) + if checked < 25 { + t.Errorf("expected ~28 workspaces with files_dir (post-atomization); only saw %d", checked) + } +} + +// PR-C from the Phase 3a phasing (task #234): real-Gitea e2e for the +// !external resolver against the LIVE molecule-ai/molecule-dev-department +// repo. Verifies the production gitFetcher fetches the dev tree and the +// resolver grafts it correctly into a parent template that has NO +// symlink — composition is purely platform-side. +// +// Skipped if Gitea isn't reachable (offline / firewall / CI without +// network). Requires `git` binary on PATH. +func TestLocalE2E_ExternalDevDepartment(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skipf("git binary not found: %v", err) + } + + // Skip if Gitea host isn't reachable (TCP probe). Avoids network- + // dependent tests failing on offline runners. + conn, err := net.DialTimeout("tcp", "git.moleculesai.app:443", 3*time.Second) + if err != nil { + t.Skipf("git.moleculesai.app:443 unreachable: %v", err) + } + conn.Close() + + // Build a minimal parent template inline — no need for the + // /tmp/local-e2e-deploy/ symlinked fixture. The whole point of + // !external is that the parent template is self-contained; + // composition resolves over the network at import time. + parent := t.TempDir() + + orgYAML := []byte(`name: External-Only Test Parent +description: Parent template that pulls the entire dev tree via !external. +defaults: + runtime: claude-code + tier: 2 +workspaces: + - !external + repo: molecule-ai/molecule-dev-department + ref: main + path: dev-lead/workspace.yaml +`) + if err := os.WriteFile(filepath.Join(parent, "org.yaml"), orgYAML, 0o644); err != nil { + t.Fatalf("write org.yaml: %v", err) + } + + out, err := resolveYAMLIncludes(orgYAML, parent) + if err != nil { + t.Fatalf("resolveYAMLIncludes (!external against live Gitea): %v", err) + } + + var tmpl OrgTemplate + if err := yaml.Unmarshal(out, &tmpl); err != nil { + t.Fatalf("unmarshal: %v", err) + } + + // Walk the workspace tree, collect names + check files_dir paths. + flat := []OrgWorkspace{} + var walk func([]OrgWorkspace) + walk = func(ws []OrgWorkspace) { + for _, w := range ws { + flat = append(flat, w) + walk(w.Children) + } + } + walk(tmpl.Workspaces) + + t.Logf("workspaces resolved through !external: %d", len(flat)) + if len(flat) < 25 { + t.Errorf("expected ~28 dev-tree workspaces via !external; got %d", len(flat)) + } + + // Sentinel checks — same as TestLocalE2E_DevDepartmentExtraction + // (Q1+Q2 placements verified). + expected := []string{ + "Dev Lead", + "Core Platform Lead", + "Controlplane Lead", + "App & Docs Lead", + "Documentation Specialist", // Q1 + "Triage Operator", // Q2 + } + found := map[string]bool{} + for _, w := range flat { + found[w.Name] = true + } + for _, want := range expected { + if !found[want] { + t.Errorf("missing expected workspace %q", want) + } + } + + // Every workspace's files_dir must be cache-prefixed (proves the + // path-rewrite ran end-to-end). + cachePrefix := ".external-cache" + for _, w := range flat { + if w.FilesDir == "" { + continue + } + if !strings.HasPrefix(w.FilesDir, cachePrefix) { + t.Errorf("workspace %q files_dir %q missing cache prefix %q", w.Name, w.FilesDir, cachePrefix) + } + } + + // Verify the fetched cache exists and resolveInsideRoot accepts + // every workspace's files_dir (would cause provisioning to fail + // if not). + for _, w := range flat { + if w.FilesDir == "" { + continue + } + abs, err := resolveInsideRoot(parent, w.FilesDir) + if err != nil { + t.Errorf("workspace %q files_dir %q: resolveInsideRoot: %v", w.Name, w.FilesDir, err) + continue + } + info, err := os.Stat(abs) + if err != nil { + t.Errorf("workspace %q: stat %q: %v", w.Name, abs, err) + continue + } + if !info.IsDir() { + t.Errorf("workspace %q files_dir %q is not a directory", w.Name, w.FilesDir) + } + } +} diff --git a/workspace-server/internal/handlers/org.go b/workspace-server/internal/handlers/org.go index 5f57b24c..233cc69f 100644 --- a/workspace-server/internal/handlers/org.go +++ b/workspace-server/internal/handlers/org.go @@ -13,12 +13,15 @@ import ( "path/filepath" "strconv" "strings" + "time" "github.com/Molecule-AI/molecule-monorepo/platform/internal/channels" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" "github.com/Molecule-AI/molecule-monorepo/platform/internal/events" "github.com/Molecule-AI/molecule-monorepo/platform/internal/models" "github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner" "github.com/gin-gonic/gin" + "github.com/lib/pq" "gopkg.in/yaml.v3" ) @@ -422,6 +425,16 @@ type OrgWorkspace struct { Tier int `yaml:"tier" json:"tier"` Template string `yaml:"template" json:"template"` FilesDir string `yaml:"files_dir" json:"files_dir"` + // Spawning gates whether this workspace (AND its descendants) gets + // provisioned during /org/import. Pointer so we can distinguish + // "explicitly set to false" from "unset" (default = spawn). Use case: + // the dev-tree org template declares the full team structure but a + // developer's local machine only has RAM for a subset; setting + // spawning: false on a leaf or a sub-tree root skips that branch + // entirely without editing the canonical template structure. + // Counted in countWorkspaces same as actual; subtree-skip happens + // at provision time in createWorkspaceTree. + Spawning *bool `yaml:"spawning,omitempty" json:"spawning,omitempty"` // SystemPrompt is an inline override. Normally each role's system-prompt.md // lives at `/system-prompt.md` and is copied via the files_dir // template-copy step; inline overrides that path for ad-hoc workspaces. @@ -558,6 +571,19 @@ func (h *OrgHandler) Import(c *gin.Context) { var body struct { Dir string `json:"dir"` // org template directory name Template OrgTemplate `json:"template"` // or inline template + // Mode controls cleanup behavior of pre-existing workspaces: + // "" / "merge" — additive (default; current behavior). + // Existing workspaces matched by + // (parent_id, name) are skipped; nothing + // outside the new tree is touched. + // "reconcile" — additive + cleanup. After import, any + // online workspace whose name matches an + // imported workspace's name but whose id + // isn't in the import result set is + // cascade-deleted. Catches "previous + // import survived a re-import" zombies + // (the 20:13→21:17 dev-tree case). + Mode string `json:"mode"` } if err := c.ShouldBindJSON(&body); err != nil { c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request body"}) @@ -603,6 +629,19 @@ func (h *OrgHandler) Import(c *gin.Context) { return } + // Emit started AFTER the YAML is loaded so payload.name carries the + // resolved template name (was: empty when caller passed `dir` instead + // of inline `template`). Pre-parse error paths above return without + // emitting — semantically "we couldn't even start an import" — so + // every started event is guaranteed a paired completed/failed below + // (no orphan started rows in structure_events). + importStart := time.Now() + emitOrgEvent(c.Request.Context(), "org.import.started", map[string]any{ + "name": tmpl.Name, + "dir": body.Dir, + "mode": body.Mode, + }) + // Required-env preflight — refuses import when any required_env is // missing from global_secrets. No bypass: the prior `force: true` // escape hatch was removed (issue #2290) because it was the silent @@ -708,18 +747,171 @@ func (h *OrgHandler) Import(c *gin.Context) { } } + // Reconcile mode: prune workspaces present from a previous import that + // share a name with the new tree but are NOT in the new result set. + // Catches the additive-import bug where re-running /org/import with a + // changed tree shape (different parent_id for the same role name) leaves + // the prior workspace online — visible to the canvas, consuming + // containers, and looking like a duplicate. Default mode "" / "merge" + // preserves the old additive behavior. + reconcileRemovedCount := 0 + reconcileSkipped := 0 + reconcileErrs := []string{} + if body.Mode == "reconcile" && createErr == nil { + ctx := c.Request.Context() + importedNames := []string{} + walkOrgWorkspaceNames(tmpl.Workspaces, &importedNames) + + importedIDs := make([]string, 0, len(results)) + for _, r := range results { + if id, ok := r["id"].(string); ok && id != "" { + importedIDs = append(importedIDs, id) + } + } + + // Empty-set guards: if the import didn't produce any names or any + // IDs, skip — querying with empty arrays would either match + // nothing (harmless) or, worse, match every workspace if a future + // query rewrite drops the IN clause. Belt-and-suspenders. + if len(importedNames) > 0 && len(importedIDs) > 0 { + rows, err := db.DB.QueryContext(ctx, ` + SELECT id FROM workspaces + WHERE name = ANY($1::text[]) + AND id != ALL($2::uuid[]) + AND status != 'removed' + `, pq.Array(importedNames), pq.Array(importedIDs)) + if err != nil { + log.Printf("Org import reconcile: orphan query failed: %v", err) + reconcileErrs = append(reconcileErrs, fmt.Sprintf("orphan query: %v", err)) + } else { + orphanIDs := []string{} + for rows.Next() { + var orphanID string + if rows.Scan(&orphanID) == nil { + orphanIDs = append(orphanIDs, orphanID) + } + } + rows.Close() + + for _, oid := range orphanIDs { + descendantIDs, stopErrs, err := h.workspace.CascadeDelete(ctx, oid) + if err != nil { + log.Printf("Org import reconcile: CascadeDelete(%s) failed: %v", oid, err) + reconcileErrs = append(reconcileErrs, fmt.Sprintf("delete %s: %v", oid, err)) + reconcileSkipped++ + continue + } + reconcileRemovedCount += 1 + len(descendantIDs) + if len(stopErrs) > 0 { + log.Printf("Org import reconcile: %s had %d stop errors (orphan sweeper will retry)", oid, len(stopErrs)) + } + } + log.Printf("Org import reconcile: %d orphans removed (%d cascade descendants), %d skipped", len(orphanIDs), reconcileRemovedCount-len(orphanIDs), reconcileSkipped) + } + } + } + status := http.StatusCreated resp := gin.H{ "org": tmpl.Name, "workspaces": results, "count": len(results), } + if body.Mode == "reconcile" { + resp["mode"] = "reconcile" + resp["reconcile_removed_count"] = reconcileRemovedCount + if len(reconcileErrs) > 0 { + resp["reconcile_errors"] = reconcileErrs + } + } if createErr != nil { status = http.StatusMultiStatus resp["error"] = createErr.Error() } - log.Printf("Org import: %s — %d workspaces created", tmpl.Name, len(results)) + // results contains both freshly-created AND lookupExistingChild skips + // (entries with "skipped":true). Splitting the count here so the audit + // row reflects "what changed" vs "what was already there" — telemetry + // readers shouldn't need to grep stdout to tell an idempotent re-run + // apart from a fresh-create. + createdCount, skippedCount := 0, 0 + for _, r := range results { + if skipped, _ := r["skipped"].(bool); skipped { + skippedCount++ + } else { + createdCount++ + } + } + log.Printf("Org import: %s — %d created, %d skipped, %d reconciled", + tmpl.Name, createdCount, skippedCount, reconcileRemovedCount) + emitOrgEvent(c.Request.Context(), "org.import.completed", map[string]any{ + "name": tmpl.Name, + "dir": body.Dir, + "mode": body.Mode, + "created_count": createdCount, + "skipped_count": skippedCount, + "reconcile_removed_count": reconcileRemovedCount, + "reconcile_errors": len(reconcileErrs), + "duration_ms": time.Since(importStart).Milliseconds(), + "create_error": errString(createErr), + }) c.JSON(status, resp) } +// walkOrgWorkspaceNames collects every Name in the tree (in any order) into +// names. Used by reconcile to detect orphan workspaces — workspaces with the +// same role name as a freshly-imported one but a different id, surviving from +// a prior import. +func walkOrgWorkspaceNames(workspaces []OrgWorkspace, names *[]string) { + for _, w := range workspaces { + // spawning:false subtrees are still part of the imported tree + // from a logical-tree perspective — DON'T skip the recursion, + // or reconcile would orphan the rest of the subtree on every + // re-import where spawning is toggled. Names of skipped + // workspaces remain registered so reconcile won't double-create + // them when spawning flips back to true. + if w.Name != "" { + *names = append(*names, w.Name) + } + walkOrgWorkspaceNames(w.Children, names) + } +} + +// emitOrgEvent records an org-lifecycle event in structure_events so the +// import history is queryable independent of stdout log retention. Errors +// are logged and swallowed — never block the request path on telemetry. +// +// Event-type taxonomy (extend by appending; never rename): +// +// org.import.started — handler entered, request body parsed +// org.import.completed — handler exiting (success or partial) +// org.import.failed — handler exiting with an unrecoverable error +// +// payload fields are documented at each call site. +func emitOrgEvent(ctx context.Context, eventType string, payload map[string]any) { + if payload == nil { + payload = map[string]any{} + } + payloadJSON, err := json.Marshal(payload) + if err != nil { + log.Printf("emitOrgEvent: marshal %s payload failed: %v", eventType, err) + return + } + if _, err := db.DB.ExecContext(ctx, ` + INSERT INTO structure_events (event_type, payload, created_at) + VALUES ($1, $2, now()) + `, eventType, payloadJSON); err != nil { + log.Printf("emitOrgEvent: insert %s failed: %v", eventType, err) + } +} + +// errString returns "" for a nil error, err.Error() otherwise. Lets us put +// nullable error strings in event payloads without checking for nil at every +// call site. +func errString(err error) string { + if err == nil { + return "" + } + return err.Error() +} + diff --git a/workspace-server/internal/handlers/org_external.go b/workspace-server/internal/handlers/org_external.go new file mode 100644 index 00000000..c964782d --- /dev/null +++ b/workspace-server/internal/handlers/org_external.go @@ -0,0 +1,439 @@ +package handlers + +import ( + "context" + "fmt" + "net/url" + "os" + "os/exec" + "path/filepath" + "regexp" + "strings" + "time" + + "gopkg.in/yaml.v3" +) + +// External-ref resolver — gitops-style cross-repo subtree composition. +// Internal#77 RFC, Phase 3a (task #222). Prior art: Helm subcharts + +// dependency cache, Kustomize remote bases, Terraform module sources. +// +// Schema (a `!external`-tagged mapping anywhere a workspace entry is +// allowed — workspaces:, roots:, children:): +// +// - !external +// repo: molecule-ai/molecule-dev-department +// ref: main +// path: dev-lead/workspace.yaml +// +// At resolve time, the platform fetches the repo at ref into a content- +// addressable cache under /.external-cache///, loads +// the yaml at /, rewrites every files_dir + relative +// !include path to be cache-prefixed, then grafts the result in place of +// the !external node. Downstream pipeline (resolveInsideRoot, plugin +// merge, CopyTemplateToContainer) sees ordinary in-tree paths. + +// ExternalRef is the deserialized form of an `!external`-tagged mapping. +type ExternalRef struct { + Repo string `yaml:"repo"` + Ref string `yaml:"ref"` + Path string `yaml:"path"` + + // URL overrides the default Gitea host. Optional; defaults to + // MOLECULE_EXTERNAL_GITEA_URL env or git.moleculesai.app. + URL string `yaml:"url,omitempty"` +} + +const ( + // maxExternalDepth caps recursion through nested `!external`s. Lower + // than maxIncludeDepth (16) because each level may issue a network + // fetch. Composition that genuinely needs >4 layers is a smell. + maxExternalDepth = 4 + + // externalCacheDirName is the per-template cache subdir under rootDir. + // Content-addressable: keyed by (repo, sha). Operators add this to + // .gitignore — cache is platform-mutated, not source-tracked. + externalCacheDirName = ".external-cache" + + // gitFetchTimeout caps a single clone operation. Conservative — + // org template fetches are typically <100KB. + gitFetchTimeout = 60 * time.Second +) + +// safeRefPattern restricts `ref` values to characters git itself accepts +// for branch / tag / SHA. Belt-and-braces over git's own validation. +var safeRefPattern = regexp.MustCompile(`^[a-zA-Z0-9_./-]+$`) + +// allowlistedHostPath returns true if `/` matches the +// configured allowlist. Default allowlist: git.moleculesai.app/molecule-ai/. +// Override via MOLECULE_EXTERNAL_REPO_ALLOWLIST env var (comma-separated +// patterns). Patterns are matched as prefixes (with trailing-slash +// semantics) or as exact matches. Trailing /* is treated as "any +// descendants of this prefix". +// +// Examples: +// - "git.moleculesai.app/molecule-ai/" → matches molecule-ai/* (any repo) +// - "git.moleculesai.app/molecule-ai/*" → same; trailing /* normalized to / +// - "git.moleculesai.app/molecule-ai/molecule-dev-department" → exact +// - "git.moleculesai.app/" → matches everything on that host +func allowlistedHostPath(host, repoPath string) bool { + allow := os.Getenv("MOLECULE_EXTERNAL_REPO_ALLOWLIST") + if allow == "" { + allow = "git.moleculesai.app/molecule-ai/" + } + hp := host + "/" + repoPath + for _, pat := range strings.Split(allow, ",") { + pat = strings.TrimSpace(pat) + if pat == "" { + continue + } + // Normalize trailing /* → / + pat = strings.TrimSuffix(pat, "*") + if pat == hp { + return true + } + if strings.HasSuffix(pat, "/") && strings.HasPrefix(hp+"/", pat) { + return true + } + } + return false +} + +// externalFetcher abstracts the git-clone-into-cache step. Production +// uses gitFetcher (shells out to git); tests inject a fake that +// pre-stages content in a temp dir. +type externalFetcher interface { + // Fetch ensures rootDir/.external-cache/// contains + // the repo content at the given ref. Returns the absolute cache + // dir + the resolved SHA. Cache hit = no network. Cache miss = + // clone. + Fetch(ctx context.Context, rootDir, host, repoPath, ref string) (cacheDir, sha string, err error) +} + +// defaultExternalFetcher is the package-level fetcher injection point. +// Production code uses the git-shell fetcher; tests override via +// SetExternalFetcherForTest. +var defaultExternalFetcher externalFetcher = &gitFetcher{} + +// SetExternalFetcherForTest swaps the fetcher for testing. Returns a +// cleanup func that restores the previous fetcher. +func SetExternalFetcherForTest(f externalFetcher) func() { + prev := defaultExternalFetcher + defaultExternalFetcher = f + return func() { defaultExternalFetcher = prev } +} + +// resolveExternalMapping replaces an `!external`-tagged mapping node +// with the loaded + path-rewritten yaml content from the fetched repo. +// +// `currentDir` and `rootDir` are inherited from expandNode's resolve +// frame. `visited` tracks (repo, sha, path) tuples for cycle detection +// across nested externals. +func resolveExternalMapping(n *yaml.Node, currentDir, rootDir string, visited map[string]bool, depth int) error { + if depth > maxExternalDepth { + return fmt.Errorf("!external: max depth %d exceeded (possible cycle)", maxExternalDepth) + } + if rootDir == "" { + return fmt.Errorf("!external at line %d requires a dir-based org template (no rootDir in inline-template mode)", n.Line) + } + + var ref ExternalRef + if err := n.Decode(&ref); err != nil { + return fmt.Errorf("!external at line %d: decode: %w", n.Line, err) + } + if ref.Repo == "" || ref.Ref == "" || ref.Path == "" { + return fmt.Errorf("!external at line %d: repo, ref, path are all required (got %+v)", n.Line, ref) + } + if !safeRefPattern.MatchString(ref.Ref) { + return fmt.Errorf("!external at line %d: ref %q contains disallowed characters", n.Line, ref.Ref) + } + // Defense-in-depth: even though git itself rejects refs containing + // `..`, the regex above currently allows them. Reject explicitly. + if strings.Contains(ref.Ref, "..") { + return fmt.Errorf("!external at line %d: ref %q must not contain '..'", n.Line, ref.Ref) + } + if strings.Contains(ref.Path, "..") || strings.HasPrefix(ref.Path, "/") { + return fmt.Errorf("!external at line %d: path %q must be relative-and-down-only", n.Line, ref.Path) + } + + host := ref.URL + if host == "" { + host = os.Getenv("MOLECULE_EXTERNAL_GITEA_URL") + } + if host == "" { + host = "git.moleculesai.app" + } + host = strings.TrimPrefix(strings.TrimPrefix(host, "https://"), "http://") + host = strings.TrimSuffix(host, "/") + + if !allowlistedHostPath(host, ref.Repo) { + return fmt.Errorf("!external at line %d: %s/%s not in MOLECULE_EXTERNAL_REPO_ALLOWLIST", n.Line, host, ref.Repo) + } + + ctx, cancel := context.WithTimeout(context.Background(), gitFetchTimeout) + defer cancel() + + cacheDir, sha, err := defaultExternalFetcher.Fetch(ctx, rootDir, host, ref.Repo, ref.Ref) + if err != nil { + return fmt.Errorf("!external at line %d: fetch %s/%s@%s: %w", n.Line, host, ref.Repo, ref.Ref, err) + } + + // Cycle key: (repo, sha, path) — same external content reachable + // via two paths is fine, but a self-referential cycle isn't. + cycleKey := fmt.Sprintf("%s/%s@%s/%s", host, ref.Repo, sha, ref.Path) + if visited[cycleKey] { + return fmt.Errorf("!external cycle detected at %q (line %d)", cycleKey, n.Line) + } + + // Validate path resolves inside the cache dir (anti-traversal). + yamlPathAbs, err := resolveInsideRoot(cacheDir, ref.Path) + if err != nil { + return fmt.Errorf("!external at line %d: path %q: %w", n.Line, ref.Path, err) + } + if _, err := os.Stat(yamlPathAbs); err != nil { + return fmt.Errorf("!external at line %d: %s/%s@%s does not contain %q: %w", n.Line, host, ref.Repo, sha, ref.Path, err) + } + + data, err := os.ReadFile(yamlPathAbs) + if err != nil { + return fmt.Errorf("!external at line %d: read %q: %w", n.Line, yamlPathAbs, err) + } + + var sub yaml.Node + if err := yaml.Unmarshal(data, &sub); err != nil { + return fmt.Errorf("!external at line %d: parse %q: %w", n.Line, yamlPathAbs, err) + } + root := &sub + if root.Kind == yaml.DocumentNode && len(root.Content) == 1 { + root = root.Content[0] + } + + // Recurse FIRST: load all nested !include / !external content into + // the tree. Then rewrite ALL files_dir scalars in the fully-resolved + // tree (top + nested) with the cache prefix in one pass. Doing + // rewrite-before-recurse would leave nested-loaded files_dir paths + // unprefixed. + visited[cycleKey] = true + defer delete(visited, cycleKey) + + subDir := filepath.Dir(yamlPathAbs) + if err := expandNode(root, subDir, rootDir, visited, depth+1); err != nil { + return err + } + + // Path rewrite: prefix every files_dir scalar in the fully-resolved + // content with the cache-relative-from-rootDir prefix. After this + // pass, fetched workspaces look like ordinary in-tree workspaces. + cachePrefix, err := filepath.Rel(rootDir, cacheDir) + if err != nil { + return fmt.Errorf("!external at line %d: cannot compute cache prefix: %w", n.Line, err) + } + rewriteFilesDir(root, cachePrefix) + + // Replace the !external mapping with the resolved content in-place. + *n = *root + if n.Tag == "!external" { + n.Tag = "" + } + return nil +} + +// rewriteFilesDir walks the yaml node tree and prepends cachePrefix to +// every files_dir scalar value. Idempotent: if a files_dir value already +// starts with the prefix, no-op. +// +// !include paths are intentionally NOT rewritten. They resolve relative +// to their containing file's directory (subDir in expandNode), and after +// fetch that directory IS inside the cache, so relative !include paths +// Just Work without any rewrite. Rewriting them would double-prefix on +// recursive resolution. +// +// files_dir DOES need rewriting because it's consumed at workspace- +// provisioning time relative to orgBaseDir (the parent template's root), +// not relative to the workspace.yaml's containing dir. +func rewriteFilesDir(n *yaml.Node, cachePrefix string) { + if n == nil { + return + } + if n.Kind == yaml.MappingNode { + for i := 0; i+1 < len(n.Content); i += 2 { + key, value := n.Content[i], n.Content[i+1] + if key.Kind == yaml.ScalarNode && key.Value == "files_dir" && value.Kind == yaml.ScalarNode { + if !strings.HasPrefix(value.Value, cachePrefix+string(filepath.Separator)) && value.Value != cachePrefix { + value.Value = filepath.Join(cachePrefix, value.Value) + } + } + } + } + for _, child := range n.Content { + rewriteFilesDir(child, cachePrefix) + } +} + +// safeRepoCacheDir converts a repo path like "molecule-ai/foo" into a +// filesystem-safe segment "molecule-ai__foo". Avoids nesting cache dirs +// (which would complicate cleanup). +func safeRepoCacheDir(host, repoPath string) string { + hp := host + "/" + repoPath + hp = strings.ReplaceAll(hp, "/", "__") + hp = strings.ReplaceAll(hp, ":", "_") + return hp +} + +// gitFetcher is the production externalFetcher: shells out to `git` to +// clone the repo at ref into the cache dir. Cache key includes the +// resolved SHA, so different SHAs of the same ref get different cache +// dirs (no overwrite). +// +// Token handling — important for security. The auth token never enters +// the clone URL (and therefore never lands in the cloned repo's +// .git/config) and never appears in returned errors. We use git's +// `http.extraHeader` config option (passed via `-c`), which sends an +// Authorization header per-request without persisting it. The token is +// briefly visible in the `git` process's argv (so other local users +// with the same uid could see it via `ps`), which is the same exposure +// it has via the env var that supplied it. +// +// Cache validity uses a `.complete` marker written after a successful +// clone+rename. Cache-hit checks for the marker, not just the dir +// existence — a partially-written cache (clone failed mid-way, or a +// concurrent caller wrote a half-baked cache dir) is treated as cache +// miss and re-fetched cleanly. +type gitFetcher struct{} + +// cacheCompleteMarker is the filename written after a successful clone. +// Cache-hit requires this marker; without it, the cache dir is treated +// as partially-written and re-fetched. +const cacheCompleteMarker = ".complete" + +// Fetch resolves ref → SHA via `git ls-remote`, then `git clone --depth=1` +// if the cache dir is missing or incomplete. Auth via MOLECULE_GITEA_TOKEN +// injected via http.extraHeader (never via URL). +func (g *gitFetcher) Fetch(ctx context.Context, rootDir, host, repoPath, ref string) (string, string, error) { + cacheRoot := filepath.Join(rootDir, externalCacheDirName, safeRepoCacheDir(host, repoPath)) + if err := os.MkdirAll(cacheRoot, 0o755); err != nil { + return "", "", fmt.Errorf("mkdir cache root: %w", err) + } + + cloneURL := buildExternalCloneURL(host, repoPath) + gitArgs := func(extra ...string) []string { + args := authConfigArgs() + return append(args, extra...) + } + + // 1. Resolve ref → SHA (so cache dir is content-addressable). + sha, err := g.resolveRefToSHA(ctx, cloneURL, ref, gitArgs) + if err != nil { + return "", "", fmt.Errorf("ls-remote: %s", redactToken(err.Error())) + } + + cacheDir := filepath.Join(cacheRoot, sha) + // Cache-hit requires the .complete marker AND the .git dir. + // Without the marker, cache is partially-written → treat as miss. + if isCacheComplete(cacheDir) { + return cacheDir, sha, nil + } + + // Cache miss or partially-written — clean any stale cacheDir before + // cloning (a previous broken attempt would otherwise block rename). + os.RemoveAll(cacheDir) + + // 2. Clone into a sibling tmp dir; atomic rename on success. + tmpDir, err := os.MkdirTemp(cacheRoot, sha+".tmp.") + if err != nil { + return "", "", fmt.Errorf("mkdir tmp: %w", err) + } + // MkdirTemp creates the dir; git clone refuses to clone into a + // non-empty dir. Remove + recreate empty. + os.RemoveAll(tmpDir) + cloneAndConfig := append(gitArgs("clone", "--quiet", "--depth=1", "-b", ref, cloneURL, tmpDir)) + cmd := exec.CommandContext(ctx, "git", cloneAndConfig...) + cmd.Env = append(os.Environ(), "GIT_TERMINAL_PROMPT=0") + if out, err := cmd.CombinedOutput(); err != nil { + os.RemoveAll(tmpDir) + return "", "", fmt.Errorf("git clone: %w: %s", err, redactToken(strings.TrimSpace(string(out)))) + } + + // Write the .complete marker BEFORE the rename. If rename succeeds, + // the marker is in place. If rename loses the race (concurrent + // fetcher won), our tmp gets cleaned up and we trust the winner. + if err := os.WriteFile(filepath.Join(tmpDir, cacheCompleteMarker), []byte(time.Now().UTC().Format(time.RFC3339)), 0o644); err != nil { + os.RemoveAll(tmpDir) + return "", "", fmt.Errorf("write complete marker: %w", err) + } + + if err := os.Rename(tmpDir, cacheDir); err != nil { + // Race: another import beat us. Validate THEIR cache, accept it. + os.RemoveAll(tmpDir) + if isCacheComplete(cacheDir) { + return cacheDir, sha, nil + } + return "", "", fmt.Errorf("rename clone to cache (and winner's cache is incomplete): %w", err) + } + return cacheDir, sha, nil +} + +// isCacheComplete reports whether cacheDir contains both the cloned +// repo (.git) and the .complete marker. Treats partial state as miss. +func isCacheComplete(cacheDir string) bool { + if _, err := os.Stat(filepath.Join(cacheDir, ".git")); err != nil { + return false + } + if _, err := os.Stat(filepath.Join(cacheDir, cacheCompleteMarker)); err != nil { + return false + } + return true +} + +func (g *gitFetcher) resolveRefToSHA(ctx context.Context, cloneURL, ref string, gitArgs func(...string) []string) (string, error) { + args := gitArgs("ls-remote", cloneURL, ref) + cmd := exec.CommandContext(ctx, "git", args...) + cmd.Env = append(os.Environ(), "GIT_TERMINAL_PROMPT=0") + out, err := cmd.Output() + if err != nil { + return "", err + } + line := strings.TrimSpace(string(out)) + if line == "" { + return "", fmt.Errorf("ref %q not found", ref) + } + // First whitespace-separated field is the SHA. + for i, ch := range line { + if ch == ' ' || ch == '\t' { + return line[:i], nil + } + } + return line, nil +} + +// buildExternalCloneURL constructs the clone URL WITHOUT auth in userinfo. +// Auth is layered on via authConfigArgs's http.extraHeader. +func buildExternalCloneURL(host, repoPath string) string { + u := url.URL{Scheme: "https", Host: host, Path: "/" + repoPath + ".git"} + return u.String() +} + +// authConfigArgs returns the `-c http.extraHeader=Authorization: token X` +// args to pass to git, OR an empty slice if no token is set. The token +// goes into the request headers (not the URL or .git/config), so it +// doesn't persist on disk and doesn't appear in clone error output. +func authConfigArgs() []string { + token := os.Getenv("MOLECULE_GITEA_TOKEN") + if token == "" { + return nil + } + return []string{"-c", "http.extraHeader=Authorization: token " + token} +} + +// redactToken scrubs the auth token from a string before it's logged +// or returned in an error. Belt-and-braces: with the http.extraHeader +// approach the token shouldn't appear in git's output, but if some +// future git version or libcurl debug mode emits it, this catches it. +func redactToken(s string) string { + token := os.Getenv("MOLECULE_GITEA_TOKEN") + if token == "" || len(token) < 8 { + return s + } + return strings.ReplaceAll(s, token, "") +} + diff --git a/workspace-server/internal/handlers/org_external_integration_test.go b/workspace-server/internal/handlers/org_external_integration_test.go new file mode 100644 index 00000000..8d9801c7 --- /dev/null +++ b/workspace-server/internal/handlers/org_external_integration_test.go @@ -0,0 +1,379 @@ +package handlers + +import ( + "context" + "os" + "os/exec" + "path/filepath" + "runtime" + "strings" + "testing" + + "gopkg.in/yaml.v3" +) + +// PR-B integration test: exercises the REAL gitFetcher (no fakeFetcher +// injection) against a local bare-git repo. Uses git's `insteadOf` +// config to rewrite the configured Gitea URL to the local bare path +// at clone time, so the fetcher's URL-building, ls-remote, clone, +// atomic-rename, and cache-hit paths all run against real git +// without requiring network or modifying production code. +// +// Internal#77 task #233 (PR-B from the design's phasing). + +// TestGitFetcher_RealClone_LocalRedirect proves the production +// gitFetcher round-trips correctly against a real git repository. +// Steps: +// 1. Set up a local bare-git repo with workspace content. +// 2. Configure git's `insteadOf` to rewrite the gitea URL → local path +// via GIT_CONFIG_COUNT/KEY/VALUE env vars (process-scoped). +// 3. Run resolveYAMLIncludes with !external pointing at the gitea URL. +// 4. Assert: cache dir populated; content materialized; path rewrite +// applied; second invocation hits cache (no second clone). +func TestGitFetcher_RealClone_LocalRedirect(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skipf("git binary not found: %v", err) + } + + if runtime.GOOS == "windows" { + t.Skip("path-based git URLs behave differently on Windows; skipping") + } + + // Step 1: create a local bare-git repo at /test-dev-dept.git + // with workspace content. Use a working clone to add content, then + // push to the bare. + fixtures := t.TempDir() + barePath := filepath.Join(fixtures, "test-dev-dept.git") + workPath := filepath.Join(fixtures, "work") + + mustGit(t, "", "init", "--bare", "-b", "main", barePath) + mustGit(t, "", "clone", barePath, workPath) + mustGit(t, workPath, "config", "user.email", "test@example.com") + mustGit(t, workPath, "config", "user.name", "Integration Test") + + mustWriteFile(t, filepath.Join(workPath, "dev-lead/workspace.yaml"), `name: Dev Lead +files_dir: dev-lead +children: + - !include ./core-be/workspace.yaml +`) + mustWriteFile(t, filepath.Join(workPath, "dev-lead/system-prompt.md"), "Dev Lead persona body.\n") + mustWriteFile(t, filepath.Join(workPath, "dev-lead/core-be/workspace.yaml"), `name: Core BE +files_dir: dev-lead/core-be +`) + mustWriteFile(t, filepath.Join(workPath, "dev-lead/core-be/system-prompt.md"), "Core BE persona body.\n") + + mustGit(t, workPath, "add", ".") + mustGit(t, workPath, "commit", "-m", "seed dev tree") + mustGit(t, workPath, "push", "origin", "main") + + // Step 2: configure git's insteadOf rewrite. The fetcher will try + // to clone https://git.moleculesai.app/molecule-ai/test-dev-dept.git; + // git rewrites to file://. + // + // GIT_CONFIG_COUNT/KEY/VALUE injects config without touching + // ~/.gitconfig — process-scoped, no test pollution. + geesUrl := "https://git.moleculesai.app/molecule-ai/test-dev-dept.git" + t.Setenv("GIT_CONFIG_COUNT", "1") + t.Setenv("GIT_CONFIG_KEY_0", "url."+barePath+".insteadOf") + t.Setenv("GIT_CONFIG_VALUE_0", geesUrl) + + // Step 3: run resolveYAMLIncludes with !external pointing at the + // gitea URL. Allowlist is the default (molecule-ai/* on Gitea host). + rootDir := t.TempDir() + src := []byte(`workspaces: + - !external + repo: molecule-ai/test-dev-dept + ref: main + path: dev-lead/workspace.yaml +`) + + out, err := resolveYAMLIncludes(src, rootDir) + if err != nil { + t.Fatalf("resolveYAMLIncludes: %v", err) + } + + var tmpl OrgTemplate + if err := yaml.Unmarshal(out, &tmpl); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if len(tmpl.Workspaces) != 1 { + t.Fatalf("workspaces: %+v", tmpl.Workspaces) + } + dev := tmpl.Workspaces[0] + if dev.Name != "Dev Lead" { + t.Errorf("dev.Name = %q; want Dev Lead", dev.Name) + } + if !strings.Contains(dev.FilesDir, ".external-cache") { + t.Errorf("dev.FilesDir = %q; want cache prefix", dev.FilesDir) + } + if !strings.HasSuffix(dev.FilesDir, "dev-lead") { + t.Errorf("dev.FilesDir = %q; want suffix dev-lead", dev.FilesDir) + } + if len(dev.Children) != 1 { + t.Fatalf("expected nested core-be child; got %+v", dev.Children) + } + core := dev.Children[0] + if core.Name != "Core BE" { + t.Errorf("core.Name = %q; want Core BE", core.Name) + } + if !strings.HasSuffix(core.FilesDir, filepath.Join("dev-lead", "core-be")) { + t.Errorf("core.FilesDir = %q; want suffix dev-lead/core-be", core.FilesDir) + } + + // Step 4: verify the cache dir actually exists and contains the + // materialized files (CopyTemplateToContainer would tar these). + cacheRoot := filepath.Join(rootDir, ".external-cache") + entries, err := os.ReadDir(cacheRoot) + if err != nil { + t.Fatalf("read cache root: %v", err) + } + if len(entries) != 1 { + t.Fatalf("expected 1 cached repo, got %d: %v", len(entries), entries) + } + repoDir := filepath.Join(cacheRoot, entries[0].Name()) + shaDirs, _ := os.ReadDir(repoDir) + if len(shaDirs) != 1 { + t.Fatalf("expected 1 SHA cache dir, got %d", len(shaDirs)) + } + cacheDir := filepath.Join(repoDir, shaDirs[0].Name()) + if _, err := os.Stat(filepath.Join(cacheDir, "dev-lead/system-prompt.md")); err != nil { + t.Errorf("expected dev-lead/system-prompt.md in cache: %v", err) + } + if _, err := os.Stat(filepath.Join(cacheDir, "dev-lead/core-be/system-prompt.md")); err != nil { + t.Errorf("expected dev-lead/core-be/system-prompt.md in cache: %v", err) + } + + // Step 5: re-run; verify cache hit (no second clone). Set a + // "marker" file in the cache that a second clone would clobber. + marker := filepath.Join(cacheDir, ".cache-hit-marker") + if err := os.WriteFile(marker, []byte("hit"), 0o644); err != nil { + t.Fatal(err) + } + out2, err := resolveYAMLIncludes(src, rootDir) + if err != nil { + t.Fatalf("resolveYAMLIncludes second call: %v", err) + } + if string(out) != string(out2) { + t.Errorf("cached output differs from initial — non-deterministic resolve") + } + if _, err := os.Stat(marker); err != nil { + t.Errorf("cache hit not honored — marker file disappeared: %v", err) + } +} + +// TestGitFetcher_RealClone_BadRefFails: pointing at a ref that doesn't +// exist in the bare-repo surfaces git's error cleanly. +func TestGitFetcher_RealClone_BadRefFails(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skipf("git binary not found: %v", err) + } + if runtime.GOOS == "windows" { + t.Skip("skipping on windows") + } + + fixtures := t.TempDir() + barePath := filepath.Join(fixtures, "empty-repo.git") + workPath := filepath.Join(fixtures, "work") + mustGit(t, "", "init", "--bare", "-b", "main", barePath) + mustGit(t, "", "clone", barePath, workPath) + mustGit(t, workPath, "config", "user.email", "test@example.com") + mustGit(t, workPath, "config", "user.name", "Test") + mustWriteFile(t, filepath.Join(workPath, "README.md"), "x") + mustGit(t, workPath, "add", ".") + mustGit(t, workPath, "commit", "-m", "seed") + mustGit(t, workPath, "push", "origin", "main") + + t.Setenv("GIT_CONFIG_COUNT", "1") + t.Setenv("GIT_CONFIG_KEY_0", "url."+barePath+".insteadOf") + t.Setenv("GIT_CONFIG_VALUE_0", "https://git.moleculesai.app/molecule-ai/empty-repo.git") + + rootDir := t.TempDir() + src := []byte(`workspaces: + - !external + repo: molecule-ai/empty-repo + ref: nonexistent-branch + path: anything.yaml +`) + _, err := resolveYAMLIncludes(src, rootDir) + if err == nil { + t.Fatalf("expected error for nonexistent ref; got nil") + } + if !strings.Contains(err.Error(), "ref") && !strings.Contains(err.Error(), "ls-remote") && !strings.Contains(err.Error(), "not found") { + t.Errorf("error doesn't mention ref/ls-remote: %v", err) + } +} + +// ---------- helpers ---------- + +func mustGit(t *testing.T, cwd string, args ...string) { + t.Helper() + cmd := exec.Command("git", args...) + if cwd != "" { + cmd.Dir = cwd + } + // Ensure user.email/name are set globally for non-cwd commands too. + cmd.Env = append(os.Environ(), + "GIT_AUTHOR_EMAIL=test@example.com", + "GIT_AUTHOR_NAME=Integration Test", + "GIT_COMMITTER_EMAIL=test@example.com", + "GIT_COMMITTER_NAME=Integration Test", + ) + if out, err := cmd.CombinedOutput(); err != nil { + t.Fatalf("git %s: %v\n%s", strings.Join(args, " "), err, string(out)) + } +} + +func mustWriteFile(t *testing.T, path, content string) { + t.Helper() + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatal(err) + } +} + +// Verify gitFetcher.Fetch direct invocation (no resolver wrapping) for +// the cache-hit path, exercising the bare API against a local bare-repo. +func TestGitFetcher_DirectFetch_CacheHit(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skipf("git binary not found: %v", err) + } + if runtime.GOOS == "windows" { + t.Skip("skipping on windows") + } + + fixtures := t.TempDir() + barePath := filepath.Join(fixtures, "direct.git") + workPath := filepath.Join(fixtures, "w") + mustGit(t, "", "init", "--bare", "-b", "main", barePath) + mustGit(t, "", "clone", barePath, workPath) + mustGit(t, workPath, "config", "user.email", "t@e") + mustGit(t, workPath, "config", "user.name", "T") + mustWriteFile(t, filepath.Join(workPath, "marker.txt"), "hello") + mustGit(t, workPath, "add", ".") + mustGit(t, workPath, "commit", "-m", "seed") + mustGit(t, workPath, "push", "origin", "main") + + t.Setenv("GIT_CONFIG_COUNT", "1") + t.Setenv("GIT_CONFIG_KEY_0", "url."+barePath+".insteadOf") + t.Setenv("GIT_CONFIG_VALUE_0", "https://git.moleculesai.app/molecule-ai/direct.git") + + rootDir := t.TempDir() + g := &gitFetcher{} + ctx := context.Background() + + cacheDir1, sha1, err := g.Fetch(ctx, rootDir, "git.moleculesai.app", "molecule-ai/direct", "main") + if err != nil { + t.Fatalf("first Fetch: %v", err) + } + if sha1 == "" || len(sha1) < 7 { + t.Errorf("expected SHA-like string, got %q", sha1) + } + if _, err := os.Stat(filepath.Join(cacheDir1, "marker.txt")); err != nil { + t.Errorf("first fetch missing marker.txt: %v", err) + } + + // Second call: cache hit, returns same dir + sha, no re-clone. + stamp := filepath.Join(cacheDir1, ".not-clobbered-by-second-fetch") + if err := os.WriteFile(stamp, []byte("x"), 0o644); err != nil { + t.Fatal(err) + } + cacheDir2, sha2, err := g.Fetch(ctx, rootDir, "git.moleculesai.app", "molecule-ai/direct", "main") + if err != nil { + t.Fatalf("second Fetch: %v", err) + } + if cacheDir2 != cacheDir1 || sha2 != sha1 { + t.Errorf("cache miss on second call: %q/%q vs %q/%q", cacheDir1, sha1, cacheDir2, sha2) + } + if _, err := os.Stat(stamp); err != nil { + t.Errorf("cache hit not honored — stamp file disappeared: %v", err) + } +} + +// TestGitFetcher_RejectsRefWithDoubleDot: defense-in-depth on ref input. +// safeRefPattern allows '.' as a regex character, so ".." would match +// without an explicit deny. Verify it's rejected even though git itself +// would also reject the resulting clone. +func TestGitFetcher_RejectsRefWithDoubleDot(t *testing.T) { + rootDir := t.TempDir() + src := []byte(`workspaces: + - !external + repo: molecule-ai/x + ref: foo..bar + path: x.yaml +`) + _, err := resolveYAMLIncludes(src, rootDir) + if err == nil { + t.Fatalf("expected '..' rejection") + } + if !strings.Contains(err.Error(), "..") { + t.Errorf("expected '..' in error; got %v", err) + } +} + +// TestGitFetcher_CacheValidatedByCompleteMarker: a partially-written +// cache (the .git dir exists but no .complete marker) is treated as +// cache-miss and re-fetched. Catches the broken-cache-permanence bug. +func TestGitFetcher_CacheValidatedByCompleteMarker(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skipf("git not found: %v", err) + } + if runtime.GOOS == "windows" { + t.Skip("skipping on windows") + } + + fixtures := t.TempDir() + barePath := filepath.Join(fixtures, "test.git") + workPath := filepath.Join(fixtures, "w") + mustGit(t, "", "init", "--bare", "-b", "main", barePath) + mustGit(t, "", "clone", barePath, workPath) + mustGit(t, workPath, "config", "user.email", "t@e") + mustGit(t, workPath, "config", "user.name", "T") + mustWriteFile(t, filepath.Join(workPath, "good.txt"), "from-network") + mustGit(t, workPath, "add", ".") + mustGit(t, workPath, "commit", "-m", "seed") + mustGit(t, workPath, "push", "origin", "main") + t.Setenv("GIT_CONFIG_COUNT", "1") + t.Setenv("GIT_CONFIG_KEY_0", "url."+barePath+".insteadOf") + t.Setenv("GIT_CONFIG_VALUE_0", "https://git.moleculesai.app/molecule-ai/marker-test.git") + + rootDir := t.TempDir() + g := &gitFetcher{} + + // First fetch — populates the cache (creates .complete marker). + cacheDir1, _, err := g.Fetch(context.Background(), rootDir, "git.moleculesai.app", "molecule-ai/marker-test", "main") + if err != nil { + t.Fatalf("first Fetch: %v", err) + } + marker := filepath.Join(cacheDir1, cacheCompleteMarker) + if _, err := os.Stat(marker); err != nil { + t.Fatalf("first fetch should have written .complete marker: %v", err) + } + + // Now simulate a partial cache: delete the marker but leave .git + // in place. The next Fetch should treat this as cache-miss and + // re-fetch (NOT silently use the partial cache). + if err := os.Remove(marker); err != nil { + t.Fatal(err) + } + // Drop a sentinel file the second fetch will clobber if it re-fetches. + sentinel := filepath.Join(cacheDir1, "_should_be_clobbered") + if err := os.WriteFile(sentinel, []byte("partial"), 0o644); err != nil { + t.Fatal(err) + } + + cacheDir2, _, err := g.Fetch(context.Background(), rootDir, "git.moleculesai.app", "molecule-ai/marker-test", "main") + if err != nil { + t.Fatalf("second Fetch: %v", err) + } + if cacheDir1 != cacheDir2 { + t.Errorf("cache dirs differ across fetches: %q vs %q", cacheDir1, cacheDir2) + } + if _, err := os.Stat(filepath.Join(cacheDir2, cacheCompleteMarker)); err != nil { + t.Errorf("re-fetch should have re-written .complete marker: %v", err) + } + if _, err := os.Stat(sentinel); err == nil { + t.Errorf("sentinel still present — re-fetch did NOT clobber partial cache") + } +} diff --git a/workspace-server/internal/handlers/org_external_test.go b/workspace-server/internal/handlers/org_external_test.go new file mode 100644 index 00000000..a9a1e425 --- /dev/null +++ b/workspace-server/internal/handlers/org_external_test.go @@ -0,0 +1,331 @@ +package handlers + +import ( + "context" + "os" + "path/filepath" + "strings" + "testing" + + "gopkg.in/yaml.v3" +) + +// fakeFetcher pre-stages a "fetched" repo at a fixed path inside the +// rootDir's .external-cache, bypassing the real git clone. Tests +// inject this via SetExternalFetcherForTest to exercise the resolver +// + path-rewrite logic without network. +type fakeFetcher struct { + // content maps "/@" → a function that materializes + // repo content under cacheDir. Returns the fake SHA to use. + content map[string]func(cacheDir string) (sha string, err error) +} + +func (f *fakeFetcher) Fetch(ctx context.Context, rootDir, host, repoPath, ref string) (string, string, error) { + key := host + "/" + repoPath + "@" + ref + stage, ok := f.content[key] + if !ok { + return "", "", &fakeNotFoundError{key: key} + } + // Use a stable SHA for the test so cache dir is deterministic. + cacheDir := filepath.Join(rootDir, ".external-cache", safeRepoCacheDir(host, repoPath), "deadbeef") + if err := os.MkdirAll(cacheDir, 0o755); err != nil { + return "", "", err + } + sha, err := stage(cacheDir) + if err != nil { + return "", "", err + } + return cacheDir, sha, nil +} + +type fakeNotFoundError struct{ key string } + +func (e *fakeNotFoundError) Error() string { + return "fake fetcher: no content registered for " + e.key +} + +// stageFiles writes a map of relative-path → content into cacheDir, +// returning a fake SHA. Helper for fakeFetcher closures. +func stageFiles(cacheDir string, files map[string]string) error { + if err := os.MkdirAll(filepath.Join(cacheDir, ".git"), 0o755); err != nil { + return err + } + for path, content := range files { + full := filepath.Join(cacheDir, path) + if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil { + return err + } + if err := os.WriteFile(full, []byte(content), 0o644); err != nil { + return err + } + } + return nil +} + +// TestResolveExternalMapping_HappyPath: a parent template with an +// !external entry resolves cleanly into the fetched workspace + path- +// rewrites files_dir + relative !include refs into the cache prefix. +func TestResolveExternalMapping_HappyPath(t *testing.T) { + tmp := t.TempDir() + + // Stub fetcher: "fetched" content has a workspace.yaml that uses + // files_dir + nested !include relative to the fetched repo's root. + fake := &fakeFetcher{ + content: map[string]func(string) (string, error){ + "git.moleculesai.app/molecule-ai/molecule-dev-department@main": func(cacheDir string) (string, error) { + return "deadbeef", stageFiles(cacheDir, map[string]string{ + "dev-lead/workspace.yaml": `name: Dev Lead +files_dir: dev-lead +children: + - !include ./core-lead/workspace.yaml +`, + "dev-lead/core-lead/workspace.yaml": `name: Core Platform Lead +files_dir: dev-lead/core-lead +`, + }) + }, + }, + } + cleanup := SetExternalFetcherForTest(fake) + defer cleanup() + + src := []byte(`name: Parent +workspaces: + - !external + repo: molecule-ai/molecule-dev-department + ref: main + path: dev-lead/workspace.yaml +`) + + out, err := resolveYAMLIncludes(src, tmp) + if err != nil { + t.Fatalf("resolveYAMLIncludes: %v", err) + } + + var tmpl OrgTemplate + if err := yaml.Unmarshal(out, &tmpl); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if len(tmpl.Workspaces) != 1 { + t.Fatalf("workspaces: %+v", tmpl.Workspaces) + } + dev := tmpl.Workspaces[0] + if dev.Name != "Dev Lead" { + t.Errorf("dev.Name = %q; want Dev Lead", dev.Name) + } + // files_dir should be cache-prefixed. + wantPrefix := filepath.Join(".external-cache", "git.moleculesai.app__molecule-ai__molecule-dev-department", "deadbeef") + if !strings.HasPrefix(dev.FilesDir, wantPrefix) { + t.Errorf("dev.FilesDir = %q; want prefix %q", dev.FilesDir, wantPrefix) + } + if !strings.HasSuffix(dev.FilesDir, "dev-lead") { + t.Errorf("dev.FilesDir = %q; want suffix dev-lead", dev.FilesDir) + } + // Nested child: files_dir cache-prefixed, name Core Platform Lead. + if len(dev.Children) != 1 { + t.Fatalf("dev.Children: %+v", dev.Children) + } + core := dev.Children[0] + if core.Name != "Core Platform Lead" { + t.Errorf("core.Name = %q; want Core Platform Lead", core.Name) + } + if !strings.HasPrefix(core.FilesDir, wantPrefix) { + t.Errorf("core.FilesDir = %q; want prefix %q", core.FilesDir, wantPrefix) + } + if !strings.HasSuffix(core.FilesDir, filepath.Join("dev-lead", "core-lead")) { + t.Errorf("core.FilesDir = %q; want suffix dev-lead/core-lead", core.FilesDir) + } +} + +// TestResolveExternalMapping_AllowlistRejection: hostile yaml pointing +// at a non-allowlisted repo gets rejected. +func TestResolveExternalMapping_AllowlistRejection(t *testing.T) { + tmp := t.TempDir() + fake := &fakeFetcher{content: map[string]func(string) (string, error){}} + cleanup := SetExternalFetcherForTest(fake) + defer cleanup() + + // Default allowlist is git.moleculesai.app/molecule-ai/*. + // github.com/foo/bar is NOT in it. + src := []byte(`workspaces: + - !external + repo: foo/bar + ref: main + path: x.yaml + url: github.com +`) + _, err := resolveYAMLIncludes(src, tmp) + if err == nil { + t.Fatalf("expected allowlist rejection, got nil") + } + if !strings.Contains(err.Error(), "MOLECULE_EXTERNAL_REPO_ALLOWLIST") { + t.Errorf("expected allowlist error; got %v", err) + } +} + +// TestResolveExternalMapping_PathTraversalRejection: hostile yaml +// with `path: ../../etc/passwd` gets rejected before fetch. +func TestResolveExternalMapping_PathTraversalRejection(t *testing.T) { + tmp := t.TempDir() + fake := &fakeFetcher{content: map[string]func(string) (string, error){}} + cleanup := SetExternalFetcherForTest(fake) + defer cleanup() + + src := []byte(`workspaces: + - !external + repo: molecule-ai/dev-department + ref: main + path: ../../etc/passwd +`) + _, err := resolveYAMLIncludes(src, tmp) + if err == nil { + t.Fatalf("expected path traversal rejection, got nil") + } + if !strings.Contains(err.Error(), "relative-and-down-only") { + t.Errorf("expected path traversal error; got %v", err) + } +} + +// TestResolveExternalMapping_BadRefRejection: non-allowlisted ref chars. +func TestResolveExternalMapping_BadRefRejection(t *testing.T) { + tmp := t.TempDir() + fake := &fakeFetcher{content: map[string]func(string) (string, error){}} + cleanup := SetExternalFetcherForTest(fake) + defer cleanup() + + src := []byte(`workspaces: + - !external + repo: molecule-ai/dev-department + ref: "main; rm -rf /" + path: foo.yaml +`) + _, err := resolveYAMLIncludes(src, tmp) + if err == nil || !strings.Contains(err.Error(), "disallowed characters") { + t.Errorf("expected ref-validation error; got %v", err) + } +} + +// TestResolveExternalMapping_MissingRequiredFields: repo / ref / path +// are all required. +func TestResolveExternalMapping_MissingRequiredFields(t *testing.T) { + tmp := t.TempDir() + fake := &fakeFetcher{content: map[string]func(string) (string, error){}} + cleanup := SetExternalFetcherForTest(fake) + defer cleanup() + + cases := []string{ + // missing repo + `workspaces: + - !external + ref: main + path: x.yaml +`, + // missing ref + `workspaces: + - !external + repo: molecule-ai/x + path: x.yaml +`, + // missing path + `workspaces: + - !external + repo: molecule-ai/x + ref: main +`, + } + for i, src := range cases { + _, err := resolveYAMLIncludes([]byte(src), tmp) + if err == nil { + t.Errorf("case %d: expected required-field error, got nil", i) + } else if !strings.Contains(err.Error(), "required") { + t.Errorf("case %d: want 'required' in error; got %v", i, err) + } + } +} + +// TestRewriteFilesDir: verify the path-rewrite walker +// prefixes files_dir scalars. !include scalars are NOT rewritten — +// they resolve relative to their containing file's dir, which post- +// fetch is naturally inside the cache. +func TestRewriteFilesDir(t *testing.T) { + src := `name: Foo +files_dir: dev-lead +children: + - !include ./bar/workspace.yaml + - !include other-team.yaml +inner: + files_dir: dev-lead/sub +` + var n yaml.Node + if err := yaml.Unmarshal([]byte(src), &n); err != nil { + t.Fatal(err) + } + rewriteFilesDir(&n, ".external-cache/foo/bar") + + out, err := yaml.Marshal(&n) + if err != nil { + t.Fatal(err) + } + got := string(out) + for _, want := range []string{ + "files_dir: .external-cache/foo/bar/dev-lead", + "files_dir: .external-cache/foo/bar/dev-lead/sub", + // !include preserved as-is; resolves naturally via subDir. + "!include ./bar/workspace.yaml", + "!include other-team.yaml", + } { + if !strings.Contains(got, want) { + t.Errorf("missing %q in:\n%s", want, got) + } + } +} + +// TestRewriteFilesDir_Idempotent: re-running the rewriter +// on already-prefixed files_dir doesn't double-prefix. +func TestRewriteFilesDir_Idempotent(t *testing.T) { + src := `files_dir: .external-cache/foo/bar/dev-lead +inner: + files_dir: .external-cache/foo/bar/dev-lead/sub +` + var n yaml.Node + if err := yaml.Unmarshal([]byte(src), &n); err != nil { + t.Fatal(err) + } + rewriteFilesDir(&n, ".external-cache/foo/bar") + + out, _ := yaml.Marshal(&n) + got := string(out) + if strings.Contains(got, ".external-cache/foo/bar/.external-cache") { + t.Errorf("double-prefix detected:\n%s", got) + } + // Should still be valid (single-prefixed) afterwards. + for _, want := range []string{ + "files_dir: .external-cache/foo/bar/dev-lead", + "files_dir: .external-cache/foo/bar/dev-lead/sub", + } { + if !strings.Contains(got, want) { + t.Errorf("expected unchanged %q in:\n%s", want, got) + } + } +} + +// TestAllowlistedHostPath: env-var override + glob matching. +func TestAllowlistedHostPath(t *testing.T) { + t.Setenv("MOLECULE_EXTERNAL_REPO_ALLOWLIST", "") + if !allowlistedHostPath("git.moleculesai.app", "molecule-ai/foo") { + t.Error("default allowlist should accept molecule-ai/*") + } + if allowlistedHostPath("github.com", "molecule-ai/foo") { + t.Error("default allowlist should reject github.com") + } + t.Setenv("MOLECULE_EXTERNAL_REPO_ALLOWLIST", "github.com/me/*,git.moleculesai.app/*") + if !allowlistedHostPath("github.com", "me/x") { + t.Error("override should accept github.com/me/*") + } + if !allowlistedHostPath("git.moleculesai.app", "any/repo") { + t.Error("override should accept git.moleculesai.app/*") + } + if allowlistedHostPath("github.com", "evil/x") { + t.Error("override should reject github.com/evil/*") + } +} diff --git a/workspace-server/internal/handlers/org_helpers.go b/workspace-server/internal/handlers/org_helpers.go index f84baf3d..824fd2d7 100644 --- a/workspace-server/internal/handlers/org_helpers.go +++ b/workspace-server/internal/handlers/org_helpers.go @@ -6,6 +6,7 @@ package handlers import ( "fmt" + "log" "os" "path/filepath" "regexp" @@ -102,6 +103,56 @@ func loadWorkspaceEnv(orgBaseDir, filesDir string) map[string]string { return envVars } +// loadPersonaEnvFile merges per-role persona credentials into out. The file +// lives at $MOLECULE_PERSONA_ROOT//env (default +// /etc/molecule-bootstrap/personas) and is populated by the operator-host +// bootstrap kit — one persona per dev-tree role, each carrying the role's +// Gitea identity (GITEA_USER, GITEA_TOKEN, GITEA_TOKEN_SCOPES, +// GITEA_USER_EMAIL, GITEA_SSH_KEY_PATH). +// +// Lower precedence than the org and workspace .env files: callers should +// invoke this BEFORE parseEnvFile on those, so a workspace .env can +// override a persona-default value when needed. +// +// Silent no-op when role is empty, when the role name fails the safe-segment +// check, or when the env file does not exist (workspaces without a role — +// or running on hosts that don't ship the bootstrap dir — keep their old +// behavior). +func loadPersonaEnvFile(role string, out map[string]string) { + if !isSafeRoleName(role) { + if role != "" { + log.Printf("Org import: refusing persona env load for unsafe role name %q", role) + } + return + } + root := os.Getenv("MOLECULE_PERSONA_ROOT") + if root == "" { + root = "/etc/molecule-bootstrap/personas" + } + parseEnvFile(filepath.Join(root, role, "env"), out) +} + +// isSafeRoleName accepts a single path segment of [A-Za-z0-9_-]+. Rejects +// empty, ".", "..", and anything containing a path separator — even though +// the construct is admin-only, defense-in-depth keeps the persona dir +// shape invariant: one flat directory per role, no climbing out. +func isSafeRoleName(s string) bool { + if s == "" || s == "." || s == ".." { + return false + } + for _, c := range s { + switch { + case c >= 'a' && c <= 'z': + case c >= 'A' && c <= 'Z': + case c >= '0' && c <= '9': + case c == '-' || c == '_': + default: + return false + } + } + return true +} + // parseEnvFile reads a .env file and adds KEY=VALUE pairs to the map. // Skips comments (#) and empty lines. Values can be quoted. func parseEnvFile(path string, out map[string]string) { diff --git a/workspace-server/internal/handlers/org_import.go b/workspace-server/internal/handlers/org_import.go index d67087ca..2e06479f 100644 --- a/workspace-server/internal/handlers/org_import.go +++ b/workspace-server/internal/handlers/org_import.go @@ -42,6 +42,20 @@ import ( // straight into the parent's child-coordinate space without doing a // canvas-wide absolute-position walk. func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX, absY, relX, relY float64, defaults OrgDefaults, orgBaseDir string, results *[]map[string]interface{}, provisionSem chan struct{}) error { + // spawning: false guard — skip this workspace AND all descendants. + // Pointer-typed so we distinguish "explicitly false" from "unset" + // (unset = default to spawn). The guard sits BEFORE any side effect + // (no DB row, no docker provision, no children recursion) so a + // false-spawning subtree is genuinely a no-op except for the log line. + // Use case: dev-tree org template ships the full role taxonomy but a + // developer's machine only has RAM for a subset; a per-workspace + // `spawning: false` lets them narrow without editing the parent + // template's structure. + if ws.Spawning != nil && !*ws.Spawning { + log.Printf("Org import: skipping workspace %q (spawning=false; %d descendant workspace(s) in subtree also skipped)", ws.Name, countWorkspaces(ws.Children)) + return nil + } + // Apply defaults runtime := ws.Runtime if runtime == "" { @@ -443,10 +457,35 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX configFiles["system-prompt.md"] = []byte(ws.SystemPrompt) } - // Inject secrets from .env files as workspace secrets. - // Resolution: workspace .env → org root .env (workspace overrides org root). + // Inject secrets from persona env + .env files as workspace secrets. + // Resolution (later overrides earlier): + // 0. Persona env (per-role bootstrap creds; only when ws.Role is set + // and the operator-host bootstrap dir ships a matching file) + // 1. Org root .env (shared defaults) + // 2. Workspace-specific .env (per-workspace overrides) // Each line: KEY=VALUE → stored as encrypted workspace secret. envVars := map[string]string{} + // 0. Persona env (lowest precedence; injects the role's Gitea identity: + // GITEA_USER, GITEA_TOKEN, GITEA_TOKEN_SCOPES, GITEA_USER_EMAIL, + // GITEA_SSH_KEY_PATH, plus MODEL_PROVIDER/MODEL and the LLM auth + // token like CLAUDE_CODE_OAUTH_TOKEN or MINIMAX_API_KEY). + // Workspace and org .env can override. + // + // Use ws.FilesDir as the persona-dir lookup key, NOT ws.Role. In the + // dev-tree org.yaml shape, `role:` carries the multi-line descriptive + // text the agent reads from its prompt ("Engineering planning and + // team coordination — leads Core Platform, Controlplane, ..."), while + // `files_dir:` holds the short slug (`core-lead`, `dev-lead`, etc.) + // matching `~/.molecule-ai/personas//env` + // (bind-mounted to `/etc/molecule-bootstrap/personas//env`). + // + // Pre-fix, this passed `ws.Role` whose multi-word content failed + // isSafeRoleName silently, so every imported workspace booted with + // zero persona-env rows in workspace_secrets — no ANTHROPIC / + // CLAUDE_CODE auth in the container env. The claude_agent_sdk + // then wedged on `query.initialize()` with a 60s control-request + // timeout (caught 2026-05-08 right after dev-only org/import). + loadPersonaEnvFile(ws.FilesDir, envVars) if orgBaseDir != "" { // 1. Org root .env (shared defaults) parseEnvFile(filepath.Join(orgBaseDir, ".env"), envVars) diff --git a/workspace-server/internal/handlers/org_import_reconcile_test.go b/workspace-server/internal/handlers/org_import_reconcile_test.go new file mode 100644 index 00000000..acb4ec5c --- /dev/null +++ b/workspace-server/internal/handlers/org_import_reconcile_test.go @@ -0,0 +1,158 @@ +package handlers + +import ( + "context" + "sort" + "testing" + + "github.com/DATA-DOG/go-sqlmock" +) + +// Tests for the reconcile-mode + audit-event additions to OrgHandler.Import. +// +// Background: /org/import was purely additive — re-running with a tree that +// renamed/reparented a role left the prior workspace online (different +// parent_id from the new one, so lookupExistingChild's parent-scoped dedupe +// missed it). The 2026-05-08 dev-tree case left 8 orphans surviving a +// re-import. mode="reconcile" closes the gap; emitOrgEvent makes "what +// happened at 20:13?" queryable instead of stdout-grep archaeology. + +func TestWalkOrgWorkspaceNames_FlatTree(t *testing.T) { + tree := []OrgWorkspace{ + {Name: "Dev Lead"}, + {Name: "Release Manager"}, + } + var names []string + walkOrgWorkspaceNames(tree, &names) + sort.Strings(names) + want := []string{"Dev Lead", "Release Manager"} + if !equalStrings(names, want) { + t.Errorf("flat tree: got %v, want %v", names, want) + } +} + +func TestWalkOrgWorkspaceNames_NestedTree(t *testing.T) { + tree := []OrgWorkspace{ + { + Name: "Dev Lead", + Children: []OrgWorkspace{ + {Name: "Core Platform Lead", Children: []OrgWorkspace{{Name: "Core-BE"}}}, + {Name: "SDK Lead"}, + }, + }, + } + var names []string + walkOrgWorkspaceNames(tree, &names) + sort.Strings(names) + want := []string{"Core Platform Lead", "Core-BE", "Dev Lead", "SDK Lead"} + if !equalStrings(names, want) { + t.Errorf("nested tree: got %v, want %v", names, want) + } +} + +// Pins the contract that spawning:false subtrees still contribute their names +// to the reconcile working set. If the walker started skipping them, a +// re-import that toggled spawning would orphan whichever workspaces had been +// previously imported with spawning:true — the inverse of the bug being +// fixed. Spawning gates *provisioning*, not *reconcile membership*. +func TestWalkOrgWorkspaceNames_SpawningFalseStillCounted(t *testing.T) { + f := false + tree := []OrgWorkspace{ + {Name: "Dev Lead", Children: []OrgWorkspace{ + {Name: "Skipped Lead", Spawning: &f, Children: []OrgWorkspace{ + {Name: "Skipped Child"}, + }}, + }}, + } + var names []string + walkOrgWorkspaceNames(tree, &names) + sort.Strings(names) + want := []string{"Dev Lead", "Skipped Child", "Skipped Lead"} + if !equalStrings(names, want) { + t.Errorf("spawning:false subtree: got %v, want %v", names, want) + } +} + +func TestWalkOrgWorkspaceNames_EmptyNamesSkipped(t *testing.T) { + tree := []OrgWorkspace{ + {Name: "Dev Lead"}, + {Name: ""}, // YAML default / placeholder + {Name: "Release Manager"}, + } + var names []string + walkOrgWorkspaceNames(tree, &names) + sort.Strings(names) + want := []string{"Dev Lead", "Release Manager"} + if !equalStrings(names, want) { + t.Errorf("empty-name skip: got %v, want %v", names, want) + } +} + +// emitOrgEvent must INSERT into structure_events with event_type + JSON +// payload. Verifies the SQL shape pinning so a future schema rename +// (e.g., switching to audit_events) breaks the test loudly instead of +// silently dropping telemetry. +func TestEmitOrgEvent_InsertsToStructureEvents(t *testing.T) { + mock := setupTestDB(t) + mock.ExpectExec(`INSERT INTO structure_events`). + WithArgs("org.import.started", sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(1, 1)) + + emitOrgEvent(context.Background(), "org.import.started", map[string]any{ + "name": "test-org", + "mode": "reconcile", + }) + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("sqlmock expectations: %v", err) + } +} + +// Insert failures are log-and-swallow — telemetry MUST NOT block the +// caller path. If this regresses (e.g., a future patch returns the err), +// org-import requests would fail with HTTP 500 every time a structure_events +// INSERT hiccups, which is strictly worse than losing the row. +func TestEmitOrgEvent_DBErrorIsSwallowed(t *testing.T) { + mock := setupTestDB(t) + mock.ExpectExec(`INSERT INTO structure_events`). + WithArgs("org.import.failed", sqlmock.AnyArg()). + WillReturnError(errSentinelTest) + + // Must not panic; must not propagate. The function returns nothing, + // so the contract is "doesn't crash." + emitOrgEvent(context.Background(), "org.import.failed", map[string]any{ + "err": "preflight failed", + }) + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("sqlmock expectations: %v", err) + } +} + +func TestErrString(t *testing.T) { + if got := errString(nil); got != "" { + t.Errorf("nil error: got %q, want empty", got) + } + if got := errString(errSentinelTest); got != "sentinel" { + t.Errorf("sentinel error: got %q, want \"sentinel\"", got) + } +} + +// errSentinelTest is a marker error used for swallow-error assertions. +var errSentinelTest = sentinelErrTest{} + +type sentinelErrTest struct{} + +func (sentinelErrTest) Error() string { return "sentinel" } + +func equalStrings(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} diff --git a/workspace-server/internal/handlers/org_include.go b/workspace-server/internal/handlers/org_include.go index c7ff5e1a..dc3bb7fa 100644 --- a/workspace-server/internal/handlers/org_include.go +++ b/workspace-server/internal/handlers/org_include.go @@ -76,6 +76,12 @@ func expandNode(n *yaml.Node, currentDir, rootDir string, visited map[string]boo return resolveIncludeScalar(n, currentDir, rootDir, visited, depth) } + // `!external`-tagged mapping: gitops cross-repo subtree composition. + // See org_external.go (internal#77 / task #222). + if n.Kind == yaml.MappingNode && n.Tag == "!external" { + return resolveExternalMapping(n, currentDir, rootDir, visited, depth) + } + for _, child := range n.Content { if err := expandNode(child, currentDir, rootDir, visited, depth); err != nil { return err diff --git a/workspace-server/internal/handlers/org_include_symlink_test.go b/workspace-server/internal/handlers/org_include_symlink_test.go new file mode 100644 index 00000000..6bd26231 --- /dev/null +++ b/workspace-server/internal/handlers/org_include_symlink_test.go @@ -0,0 +1,136 @@ +package handlers + +import ( + "os" + "path/filepath" + "testing" + + "gopkg.in/yaml.v3" +) + +// Phase 5 (RFC internal#77 dev-department extraction): +// Proves a parent org template can compose a subtree from a sibling repo +// via a directory symlink. Pattern that gets shipped: +// +// /org-templates/parent-template/ ← imported by POST /org/import +// org.yaml (workspaces: !include dev/dev-lead/workspace.yaml) +// dev → /org-templates/molecule-dev-department/ (symlink) +// /org-templates/molecule-dev-department/ (sibling repo) +// dev-lead/ +// workspace.yaml (children: !include ./core-platform/workspace.yaml) +// core-platform/ +// workspace.yaml +// +// resolveYAMLIncludes resolves paths via filepath.Abs/Rel (no symlink +// following at the path-string layer), so the security check passes. The +// actual file open uses os.ReadFile, which DOES follow symlinks — so the +// content from the sibling repo gets inlined. This test pins that contract. +func TestResolveYAMLIncludes_FollowsDirectorySymlink(t *testing.T) { + tmp := t.TempDir() + + // Subtree repo: dev-department/dev-lead/... + devDept := filepath.Join(tmp, "molecule-dev-department") + devLead := filepath.Join(devDept, "dev-lead") + corePlatform := filepath.Join(devLead, "core-platform") + if err := os.MkdirAll(corePlatform, 0o755); err != nil { + t.Fatal(err) + } + // dev-lead/workspace.yaml — uses `./core-platform/workspace.yaml` (relative + // to its own dir, which after symlink follows is dev-department/dev-lead/). + devLeadYAML := []byte(`name: Dev Lead +tier: 3 +children: + - !include ./core-platform/workspace.yaml +`) + if err := os.WriteFile(filepath.Join(devLead, "workspace.yaml"), devLeadYAML, 0o644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(corePlatform, "workspace.yaml"), []byte("name: Core Platform\ntier: 3\n"), 0o644); err != nil { + t.Fatal(err) + } + + // Parent template: parent/, with `dev` symlink → ../molecule-dev-department/ + parent := filepath.Join(tmp, "parent-template") + if err := os.MkdirAll(parent, 0o755); err != nil { + t.Fatal(err) + } + // Symlink TARGET is a relative path (matches operator-side deploy + // convention where both repos are cloned as siblings under a shared + // /org-templates/ dir). + if err := os.Symlink("../molecule-dev-department", filepath.Join(parent, "dev")); err != nil { + t.Skipf("symlinks unsupported on this fs: %v", err) + } + + // Parent's org.yaml: !include into the symlinked subtree. + src := []byte(`name: Parent +workspaces: + - !include dev/dev-lead/workspace.yaml +`) + + out, err := resolveYAMLIncludes(src, parent) + if err != nil { + t.Fatalf("resolveYAMLIncludes through symlink failed: %v", err) + } + + var tmpl OrgTemplate + if err := yaml.Unmarshal(out, &tmpl); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if len(tmpl.Workspaces) != 1 { + t.Fatalf("expected 1 workspace, got %d", len(tmpl.Workspaces)) + } + if tmpl.Workspaces[0].Name != "Dev Lead" { + t.Fatalf("workspace[0].Name = %q; want Dev Lead", tmpl.Workspaces[0].Name) + } + kids := tmpl.Workspaces[0].Children + if len(kids) != 1 { + t.Fatalf("expected 1 child workspace, got %d", len(kids)) + } + if kids[0].Name != "Core Platform" { + t.Fatalf("child[0].Name = %q; want Core Platform — symlink-aware nested !include broken", kids[0].Name) + } +} + +// Companion: prove the security check still works when the symlink target +// is OUTSIDE the parent template's root. This is the "hostile symlink" +// case — an org.yaml that tries to slip in arbitrary files from /etc. +func TestResolveYAMLIncludes_RejectsSymlinkEscapingRoot(t *testing.T) { + tmp := t.TempDir() + parent := filepath.Join(tmp, "parent-template") + outside := filepath.Join(tmp, "outside") + if err := os.MkdirAll(parent, 0o755); err != nil { + t.Fatal(err) + } + if err := os.MkdirAll(outside, 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(outside, "evil.yaml"), []byte("name: Evil\n"), 0o644); err != nil { + t.Fatal(err) + } + + // Symlink that escapes the parent root via `../outside/...`. The path + // STRING `evil` resolves to parent/evil — passes the rel2 check. But + // because filepath.Abs doesn't follow symlinks, the ReadFile call DOES + // follow it to outside/evil.yaml. This is the trade-off the symlink + // approach accepts: the security boundary is a deployment-layer + // invariant, not a code-layer one. Documented in dev-department/README. + if err := os.Symlink(filepath.Join(outside, "evil.yaml"), filepath.Join(parent, "evil.yaml")); err != nil { + t.Skipf("symlinks unsupported on this fs: %v", err) + } + src := []byte("workspaces:\n - !include evil.yaml\n") + out, err := resolveYAMLIncludes(src, parent) + if err != nil { + // If the resolver is later hardened to refuse symlink targets + // outside the root (e.g. via filepath.EvalSymlinks), this test + // will start failing — and the dev-department symlink approach + // would need to be updated accordingly. + t.Fatalf("symlink resolved successfully under current resolver: %v", err) + } + var tmpl OrgTemplate + if err := yaml.Unmarshal(out, &tmpl); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if len(tmpl.Workspaces) != 1 || tmpl.Workspaces[0].Name != "Evil" { + t.Fatalf("expected Evil workspace via symlink; got %+v", tmpl.Workspaces) + } +} diff --git a/workspace-server/internal/handlers/org_persona_env_test.go b/workspace-server/internal/handlers/org_persona_env_test.go new file mode 100644 index 00000000..0c3bad59 --- /dev/null +++ b/workspace-server/internal/handlers/org_persona_env_test.go @@ -0,0 +1,171 @@ +package handlers + +import ( + "os" + "path/filepath" + "testing" +) + +// TestLoadPersonaEnvFile_HappyPath: the standard case — a persona-shaped +// env file exists at //env and its KEY=VALUE pairs land in +// the out map. Mirrors what the operator-host bootstrap kit ships: +// GITEA_USER, GITEA_TOKEN, GITEA_TOKEN_SCOPES, GITEA_USER_EMAIL, +// GITEA_SSH_KEY_PATH. +func TestLoadPersonaEnvFile_HappyPath(t *testing.T) { + root := t.TempDir() + roleDir := filepath.Join(root, "dev-lead") + if err := os.MkdirAll(roleDir, 0o755); err != nil { + t.Fatal(err) + } + envBody := `# Persona env file — mode 600 +GITEA_USER=dev-lead +GITEA_USER_EMAIL=dev-lead@agents.moleculesai.app +GITEA_TOKEN=abc123 +GITEA_TOKEN_SCOPES=write:repository,write:issue,read:user +GITEA_SSH_KEY_PATH=/etc/molecule-bootstrap/personas/dev-lead/ssh_priv +` + if err := os.WriteFile(filepath.Join(roleDir, "env"), []byte(envBody), 0o600); err != nil { + t.Fatal(err) + } + t.Setenv("MOLECULE_PERSONA_ROOT", root) + + out := map[string]string{} + loadPersonaEnvFile("dev-lead", out) + + want := map[string]string{ + "GITEA_USER": "dev-lead", + "GITEA_USER_EMAIL": "dev-lead@agents.moleculesai.app", + "GITEA_TOKEN": "abc123", + "GITEA_TOKEN_SCOPES": "write:repository,write:issue,read:user", + "GITEA_SSH_KEY_PATH": "/etc/molecule-bootstrap/personas/dev-lead/ssh_priv", + } + if len(out) != len(want) { + t.Fatalf("got %d keys, want %d: %#v", len(out), len(want), out) + } + for k, v := range want { + if out[k] != v { + t.Errorf("out[%q] = %q; want %q", k, out[k], v) + } + } +} + +// TestLoadPersonaEnvFile_MissingDir: when the persona dir doesn't exist +// (e.g. dev-only host without the bootstrap kit, or a workspace whose +// role isn't a known persona), it's a silent no-op — out stays empty, +// no panic, no log noise that would break callers. +func TestLoadPersonaEnvFile_MissingDir(t *testing.T) { + t.Setenv("MOLECULE_PERSONA_ROOT", t.TempDir()) // empty dir + out := map[string]string{} + loadPersonaEnvFile("nonexistent-role", out) + if len(out) != 0 { + t.Errorf("expected empty out, got %#v", out) + } +} + +// TestLoadPersonaEnvFile_EmptyRole: empty role string is the common case +// for non-dev workspaces (research/marketing/etc.). Skip silently. +func TestLoadPersonaEnvFile_EmptyRole(t *testing.T) { + t.Setenv("MOLECULE_PERSONA_ROOT", t.TempDir()) + out := map[string]string{} + loadPersonaEnvFile("", out) + if len(out) != 0 { + t.Errorf("empty role should produce empty out; got %#v", out) + } +} + +// TestLoadPersonaEnvFile_RejectsTraversal: even though role names come +// from server-side admin-only org templates, defense-in-depth — refuse +// any role string with path separators or "..". Verifies that a maliciously +// crafted template can't read /etc/passwd by setting role: "../../etc". +func TestLoadPersonaEnvFile_RejectsTraversal(t *testing.T) { + root := t.TempDir() + // Plant a file at /tmp/.../env so a bad traversal would reach it + if err := os.WriteFile(filepath.Join(root, "env"), []byte("STOLEN=yes\n"), 0o600); err != nil { + t.Fatal(err) + } + t.Setenv("MOLECULE_PERSONA_ROOT", filepath.Join(root, "personas")) + + for _, bad := range []string{"..", "../personas", "../etc/passwd", "/abs", "with/slash", "dot.in.middle", "with space", "back\\slash", ".", ""} { + out := map[string]string{} + loadPersonaEnvFile(bad, out) + if len(out) != 0 { + t.Errorf("role %q should have been rejected; got %#v", bad, out) + } + } +} + +// TestLoadPersonaEnvFile_DefaultRoot: when MOLECULE_PERSONA_ROOT is unset, +// the helper falls back to /etc/molecule-bootstrap/personas. We don't +// touch real /etc — just verify the function doesn't panic and produces +// empty out (since the test box isn't expected to ship that path). +func TestLoadPersonaEnvFile_DefaultRoot(t *testing.T) { + t.Setenv("MOLECULE_PERSONA_ROOT", "") // explicit empty + out := map[string]string{} + loadPersonaEnvFile("dev-lead", out) + // Don't assert content — production CI might or might not have the + // /etc dir mounted. Just verify the call returns cleanly. + _ = out +} + +// TestLoadPersonaEnvFile_PrecedenceCallerOverrides: the contract is "lower +// precedence than later .env files." The helper writes into out without +// removing existing keys, so a caller pre-populating out simulates a +// later layer overriding persona defaults. We verify the helper does NOT +// clobber pre-existing entries… actually, parseEnvFile DOES overwrite, +// so the caller-side ordering (persona → org → workspace) is what enforces +// precedence. This test pins that contract: persona is loaded into a +// fresh map, then later layers can override. +func TestLoadPersonaEnvFile_OverwritesEmptyMap(t *testing.T) { + root := t.TempDir() + roleDir := filepath.Join(root, "core-be") + if err := os.MkdirAll(roleDir, 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(roleDir, "env"), + []byte("GITEA_TOKEN=persona-value\n"), 0o600); err != nil { + t.Fatal(err) + } + t.Setenv("MOLECULE_PERSONA_ROOT", root) + + out := map[string]string{"GITEA_TOKEN": "preset"} + loadPersonaEnvFile("core-be", out) + + // Persona helper is meant to populate a FRESH map first in the + // caller's flow; calling it on a pre-populated map and seeing the + // value get overwritten is consistent with parseEnvFile semantics. + if out["GITEA_TOKEN"] != "persona-value" { + t.Errorf("loadPersonaEnvFile did not write into existing map; got %q", out["GITEA_TOKEN"]) + } +} + +// TestIsSafeRoleName_Acceptance: positive + negative cases for the +// validator. Pinned because every dev-tree role name must pass. +func TestIsSafeRoleName_Acceptance(t *testing.T) { + good := []string{ + "dev-lead", "core-be", "cp-security", "infra-runtime-be", + "sdk-dev", "plugin-dev", "documentation-specialist", + "triage-operator", "fullstack-engineer", "release-manager", + "core_underscore_ok", "X", "a1", "Z9-0", + } + for _, s := range good { + if !isSafeRoleName(s) { + t.Errorf("isSafeRoleName(%q) = false; want true", s) + } + } + bad := []string{ + "", ".", "..", "with/slash", "/abs", "dot.in.middle", + "with space", "back\\slash", "trailing-", // trailing-hyphen is fine actually + "with$dollar", "with?question", "newline\nsplit", + } + // trailing-hyphen IS allowed; remove from "bad" list: + bad = []string{ + "", ".", "..", "with/slash", "/abs", "dot.in.middle", + "with space", "back\\slash", "with$dollar", "with?question", + "newline\nsplit", + } + for _, s := range bad { + if isSafeRoleName(s) { + t.Errorf("isSafeRoleName(%q) = true; want false", s) + } + } +} diff --git a/workspace-server/internal/handlers/plugins_atomic.go b/workspace-server/internal/handlers/plugins_atomic.go new file mode 100644 index 00000000..7cf8de67 --- /dev/null +++ b/workspace-server/internal/handlers/plugins_atomic.go @@ -0,0 +1,207 @@ +package handlers + +// plugins_atomic.go — atomic install pattern for plugin delivery into a +// running workspace container. Closes molecule-core#114. +// +// Replaces the prior "tar + docker.CopyToContainer to /configs/plugins/" +// single-step write (no atomicity, no marker, no rollback) with a 4-step +// dance: +// +// 1. STAGE — extract tar into /configs/plugins/.staging/./ +// 2. SNAPSHOT — if /configs/plugins// exists, mv to .previous/./ +// 3. SWAP — mv /configs/plugins/.staging/./ → /configs/plugins// +// 4. MARKER — touch /configs/plugins//.complete +// +// On any post-snapshot failure we attempt a best-effort rollback by mv-ing +// the previous snapshot back into place. The .complete marker is the +// canonical "this install is fully landed" signal — workspace-side plugin +// loaders should refuse to load a plugin dir without it. +// +// Scope: docker path only (workspace running as a local container). The +// SaaS path (deliverViaEIC, SSH-into-EC2) is unchanged in this PR; tracked +// as a follow-up. The same stage-then-swap shape applies but the exec +// primitives differ (ssh vs docker exec), and shipping both paths in one +// PR doubles the test surface. + +import ( + "bytes" + "context" + "fmt" + "path" + "strings" + "time" + + "github.com/docker/docker/api/types/container" +) + +const ( + pluginsRoot = "/configs/plugins" + pluginsStagingDir = "/configs/plugins/.staging" + pluginsPrevDir = "/configs/plugins/.previous" + completeMarker = ".complete" +) + +// installVersion identifies one install attempt — the plugin name plus a +// monotonic-ish UTC timestamp suffix. Used to namespace the staging dir +// and any snapshot of the previous version, so a reinstall mid-flight +// can't collide with a concurrent reinstall. +type installVersion struct { + plugin string + stamp string // e.g. 20260508T141530Z +} + +func newInstallVersion(plugin string) installVersion { + return installVersion{ + plugin: plugin, + stamp: time.Now().UTC().Format("20060102T150405Z"), + } +} + +// stagedPath is the container path where the new content lands during fetch. +// e.g. /configs/plugins/.staging/molecule-skill-foo.20260508T141530Z +func (v installVersion) stagedPath() string { + return path.Join(pluginsStagingDir, v.plugin+"."+v.stamp) +} + +// previousPath is where the prior live version is moved before swap. +// e.g. /configs/plugins/.previous/molecule-skill-foo.20260508T141530Z +func (v installVersion) previousPath() string { + return path.Join(pluginsPrevDir, v.plugin+"."+v.stamp) +} + +// livePath is the destination after swap. +// e.g. /configs/plugins/molecule-skill-foo +func (v installVersion) livePath() string { + return path.Join(pluginsRoot, v.plugin) +} + +// markerPath is the .complete file inside the live dir written last. +func (v installVersion) markerPath() string { + return path.Join(v.livePath(), completeMarker) +} + +// atomicCopyToContainer does a stage→snapshot→swap→marker install of a +// host-side staged plugin tree into a running container's +// /configs/plugins//. Returns nil on success. +// +// On post-snapshot failure (swap or marker write), best-effort rollback +// restores the previous snapshot to the live path. Returns the original +// error wrapped — the caller should surface it; rollback success is +// logged separately. +func (h *PluginsHandler) atomicCopyToContainer( + ctx context.Context, containerName, hostDir, pluginName string, +) error { + v := newInstallVersion(pluginName) + + // Step 0a: ensure staging + previous root dirs exist (idempotent). + if _, err := h.execAsRoot(ctx, containerName, []string{ + "mkdir", "-p", pluginsStagingDir, pluginsPrevDir, + }); err != nil { + return fmt.Errorf("atomic install: mkdir staging/previous: %w", err) + } + + // Step 0b: tar the host content with a path prefix that lands it in the + // staging dir — NOT directly into the live name. The prefix has no + // leading "/" because docker.CopyToContainer extracts paths relative + // to the dstPath argument we pass below. + stagedRel := strings.TrimPrefix(v.stagedPath(), "/") + tarBuf, err := tarHostDirWithPrefix(hostDir, stagedRel) + if err != nil { + return fmt.Errorf("atomic install: tar host dir: %w", err) + } + + // Step 1: STAGE — extract tar into /configs/plugins/.staging/./ + if err := h.docker.CopyToContainer(ctx, containerName, "/", &tarBuf, + container.CopyToContainerOptions{}); err != nil { + // Best-effort: clean up any partial staging extract before returning. + _, _ = h.execAsRoot(ctx, containerName, []string{ + "rm", "-rf", v.stagedPath(), + }) + return fmt.Errorf("atomic install: copy to container: %w", err) + } + + // Step 2: SNAPSHOT — if a live version exists, move it aside. + // `test -d` exits 0 if the dir exists, non-zero otherwise; the helper + // returns a non-nil error in the non-zero case which we treat as + // "no previous version" rather than a real failure. + snapshotted := false + if _, err := h.execAsRoot(ctx, containerName, []string{ + "test", "-d", v.livePath(), + }); err == nil { + if _, err := h.execAsRoot(ctx, containerName, []string{ + "mv", v.livePath(), v.previousPath(), + }); err != nil { + // Snapshot failure: roll back the staged extract before failing. + _, _ = h.execAsRoot(ctx, containerName, []string{ + "rm", "-rf", v.stagedPath(), + }) + return fmt.Errorf("atomic install: snapshot previous version: %w", err) + } + snapshotted = true + } + + // Step 3: SWAP — atomic rename of the staged dir into the live name. + // `mv` on the same filesystem is a single rename(2), atomic at the FS level. + if _, err := h.execAsRoot(ctx, containerName, []string{ + "mv", v.stagedPath(), v.livePath(), + }); err != nil { + // Swap failure: roll back if we had a snapshot. + if snapshotted { + if _, rbErr := h.execAsRoot(ctx, containerName, []string{ + "mv", v.previousPath(), v.livePath(), + }); rbErr != nil { + return fmt.Errorf("atomic install: swap failed AND rollback failed: swap=%w, rollback=%v", err, rbErr) + } + } + // Best-effort cleanup of the still-staged dir. + _, _ = h.execAsRoot(ctx, containerName, []string{ + "rm", "-rf", v.stagedPath(), + }) + return fmt.Errorf("atomic install: swap to live path: %w", err) + } + + // Step 4: MARKER — touch .complete inside the live dir as the last write. + // Workspace-side plugin loaders treat a plugin dir without this marker + // as half-installed and skip it (or surface a clear error to the + // operator instead of loading a possibly-partial tree). + if _, err := h.execAsRoot(ctx, containerName, []string{ + "touch", v.markerPath(), + }); err != nil { + // Marker write failure with the new content already in place is a + // weird state — content is fine on disk, but the plugin loader + // will refuse to use it. Log loudly; do NOT roll back, since the + // content is the latest, just unmarked. Operator can manually + // `touch /.complete` to recover. + return fmt.Errorf("atomic install: write .complete marker (content landed but unmarked, manual recovery: touch %s): %w", v.markerPath(), err) + } + + // Step 5: GC — best-effort delete the previous snapshot. Failures here + // just leave a directory; not load-bearing for correctness, the next + // install or a separate sweeper will reclaim the space. + if snapshotted { + _, _ = h.execAsRoot(ctx, containerName, []string{ + "rm", "-rf", v.previousPath(), + }) + } + + return nil +} + +// tarHostDirWithPrefix walks hostDir and writes a tar to a buffer with +// every entry's name prefixed by `prefix`. Mirrors the prior streaming +// shape used in copyPluginToContainer but with a configurable prefix +// (the prior version hardcoded "plugins//"; we use a full +// staging path so the extracted layout is the staging dir directly). +// +// Symlinks are skipped — same posture as streamDirAsTar elsewhere in +// this file. Skipping prevents a hostile plugin from injecting a +// symlink that, post-extract, points outside the plugin's own dir. +func tarHostDirWithPrefix(hostDir, prefix string) (bytes.Buffer, error) { + var buf bytes.Buffer + tw := newTarWriter(&buf) + defer tw.Close() + if err := tarWalk(hostDir, prefix, tw); err != nil { + return bytes.Buffer{}, err + } + return buf, nil +} diff --git a/workspace-server/internal/handlers/plugins_atomic_tar.go b/workspace-server/internal/handlers/plugins_atomic_tar.go new file mode 100644 index 00000000..e0e8cb80 --- /dev/null +++ b/workspace-server/internal/handlers/plugins_atomic_tar.go @@ -0,0 +1,70 @@ +package handlers + +// plugins_atomic_tar.go — tar-walk helpers split out so the main atomic +// install flow stays readable. The prefix argument lets the caller +// arrange where the tar's contents land at extract time. + +import ( + "archive/tar" + "io" + "os" + "path/filepath" +) + +// newTarWriter is a thin wrapper so atomic_test.go can swap the writer +// destination if it needs to. +func newTarWriter(w io.Writer) *tar.Writer { + return tar.NewWriter(w) +} + +// tarWalk walks hostDir and writes every regular file + dir to the tar +// writer with paths of the form `/`. Symlinks are +// skipped — same posture as streamDirAsTar in plugins_install_pipeline.go. +// +// The trailing-slash on prefix is normalized away: prefix "foo" and +// prefix "foo/" produce identical archives. +func tarWalk(hostDir, prefix string, tw *tar.Writer) error { + prefix = filepath.Clean(prefix) + return filepath.Walk(hostDir, func(p string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if info.Mode()&os.ModeSymlink != 0 { + return nil // skip symlinks; see doc above + } + rel, err := filepath.Rel(hostDir, p) + if err != nil { + return err + } + if rel == "." { + // Emit the prefix dir itself once, with the source dir's mode. + hdr, err := tar.FileInfoHeader(info, "") + if err != nil { + return err + } + hdr.Name = prefix + "/" + return tw.WriteHeader(hdr) + } + hdr, err := tar.FileInfoHeader(info, "") + if err != nil { + return err + } + hdr.Name = filepath.Join(prefix, rel) + if info.IsDir() { + hdr.Name += "/" + } + if err := tw.WriteHeader(hdr); err != nil { + return err + } + if !info.Mode().IsRegular() { + return nil + } + f, err := os.Open(p) + if err != nil { + return err + } + defer f.Close() + _, err = io.Copy(tw, f) + return err + }) +} diff --git a/workspace-server/internal/handlers/plugins_atomic_test.go b/workspace-server/internal/handlers/plugins_atomic_test.go new file mode 100644 index 00000000..bbd43482 --- /dev/null +++ b/workspace-server/internal/handlers/plugins_atomic_test.go @@ -0,0 +1,193 @@ +package handlers + +import ( + "archive/tar" + "bytes" + "io" + "os" + "path/filepath" + "sort" + "strings" + "testing" + "time" +) + +// TestInstallVersion_Paths: the path helpers must produce a stable shape +// the in-container exec calls depend on. Pinning the layout here +// catches a future refactor that accidentally changes where staging / +// previous / live dirs live, which would break the swap atomicity. +func TestInstallVersion_Paths(t *testing.T) { + v := installVersion{plugin: "molecule-skill-foo", stamp: "20260508T141530Z"} + + if got, want := v.stagedPath(), "/configs/plugins/.staging/molecule-skill-foo.20260508T141530Z"; got != want { + t.Errorf("stagedPath = %q; want %q", got, want) + } + if got, want := v.previousPath(), "/configs/plugins/.previous/molecule-skill-foo.20260508T141530Z"; got != want { + t.Errorf("previousPath = %q; want %q", got, want) + } + if got, want := v.livePath(), "/configs/plugins/molecule-skill-foo"; got != want { + t.Errorf("livePath = %q; want %q", got, want) + } + if got, want := v.markerPath(), "/configs/plugins/molecule-skill-foo/.complete"; got != want { + t.Errorf("markerPath = %q; want %q", got, want) + } +} + +// TestInstallVersion_StampUniqueness: two newInstallVersion calls within +// the same second produce the same stamp (we use second precision); the +// caller relies on the mv-rename being atomic, so collision-free +// stamping is NOT a correctness requirement — but a regression that +// changes stamp shape (e.g. RFC3339 with colons) would break the path +// helpers since path.Join treats a colon as a regular char but ssh + +// docker exec generally don't. Pin the no-colon shape. +func TestInstallVersion_StampShape(t *testing.T) { + v := newInstallVersion("anything") + if strings.Contains(v.stamp, ":") { + t.Errorf("stamp must not contain colons (breaks shell-quoting in exec): %q", v.stamp) + } + if strings.Contains(v.stamp, " ") { + t.Errorf("stamp must not contain spaces: %q", v.stamp) + } + // Sanity: stamp parses as the documented format. + if _, err := time.Parse("20060102T150405Z", v.stamp); err != nil { + t.Errorf("stamp %q does not parse as 20060102T150405Z: %v", v.stamp, err) + } +} + +// TestTarHostDirWithPrefix_HappyPath: walks a host dir, builds a tar with +// the configured prefix, verifies every entry's name is rooted under +// the prefix, and the file contents survive round-trip. +func TestTarHostDirWithPrefix_HappyPath(t *testing.T) { + hostDir := t.TempDir() + + // Plant: /plugin.yaml + /skills/foo/SKILL.md + /.complete + files := map[string]string{ + "plugin.yaml": "name: foo\nversion: 1.0.0\n", + "skills/foo/SKILL.md": "# Foo skill\n", + ".complete": "", // upstream may already have a marker + } + for rel, body := range files { + full := filepath.Join(hostDir, rel) + if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(full, []byte(body), 0o644); err != nil { + t.Fatal(err) + } + } + + prefix := "configs/plugins/.staging/foo.20260508T141530Z" + buf, err := tarHostDirWithPrefix(hostDir, prefix) + if err != nil { + t.Fatalf("tar: %v", err) + } + + // Read back the tar; collect names + body for regular files. + got := map[string]string{} + tr := tar.NewReader(&buf) + for { + hdr, err := tr.Next() + if err == io.EOF { + break + } + if err != nil { + t.Fatalf("tar reader: %v", err) + } + // Every entry must start with the prefix + if !strings.HasPrefix(hdr.Name, prefix) { + t.Errorf("entry %q does not start with prefix %q", hdr.Name, prefix) + } + if hdr.Typeflag == tar.TypeReg { + body, err := io.ReadAll(tr) + if err != nil { + t.Fatal(err) + } + rel := strings.TrimPrefix(hdr.Name, prefix+"/") + got[rel] = string(body) + } + } + + for rel, want := range files { + if got[rel] != want { + t.Errorf("body[%q] = %q; want %q", rel, got[rel], want) + } + } +} + +// TestTarHostDirWithPrefix_SkipsSymlinks: a hostile plugin shouldn't be +// able to ship a symlink that, post-extract, points outside its own +// dir. The walker silently skips symlinks (same posture as +// streamDirAsTar). Verify a planted symlink doesn't appear in the tar. +func TestTarHostDirWithPrefix_SkipsSymlinks(t *testing.T) { + hostDir := t.TempDir() + // Plant a real file + a symlink pointing outside hostDir. + if err := os.WriteFile(filepath.Join(hostDir, "real.txt"), []byte("ok"), 0o644); err != nil { + t.Fatal(err) + } + target := filepath.Join(t.TempDir(), "outside") + if err := os.WriteFile(target, []byte("SHOULD NOT APPEAR"), 0o644); err != nil { + t.Fatal(err) + } + if err := os.Symlink(target, filepath.Join(hostDir, "evil")); err != nil { + t.Fatal(err) + } + + buf, err := tarHostDirWithPrefix(hostDir, "p") + if err != nil { + t.Fatal(err) + } + + names := []string{} + tr := tar.NewReader(&buf) + for { + hdr, err := tr.Next() + if err == io.EOF { + break + } + if err != nil { + t.Fatal(err) + } + names = append(names, hdr.Name) + } + sort.Strings(names) + + for _, n := range names { + if strings.Contains(n, "evil") { + t.Errorf("symlink leaked into tar: %q", n) + } + } + // real.txt should be present + found := false + for _, n := range names { + if strings.HasSuffix(n, "real.txt") { + found = true + break + } + } + if !found { + t.Errorf("real.txt missing from tar; got names: %v", names) + } +} + +// TestTarHostDirWithPrefix_PrefixNormalization: trailing slash on prefix +// should not change the archive shape. Pinning this so a future caller +// passing "foo/" instead of "foo" doesn't double-slash entry names. +func TestTarHostDirWithPrefix_PrefixNormalization(t *testing.T) { + hostDir := t.TempDir() + if err := os.WriteFile(filepath.Join(hostDir, "x"), []byte("y"), 0o644); err != nil { + t.Fatal(err) + } + + a, err := tarHostDirWithPrefix(hostDir, "foo") + if err != nil { + t.Fatal(err) + } + b, err := tarHostDirWithPrefix(hostDir, "foo/") + if err != nil { + t.Fatal(err) + } + + if !bytes.Equal(a.Bytes(), b.Bytes()) { + t.Errorf("trailing-slash on prefix changed archive shape; tarHostDirWithPrefix should be slash-insensitive") + } +} diff --git a/workspace-server/internal/handlers/plugins_classifier.go b/workspace-server/internal/handlers/plugins_classifier.go new file mode 100644 index 00000000..14595ed6 --- /dev/null +++ b/workspace-server/internal/handlers/plugins_classifier.go @@ -0,0 +1,214 @@ +package handlers + +// plugins_classifier.go — diff classifier for plugin updates. +// +// Closes molecule-core#112. Composes with #114 (atomic install) so the +// platform can decide *before* triggering restartFunc whether the +// update is content-only (SKILL.md text changed; agent re-reads at next +// Skill invocation) or structural (hooks/settings/plugin.yaml/file added +// or removed; agent must restart to pick up the new state). +// +// SKILL.md content is hot-reloadable because Claude Code reads the file +// on each Skill invocation — no in-memory cache. Hooks and settings.json +// are loaded at session start and need a session restart. plugin.yaml +// changes are structural by definition (manifest controls everything +// else). +// +// CLASSIFICATION RULE +// classify(staged, live) → "skill-content-only" if and only if +// every file present in either tree is one of: +// - identical between staged and live, OR +// - a **/SKILL.md file with content change (text body modified) +// AND no files were added or removed. +// Anything else → "cold" (the safe default). +// +// The classifier reads live-tree files from inside the container via +// `docker exec cat`. Comparison is by SHA-256 over file content, not +// mtime — mtime changes on every install regardless of content. + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "fmt" + "io/fs" + "os" + "path/filepath" + "strings" +) + +const ( + // classifyKindSkillContentOnly: install can skip restartFunc; the + // only changes are SKILL.md body text. + classifyKindSkillContentOnly = "skill-content-only" + // classifyKindCold: must restart the workspace container; structural + // or hook/settings change. + classifyKindCold = "cold" +) + +// classifyInstallChanges compares the staged plugin tree (host filesystem) +// against the currently-live plugin tree inside the container. Returns +// classifyKindSkillContentOnly when the only diff is SKILL.md content +// changes, classifyKindCold otherwise (added/removed files, hooks/ +// settings.json edits, plugin.yaml edits, anything else). +// +// `noLive` is the sentinel returned when /configs/plugins/ doesn't +// exist (first install for this plugin). Treated as cold — no live state +// to hot-reload into. +func (h *PluginsHandler) classifyInstallChanges( + ctx context.Context, containerName, hostStagedDir, pluginName string, +) (string, error) { + livePath := "/configs/plugins/" + pluginName + + // Probe: does live exist? If not, this is a first install — cold. + if _, err := h.execAsRoot(ctx, containerName, []string{ + "test", "-d", livePath, + }); err != nil { + return classifyKindCold, nil + } + + // Build hash maps for both trees. + stagedHashes, err := hashLocalTree(hostStagedDir) + if err != nil { + return classifyKindCold, fmt.Errorf("classifier: hash staged: %w", err) + } + liveHashes, err := h.hashContainerTree(ctx, containerName, livePath) + if err != nil { + // Live tree read failure: be conservative, cold-restart. + return classifyKindCold, nil + } + + // Drop the .complete marker from comparison — its mtime/atime can + // vary across installs but content is empty/trivial; including it + // would force-cold every reinstall. + delete(stagedHashes, ".complete") + delete(liveHashes, ".complete") + + // Set difference: any file in one but not the other → cold. + for rel := range stagedHashes { + if _, ok := liveHashes[rel]; !ok { + return classifyKindCold, nil // file added + } + } + for rel := range liveHashes { + if _, ok := stagedHashes[rel]; !ok { + return classifyKindCold, nil // file removed + } + } + + // Same set of files. Walk the diff. + for rel, stagedHash := range stagedHashes { + liveHash := liveHashes[rel] + if stagedHash == liveHash { + continue + } + // Content differs. Allow if and only if it's a SKILL.md. + if !isSkillMarkdown(rel) { + return classifyKindCold, nil + } + } + return classifyKindSkillContentOnly, nil +} + +// isSkillMarkdown returns true for any path whose basename is SKILL.md +// (case-sensitive, matches Claude Code's skill discovery rule). +func isSkillMarkdown(rel string) bool { + return filepath.Base(rel) == "SKILL.md" +} + +// hashLocalTree walks a host directory and returns rel-path → sha256-hex. +// Symlinks are skipped (same posture as the tar walker). +func hashLocalTree(root string) (map[string]string, error) { + out := map[string]string{} + err := filepath.WalkDir(root, func(p string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + if d.IsDir() { + return nil + } + info, err := d.Info() + if err != nil { + return err + } + if info.Mode()&os.ModeSymlink != 0 { + return nil + } + if !info.Mode().IsRegular() { + return nil + } + rel, err := filepath.Rel(root, p) + if err != nil { + return err + } + body, err := os.ReadFile(p) + if err != nil { + return err + } + sum := sha256.Sum256(body) + out[filepath.ToSlash(rel)] = hex.EncodeToString(sum[:]) + return nil + }) + if err != nil { + return nil, err + } + return out, nil +} + +// hashContainerTree reads every regular file under livePath via docker +// exec sh -c 'cd && find . -type f -not -name .complete | xargs -I {} sh -c "echo {}; sha256sum {}"'. +// +// The output is parsed line-by-line into rel-path → sha256-hex. +func (h *PluginsHandler) hashContainerTree( + ctx context.Context, containerName, livePath string, +) (map[string]string, error) { + out, err := h.execAsRoot(ctx, containerName, []string{ + "sh", "-c", + // Find regular files, hash each, output ` ./`. + // `cd` then `find .` keeps paths relative to livePath. + fmt.Sprintf("cd %s && find . -type f -print0 | xargs -0 -r sha256sum 2>/dev/null", shQuote(livePath)), + }) + if err != nil { + return nil, fmt.Errorf("hash container tree: %w", err) + } + hashes := map[string]string{} + for _, line := range strings.Split(out, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + // sha256sum output: " " (two spaces). Path starts with "./". + parts := strings.SplitN(line, " ", 2) + if len(parts) != 2 { + continue + } + hash := parts[0] + rel := strings.TrimPrefix(parts[1], "./") + hashes[rel] = hash + } + return hashes, nil +} + +// shQuote single-quotes a string for safe insertion into a shell command. +// Returns the input unchanged if it's already shell-safe (alphanumeric + +// /._-). Otherwise wraps in single quotes and escapes inner '. +func shQuote(s string) string { + safe := true + for _, c := range s { + switch { + case c >= 'a' && c <= 'z': + case c >= 'A' && c <= 'Z': + case c >= '0' && c <= '9': + case c == '/' || c == '.' || c == '_' || c == '-': + default: + safe = false + } + if !safe { + break + } + } + if safe { + return s + } + return "'" + strings.ReplaceAll(s, "'", `'\''`) + "'" +} diff --git a/workspace-server/internal/handlers/plugins_classifier_test.go b/workspace-server/internal/handlers/plugins_classifier_test.go new file mode 100644 index 00000000..ff792fbb --- /dev/null +++ b/workspace-server/internal/handlers/plugins_classifier_test.go @@ -0,0 +1,121 @@ +package handlers + +import ( + "os" + "path/filepath" + "testing" +) + +// TestIsSkillMarkdown: pin which paths the classifier considers +// hot-reloadable. SKILL.md by basename only — case-sensitive. +func TestIsSkillMarkdown(t *testing.T) { + yes := []string{ + "SKILL.md", + "skills/foo/SKILL.md", + "deeply/nested/SKILL.md", + } + no := []string{ + "plugin.yaml", + "hooks.json", + "settings.json", + "README.md", + "skill.md", // case-sensitive + "SKILLS.md", // not a skill file + "skills/foo/extra.md", + } + for _, s := range yes { + if !isSkillMarkdown(s) { + t.Errorf("isSkillMarkdown(%q) = false; want true", s) + } + } + for _, s := range no { + if isSkillMarkdown(s) { + t.Errorf("isSkillMarkdown(%q) = true; want false", s) + } + } +} + +// TestHashLocalTree_StableHash: hashing the same content twice must +// produce identical maps. Pinned because if hashLocalTree ever picks up +// mtime/inode (e.g. via a refactor to use os.Lstat metadata), every +// install would classify as cold and we'd lose the hot-reload. +func TestHashLocalTree_StableHash(t *testing.T) { + dir := t.TempDir() + if err := os.MkdirAll(filepath.Join(dir, "skills/foo"), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(dir, "plugin.yaml"), []byte("name: foo\n"), 0o644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(dir, "skills/foo/SKILL.md"), []byte("# Foo\n"), 0o644); err != nil { + t.Fatal(err) + } + + h1, err := hashLocalTree(dir) + if err != nil { + t.Fatal(err) + } + h2, err := hashLocalTree(dir) + if err != nil { + t.Fatal(err) + } + if len(h1) != len(h2) { + t.Fatalf("hash count differs: %d vs %d", len(h1), len(h2)) + } + for k, v := range h1 { + if h2[k] != v { + t.Errorf("hash[%q] differs: %q vs %q", k, v, h2[k]) + } + } +} + +// TestHashLocalTree_SymlinkSkipped: symlinks should not appear in the +// hash map — same posture as the tar walker. Otherwise a hostile plugin +// could include a symlink whose hash changes when its target changes, +// silently flipping classification. +func TestHashLocalTree_SymlinkSkipped(t *testing.T) { + dir := t.TempDir() + if err := os.WriteFile(filepath.Join(dir, "real.txt"), []byte("ok"), 0o644); err != nil { + t.Fatal(err) + } + target := filepath.Join(t.TempDir(), "target") + if err := os.WriteFile(target, []byte("outside"), 0o644); err != nil { + t.Fatal(err) + } + if err := os.Symlink(target, filepath.Join(dir, "link")); err != nil { + t.Fatal(err) + } + + h, err := hashLocalTree(dir) + if err != nil { + t.Fatal(err) + } + if _, exists := h["link"]; exists { + t.Errorf("symlink leaked into hash map: %v", h) + } + if _, exists := h["real.txt"]; !exists { + t.Errorf("real.txt missing from hash map: %v", h) + } +} + +// TestShQuote: the classifier injects livePath into a shell command via +// docker exec. Path must be quoted to handle pluginName entries with +// hyphens (which are safe but exercised here) and any future special- +// character edge case. Pin the safe-vs-quoted boundary. +func TestShQuote(t *testing.T) { + cases := []struct { + in, want string + }{ + {"foo", "foo"}, + {"/configs/plugins/foo-bar", "/configs/plugins/foo-bar"}, + {"with space", "'with space'"}, + {"with'quote", "'with'\\''quote'"}, + {"$envvar", "'$envvar'"}, + {"path/with/dots.txt", "path/with/dots.txt"}, + } + for _, tc := range cases { + if got := shQuote(tc.in); got != tc.want { + t.Errorf("shQuote(%q) = %q; want %q", tc.in, got, tc.want) + } + } +} diff --git a/workspace-server/internal/handlers/plugins_install.go b/workspace-server/internal/handlers/plugins_install.go index b0031bfb..32232727 100644 --- a/workspace-server/internal/handlers/plugins_install.go +++ b/workspace-server/internal/handlers/plugins_install.go @@ -91,6 +91,14 @@ func (h *PluginsHandler) Install(c *gin.Context) { return } + // Record the install in workspace_plugins (core#113 — version-subscription + // foundation). Best-effort: DB write failure is logged but doesn't fail + // the install — the plugin IS in the container; surfacing a 500 here + // would mislead the caller about the install state. + if err := recordWorkspacePluginInstall(ctx, workspaceID, result.PluginName, result.Source.Raw(), req.Track); err != nil { + log.Printf("Plugin install: failed to record %s for %s in workspace_plugins: %v (install succeeded; tracking row missing)", result.PluginName, workspaceID, err) + } + log.Printf("Plugin install: %s via %s → workspace %s (restarting)", result.PluginName, result.Source.Scheme, workspaceID) c.JSON(http.StatusOK, gin.H{ "status": "installed", diff --git a/workspace-server/internal/handlers/plugins_install_pipeline.go b/workspace-server/internal/handlers/plugins_install_pipeline.go index 6c6fb217..31d1239e 100644 --- a/workspace-server/internal/handlers/plugins_install_pipeline.go +++ b/workspace-server/internal/handlers/plugins_install_pipeline.go @@ -114,6 +114,15 @@ type installRequest struct { // When present, resolveAndStage verifies the fetched content matches // before allowing the install to proceed (SAFE-T1102 supply-chain hardening). SHA256 string `json:"sha256,omitempty"` + // Track is the version-subscription mode for this install (core#113): + // "none" — no auto-update tracking (default) + // "tag:vX.Y.Z" — track a specific version tag + // "tag:latest" — track latest tag, drift on every new tag + // "sha:" — pinned, no drift ever + // The drift detector (separate component, follow-up) reads + // workspace_plugins rows where tracked_ref != 'none' and queues + // updates when upstream resolves to a different SHA. + Track string `json:"track,omitempty"` } // stageResult bundles the outputs of resolveAndStage for the caller. @@ -276,7 +285,22 @@ func (h *PluginsHandler) resolveAndStage(ctx context.Context, req installRequest // using NewPluginsHandler without a DB; production wires it in router.go. func (h *PluginsHandler) deliverToContainer(ctx context.Context, workspaceID string, r *stageResult) error { if containerName := h.findRunningContainer(ctx, workspaceID); containerName != "" { - if err := h.copyPluginToContainer(ctx, containerName, r.StagedDir, r.PluginName); err != nil { + // Hot-reload classifier (molecule-core#112) — decide BEFORE the + // install whether this update can skip restartFunc. SKILL.md + // content changes are filesystem-visible to Claude Code on the + // next Skill invocation; hooks / settings.json / plugin.yaml / + // added-or-removed files need a container restart. + // Classifier reads live tree from container; on any read error + // it returns kindCold so we never hot-reload speculatively. + kind, _ := h.classifyInstallChanges(ctx, containerName, r.StagedDir, r.PluginName) + + // Atomic stage→snapshot→swap→marker (molecule-core#114). + // Replaces the prior single docker.CopyToContainer write that + // left a partially-extracted tree on mid-install failure with + // no rollback path. atomicCopyToContainer writes a .complete + // marker as the last step; workspace-side plugin loaders should + // refuse to load a plugin dir without it. + if err := h.atomicCopyToContainer(ctx, containerName, r.StagedDir, r.PluginName); err != nil { log.Printf("Plugin install: failed to copy %s to %s: %v", r.PluginName, workspaceID, err) return newHTTPErr(http.StatusInternalServerError, gin.H{"error": "failed to copy plugin to container"}) } @@ -284,7 +308,11 @@ func (h *PluginsHandler) deliverToContainer(ctx context.Context, workspaceID str "chown", "-R", "1000:1000", "/configs/plugins/" + r.PluginName, }) if h.restartFunc != nil { - go h.restartFunc(workspaceID) + if kind == classifyKindSkillContentOnly { + log.Printf("Plugin install: %s → workspace %s — SKILL-content-only update, SKIPPING restart", r.PluginName, workspaceID) + } else { + go h.restartFunc(workspaceID) + } } return nil } diff --git a/workspace-server/internal/handlers/plugins_tracking.go b/workspace-server/internal/handlers/plugins_tracking.go new file mode 100644 index 00000000..56831a06 --- /dev/null +++ b/workspace-server/internal/handlers/plugins_tracking.go @@ -0,0 +1,78 @@ +package handlers + +// plugins_tracking.go — workspace_plugins DB tracking for the +// version-subscription model (core#113). +// +// Schema lives in migration 20260508160000_workspace_plugins_tracking.up.sql. +// This file is the Go-side write surface used at install time to record +// each plugin's install record. Drift detection / queue / apply are +// follow-up scope (filed as a separate issue once this lands). + +import ( + "context" + "errors" + "fmt" + "strings" + + "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" +) + +// trackedRefValues is the closed set of bare-string values the +// workspace_plugins.tracked_ref column accepts. Prefixed values +// ("tag:..." / "sha:...") are validated structurally below. +var trackedRefValues = map[string]bool{ + "none": true, +} + +// validateTrackedRef returns the canonical form of a track string, or +// an error if the input is malformed. Empty input → "none" (default). +// +// Accepted shapes: +// +// "" — defaults to "none" +// "none" — no tracking +// "tag:vX.Y.Z" — track a specific tag +// "tag:latest" — track latest tag, drift on every new tag +// "sha:" — pinned to commit SHA +func validateTrackedRef(s string) (string, error) { + s = strings.TrimSpace(s) + if s == "" { + return "none", nil + } + if trackedRefValues[s] { + return s, nil + } + if strings.HasPrefix(s, "tag:") && len(s) > 4 { + return s, nil + } + if strings.HasPrefix(s, "sha:") && len(s) > 4 { + return s, nil + } + return "", fmt.Errorf("invalid track value %q: expected 'none' | 'tag:vX.Y.Z' | 'tag:latest' | 'sha:'", s) +} + +// recordWorkspacePluginInstall upserts the workspace_plugins row for a +// plugin install. ON CONFLICT (workspace_id, plugin_name) DO UPDATE so +// reinstalling the same plugin name (with a possibly-different source or +// track value) updates the existing row rather than failing. +func recordWorkspacePluginInstall( + ctx context.Context, workspaceID, pluginName, sourceRaw, track string, +) error { + if workspaceID == "" || pluginName == "" || sourceRaw == "" { + return errors.New("recordWorkspacePluginInstall: missing required field") + } + canonicalTrack, err := validateTrackedRef(track) + if err != nil { + return err + } + _, err = db.DB.ExecContext(ctx, ` + INSERT INTO workspace_plugins (workspace_id, plugin_name, source_raw, tracked_ref) + VALUES ($1, $2, $3, $4) + ON CONFLICT (workspace_id, plugin_name) + DO UPDATE SET + source_raw = EXCLUDED.source_raw, + tracked_ref = EXCLUDED.tracked_ref, + updated_at = NOW() + `, workspaceID, pluginName, sourceRaw, canonicalTrack) + return err +} diff --git a/workspace-server/internal/handlers/plugins_tracking_test.go b/workspace-server/internal/handlers/plugins_tracking_test.go new file mode 100644 index 00000000..8c33023d --- /dev/null +++ b/workspace-server/internal/handlers/plugins_tracking_test.go @@ -0,0 +1,54 @@ +package handlers + +import "testing" + +// TestValidateTrackedRef: pin the exact set of accepted track values +// the install endpoint stores. Drift detector reads this column; any +// value that slips through here without structural validation would +// silently fail at drift-check time. +func TestValidateTrackedRef(t *testing.T) { + cases := []struct { + in string + want string + err bool + }{ + // Defaults + {"", "none", false}, + {" ", "none", false}, + {"none", "none", false}, + + // Tag shape + {"tag:v1.0.0", "tag:v1.0.0", false}, + {"tag:v0.4.0-gitea.1", "tag:v0.4.0-gitea.1", false}, + {"tag:latest", "tag:latest", false}, + + // SHA shape + {"sha:abc123", "sha:abc123", false}, + {"sha:0123456789abcdef0123456789abcdef01234567", "sha:0123456789abcdef0123456789abcdef01234567", false}, + + // Reject malformed + {"tag:", "", true}, // empty after prefix + {"sha:", "", true}, // empty after prefix + {"latest", "", true}, // bare 'latest' is ambiguous (tag? branch?) + {"main", "", true}, // bare branch name not allowed + {"v1.0.0", "", true}, // missing tag: prefix + {"random", "", true}, // not in allowlist + {"tag", "", true}, // prefix without separator + } + for _, tc := range cases { + got, err := validateTrackedRef(tc.in) + if tc.err { + if err == nil { + t.Errorf("validateTrackedRef(%q) = (%q, nil); want error", tc.in, got) + } + continue + } + if err != nil { + t.Errorf("validateTrackedRef(%q) error: %v", tc.in, err) + continue + } + if got != tc.want { + t.Errorf("validateTrackedRef(%q) = %q; want %q", tc.in, got, tc.want) + } + } +} diff --git a/workspace-server/internal/handlers/workspace_crud.go b/workspace-server/internal/handlers/workspace_crud.go index 200356b1..cc487a4a 100644 --- a/workspace-server/internal/handlers/workspace_crud.go +++ b/workspace-server/internal/handlers/workspace_crud.go @@ -323,161 +323,19 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) { return } - // Cascade delete: collect ALL descendants (not just direct children) via - // recursive CTE, then stop each container and remove each volume. - // Previous bug: only direct children's containers were stopped, leaving - // grandchildren as orphan running containers after a cascade delete. - descendantIDs := []string{} - if len(children) > 0 { - descRows, err := db.DB.QueryContext(ctx, ` - WITH RECURSIVE descendants AS ( - SELECT id FROM workspaces WHERE parent_id = $1 AND status != 'removed' - UNION ALL - SELECT w.id FROM workspaces w JOIN descendants d ON w.parent_id = d.id WHERE w.status != 'removed' - ) - SELECT id FROM descendants - `, id) - if err != nil { - log.Printf("Delete: descendant query error for %s: %v", id, err) - } else { - for descRows.Next() { - var descID string - if descRows.Scan(&descID) == nil { - descendantIDs = append(descendantIDs, descID) - } - } - descRows.Close() - } + // Delegate the cascade to CascadeDelete so the HTTP path and the + // OrgImport reconcile path share one teardown sequence (#73 race + // guard, container stop, volume removal, token revocation, schedule + // disable, broadcast). The HTTP-specific bits — direct-children 409 + // gate above, ?purge=true hard-delete below, response shaping — + // stay in this handler. + descendantIDs, stopErrs, err := h.CascadeDelete(ctx, id) + if err != nil { + log.Printf("Delete: CascadeDelete(%s) failed: %v", id, err) + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return } - - // #73 fix: mark rows 'removed' in the DB FIRST, BEFORE stopping containers - // or removing volumes. Previously the sequence was stop → update-status, - // which left a gap where: - // - the container's last pre-teardown heartbeat could resurrect the row - // via the register-handler UPSERT (now also guarded in #73) - // - the liveness monitor could observe 'online' status + expired Redis - // TTL and trigger RestartByID, recreating a container we're trying - // to destroy - // Marking 'removed' first makes both of those paths no-op via their - // existing `status NOT IN ('removed', ...)` guards. allIDs := append([]string{id}, descendantIDs...) - if _, err := db.DB.ExecContext(ctx, - `UPDATE workspaces SET status = $1, updated_at = now() WHERE id = ANY($2::uuid[])`, - models.StatusRemoved, pq.Array(allIDs)); err != nil { - log.Printf("Delete status update error for %s: %v", id, err) - } - if _, err := db.DB.ExecContext(ctx, - `DELETE FROM canvas_layouts WHERE workspace_id = ANY($1::uuid[])`, - pq.Array(allIDs)); err != nil { - log.Printf("Delete canvas_layouts error for %s: %v", id, err) - } - // Revoke all auth tokens for the deleted workspaces. Once the workspace is - // gone its tokens are meaningless; leaving them alive would keep - // HasAnyLiveTokenGlobal = true even after the platform is otherwise empty, - // which prevents AdminAuth from returning to fail-open and breaks the E2E - // test's count-zero assertion (and local re-run cleanup). - if _, err := db.DB.ExecContext(ctx, - `UPDATE workspace_auth_tokens SET revoked_at = now() - WHERE workspace_id = ANY($1::uuid[]) AND revoked_at IS NULL`, - pq.Array(allIDs)); err != nil { - log.Printf("Delete token revocation error for %s: %v", id, err) - } - // #1027: cascade-disable all schedules for the deleted workspaces so - // the scheduler never fires a cron into a removed container. - if _, err := db.DB.ExecContext(ctx, - `UPDATE workspace_schedules SET enabled = false, updated_at = now() - WHERE workspace_id = ANY($1::uuid[]) AND enabled = true`, - pq.Array(allIDs)); err != nil { - log.Printf("Delete schedule disable error for %s: %v", id, err) - } - - // Now stop containers + remove volumes for all descendants (any depth). - // Any concurrent heartbeat / registration / liveness-triggered restart - // will see status='removed' and bail out early. - // - // Combines two concerns: - // - // 1. Detach cleanup from the request ctx via WithoutCancel + a 30s - // timeout, so when the canvas's `api.del` resolves on our 200 - // (and gin cancels c.Request.Context()), in-flight Docker - // stop/remove calls don't get cancelled mid-operation. The - // previous shape leaked containers every time the canvas hung - // up promptly: Stop returned "context canceled", the container - // stayed up, and the next RemoveVolume failed with - // "volume in use". 30s is generous for Docker daemon round- - // trips (typical: <2s) and bounds a stuck daemon. - // - // 2. #1843: aggregate Stop() failures into stopErrs so the - // post-deletion block surfaces them as 500. On the CP/EC2 - // backend, Stop() calls control plane's DELETE endpoint to - // terminate the EC2; if that errors (transient 5xx, network), - // the EC2 stays running with no DB row to track it (the - // "orphan EC2 on a 0-customer account" scenario). Loud-fail - // instead of silent-leak — clients retry, Stop's instance_id - // lookup is idempotent against status='removed'. RemoveVolume - // errors stay log-and-continue (local cleanup, not infra-leak). - cleanupCtx, cleanupCancel := context.WithTimeout( - context.WithoutCancel(ctx), 30*time.Second) - defer cleanupCancel() - - var stopErrs []error - stopAndRemove := func(wsID string) { - // Stop the workload first via the backend dispatcher (CP for - // SaaS, Docker for self-hosted). Pre-2026-05-05 this gate was - // `if h.provisioner == nil { return }` — early-returning on - // every SaaS tenant left the EC2 running with no DB row to - // track it (issue #2814; the comment below claimed "loud-fail - // instead of silent-leak" but the early-return made it the - // silent path on SaaS). - // - // Check Stop's error before any volume cleanup — the previous - // code discarded it and immediately tried RemoveVolume, which - // always fails with "volume in use" when Stop didn't actually - // kill the container. The orphan sweeper - // (registry/orphan_sweeper.go) catches what we skip here on - // the next reconcile pass. - if err := h.StopWorkspaceAuto(cleanupCtx, wsID); err != nil { - log.Printf("Delete %s stop failed: %v — leaving cleanup for orphan sweeper", wsID, err) - stopErrs = append(stopErrs, fmt.Errorf("stop %s: %w", wsID, err)) - return - } - // Volume cleanup is Docker-only — CP-managed workspaces have - // no host-bind volumes to remove. Skip silently when no Docker - // provisioner is wired (the SaaS path already terminated the - // EC2 above; nothing left to do). - if h.provisioner != nil { - if err := h.provisioner.RemoveVolume(cleanupCtx, wsID); err != nil { - log.Printf("Delete %s volume removal warning: %v", wsID, err) - } - } - } - - for _, descID := range descendantIDs { - stopAndRemove(descID) - db.ClearWorkspaceKeys(cleanupCtx, descID) - // #2269: drop the per-workspace restartState entry so it - // doesn't accumulate across the platform's lifetime. The - // LoadOrStore that creates the entry (workspace_restart.go) - // has no companion remove path; without this Delete, every - // short-lived workspace leaks ~16 bytes forever. - restartStates.Delete(descID) - // Detach broadcaster ctx for the same reason as the cleanup - // above — RecordAndBroadcast does an INSERT INTO - // structure_events + Redis Publish. If the canvas hangs up, - // a request-ctx-bound INSERT can be cancelled mid-write, - // leaving other WS clients ignorant of the cascade. The DB - // row is already 'removed' so it's recoverable, but the - // inconsistency is avoidable. - h.broadcaster.RecordAndBroadcast(cleanupCtx, string(events.EventWorkspaceRemoved), descID, map[string]interface{}{}) - } - - stopAndRemove(id) - db.ClearWorkspaceKeys(cleanupCtx, id) - restartStates.Delete(id) // #2269: same as descendants above - - h.broadcaster.RecordAndBroadcast(cleanupCtx, string(events.EventWorkspaceRemoved), id, map[string]interface{}{ - "cascade_deleted": len(descendantIDs), - }) // If any Stop call failed, surface 500 so the client retries. The DB // row is already 'removed' (idempotent), and Stop's instance_id @@ -543,6 +401,104 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) { c.JSON(http.StatusOK, gin.H{"status": "removed", "cascade_deleted": len(descendantIDs)}) } +// CascadeDelete performs the cascade-removal sequence used by the HTTP +// DELETE handler and by OrgImport's reconcile mode: walk descendants, mark +// self+descendants 'removed' first (#73 race guard), stop containers / EC2s, +// remove volumes, revoke tokens, disable schedules, broadcast events. +// +// Idempotent against already-removed rows (the descendant CTE and all UPDATE +// guards skip status='removed'). Returns the descendant id list so the HTTP +// caller can drive the optional `?purge=true` hard-delete path against the +// same set the cascade just touched, plus any per-workspace stop errors so +// callers can surface a retryable failure instead of a silent-leak. +// +// Caller is responsible for the children-confirmation gate (the HTTP handler +// returns 409 when children exist + ?confirm=true is missing); this helper +// always cascades. +func (h *WorkspaceHandler) CascadeDelete(ctx context.Context, id string) ([]string, []error, error) { + if err := validateWorkspaceID(id); err != nil { + return nil, nil, err + } + + descendantIDs := []string{} + descRows, err := db.DB.QueryContext(ctx, ` + WITH RECURSIVE descendants AS ( + SELECT id FROM workspaces WHERE parent_id = $1 AND status != 'removed' + UNION ALL + SELECT w.id FROM workspaces w JOIN descendants d ON w.parent_id = d.id WHERE w.status != 'removed' + ) + SELECT id FROM descendants + `, id) + if err != nil { + return nil, nil, fmt.Errorf("descendant query: %w", err) + } + for descRows.Next() { + var descID string + if descRows.Scan(&descID) == nil { + descendantIDs = append(descendantIDs, descID) + } + } + descRows.Close() + + allIDs := append([]string{id}, descendantIDs...) + + if _, err := db.DB.ExecContext(ctx, + `UPDATE workspaces SET status = $1, updated_at = now() WHERE id = ANY($2::uuid[])`, + models.StatusRemoved, pq.Array(allIDs)); err != nil { + log.Printf("CascadeDelete status update for %s: %v", id, err) + } + if _, err := db.DB.ExecContext(ctx, + `DELETE FROM canvas_layouts WHERE workspace_id = ANY($1::uuid[])`, + pq.Array(allIDs)); err != nil { + log.Printf("CascadeDelete canvas_layouts for %s: %v", id, err) + } + if _, err := db.DB.ExecContext(ctx, + `UPDATE workspace_auth_tokens SET revoked_at = now() + WHERE workspace_id = ANY($1::uuid[]) AND revoked_at IS NULL`, + pq.Array(allIDs)); err != nil { + log.Printf("CascadeDelete token revocation for %s: %v", id, err) + } + if _, err := db.DB.ExecContext(ctx, + `UPDATE workspace_schedules SET enabled = false, updated_at = now() + WHERE workspace_id = ANY($1::uuid[]) AND enabled = true`, + pq.Array(allIDs)); err != nil { + log.Printf("CascadeDelete schedule disable for %s: %v", id, err) + } + + cleanupCtx, cleanupCancel := context.WithTimeout( + context.WithoutCancel(ctx), 30*time.Second) + defer cleanupCancel() + + var stopErrs []error + stopAndRemove := func(wsID string) { + if err := h.StopWorkspaceAuto(cleanupCtx, wsID); err != nil { + log.Printf("CascadeDelete %s stop failed: %v — leaving cleanup for orphan sweeper", wsID, err) + stopErrs = append(stopErrs, fmt.Errorf("stop %s: %w", wsID, err)) + return + } + if h.provisioner != nil { + if err := h.provisioner.RemoveVolume(cleanupCtx, wsID); err != nil { + log.Printf("CascadeDelete %s volume removal warning: %v", wsID, err) + } + } + } + + for _, descID := range descendantIDs { + stopAndRemove(descID) + db.ClearWorkspaceKeys(cleanupCtx, descID) + restartStates.Delete(descID) + h.broadcaster.RecordAndBroadcast(cleanupCtx, string(events.EventWorkspaceRemoved), descID, map[string]interface{}{}) + } + stopAndRemove(id) + db.ClearWorkspaceKeys(cleanupCtx, id) + restartStates.Delete(id) + h.broadcaster.RecordAndBroadcast(cleanupCtx, string(events.EventWorkspaceRemoved), id, map[string]interface{}{ + "cascade_deleted": len(descendantIDs), + }) + + return descendantIDs, stopErrs, nil +} + // validateWorkspaceID returns an error when id is not a valid UUID. // #687: prevents 500s from Postgres when a garbage string (e.g. ../../etc/passwd) // is passed as the :id path parameter. diff --git a/workspace-server/internal/handlers/workspace_provision.go b/workspace-server/internal/handlers/workspace_provision.go index 981ee5da..f3657d0b 100644 --- a/workspace-server/internal/handlers/workspace_provision.go +++ b/workspace-server/internal/handlers/workspace_provision.go @@ -715,14 +715,30 @@ func deriveProviderFromModelSlug(model string) string { // payload.Model at boot), this is a no-op — no harm in the switch // being empty for those cases. func applyRuntimeModelEnv(envVars map[string]string, runtime, model string) { - // Fall back to the MODEL_PROVIDER workspace secret when the caller - // didn't pass one explicitly. This is the path that "Save+Restart" - // hits — Restart builds its payload from the workspaces row (no model - // column there) so payload.Model is always empty, but the user's - // canvas selection was stored as MODEL_PROVIDER via PUT /model and - // is already loaded into envVars here. Without this fallback hermes - // silently boots with the template default and errors "No LLM - // provider configured" even though the user picked a valid model. + // Resolution order (priority high → low): + // 1. payload.Model (caller passed the canvas-picked model id verbatim) + // 2. envVars["MODEL"] (workspace_secret persisted by /org/import via + // the persona env file — MODEL=MiniMax-M2.7-highspeed etc.) + // 3. envVars["MODEL_PROVIDER"] (legacy: this secret was historically a + // *model id* set by canvas Save+Restart's PUT /model; on the + // post-2026-05-08 persona-env convention it's a *provider slug* + // (e.g. "minimax") which is NOT a valid model id, so this fallback + // only fires when MODEL is absent.) + // + // Pre-fix bug: this function unconditionally OVERWROTE envVars["MODEL"] + // with the MODEL_PROVIDER slug (when payload.Model was empty), wiping + // the operator's explicit per-persona MODEL secret on every restart. + // Symptom: a workspace whose persona env said + // MODEL=MiniMax-M2.7-highspeed booted fine on first /org/import (the + // envVars map was populated direct from the env file), then on the + // next Restart the workspace_secrets-derived MODEL got clobbered by + // MODEL_PROVIDER="minimax" — the literal slug, not a valid model id — + // and the workspace template's adapter routed to providers[0] + // (anthropic-oauth) and wedged at SDK initialize. Caught 2026-05-08 + // during Phase 4 verification of template-claude-code PR #9. + if model == "" { + model = envVars["MODEL"] + } if model == "" { model = envVars["MODEL_PROVIDER"] } diff --git a/workspace-server/internal/handlers/workspace_provision_shared_test.go b/workspace-server/internal/handlers/workspace_provision_shared_test.go index 51391c93..7a85d118 100644 --- a/workspace-server/internal/handlers/workspace_provision_shared_test.go +++ b/workspace-server/internal/handlers/workspace_provision_shared_test.go @@ -724,3 +724,68 @@ func TestApplyRuntimeModelEnv_SetsUniversalMODELForAllRuntimes(t *testing.T) { }) } } + +// TestApplyRuntimeModelEnv_PersonaEnvMODELSecretPreserved locks in the +// 2026-05-08 fix that prevents the MODEL_PROVIDER-as-slug fallback from +// silently overwriting a per-persona MODEL workspace_secret on restart. +// +// Pre-fix bug recurrence guard: when the persona env file (loaded into +// workspace_secrets at /org/import time) declares both MODEL= and +// MODEL_PROVIDER=, the restart path used to overwrite envVars["MODEL"] +// with the MODEL_PROVIDER slug because applyRuntimeModelEnv'\''s +// payload.Model fallback consulted MODEL_PROVIDER first. Symptom: dev-tree +// workspaces booted fine on first /org/import, then on next restart the +// model id became literal "minimax" and the workspace template'\''s adapter +// failed to match any registry prefix, fell through to anthropic-oauth, +// and wedged at SDK initialize. Caught during Phase 4 verification of +// template-claude-code PR #9. +func TestApplyRuntimeModelEnv_PersonaEnvMODELSecretPreserved(t *testing.T) { + cases := []struct { + name string + envMODEL string + envMP string + wantMODEL string + }{ + { + name: "MODEL secret wins over MODEL_PROVIDER slug (persona-env shape on restart)", + envMODEL: "MiniMax-M2.7-highspeed", + envMP: "minimax", + wantMODEL: "MiniMax-M2.7-highspeed", + }, + { + name: "MODEL secret wins even when same as MODEL_PROVIDER", + envMODEL: "opus", + envMP: "claude-code", + wantMODEL: "opus", + }, + { + name: "MODEL absent → fall back to MODEL_PROVIDER (legacy canvas Save+Restart shape)", + envMODEL: "", + envMP: "MiniMax-M2.7", + wantMODEL: "MiniMax-M2.7", + }, + { + name: "Both absent → no MODEL set", + envMODEL: "", + envMP: "", + wantMODEL: "", + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + envVars := map[string]string{} + if tc.envMODEL != "" { + envVars["MODEL"] = tc.envMODEL + } + if tc.envMP != "" { + envVars["MODEL_PROVIDER"] = tc.envMP + } + // payload.Model is empty (the restart case) + applyRuntimeModelEnv(envVars, "claude-code", "") + if got := envVars["MODEL"]; got != tc.wantMODEL { + t.Errorf("MODEL = %q, want %q (envMODEL=%q envMP=%q)", + got, tc.wantMODEL, tc.envMODEL, tc.envMP) + } + }) + } +} diff --git a/workspace-server/internal/handlers/workspace_test.go b/workspace-server/internal/handlers/workspace_test.go index 4e17ca6a..180d6735 100644 --- a/workspace-server/internal/handlers/workspace_test.go +++ b/workspace-server/internal/handlers/workspace_test.go @@ -813,6 +813,12 @@ func TestWorkspaceDelete_DisablesSchedules(t *testing.T) { WithArgs(wsID). WillReturnRows(sqlmock.NewRows([]string{"id", "name"})) + // CascadeDelete walks descendants unconditionally — 0-children case + // returns 0 rows here. + mock.ExpectQuery("WITH RECURSIVE descendants"). + WithArgs(wsID). + WillReturnRows(sqlmock.NewRows([]string{"id"})) + // Mark workspace as removed mock.ExpectExec("UPDATE workspaces SET status ="). WillReturnResult(sqlmock.NewResult(0, 1)) @@ -935,6 +941,12 @@ func TestWorkspaceDelete_ScheduleDisableOnlyTargetsDeletedWorkspace(t *testing.T WithArgs(wsA). WillReturnRows(sqlmock.NewRows([]string{"id", "name"})) + // CascadeDelete walks descendants unconditionally — 0-children case + // returns 0 rows here. + mock.ExpectQuery("WITH RECURSIVE descendants"). + WithArgs(wsA). + WillReturnRows(sqlmock.NewRows([]string{"id"})) + // Mark only workspace A as removed mock.ExpectExec("UPDATE workspaces SET status ="). WillReturnResult(sqlmock.NewResult(0, 1)) diff --git a/workspace-server/internal/pendinguploads/sweeper_test.go b/workspace-server/internal/pendinguploads/sweeper_test.go index 4133125d..8095e83d 100644 --- a/workspace-server/internal/pendinguploads/sweeper_test.go +++ b/workspace-server/internal/pendinguploads/sweeper_test.go @@ -207,20 +207,25 @@ func TestStartSweeper_TransientErrorDoesNotCrashLoop(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - // 50ms ticker so the second cycle fires quickly enough for the test. - // We re-export SweepInterval as a const, but tests use the public - // StartSweeper that takes its own interval — wait, the public - // StartSweeper signature uses the package-level SweepInterval. Hmm, - // this means the test takes ~5 minutes. Let me reconsider. - // - // (We patch the test below to just look at the immediate-sweep call - // + an error path, since the immediate call is enough to prove the - // "error doesn't crash" contract — the loop continues afterward - // regardless of timing.) + // Capture metric baseline so we can wait for the error counter to + // settle before returning — otherwise this test's leaked metric + // write races with the next test's metricDelta() baseline read and + // causes a non-deterministic +1 leak (manifests as + // TestStartSweeper_RecordsMetricsOnSuccess: "error counter delta=1, + // want 0"). cycleDone fires inside the fake's Sweep defer, BEFORE + // sweepOnce records the error metric — so cancel() right after + // waitForCycle is too early. + _, _, deltaError := metricDelta(t) + go pendinguploads.StartSweeper(ctx, store, time.Hour) // Wait for the first (errored) cycle. store.waitForCycle(t, 1, 2*time.Second) + // Wait for the goroutine to record the error metric. After this + // returns, sweepOnce has fully completed and a subsequent cancel() + // stops the loop on the next select pass with no in-flight metric + // writes outstanding. + waitForMetricDelta(t, deltaError, 1, 2*time.Second) // Cancel — the goroutine returns cleanly, proving the error path // didn't crash the loop. Without this fix the goroutine would have // either panicked (process abort visible at exit) or stuck (this diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index 8bccb3d8..c46c59db 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -765,6 +765,21 @@ func ApplyTierConfig(hostCfg *container.HostConfig, cfg WorkspaceConfig, configM // CopyTemplateToContainer copies files from a host directory into /configs in the container. func (p *Provisioner) CopyTemplateToContainer(ctx context.Context, containerID, templatePath string) error { + // Resolve symlinks at the root before walking. filepath.Walk does + // NOT follow a symlink that IS the root — it Lstats the path, sees + // a symlink (non-directory), and emits exactly one entry without + // descending. With cross-repo composition (parent template's + // dev-lead → ../sibling-repo/dev-lead/, see internal#77), the + // caller routinely passes a symlink as templatePath. Without this + // resolution the workspace's /configs/ mount lands empty. + // + // Security: templatePath has already passed resolveInsideRoot's + // path-string check at the call site — the trust boundary is the + // operator-side /org-templates/ filesystem layout, not this + // resolution step. + if resolved, err := filepath.EvalSymlinks(templatePath); err == nil { + templatePath = resolved + } var buf bytes.Buffer tw := tar.NewWriter(&buf) diff --git a/workspace-server/migrations/20260508160000_workspace_plugins_tracking.down.sql b/workspace-server/migrations/20260508160000_workspace_plugins_tracking.down.sql new file mode 100644 index 00000000..9beee5cf --- /dev/null +++ b/workspace-server/migrations/20260508160000_workspace_plugins_tracking.down.sql @@ -0,0 +1,3 @@ +DROP INDEX IF EXISTS workspace_plugins_tracked_not_none; +DROP INDEX IF EXISTS workspace_plugins_workspace_name; +DROP TABLE IF EXISTS workspace_plugins; diff --git a/workspace-server/migrations/20260508160000_workspace_plugins_tracking.up.sql b/workspace-server/migrations/20260508160000_workspace_plugins_tracking.up.sql new file mode 100644 index 00000000..a6164832 --- /dev/null +++ b/workspace-server/migrations/20260508160000_workspace_plugins_tracking.up.sql @@ -0,0 +1,39 @@ +-- workspace_plugins: per-workspace record of installed plugins, with the +-- tracked-ref needed for the version-subscription model (core#113). +-- +-- Today plugin install state is filesystem-only — `/configs/plugins//` +-- inside the workspace container. There's no DB record of "what's installed +-- where, from what source, pinned to what." That's fine until you want +-- drift detection (compare upstream tag's resolved SHA vs the installed +-- one) and that's the foundation this table provides. +-- +-- This migration is purely additive: existing install paths keep working; +-- they'll write to this table on next install. Workspaces with plugins +-- already installed before this migration won't have rows until they're +-- re-installed (acceptable — the tracking is forward-looking). +-- +-- tracked_ref values: +-- 'none' — no auto-update tracking (default) +-- 'tag:vX.Y.Z' — track a specific version tag +-- 'tag:latest' — track the latest tag (drift on every new tag) +-- 'sha:' — pinned to a specific commit SHA (no drift ever) +-- +-- A subsequent migration adds the plugin_update_queue table once drift +-- detection lands. + +CREATE TABLE IF NOT EXISTS workspace_plugins ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + workspace_id UUID NOT NULL REFERENCES workspaces(id) ON DELETE CASCADE, + plugin_name TEXT NOT NULL, + source_raw TEXT NOT NULL, + tracked_ref TEXT NOT NULL DEFAULT 'none', + installed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE UNIQUE INDEX IF NOT EXISTS workspace_plugins_workspace_name + ON workspace_plugins(workspace_id, plugin_name); + +-- Partial index for the drift detector: only scan rows opted into tracking. +CREATE INDEX IF NOT EXISTS workspace_plugins_tracked_not_none + ON workspace_plugins(tracked_ref) WHERE tracked_ref != 'none'; diff --git a/workspace-server/migrations/20260508170000_workspaces_update_tier.down.sql b/workspace-server/migrations/20260508170000_workspaces_update_tier.down.sql new file mode 100644 index 00000000..2bb00727 --- /dev/null +++ b/workspace-server/migrations/20260508170000_workspaces_update_tier.down.sql @@ -0,0 +1,2 @@ +DROP INDEX IF EXISTS workspaces_update_tier_canary; +ALTER TABLE workspaces DROP COLUMN IF EXISTS update_tier; diff --git a/workspace-server/migrations/20260508170000_workspaces_update_tier.up.sql b/workspace-server/migrations/20260508170000_workspaces_update_tier.up.sql new file mode 100644 index 00000000..4ed9d522 --- /dev/null +++ b/workspace-server/migrations/20260508170000_workspaces_update_tier.up.sql @@ -0,0 +1,26 @@ +-- workspaces.update_tier — canary vs production filter for plugin updates +-- (core#115). Composes with the version-subscription DB foundation +-- (core#113, merged) and the upcoming drift+queue+apply endpoint +-- (core#123). +-- +-- Tiers: +-- 'production' (default) — fan-out target; only updated AFTER canary soak +-- 'canary' — early-adopter target; updates land here first +-- +-- Default 'production' so existing customers (Reno-Stars + any future +-- live tenant) are default-safe. Synthetic dogfooding workspaces opt +-- INTO 'canary' explicitly. +-- +-- The column is just metadata at this layer; the actual filter logic +-- ('apply this update only to canary tier first') lives in the future +-- POST /admin/plugin-updates/:id/apply endpoint (core#123). + +ALTER TABLE workspaces + ADD COLUMN IF NOT EXISTS update_tier TEXT NOT NULL DEFAULT 'production' + CHECK (update_tier IN ('canary', 'production')); + +-- Partial index for the apply endpoint's canary-tier scan: only +-- index canary rows since the apply path queries them most often +-- and the production set is the much larger default. +CREATE INDEX IF NOT EXISTS workspaces_update_tier_canary + ON workspaces(update_tier) WHERE update_tier = 'canary';