diff --git a/.github/workflows/publish-workspace-server-image.yml b/.github/workflows/publish-workspace-server-image.yml index f95e3609..1e7b4630 100644 --- a/.github/workflows/publish-workspace-server-image.yml +++ b/.github/workflows/publish-workspace-server-image.yml @@ -142,6 +142,13 @@ jobs: ${{ env.IMAGE_NAME }}:staging-latest cache-from: type=gha cache-to: type=gha,mode=max + # GIT_SHA bakes into the Go binary via -ldflags so /buildinfo + # returns it at runtime — see Dockerfile + buildinfo/buildinfo.go. + # This is the same value as the OCI revision label below; passing + # it twice is intentional, the OCI label is for registry tooling + # while /buildinfo is for the redeploy verification step. + build-args: | + GIT_SHA=${{ github.sha }} labels: | org.opencontainers.image.source=https://github.com/${{ github.repository }} org.opencontainers.image.revision=${{ github.sha }} @@ -172,6 +179,7 @@ jobs: # NEXT_PUBLIC_PLATFORM_URL=http://localhost:8080). build-args: | NEXT_PUBLIC_PLATFORM_URL= + GIT_SHA=${{ github.sha }} labels: | org.opencontainers.image.source=https://github.com/${{ github.repository }} org.opencontainers.image.revision=${{ github.sha }} diff --git a/.github/workflows/redeploy-tenants-on-main.yml b/.github/workflows/redeploy-tenants-on-main.yml index 0fd8820b..02c21e24 100644 --- a/.github/workflows/redeploy-tenants-on-main.yml +++ b/.github/workflows/redeploy-tenants-on-main.yml @@ -175,4 +175,120 @@ jobs: echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)" exit 1 fi - echo "::notice::Tenant fleet redeploy complete." + echo "::notice::Tenant fleet redeploy reported ssm_status=Success — verifying actual image roll on each tenant..." + + # Stash the response for the verify step. $RUNNER_TEMP outlasts + # the step boundary; $HTTP_RESPONSE doesn't. + cp "$HTTP_RESPONSE" "$RUNNER_TEMP/redeploy-response.json" + + - name: Verify each tenant /buildinfo matches published SHA + # ROOT FIX FOR #2395. + # + # `redeploy-fleet`'s `ssm_status=Success` means "the SSM RPC + # didn't error" — NOT "the new image is running on the tenant." + # `:latest` lives in the local Docker daemon's image cache; if + # the SSM document does `docker compose up -d` without an + # explicit `docker pull`, the daemon serves the previously- + # cached digest and the container restarts on stale code. + # 2026-04-30 incident: hongmingwang's tenant reported + # ssm_status=Success at 17:00:53Z but kept serving pre-501a42d7 + # chat_files for 30+ min — the lazy-heal fix never reached the + # user despite green deploy + green redeploy. + # + # This step closes the gap by curling each tenant's /buildinfo + # endpoint (added in workspace-server/internal/buildinfo + + # /Dockerfile* GIT_SHA build-arg, this PR) and comparing the + # returned git_sha to the SHA the workflow expects. Mismatches + # fail the workflow, which is what `ok=true` should have + # guaranteed all along. + # + # When the redeploy was triggered by workflow_dispatch with a + # specific tag (target_tag != "latest"), the expected SHA may + # not equal ${{ github.sha }} — in that case we resolve via + # GHCR's manifest. For workflow_run (default :latest) the + # workflow_run.head_sha is the SHA that just published. + env: + EXPECTED_SHA: ${{ github.event.workflow_run.head_sha || github.sha }} + TARGET_TAG: ${{ inputs.target_tag || 'latest' }} + # Tenant subdomain template — slugs from the response are + # appended. Production CP issues `.moleculesai.app`; + # staging CP issues `.staging.moleculesai.app`. This + # workflow runs on main → prod CP → no `staging.` infix. + TENANT_DOMAIN: 'moleculesai.app' + run: | + set -euo pipefail + + if [ "$TARGET_TAG" != "latest" ] && [ "$TARGET_TAG" != "$EXPECTED_SHA" ]; then + # workflow_dispatch with a pinned tag that isn't the head + # SHA — operator is rolling back / pinning. Skip the + # verification because we don't have the expected SHA in + # this context (would need to crane-inspect the GHCR + # manifest, which is a follow-up). Failing-open here is + # safe: the operator chose the tag deliberately. + echo "::notice::target_tag=$TARGET_TAG (operator-pinned) — skipping per-tenant SHA verification." + exit 0 + fi + + RESP="$RUNNER_TEMP/redeploy-response.json" + if [ ! -s "$RESP" ]; then + echo "::error::redeploy-response.json missing or empty — verify step ran without a response to read" + exit 1 + fi + + # Pull only successfully-redeployed tenants. Any tenant that + # halted the rollout already failed the previous step, so we + # don't double-count them here. + mapfile -t SLUGS < <(jq -r '.results[]? | select(.healthz_ok == true) | .slug' "$RESP") + if [ ${#SLUGS[@]} -eq 0 ]; then + echo "::warning::No tenants reported healthz_ok — nothing to verify" + exit 0 + fi + + echo "Verifying ${#SLUGS[@]} tenant(s) against EXPECTED_SHA=${EXPECTED_SHA:0:7}..." + + MISMATCH_COUNT=0 + MISMATCH_LINES=() + for slug in "${SLUGS[@]}"; do + URL="https://${slug}.${TENANT_DOMAIN}/buildinfo" + # 30s total: tenant just SSM-restarted, may still be coming + # up. Retry-on-empty rather than retry-on-status — we want + # to fail fast on "responded with wrong SHA", not "still + # warming up". + BODY=$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$URL" || true) + ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "") + if [ -z "$ACTUAL_SHA" ]; then + MISMATCH_COUNT=$((MISMATCH_COUNT + 1)) + MISMATCH_LINES+=("| $slug | (no /buildinfo response) | ${EXPECTED_SHA:0:7} | ❌ unreachable |") + continue + fi + if [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then + echo " $slug: ${ACTUAL_SHA:0:7} ✓" + else + MISMATCH_COUNT=$((MISMATCH_COUNT + 1)) + MISMATCH_LINES+=("| $slug | ${ACTUAL_SHA:0:7} | ${EXPECTED_SHA:0:7} | ❌ stale |") + fi + done + + { + echo "" + echo "### Per-tenant /buildinfo verification" + echo "" + echo "Expected SHA: \`${EXPECTED_SHA:0:7}\`" + echo "" + if [ $MISMATCH_COUNT -gt 0 ]; then + echo "**${MISMATCH_COUNT} mismatch(es) — these tenants did NOT pick up the new image despite ssm_status=Success:**" + echo "" + echo "| Slug | Actual /buildinfo SHA | Expected | Status |" + echo "|------|----------------------|----------|--------|" + for line in "${MISMATCH_LINES[@]}"; do echo "$line"; done + else + echo "All ${#SLUGS[@]} tenants returned matching SHA. ✓" + fi + } >> "$GITHUB_STEP_SUMMARY" + + if [ $MISMATCH_COUNT -gt 0 ]; then + echo "::error::$MISMATCH_COUNT tenant(s) did not pick up the new image. ssm_status=Success was misleading — see job summary." + exit 1 + fi + + echo "::notice::Tenant fleet redeploy complete + verified — all tenants on ${EXPECTED_SHA:0:7}." diff --git a/.github/workflows/redeploy-tenants-on-staging.yml b/.github/workflows/redeploy-tenants-on-staging.yml index 4da11d51..82eb16a0 100644 --- a/.github/workflows/redeploy-tenants-on-staging.yml +++ b/.github/workflows/redeploy-tenants-on-staging.yml @@ -181,4 +181,83 @@ jobs: echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)" exit 1 fi - echo "::notice::Staging tenant fleet redeploy complete." + echo "::notice::Staging tenant fleet redeploy reported ssm_status=Success — verifying actual image roll on each tenant..." + + cp "$HTTP_RESPONSE" "$RUNNER_TEMP/redeploy-response.json" + + - name: Verify each staging tenant /buildinfo matches published SHA + # Mirror of the verify step in redeploy-tenants-on-main.yml — see + # there for the rationale (#2395 root fix). Staging has the same + # ssm_status-success-but-stale-image hazard and benefits from the + # same gate. Diff: TENANT_DOMAIN includes the `staging.` infix. + env: + EXPECTED_SHA: ${{ github.event.workflow_run.head_sha || github.sha }} + TARGET_TAG: ${{ inputs.target_tag || 'staging-latest' }} + TENANT_DOMAIN: 'staging.moleculesai.app' + run: | + set -euo pipefail + + # staging-latest is the staging-side moving tag; treat it the + # same way main treats `latest`. Operator-pinned SHAs skip + # verification (see main variant for why). + if [ "$TARGET_TAG" != "staging-latest" ] && [ "$TARGET_TAG" != "latest" ] && [ "$TARGET_TAG" != "$EXPECTED_SHA" ]; then + echo "::notice::target_tag=$TARGET_TAG (operator-pinned) — skipping per-tenant SHA verification." + exit 0 + fi + + RESP="$RUNNER_TEMP/redeploy-response.json" + if [ ! -s "$RESP" ]; then + echo "::error::redeploy-response.json missing or empty" + exit 1 + fi + + mapfile -t SLUGS < <(jq -r '.results[]? | select(.healthz_ok == true) | .slug' "$RESP") + if [ ${#SLUGS[@]} -eq 0 ]; then + echo "::warning::No staging tenants reported healthz_ok — nothing to verify" + exit 0 + fi + + echo "Verifying ${#SLUGS[@]} staging tenant(s) against EXPECTED_SHA=${EXPECTED_SHA:0:7}..." + + MISMATCH_COUNT=0 + MISMATCH_LINES=() + for slug in "${SLUGS[@]}"; do + URL="https://${slug}.${TENANT_DOMAIN}/buildinfo" + BODY=$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$URL" || true) + ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "") + if [ -z "$ACTUAL_SHA" ]; then + MISMATCH_COUNT=$((MISMATCH_COUNT + 1)) + MISMATCH_LINES+=("| $slug | (no /buildinfo response) | ${EXPECTED_SHA:0:7} | ❌ unreachable |") + continue + fi + if [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then + echo " $slug: ${ACTUAL_SHA:0:7} ✓" + else + MISMATCH_COUNT=$((MISMATCH_COUNT + 1)) + MISMATCH_LINES+=("| $slug | ${ACTUAL_SHA:0:7} | ${EXPECTED_SHA:0:7} | ❌ stale |") + fi + done + + { + echo "" + echo "### Per-tenant /buildinfo verification (staging)" + echo "" + echo "Expected SHA: \`${EXPECTED_SHA:0:7}\`" + echo "" + if [ $MISMATCH_COUNT -gt 0 ]; then + echo "**${MISMATCH_COUNT} mismatch(es) — these staging tenants did NOT pick up the new image despite ssm_status=Success:**" + echo "" + echo "| Slug | Actual /buildinfo SHA | Expected | Status |" + echo "|------|----------------------|----------|--------|" + for line in "${MISMATCH_LINES[@]}"; do echo "$line"; done + else + echo "All ${#SLUGS[@]} staging tenants returned matching SHA. ✓" + fi + } >> "$GITHUB_STEP_SUMMARY" + + if [ $MISMATCH_COUNT -gt 0 ]; then + echo "::error::$MISMATCH_COUNT staging tenant(s) did not pick up the new image. ssm_status=Success was misleading — see job summary." + exit 1 + fi + + echo "::notice::Staging tenant fleet redeploy complete + verified — all tenants on ${EXPECTED_SHA:0:7}." diff --git a/workspace-server/Dockerfile b/workspace-server/Dockerfile index dcd7841e..7065e405 100644 --- a/workspace-server/Dockerfile +++ b/workspace-server/Dockerfile @@ -16,7 +16,11 @@ RUN echo 'replace github.com/Molecule-AI/molecule-ai-plugin-github-app-auth => / RUN sed -i 's|replace github.com/Molecule-AI/molecule-monorepo/platform => .*|replace github.com/Molecule-AI/molecule-monorepo/platform => /app|' /plugin/go.mod RUN go mod download COPY workspace-server/ . -RUN CGO_ENABLED=0 GOOS=linux go build -o /platform ./cmd/server +# GIT_SHA mirror of Dockerfile.tenant — see that file for the rationale. +ARG GIT_SHA=dev +RUN CGO_ENABLED=0 GOOS=linux go build \ + -ldflags "-X github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo.GitSHA=${GIT_SHA}" \ + -o /platform ./cmd/server # Clone templates + plugins at build time from manifest.json FROM alpine:3.20 AS templates diff --git a/workspace-server/Dockerfile.tenant b/workspace-server/Dockerfile.tenant index c4d8fc88..23140a67 100644 --- a/workspace-server/Dockerfile.tenant +++ b/workspace-server/Dockerfile.tenant @@ -21,7 +21,19 @@ COPY workspace-server/go.mod workspace-server/go.sum ./ RUN echo 'replace github.com/Molecule-AI/molecule-ai-plugin-github-app-auth => /plugin' >> go.mod RUN go mod download COPY workspace-server/ . -RUN CGO_ENABLED=0 GOOS=linux go build -o /platform ./cmd/server + +# GIT_SHA is baked into the binary via -ldflags so /buildinfo can return +# it at runtime. CI passes ${{ github.sha }}; local builds default to +# "dev" so an unset value never reads as a real SHA. +# +# Why this matters: the redeploy verification step compares each tenant's +# /buildinfo against the SHA the workflow expects. If GIT_SHA isn't +# threaded through here, every tenant returns "dev" and the verification +# fails closed — which is the correct fail-direction (#2395 root fix). +ARG GIT_SHA=dev +RUN CGO_ENABLED=0 GOOS=linux go build \ + -ldflags "-X github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo.GitSHA=${GIT_SHA}" \ + -o /platform ./cmd/server # ── Stage 2: Canvas Next.js standalone ──────────────────────────────── FROM node:20-alpine AS canvas-builder diff --git a/workspace-server/internal/buildinfo/buildinfo.go b/workspace-server/internal/buildinfo/buildinfo.go new file mode 100644 index 00000000..af92945a --- /dev/null +++ b/workspace-server/internal/buildinfo/buildinfo.go @@ -0,0 +1,26 @@ +// Package buildinfo exposes the git SHA the binary was built from. +// +// Set at link time: +// +// go build -ldflags "-X github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo.GitSHA=" +// +// CI passes ${{ github.sha }} via Dockerfile.tenant ARG GIT_SHA; local +// dev builds default to "dev" so unset never reads as success. +// +// Why this package exists: redeploy-fleet (CP) returns ssm_status=Success +// when the SSM RPC didn't error — that's "the deploy command ran", +// NOT "the new code is running on every tenant." Image-tag-as-tag +// (`:latest`) caches in the local Docker daemon so `docker compose up -d` +// without an explicit `docker pull` is a no-op when the tag hasn't been +// invalidated. Both observed 2026-04-30: the user's tenant kept serving +// pre-501a42d7 chat_files even after main published the lazy-heal fix +// (#2395). Exposing GitSHA at /buildinfo lets the redeploy workflow +// verify EVERY tenant is actually running the published SHA before +// reporting success. +package buildinfo + +// GitSHA is overwritten at build time via -ldflags. Default catches +// dev builds + any deploy that forgot to wire the build-arg through. +// "dev" is intentional — comparing it to a real SHA always fails, +// which is what we want for an unconfigured deploy. +var GitSHA = "dev" diff --git a/workspace-server/internal/buildinfo/buildinfo_test.go b/workspace-server/internal/buildinfo/buildinfo_test.go new file mode 100644 index 00000000..a1582ba4 --- /dev/null +++ b/workspace-server/internal/buildinfo/buildinfo_test.go @@ -0,0 +1,81 @@ +package buildinfo_test + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo" + "github.com/gin-gonic/gin" +) + +// TestGitSHA_DefaultDevSentinel pins the contract that an unset +// GIT_SHA at build time reads as "dev", NOT as an empty string. The +// redeploy verification step compares the deployed /buildinfo against +// the workflow's expected SHA — if GitSHA were "" by default, a +// misconfigured deploy would round-trip "" successfully if the +// expected SHA were also somehow ""; "dev" guarantees the comparison +// always fails closed for an unset deploy. +// +// Linker tests can't directly exercise -ldflags injection from inside +// `go test`, but they can pin the default the linker overrides. +func TestGitSHA_DefaultDevSentinel(t *testing.T) { + if buildinfo.GitSHA != "dev" { + t.Errorf("GitSHA default = %q, want %q (CI ldflags override expected to set this; tests run without ldflags so this should be the dev sentinel)", buildinfo.GitSHA, "dev") + } +} + +// TestBuildInfoEndpoint_ReturnsGitSHA pins the wire shape of the +// /buildinfo response. The redeploy verification step reads +// `.git_sha` from this JSON; renaming the field would silently break +// every tenant verification (the jq lookup would return null + the +// step would interpret it as "tenant unreachable" and fail closed, +// which is correct but noisy). +// +// Test routes the handler against an httptest server rather than +// constructing a router.Setup() — that constructor takes a Hub + +// Broadcaster + Provisioner + WorkspaceHandler + ChannelMgr, and +// /buildinfo doesn't depend on any of them. Using a minimal gin +// engine here keeps the test fast and isolated to the contract under +// test. +func TestBuildInfoEndpoint_ReturnsGitSHA(t *testing.T) { + // Stash + restore so other tests that read GitSHA see a stable + // value. The package-level var is mutable by design (-ldflags), + // so test isolation requires explicit save/restore. + prev := buildinfo.GitSHA + t.Cleanup(func() { buildinfo.GitSHA = prev }) + buildinfo.GitSHA = "abc1234deadbeef" + + gin.SetMode(gin.TestMode) + r := gin.New() + r.GET("/buildinfo", func(c *gin.Context) { + c.JSON(200, gin.H{"git_sha": buildinfo.GitSHA}) + }) + + srv := httptest.NewServer(r) + t.Cleanup(srv.Close) + + resp, err := http.Get(srv.URL + "/buildinfo") + if err != nil { + t.Fatalf("GET /buildinfo: %v", err) + } + t.Cleanup(func() { _ = resp.Body.Close() }) + + if resp.StatusCode != 200 { + t.Fatalf("status = %d, want 200", resp.StatusCode) + } + + var body map[string]string + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + t.Fatalf("decode: %v", err) + } + + got, ok := body["git_sha"] + if !ok { + t.Fatalf("response missing git_sha field — would break the redeploy verification jq lookup. Body: %+v", body) + } + if got != "abc1234deadbeef" { + t.Errorf("git_sha = %q, want %q", got, "abc1234deadbeef") + } +} diff --git a/workspace-server/internal/router/router.go b/workspace-server/internal/router/router.go index 4fa70632..3d04b12e 100644 --- a/workspace-server/internal/router/router.go +++ b/workspace-server/internal/router/router.go @@ -8,6 +8,7 @@ import ( "strings" "time" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo" "github.com/Molecule-AI/molecule-monorepo/platform/internal/channels" "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" "github.com/Molecule-AI/molecule-monorepo/platform/internal/events" @@ -80,6 +81,18 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi c.JSON(200, gin.H{"status": "ok"}) }) + // Build info — public, no auth. Returns the git SHA the binary was + // linked from. Existence reason is in buildinfo/buildinfo.go: lets the + // redeploy workflow verify each tenant is actually running the + // published code (closing #2395 — ssm_status=Success is "the deploy + // command ran", not "the new code is running"). Public is intentional: + // it's a build identifier, not operational state. The same string is + // already published as org.opencontainers.image.revision on the + // container image, so no new info is exposed. + r.GET("/buildinfo", func(c *gin.Context) { + c.JSON(200, gin.H{"git_sha": buildinfo.GitSHA}) + }) + // /admin/liveness — per-subsystem last-tick timestamps. Operators read this // to catch stuck-but-not-crashed goroutines (the failure mode that caused // the 12h scheduler outage of 2026-04-14, issue #85). Any subsystem whose