feat(deploy): verify each tenant /buildinfo matches published SHA after redeploy

Closes the gap that let issue #2395 ship: redeploy-fleet workflows reported
ssm_status=Success based on SSM RPC return code alone, while EC2 tenants
silently kept serving the previous :latest digest because docker compose up
without an explicit pull is a no-op when the local tag already exists.

Wire:
  - new buildinfo package exposes GitSHA, set at link time via -ldflags from
    the GIT_SHA build-arg (default "dev" so test runs without ldflags fail
    closed against an unset deploy)
  - router exposes GET /buildinfo returning {git_sha} — public, no auth,
    cheap enough to curl from CI for every tenant
  - both Dockerfiles thread GIT_SHA into the Go build
  - publish-workspace-server-image.yml passes GIT_SHA=github.sha for both
    images
  - redeploy-tenants-on-main.yml + redeploy-tenants-on-staging.yml curl each
    tenant's /buildinfo after the redeploy SSM RPC and fail the workflow on
    digest mismatch; staging treats both :latest and :staging-latest as
    moving tags; verification is skipped only when an operator pinned a
    specific tag via workflow_dispatch

Tests:
  - TestGitSHA_DefaultDevSentinel pins the dev default
  - TestBuildInfoEndpoint_ReturnsGitSHA pins the wire shape that the
    workflow's jq lookup depends on

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-30 10:55:08 -07:00
parent 36fd658cc0
commit 998e13c4bd
8 changed files with 343 additions and 4 deletions

View File

@ -142,6 +142,13 @@ jobs:
${{ env.IMAGE_NAME }}:staging-latest
cache-from: type=gha
cache-to: type=gha,mode=max
# GIT_SHA bakes into the Go binary via -ldflags so /buildinfo
# returns it at runtime — see Dockerfile + buildinfo/buildinfo.go.
# This is the same value as the OCI revision label below; passing
# it twice is intentional, the OCI label is for registry tooling
# while /buildinfo is for the redeploy verification step.
build-args: |
GIT_SHA=${{ github.sha }}
labels: |
org.opencontainers.image.source=https://github.com/${{ github.repository }}
org.opencontainers.image.revision=${{ github.sha }}
@ -172,6 +179,7 @@ jobs:
# NEXT_PUBLIC_PLATFORM_URL=http://localhost:8080).
build-args: |
NEXT_PUBLIC_PLATFORM_URL=
GIT_SHA=${{ github.sha }}
labels: |
org.opencontainers.image.source=https://github.com/${{ github.repository }}
org.opencontainers.image.revision=${{ github.sha }}

View File

@ -175,4 +175,120 @@ jobs:
echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)"
exit 1
fi
echo "::notice::Tenant fleet redeploy complete."
echo "::notice::Tenant fleet redeploy reported ssm_status=Success — verifying actual image roll on each tenant..."
# Stash the response for the verify step. $RUNNER_TEMP outlasts
# the step boundary; $HTTP_RESPONSE doesn't.
cp "$HTTP_RESPONSE" "$RUNNER_TEMP/redeploy-response.json"
- name: Verify each tenant /buildinfo matches published SHA
# ROOT FIX FOR #2395.
#
# `redeploy-fleet`'s `ssm_status=Success` means "the SSM RPC
# didn't error" — NOT "the new image is running on the tenant."
# `:latest` lives in the local Docker daemon's image cache; if
# the SSM document does `docker compose up -d` without an
# explicit `docker pull`, the daemon serves the previously-
# cached digest and the container restarts on stale code.
# 2026-04-30 incident: hongmingwang's tenant reported
# ssm_status=Success at 17:00:53Z but kept serving pre-501a42d7
# chat_files for 30+ min — the lazy-heal fix never reached the
# user despite green deploy + green redeploy.
#
# This step closes the gap by curling each tenant's /buildinfo
# endpoint (added in workspace-server/internal/buildinfo +
# /Dockerfile* GIT_SHA build-arg, this PR) and comparing the
# returned git_sha to the SHA the workflow expects. Mismatches
# fail the workflow, which is what `ok=true` should have
# guaranteed all along.
#
# When the redeploy was triggered by workflow_dispatch with a
# specific tag (target_tag != "latest"), the expected SHA may
# not equal ${{ github.sha }} — in that case we resolve via
# GHCR's manifest. For workflow_run (default :latest) the
# workflow_run.head_sha is the SHA that just published.
env:
EXPECTED_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
TARGET_TAG: ${{ inputs.target_tag || 'latest' }}
# Tenant subdomain template — slugs from the response are
# appended. Production CP issues `<slug>.moleculesai.app`;
# staging CP issues `<slug>.staging.moleculesai.app`. This
# workflow runs on main → prod CP → no `staging.` infix.
TENANT_DOMAIN: 'moleculesai.app'
run: |
set -euo pipefail
if [ "$TARGET_TAG" != "latest" ] && [ "$TARGET_TAG" != "$EXPECTED_SHA" ]; then
# workflow_dispatch with a pinned tag that isn't the head
# SHA — operator is rolling back / pinning. Skip the
# verification because we don't have the expected SHA in
# this context (would need to crane-inspect the GHCR
# manifest, which is a follow-up). Failing-open here is
# safe: the operator chose the tag deliberately.
echo "::notice::target_tag=$TARGET_TAG (operator-pinned) — skipping per-tenant SHA verification."
exit 0
fi
RESP="$RUNNER_TEMP/redeploy-response.json"
if [ ! -s "$RESP" ]; then
echo "::error::redeploy-response.json missing or empty — verify step ran without a response to read"
exit 1
fi
# Pull only successfully-redeployed tenants. Any tenant that
# halted the rollout already failed the previous step, so we
# don't double-count them here.
mapfile -t SLUGS < <(jq -r '.results[]? | select(.healthz_ok == true) | .slug' "$RESP")
if [ ${#SLUGS[@]} -eq 0 ]; then
echo "::warning::No tenants reported healthz_ok — nothing to verify"
exit 0
fi
echo "Verifying ${#SLUGS[@]} tenant(s) against EXPECTED_SHA=${EXPECTED_SHA:0:7}..."
MISMATCH_COUNT=0
MISMATCH_LINES=()
for slug in "${SLUGS[@]}"; do
URL="https://${slug}.${TENANT_DOMAIN}/buildinfo"
# 30s total: tenant just SSM-restarted, may still be coming
# up. Retry-on-empty rather than retry-on-status — we want
# to fail fast on "responded with wrong SHA", not "still
# warming up".
BODY=$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$URL" || true)
ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "")
if [ -z "$ACTUAL_SHA" ]; then
MISMATCH_COUNT=$((MISMATCH_COUNT + 1))
MISMATCH_LINES+=("| $slug | (no /buildinfo response) | ${EXPECTED_SHA:0:7} | ❌ unreachable |")
continue
fi
if [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then
echo " $slug: ${ACTUAL_SHA:0:7} ✓"
else
MISMATCH_COUNT=$((MISMATCH_COUNT + 1))
MISMATCH_LINES+=("| $slug | ${ACTUAL_SHA:0:7} | ${EXPECTED_SHA:0:7} | ❌ stale |")
fi
done
{
echo ""
echo "### Per-tenant /buildinfo verification"
echo ""
echo "Expected SHA: \`${EXPECTED_SHA:0:7}\`"
echo ""
if [ $MISMATCH_COUNT -gt 0 ]; then
echo "**${MISMATCH_COUNT} mismatch(es) — these tenants did NOT pick up the new image despite ssm_status=Success:**"
echo ""
echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
echo "|------|----------------------|----------|--------|"
for line in "${MISMATCH_LINES[@]}"; do echo "$line"; done
else
echo "All ${#SLUGS[@]} tenants returned matching SHA. ✓"
fi
} >> "$GITHUB_STEP_SUMMARY"
if [ $MISMATCH_COUNT -gt 0 ]; then
echo "::error::$MISMATCH_COUNT tenant(s) did not pick up the new image. ssm_status=Success was misleading — see job summary."
exit 1
fi
echo "::notice::Tenant fleet redeploy complete + verified — all tenants on ${EXPECTED_SHA:0:7}."

View File

@ -181,4 +181,83 @@ jobs:
echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)"
exit 1
fi
echo "::notice::Staging tenant fleet redeploy complete."
echo "::notice::Staging tenant fleet redeploy reported ssm_status=Success — verifying actual image roll on each tenant..."
cp "$HTTP_RESPONSE" "$RUNNER_TEMP/redeploy-response.json"
- name: Verify each staging tenant /buildinfo matches published SHA
# Mirror of the verify step in redeploy-tenants-on-main.yml — see
# there for the rationale (#2395 root fix). Staging has the same
# ssm_status-success-but-stale-image hazard and benefits from the
# same gate. Diff: TENANT_DOMAIN includes the `staging.` infix.
env:
EXPECTED_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
TARGET_TAG: ${{ inputs.target_tag || 'staging-latest' }}
TENANT_DOMAIN: 'staging.moleculesai.app'
run: |
set -euo pipefail
# staging-latest is the staging-side moving tag; treat it the
# same way main treats `latest`. Operator-pinned SHAs skip
# verification (see main variant for why).
if [ "$TARGET_TAG" != "staging-latest" ] && [ "$TARGET_TAG" != "latest" ] && [ "$TARGET_TAG" != "$EXPECTED_SHA" ]; then
echo "::notice::target_tag=$TARGET_TAG (operator-pinned) — skipping per-tenant SHA verification."
exit 0
fi
RESP="$RUNNER_TEMP/redeploy-response.json"
if [ ! -s "$RESP" ]; then
echo "::error::redeploy-response.json missing or empty"
exit 1
fi
mapfile -t SLUGS < <(jq -r '.results[]? | select(.healthz_ok == true) | .slug' "$RESP")
if [ ${#SLUGS[@]} -eq 0 ]; then
echo "::warning::No staging tenants reported healthz_ok — nothing to verify"
exit 0
fi
echo "Verifying ${#SLUGS[@]} staging tenant(s) against EXPECTED_SHA=${EXPECTED_SHA:0:7}..."
MISMATCH_COUNT=0
MISMATCH_LINES=()
for slug in "${SLUGS[@]}"; do
URL="https://${slug}.${TENANT_DOMAIN}/buildinfo"
BODY=$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$URL" || true)
ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "")
if [ -z "$ACTUAL_SHA" ]; then
MISMATCH_COUNT=$((MISMATCH_COUNT + 1))
MISMATCH_LINES+=("| $slug | (no /buildinfo response) | ${EXPECTED_SHA:0:7} | ❌ unreachable |")
continue
fi
if [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then
echo " $slug: ${ACTUAL_SHA:0:7} ✓"
else
MISMATCH_COUNT=$((MISMATCH_COUNT + 1))
MISMATCH_LINES+=("| $slug | ${ACTUAL_SHA:0:7} | ${EXPECTED_SHA:0:7} | ❌ stale |")
fi
done
{
echo ""
echo "### Per-tenant /buildinfo verification (staging)"
echo ""
echo "Expected SHA: \`${EXPECTED_SHA:0:7}\`"
echo ""
if [ $MISMATCH_COUNT -gt 0 ]; then
echo "**${MISMATCH_COUNT} mismatch(es) — these staging tenants did NOT pick up the new image despite ssm_status=Success:**"
echo ""
echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
echo "|------|----------------------|----------|--------|"
for line in "${MISMATCH_LINES[@]}"; do echo "$line"; done
else
echo "All ${#SLUGS[@]} staging tenants returned matching SHA. ✓"
fi
} >> "$GITHUB_STEP_SUMMARY"
if [ $MISMATCH_COUNT -gt 0 ]; then
echo "::error::$MISMATCH_COUNT staging tenant(s) did not pick up the new image. ssm_status=Success was misleading — see job summary."
exit 1
fi
echo "::notice::Staging tenant fleet redeploy complete + verified — all tenants on ${EXPECTED_SHA:0:7}."

View File

@ -16,7 +16,11 @@ RUN echo 'replace github.com/Molecule-AI/molecule-ai-plugin-github-app-auth => /
RUN sed -i 's|replace github.com/Molecule-AI/molecule-monorepo/platform => .*|replace github.com/Molecule-AI/molecule-monorepo/platform => /app|' /plugin/go.mod
RUN go mod download
COPY workspace-server/ .
RUN CGO_ENABLED=0 GOOS=linux go build -o /platform ./cmd/server
# GIT_SHA mirror of Dockerfile.tenant — see that file for the rationale.
ARG GIT_SHA=dev
RUN CGO_ENABLED=0 GOOS=linux go build \
-ldflags "-X github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo.GitSHA=${GIT_SHA}" \
-o /platform ./cmd/server
# Clone templates + plugins at build time from manifest.json
FROM alpine:3.20 AS templates

View File

@ -21,7 +21,19 @@ COPY workspace-server/go.mod workspace-server/go.sum ./
RUN echo 'replace github.com/Molecule-AI/molecule-ai-plugin-github-app-auth => /plugin' >> go.mod
RUN go mod download
COPY workspace-server/ .
RUN CGO_ENABLED=0 GOOS=linux go build -o /platform ./cmd/server
# GIT_SHA is baked into the binary via -ldflags so /buildinfo can return
# it at runtime. CI passes ${{ github.sha }}; local builds default to
# "dev" so an unset value never reads as a real SHA.
#
# Why this matters: the redeploy verification step compares each tenant's
# /buildinfo against the SHA the workflow expects. If GIT_SHA isn't
# threaded through here, every tenant returns "dev" and the verification
# fails closed — which is the correct fail-direction (#2395 root fix).
ARG GIT_SHA=dev
RUN CGO_ENABLED=0 GOOS=linux go build \
-ldflags "-X github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo.GitSHA=${GIT_SHA}" \
-o /platform ./cmd/server
# ── Stage 2: Canvas Next.js standalone ────────────────────────────────
FROM node:20-alpine AS canvas-builder

View File

@ -0,0 +1,26 @@
// Package buildinfo exposes the git SHA the binary was built from.
//
// Set at link time:
//
// go build -ldflags "-X github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo.GitSHA=<sha>"
//
// CI passes ${{ github.sha }} via Dockerfile.tenant ARG GIT_SHA; local
// dev builds default to "dev" so unset never reads as success.
//
// Why this package exists: redeploy-fleet (CP) returns ssm_status=Success
// when the SSM RPC didn't error — that's "the deploy command ran",
// NOT "the new code is running on every tenant." Image-tag-as-tag
// (`:latest`) caches in the local Docker daemon so `docker compose up -d`
// without an explicit `docker pull` is a no-op when the tag hasn't been
// invalidated. Both observed 2026-04-30: the user's tenant kept serving
// pre-501a42d7 chat_files even after main published the lazy-heal fix
// (#2395). Exposing GitSHA at /buildinfo lets the redeploy workflow
// verify EVERY tenant is actually running the published SHA before
// reporting success.
package buildinfo
// GitSHA is overwritten at build time via -ldflags. Default catches
// dev builds + any deploy that forgot to wire the build-arg through.
// "dev" is intentional — comparing it to a real SHA always fails,
// which is what we want for an unconfigured deploy.
var GitSHA = "dev"

View File

@ -0,0 +1,81 @@
package buildinfo_test
import (
"encoding/json"
"net/http"
"net/http/httptest"
"testing"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo"
"github.com/gin-gonic/gin"
)
// TestGitSHA_DefaultDevSentinel pins the contract that an unset
// GIT_SHA at build time reads as "dev", NOT as an empty string. The
// redeploy verification step compares the deployed /buildinfo against
// the workflow's expected SHA — if GitSHA were "" by default, a
// misconfigured deploy would round-trip "" successfully if the
// expected SHA were also somehow ""; "dev" guarantees the comparison
// always fails closed for an unset deploy.
//
// Linker tests can't directly exercise -ldflags injection from inside
// `go test`, but they can pin the default the linker overrides.
func TestGitSHA_DefaultDevSentinel(t *testing.T) {
if buildinfo.GitSHA != "dev" {
t.Errorf("GitSHA default = %q, want %q (CI ldflags override expected to set this; tests run without ldflags so this should be the dev sentinel)", buildinfo.GitSHA, "dev")
}
}
// TestBuildInfoEndpoint_ReturnsGitSHA pins the wire shape of the
// /buildinfo response. The redeploy verification step reads
// `.git_sha` from this JSON; renaming the field would silently break
// every tenant verification (the jq lookup would return null + the
// step would interpret it as "tenant unreachable" and fail closed,
// which is correct but noisy).
//
// Test routes the handler against an httptest server rather than
// constructing a router.Setup() — that constructor takes a Hub +
// Broadcaster + Provisioner + WorkspaceHandler + ChannelMgr, and
// /buildinfo doesn't depend on any of them. Using a minimal gin
// engine here keeps the test fast and isolated to the contract under
// test.
func TestBuildInfoEndpoint_ReturnsGitSHA(t *testing.T) {
// Stash + restore so other tests that read GitSHA see a stable
// value. The package-level var is mutable by design (-ldflags),
// so test isolation requires explicit save/restore.
prev := buildinfo.GitSHA
t.Cleanup(func() { buildinfo.GitSHA = prev })
buildinfo.GitSHA = "abc1234deadbeef"
gin.SetMode(gin.TestMode)
r := gin.New()
r.GET("/buildinfo", func(c *gin.Context) {
c.JSON(200, gin.H{"git_sha": buildinfo.GitSHA})
})
srv := httptest.NewServer(r)
t.Cleanup(srv.Close)
resp, err := http.Get(srv.URL + "/buildinfo")
if err != nil {
t.Fatalf("GET /buildinfo: %v", err)
}
t.Cleanup(func() { _ = resp.Body.Close() })
if resp.StatusCode != 200 {
t.Fatalf("status = %d, want 200", resp.StatusCode)
}
var body map[string]string
if err := json.NewDecoder(resp.Body).Decode(&body); err != nil {
t.Fatalf("decode: %v", err)
}
got, ok := body["git_sha"]
if !ok {
t.Fatalf("response missing git_sha field — would break the redeploy verification jq lookup. Body: %+v", body)
}
if got != "abc1234deadbeef" {
t.Errorf("git_sha = %q, want %q", got, "abc1234deadbeef")
}
}

View File

@ -8,6 +8,7 @@ import (
"strings"
"time"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/channels"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
@ -80,6 +81,18 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi
c.JSON(200, gin.H{"status": "ok"})
})
// Build info — public, no auth. Returns the git SHA the binary was
// linked from. Existence reason is in buildinfo/buildinfo.go: lets the
// redeploy workflow verify each tenant is actually running the
// published code (closing #2395 — ssm_status=Success is "the deploy
// command ran", not "the new code is running"). Public is intentional:
// it's a build identifier, not operational state. The same string is
// already published as org.opencontainers.image.revision on the
// container image, so no new info is exposed.
r.GET("/buildinfo", func(c *gin.Context) {
c.JSON(200, gin.H{"git_sha": buildinfo.GitSHA})
})
// /admin/liveness — per-subsystem last-tick timestamps. Operators read this
// to catch stuck-but-not-crashed goroutines (the failure mode that caused
// the 12h scheduler outage of 2026-04-14, issue #85). Any subsystem whose