feat(canary): smoke harness + GHA verification workflow (Phase 2)
Post-deploy verification for staging tenant images. Runs against the canary fleet after each publish-workspace-server-image build — catches auto-update breakage (a la today's E2E current_task drift) before it propagates to the prod tenant fleet that auto-pulls :latest every 5 min. scripts/canary-smoke.sh iterates a space-sep list of canary base URLs (paired with their ADMIN_TOKENs) and checks: - /admin/liveness reachable with admin bearer (tenant boot OK) - /workspaces list responds (wsAuth + DB path OK) - /memories/commit + /memories/search round-trip (encryption + scrubber) - /events admin read (AdminAuth C4 path) - /admin/liveness without bearer returns 401 (C4 fail-closed regression) .github/workflows/canary-verify.yml runs after publish succeeds: - 6-min sleep (tenant auto-updater pulls every 5 min) - bash scripts/canary-smoke.sh with secrets pulled from repo settings - on failure: writes a Step Summary flagging that :latest should be rolled back to prior known-good digest Phase 3 follow-up will split the publish workflow so only :staging-<sha> ships initially, and canary-verify's green gate is what promotes :staging-<sha> → :latest. This commit lays the test gate alone so we have something running against tenants immediately. Secrets to set in GitHub repo settings before this workflow can run: - CANARY_TENANT_URLS (space-sep list) - CANARY_ADMIN_TOKENS (same order as URLs) - CANARY_CP_SHARED_SECRET (matches staging CP PROVISION_SHARED_SECRET) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
c40e7ba68c
commit
c1ac32365d
56
.github/workflows/canary-verify.yml
vendored
Normal file
56
.github/workflows/canary-verify.yml
vendored
Normal file
@ -0,0 +1,56 @@
|
||||
name: canary-verify
|
||||
|
||||
# Runs the canary smoke suite against the staging canary tenant fleet
|
||||
# after a new workspace-server image lands on :latest. On failure,
|
||||
# alerts via a GitHub Actions summary — follow-up PR will add:
|
||||
# - :staging-<sha> intermediate tag published BY publish workflow
|
||||
# - retag :staging-<sha> → :latest ONLY when this workflow is green
|
||||
# - Telegram/Slack notifier on red
|
||||
# For now this exists as the test gate itself so we can catch
|
||||
# auto-update breakage before it reaches the prod tenant fleet
|
||||
# (which auto-pulls :latest every 5 min).
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows: ["publish-workspace-server-image"]
|
||||
types: [completed]
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
actions: read
|
||||
|
||||
jobs:
|
||||
canary-smoke:
|
||||
# Skip when the upstream workflow failed — no image to test against.
|
||||
if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Wait for canary tenants to pick up new image
|
||||
# Tenant auto-updater runs every 5 min. Sleep 6 min to give every
|
||||
# canary time to pull + restart. Cheaper than polling.
|
||||
run: sleep 360
|
||||
|
||||
- name: Run canary smoke suite
|
||||
env:
|
||||
CANARY_TENANT_URLS: ${{ secrets.CANARY_TENANT_URLS }}
|
||||
CANARY_ADMIN_TOKENS: ${{ secrets.CANARY_ADMIN_TOKENS }}
|
||||
CANARY_CP_BASE_URL: https://staging-api.moleculesai.app
|
||||
CANARY_CP_SHARED_SECRET: ${{ secrets.CANARY_CP_SHARED_SECRET }}
|
||||
run: bash scripts/canary-smoke.sh
|
||||
|
||||
- name: Summary on failure
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
{
|
||||
echo "## Canary smoke FAILED"
|
||||
echo
|
||||
echo "The staging canary tenant fleet failed its post-deploy smoke suite."
|
||||
echo "The :latest tag on ghcr.io/molecule-ai/platform should be rolled back"
|
||||
echo "to the prior known-good digest until this is resolved."
|
||||
echo
|
||||
echo "See job log above for the specific failed assertions."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
120
scripts/canary-smoke.sh
Executable file
120
scripts/canary-smoke.sh
Executable file
@ -0,0 +1,120 @@
|
||||
#!/bin/bash
|
||||
# canary-smoke.sh — runs the post-deploy smoke suite against the
|
||||
# staging canary tenant fleet. Called by the canary-verify.yml GitHub
|
||||
# Actions workflow after a new workspace-server image gets pushed to
|
||||
# GHCR; exits non-zero on any failure so the workflow can skip the
|
||||
# :staging-sha → :latest retag that would otherwise release broken
|
||||
# code to the prod tenant fleet.
|
||||
#
|
||||
# Environment:
|
||||
# CANARY_TENANT_URLS space-sep list of canary tenant base URLs
|
||||
# (e.g. "https://canary-pm.staging.moleculesai.app
|
||||
# https://canary-mcp.staging.moleculesai.app")
|
||||
# CANARY_ADMIN_TOKENS space-sep list of ADMIN_TOKENs, positionally
|
||||
# matched to CANARY_TENANT_URLS. Canary tenants
|
||||
# are provisioned with known ADMIN_TOKENs so CI
|
||||
# can hit their admin-gated endpoints.
|
||||
# CANARY_CP_BASE_URL CP base URL the canaries call back to
|
||||
# (https://staging-api.moleculesai.app)
|
||||
# CANARY_CP_SHARED_SECRET matches CP's PROVISION_SHARED_SECRET so this
|
||||
# script can also exercise /cp/workspaces/* via
|
||||
# the canary's own CPProvisioner identity.
|
||||
#
|
||||
# Exit codes: 0 = all green, 1 = assertion failure, 2 = setup/env problem.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ── Setup ────────────────────────────────────────────────────────────────
|
||||
|
||||
: "${CANARY_TENANT_URLS:?space-sep list of canary base URLs required}"
|
||||
: "${CANARY_ADMIN_TOKENS:?space-sep list of ADMIN_TOKENs required, same order as URLs}"
|
||||
: "${CANARY_CP_BASE_URL:?CP base URL required}"
|
||||
|
||||
read -r -a URLS <<< "$CANARY_TENANT_URLS"
|
||||
read -r -a TOKENS <<< "$CANARY_ADMIN_TOKENS"
|
||||
|
||||
if [ "${#URLS[@]}" -ne "${#TOKENS[@]}" ]; then
|
||||
echo "ERROR: URLS(${#URLS[@]}) and TOKENS(${#TOKENS[@]}) length mismatch" >&2
|
||||
exit 2
|
||||
fi
|
||||
if [ "${#URLS[@]}" -eq 0 ]; then
|
||||
echo "ERROR: no canary URLs configured" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
|
||||
# ── Helpers ──────────────────────────────────────────────────────────────
|
||||
|
||||
check() {
|
||||
local desc="$1" expected="$2" actual="$3"
|
||||
if echo "$actual" | grep -qF "$expected"; then
|
||||
printf " PASS %s\n" "$desc"
|
||||
PASS=$((PASS + 1))
|
||||
else
|
||||
printf " FAIL %s\n expected to contain: %s\n got: %s\n" "$desc" "$expected" "$actual" >&2
|
||||
FAIL=$((FAIL + 1))
|
||||
fi
|
||||
}
|
||||
|
||||
# acurl does an admin-authenticated GET/POST/etc. against a canary tenant.
|
||||
# Takes +BASE_URL +ADMIN_TOKEN as its first two positional args; the rest
|
||||
# are passed through to curl. Keeps the two values paired so the wrong
|
||||
# tenant never gets the wrong token.
|
||||
acurl() {
|
||||
local base="$1" token="$2"; shift 2
|
||||
curl -sS --max-time 20 -H "Authorization: Bearer $token" "$@" -- "$base${CANARY_ACURL_PATH:-}"
|
||||
}
|
||||
|
||||
# ── Checks (run per canary tenant) ───────────────────────────────────────
|
||||
|
||||
for i in "${!URLS[@]}"; do
|
||||
base="${URLS[$i]}"
|
||||
token="${TOKENS[$i]}"
|
||||
printf "\n── %s ──\n" "$base"
|
||||
|
||||
# 1. Liveness — the tenant is up and responding to admin auth.
|
||||
CANARY_ACURL_PATH="/admin/liveness" resp=$(acurl "$base" "$token" || true)
|
||||
check "liveness returns a subsystems map" '"subsystems"' "$resp"
|
||||
|
||||
# 2. CP env refresh — the workspace-server fetched MOLECULE_CP_SHARED_SECRET
|
||||
# from CP on startup. We can't read env directly, but we can assert the
|
||||
# liveness + workspace list both work, which together imply the binary
|
||||
# booted without crashing on the refresh call. A startup failure in
|
||||
# refreshEnvFromCP logs but still boots (best-effort semantics), so
|
||||
# this is a sanity check, not a proof.
|
||||
CANARY_ACURL_PATH="/workspaces" resp=$(acurl "$base" "$token" || true)
|
||||
check "workspace list is JSON array" "[" "$resp"
|
||||
|
||||
# 3. Memory commit round-trip — scope=LOCAL so test data stays on this
|
||||
# tenant. Verifies encryption + scrubber + retrieval end-to-end.
|
||||
probe_id="canary-smoke-$(date +%s)-$i"
|
||||
body=$(printf '{"scope":"LOCAL","namespace":"canary-smoke","content":"probe-%s"}' "$probe_id")
|
||||
CANARY_ACURL_PATH="/memories/commit" resp=$(curl -sS --max-time 20 \
|
||||
-X POST -H "Content-Type: application/json" -H "Authorization: Bearer $token" \
|
||||
--data "$body" "$base/memories/commit" || true)
|
||||
check "memory commit accepted" '"id"' "$resp"
|
||||
|
||||
CANARY_ACURL_PATH="/memories/search?query=probe-${probe_id}" \
|
||||
resp=$(curl -sS --max-time 20 -H "Authorization: Bearer $token" \
|
||||
"$base/memories/search?query=probe-${probe_id}" || true)
|
||||
check "memory search finds the probe" "probe-${probe_id}" "$resp"
|
||||
|
||||
# 4. Events admin read — AdminAuth path (C4 fail-closed proof on SaaS).
|
||||
CANARY_ACURL_PATH="/events" resp=$(acurl "$base" "$token" || true)
|
||||
check "events endpoint returns JSON" "[" "$resp"
|
||||
|
||||
# 5. Negative: unauth'd admin call must 401 (C4 regression gate).
|
||||
unauth_code=$(curl -sS -o /dev/null -w '%{http_code}' --max-time 10 "$base/admin/liveness" || echo "000")
|
||||
check "unauth'd /admin/liveness returns 401" "401" "$unauth_code"
|
||||
done
|
||||
|
||||
# ── Summary ──────────────────────────────────────────────────────────────
|
||||
|
||||
printf "\n=== CANARY SMOKE RESULTS ===\n"
|
||||
printf " PASS: %d\n FAIL: %d\n" "$PASS" "$FAIL"
|
||||
|
||||
if [ "$FAIL" -gt 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
Loading…
Reference in New Issue
Block a user