ci(tenant-image): add build-time smoke gate so broken image never becomes :staging-latest (P0 SEV) #3111
@@ -280,12 +280,22 @@ jobs:
|
||||
aws ecr get-login-password --region us-east-2 | \
|
||||
docker login --username AWS --password-stdin "${STAGING_ECR_REGISTRY}"
|
||||
|
||||
# build_tags is the buildx-argv form (--tag, ref, --tag, ref) so we
|
||||
# can pass "${build_tags[@]}" to `docker buildx build`. The push
|
||||
# loop below needs the image refs WITHOUT the --tag flag, so we
|
||||
# also build a parallel list of bare image refs.
|
||||
build_tags=(
|
||||
--tag "${TENANT_IMAGE_NAME}:${TAG_SHA}"
|
||||
--tag "${TENANT_IMAGE_NAME}:${TAG_LATEST}"
|
||||
--tag "${STAGING_TENANT_IMAGE_NAME}:${TAG_SHA}"
|
||||
--tag "${STAGING_TENANT_IMAGE_NAME}:${TAG_LATEST}"
|
||||
)
|
||||
push_refs=(
|
||||
"${TENANT_IMAGE_NAME}:${TAG_SHA}"
|
||||
"${TENANT_IMAGE_NAME}:${TAG_LATEST}"
|
||||
"${STAGING_TENANT_IMAGE_NAME}:${TAG_SHA}"
|
||||
"${STAGING_TENANT_IMAGE_NAME}:${TAG_LATEST}"
|
||||
)
|
||||
|
||||
# Retry loop: buildkit EOF (internal#2468) is often transient on the
|
||||
# publish runner under memory pressure. Up to 3 attempts with a fresh
|
||||
@@ -295,8 +305,14 @@ jobs:
|
||||
# NEVER local cache here — cache-from gives retries AND the next run
|
||||
# a warm start. Cache lives on the PRIMARY ECR only (the staging
|
||||
# mirror is a push target, not a cache source).
|
||||
#
|
||||
# P0 SEV (2026-06-21): build uses --load (not --push) so the smoke
|
||||
# gate below can run the just-built image locally BEFORE it ever
|
||||
# touches ECR. A broken image can no longer become :staging-latest —
|
||||
# the gate fails the build (and the push step is skipped) if the
|
||||
# container does not reach /healthz=200 within 120s.
|
||||
for attempt in 1 2 3; do
|
||||
echo "::notice::Tenant image build attempt ${attempt}/3 ..."
|
||||
echo "::notice::Tenant image build (--load) attempt ${attempt}/3 ..."
|
||||
builder="tenant-builder-${GITHUB_RUN_ID}-${attempt}"
|
||||
docker buildx create --name "${builder}" --use >/dev/null 2>&1 || true
|
||||
if docker buildx build \
|
||||
@@ -313,9 +329,9 @@ jobs:
|
||||
--cache-from "type=registry,ref=${TENANT_IMAGE_NAME}:buildcache" \
|
||||
--cache-to "type=registry,ref=${TENANT_IMAGE_NAME}:buildcache,mode=max,image-manifest=true,oci-mediatypes=true,ignore-error=true" \
|
||||
"${build_tags[@]}" \
|
||||
--push .; then
|
||||
--load .; then
|
||||
docker buildx rm "${builder}" >/dev/null 2>&1 || true
|
||||
echo "::notice::Tenant image build succeeded on attempt ${attempt}"
|
||||
echo "::notice::Tenant image build (--load) succeeded on attempt ${attempt}"
|
||||
break
|
||||
fi
|
||||
echo "::warning::Tenant image build attempt ${attempt} failed — cleaning builder and retrying"
|
||||
@@ -327,6 +343,200 @@ jobs:
|
||||
fi
|
||||
done
|
||||
|
||||
# ====== SMOKE GATE (P0 SEV hardening) ======
|
||||
# Run the just-built image locally and assert it reaches /healthz=200
|
||||
# BEFORE pushing to ECR. A broken image MUST NOT become
|
||||
# :staging-latest. The existing canary/staging-verify job is a
|
||||
# post-push safety net — this gate is the build-time pre-push net
|
||||
# that catches the same defect ~10x faster (no ECR round-trip, no
|
||||
# cloudflared tunnel, no full org provisioning).
|
||||
#
|
||||
# P0 UPDATE (RCA 107680, PM dispatch fb0ab22f): the original
|
||||
# "bare `docker run`" smoke did NOT exercise the
|
||||
# env-activated MEMORY_PLUGIN_URL sidecar branch of
|
||||
# entrypoint-tenant.sh — a bare run with no MEMORY_PLUGIN_URL
|
||||
# set skips the sidecar entirely, so a defect in that branch
|
||||
# (the actual prod-onboarding root cause was a malformed
|
||||
# templateRepoEnvLine that the sidecar boot path tripped on)
|
||||
# would have passed the bare smoke. The gate now runs TWO
|
||||
# variants per build:
|
||||
# (A) FULL ENV — DATABASE_URL + MEMORY_PLUGIN_URL set, so
|
||||
# the sidecar branch EXECUTES. Boots a local pgvector
|
||||
# container, points the tenant at it, asserts BOTH
|
||||
# /healthz=200 (platform) AND :9100/v1/health=200
|
||||
# (memory-plugin sidecar).
|
||||
# (B) SIDECAR-DISABLED — MEMORY_PLUGIN_DISABLE=1, no DATABASE_URL.
|
||||
# Verifies the "sidecar off" boot path still works
|
||||
# (e.g. for a CP template that explicitly opts out, or
|
||||
# for self-hosted tenants without the memory v2 stack).
|
||||
# Both must pass. If either fails, the build is failed and
|
||||
# NO push occurs.
|
||||
|
||||
SMOKE_NET="smoke-net-${GITHUB_RUN_ID}"
|
||||
PGV_NAME="smoke-pgv-${GITHUB_RUN_ID}"
|
||||
SMOKE_NAME_FULL="smoke-tenant-full-${GITHUB_RUN_ID}"
|
||||
SMOKE_NAME_BARE="smoke-tenant-bare-${GITHUB_RUN_ID}"
|
||||
|
||||
cleanup_all() {
|
||||
docker rm -f "${SMOKE_NAME_FULL}" "${SMOKE_NAME_BARE}" "${PGV_NAME}" >/dev/null 2>&1 || true
|
||||
docker network rm "${SMOKE_NET}" >/dev/null 2>&1 || true
|
||||
}
|
||||
trap cleanup_all EXIT
|
||||
|
||||
# Create an isolated user-defined bridge network so the pgvector
|
||||
# container and the tenant container can resolve each other by
|
||||
# name (DNS baked into user-defined networks).
|
||||
docker network create "${SMOKE_NET}" >/dev/null
|
||||
|
||||
# --- pgvector sidecar (FULL ENV only) -----------------------------
|
||||
# Pull pgvector on first use. We use the official image
|
||||
# (pgvector/pgvector:pg16) which bundles the vector extension.
|
||||
# Pull is cached after the first run, so steady-state cost is
|
||||
# ~0s; first-run cost is ~10-15s. Total gate budget is 180s
|
||||
# for BOTH variants.
|
||||
echo "::notice::Smoke gate (FULL ENV): starting pgvector sidecar ${PGV_NAME}"
|
||||
docker run -d --rm \
|
||||
--name "${PGV_NAME}" \
|
||||
--network "${SMOKE_NET}" \
|
||||
-e POSTGRES_PASSWORD=smoketest \
|
||||
-e POSTGRES_USER=smoke \
|
||||
-e POSTGRES_DB=smoke \
|
||||
pgvector/pgvector:pg16 >/dev/null
|
||||
|
||||
# Wait for pgvector to accept connections (max 60s).
|
||||
pgv_ok=0
|
||||
for i in $(seq 1 30); do
|
||||
if docker exec "${PGV_NAME}" pg_isready -U smoke >/dev/null 2>&1; then
|
||||
pgv_ok=1
|
||||
break
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
if [ "${pgv_ok}" -ne 1 ]; then
|
||||
echo "::error::pgvector sidecar never reached ready in 60s — aborting smoke"
|
||||
trap - EXIT; cleanup_all; exit 1
|
||||
fi
|
||||
# Install the vector extension. The memory-plugin's 000_schema_bootstrap
|
||||
# migration expects this to exist (per entrypoint-tenant.sh comment
|
||||
# on search_path + public fallback). Created explicitly here so
|
||||
# any future schema-bootstrap refactor still gets a clean test.
|
||||
docker exec "${PGV_NAME}" psql -U smoke -d smoke -c "CREATE EXTENSION IF NOT EXISTS vector;" >/dev/null
|
||||
echo "::notice::pgvector sidecar ready + vector extension installed"
|
||||
|
||||
# --- (A) FULL ENV smoke: sidecar branch must execute ---------------
|
||||
# DATABASE_URL points at the pgvector container via DNS name
|
||||
# (user-defined network). MEMORY_PLUGIN_URL points at the
|
||||
# in-container sidecar on :9100 (localhost inside the tenant
|
||||
# container's network namespace). We port-map :9100 → :19100
|
||||
# so the host can probe the sidecar's /v1/health via curl.
|
||||
echo "::notice::Smoke gate (A: FULL ENV): starting ${SMOKE_NAME_FULL}"
|
||||
docker run -d --rm \
|
||||
--name "${SMOKE_NAME_FULL}" \
|
||||
--network "${SMOKE_NET}" \
|
||||
-e PORT=8080 \
|
||||
-e MOLECULE_TENANT_MODE=smoke \
|
||||
-e DATABASE_URL="postgres://smoke:smoketest@${PGV_NAME}:5432/smoke?sslmode=disable" \
|
||||
-e MEMORY_PLUGIN_URL="http://localhost:9100" \
|
||||
-e MEMORY_PLUGIN_LISTEN_ADDR=":9100" \
|
||||
-p 18080:8080 \
|
||||
-p 19100:9100 \
|
||||
"${TENANT_IMAGE_NAME}:${TAG_SHA}" >/dev/null
|
||||
|
||||
full_ok=0
|
||||
for i in $(seq 1 90); do
|
||||
code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 2 "http://localhost:18080/healthz" 2>/dev/null || echo "000")
|
||||
if [ "$code" = "200" ]; then
|
||||
full_ok=1
|
||||
break
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
if [ "${full_ok}" -ne 1 ]; then
|
||||
echo "::error::Smoke gate (A: FULL ENV) FAILED: tenant /healthz never returned 200 in 180s (last code: ${code})"
|
||||
echo "::error::This means the entrypoint-tenant.sh MEMORY_PLUGIN_URL sidecar branch could not"
|
||||
echo "::error::boot to healthy — the same class of defect that caused the prod-outage."
|
||||
echo "::error::Last 120 lines of container logs:"
|
||||
docker logs --tail 120 "${SMOKE_NAME_FULL}" 2>&1 | tail -120 || true
|
||||
trap - EXIT; cleanup_all; exit 1
|
||||
fi
|
||||
echo "::notice::Smoke gate (A: FULL ENV) PASSED: platform /healthz=200 in ~$((i*2))s"
|
||||
|
||||
# Verify the memory-plugin sidecar itself is healthy on :9100.
|
||||
# We port-mapped :9100 → :19100 so the host can curl the sidecar
|
||||
# directly (avoids the YAML-escaping problem of running a node
|
||||
# script inside the tenant container). The sidecar is the
|
||||
# regression point — the prod-outage was the sidecar boot path
|
||||
# failing under malformed env, not the platform itself.
|
||||
sidecar_ok=0
|
||||
for j in $(seq 1 30); do
|
||||
sc=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 2 "http://localhost:19100/v1/health" 2>/dev/null || echo "000")
|
||||
if [ "$sc" = "200" ]; then
|
||||
sidecar_ok=1
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
if [ "${sidecar_ok}" -ne 1 ]; then
|
||||
echo "::error::Smoke gate (A: FULL ENV) FAILED: memory-plugin /v1/health did not return 200 in 30s (last code: ${sc})"
|
||||
echo "::error::Platform is healthy but the sidecar isn't — the sidecar is the regression point."
|
||||
echo "::error::Last 60 lines of container logs:"
|
||||
docker logs --tail 60 "${SMOKE_NAME_FULL}" 2>&1 | tail -60 || true
|
||||
trap - EXIT; cleanup_all; exit 1
|
||||
fi
|
||||
echo "::notice::Smoke gate (A: FULL ENV) PASSED: memory-plugin /v1/health=200 on :9100 in ~${j}s"
|
||||
docker rm -f "${SMOKE_NAME_FULL}" >/dev/null 2>&1
|
||||
|
||||
# --- (B) SIDECAR-DISABLED smoke: bare-equivalent path --------------
|
||||
# Verifies the "sidecar off" boot path still works (no
|
||||
# DATABASE_URL, MEMORY_PLUGIN_DISABLE=1 — entrypoint skips the
|
||||
# sidecar branch entirely, just like a self-hosted tenant
|
||||
# without the memory v2 stack).
|
||||
echo "::notice::Smoke gate (B: SIDECAR-DISABLED): starting ${SMOKE_NAME_BARE}"
|
||||
docker run -d --rm \
|
||||
--name "${SMOKE_NAME_BARE}" \
|
||||
-p 18081:8080 \
|
||||
-e PORT=8080 \
|
||||
-e MOLECULE_TENANT_MODE=smoke \
|
||||
-e MEMORY_PLUGIN_DISABLE=1 \
|
||||
"${TENANT_IMAGE_NAME}:${TAG_SHA}" >/dev/null
|
||||
|
||||
bare_ok=0
|
||||
for i in $(seq 1 60); do
|
||||
code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 2 "http://localhost:18081/healthz" 2>/dev/null || echo "000")
|
||||
if [ "$code" = "200" ]; then
|
||||
bare_ok=1
|
||||
break
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
if [ "${bare_ok}" -ne 1 ]; then
|
||||
echo "::error::Smoke gate (B: SIDECAR-DISABLED) FAILED: /healthz never returned 200 in 120s (last code: ${code})"
|
||||
echo "::error::Last 80 lines of container logs:"
|
||||
docker logs --tail 80 "${SMOKE_NAME_BARE}" 2>&1 | tail -80 || true
|
||||
trap - EXIT; cleanup_all; exit 1
|
||||
fi
|
||||
echo "::notice::Smoke gate (B: SIDECAR-DISABLED) PASSED: /healthz=200 in ~$((i*2))s"
|
||||
docker rm -f "${SMOKE_NAME_BARE}" >/dev/null 2>&1
|
||||
|
||||
trap - EXIT
|
||||
cleanup_all
|
||||
echo "::notice::Smoke gate PASSED both variants; pushing to ECR (4 tags)"
|
||||
|
||||
# Smoke passed — push the loaded image to ECR with all 4 tags. If any
|
||||
# push fails, the build fails (--load already in the daemon, so this
|
||||
# is a fast network-only operation).
|
||||
#
|
||||
# CR2 RC 12948 / Researcher RC 12946: previous version iterated
|
||||
# build_tags (the buildx --tag, ref alternating argv) which
|
||||
# produced `docker push --tag` on the first iteration and
|
||||
# failed before any image reached ECR. The fix: iterate the
|
||||
# parallel push_refs list which holds only the bare image
|
||||
# refs (no --tag flag).
|
||||
for ref in "${push_refs[@]}"; do
|
||||
echo "::notice::Pushing ${ref}"
|
||||
docker push "${ref}"
|
||||
done
|
||||
|
||||
# Staging auto-deploy: every workspace-server image publish on main should
|
||||
# roll out to the staging fleet so code fixes reach staging without a
|
||||
# manual workflow_dispatch. Gitea 1.22.6 does not support workflow_run, so
|
||||
|
||||
Reference in New Issue
Block a user