Merge pull request 'test(e2e): staging coverage for every runtime + resume/hibernate lifecycle' (#2296) from harden/staging-saas-all-runtimes into main

2026-06-05 11:21:37 +00:00
parent ba78894858 2e31f27304
commit f78fef4c97
5 changed files with 429 additions and 8 deletions
@@ -124,7 +124,12 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    # mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
-    timeout-minutes: 45
+    # Raised 45→75: step 10b now exercises pause→resume→online +
+    # hibernate→wake→online, each of which RE-PROVISIONS the parent (CP
+    # re-provision + heartbeat recovery, not a fresh EC2 cold start, but still
+    # minutes). The base provision→online→A2A matrix fits in ~35 min; the two
+    # extra lifecycle reprovisions need headroom under WORKSPACE_ONLINE_TIMEOUT.
+    timeout-minutes: 75
    permissions:
      contents: read

@@ -184,6 +189,11 @@ jobs:
      E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'codex' && 'openai/gpt-4o' || github.event.inputs.runtime == 'google-adk' && 'google_genai:gemini-2.5-pro' || 'minimax:MiniMax-M2.7' }}
      E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
      E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
+      # Lifecycle transitions (step 10b): pause→resume→online +
+      # hibernate→wake→online on the provisioned parent. `auto` runs them in
+      # full mode (this job). Set `off` to skip the ~2x-reprovision cost on an
+      # ad-hoc dispatch. The timeout-minutes above is sized for this being on.
+      E2E_LIFECYCLE: auto
      # Fail-closed-on-skip: in CI the harness MUST prove ≥1 full
      # provision→online→A2A cycle. If it reaches the end having validated
      # nothing (a future short-circuit / skip path), it exits 5 rather than
@@ -83,7 +83,17 @@ pick_model_slug() {
  fi
  case "$runtime" in
    hermes)      printf 'openai/gpt-4o' ;;
-    claude-code)
+    # seo-agent is a claude-code-adapter template VARIANT selected by
+    # template name (template="seo-agent"), not a distinct registry runtime
+    # (it is absent from manifest.json + runtime_registry.go). Its config.yaml
+    # declares `runtime: claude-code` and copies the claude-code `providers:`
+    # block (providers.yaml:21 "The same block is copy-pasted into the seo-agent
+    # template"), so its model dispatch is IDENTICAL to claude-code's: the
+    # MiniMax BYOK colon id (the staging-default key path), else direct
+    # Anthropic, else the OAuth `sonnet` alias. Sharing the claude-code branch
+    # keeps the SSOT one place — a seo-agent run is just a claude-code run
+    # behind a productized template skin.
+    claude-code|seo-agent)
      if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
        # Namespaced (colon) BYOK id, not bare "MiniMax-M2" (#2263 deploy-skew):
        # bare ids can lag the deployed staging ws-server's compiled registry,
@@ -102,6 +112,20 @@ pick_model_slug() {
        printf 'sonnet'
      fi
      ;;
+    # google-adk: Gemini via two distinct provider arms in providers.yaml
+    # runtimes.google-adk:
+    #   * platform arm → `platform:gemini-2.5-pro` (keyless Vertex via the CP
+    #     LLM proxy + server-side WIF mint; the org-compliant PROD path). This
+    #     id is selected via E2E_LLM_PATH=platform above, NOT here.
+    #   * google arm (AI Studio BYOK) → bare `gemini-2.5-pro` with the tenant's
+    #     own GOOGLE_API_KEY. This is the staging-exercisable path (no WIF
+    #     provisioning needed) and is what this branch selects.
+    # The workflow may further override with E2E_MODEL_SLUG=google_genai:gemini-2.5-pro
+    # (the adapter's provider:model spelling) — E2E_MODEL_SLUG wins at the top
+    # of this function, so both forms are supported.
+    google-adk)
+      printf 'gemini-2.5-pro'
+      ;;
    *)           printf 'openai/gpt-4o' ;;  # safest fallback (matches hermes)
  esac
 }
@@ -57,6 +57,29 @@ assert_eq "claude-code + Anthropic API key → Anthropic API model" "$got" "clau
 got=$(unset E2E_MODEL_SLUG; E2E_MINIMAX_API_KEY="mx-priority" E2E_ANTHROPIC_API_KEY="sk-ant-loser" pick_model_slug claude-code)
 assert_eq "claude-code + both keys → MiniMax priority"            "$got" "minimax:MiniMax-M2.7"

+# ── seo-agent (claude-code-adapter template variant) ──
+# seo-agent shares the claude-code dispatch branch (it reuses the claude-code
+# adapter + the same copied providers block). Pin that it resolves IDENTICALLY
+# to claude-code for every key path so a future refactor can't accidentally
+# fork seo-agent's model selection from claude-code's.
+run_test "seo-agent → claude-code default alias"                  seo-agent   "sonnet"
+
+got=$(unset E2E_MODEL_SLUG E2E_ANTHROPIC_API_KEY; E2E_MINIMAX_API_KEY="mx-test" pick_model_slug seo-agent)
+assert_eq "seo-agent + MiniMax key → MiniMax model (==claude-code)"  "$got" "minimax:MiniMax-M2.7"
+
+got=$(unset E2E_MODEL_SLUG E2E_MINIMAX_API_KEY; E2E_ANTHROPIC_API_KEY="sk-ant-test" pick_model_slug seo-agent)
+assert_eq "seo-agent + Anthropic key → Anthropic model (==claude-code)" "$got" "claude-sonnet-4-6"
+
+# ── google-adk (Gemini) ──
+# AI-Studio BYOK arm → bare gemini-2.5-pro (providers.yaml runtimes.google-adk
+# `google` arm). The platform/Vertex arm is selected via E2E_LLM_PATH=platform
+# (a platform: id), not this dispatch. Pin the bare form so a drift to the
+# platform id (which would change billing/route) is caught.
+run_test "google-adk → AI-Studio bare gemini id"                  google-adk  "gemini-2.5-pro"
+
+got=$(E2E_MODEL_SLUG="google_genai:gemini-2.5-pro" pick_model_slug google-adk)
+assert_eq "google-adk + E2E_MODEL_SLUG override (adapter spelling)" "$got" "google_genai:gemini-2.5-pro"
+
 # ── Fallback for unknown runtime ──
 # Picks slash-form (hermes-shaped) since hermes is the historical
 # default and most third-party runtimes behave hermes-like. Pinning
@@ -26,7 +26,26 @@
 #      the workspace stuck on 'online' indefinitely.)
 #
 # Hibernation is intentionally NOT covered here — it has its own timing
-# model (idle threshold) and warrants a separate harness.
+# model (idle threshold) and warrants a separate harness. (The
+# pause→resume + hibernate→wake transitions for PLATFORM-compute runtimes
+# are covered by test_staging_full_saas.sh step 10b.)
+#
+# BYO meta-runtime arms (kimi, kimi-cli) — added 2026-06-05:
+#   kimi and kimi-cli are BYO-compute meta-runtimes (isExternalLikeRuntime:
+#   runtime_registry.go:141-147) that go through the SAME external/poll
+#   provisioning path as `external` — create with external:true →
+#   awaiting_agent, register → online — but with their runtime LABEL
+#   PRESERVED (workspace.go:752-770 normalizeExternalRuntime keeps the
+#   specific label, does NOT coerce to generic "external", so the canvas
+#   shows the right runtime). They had ONLY validation/unit coverage and
+#   were NEVER provisioned→online in any e2e. Step 9 adds, for EACH of
+#   {kimi, kimi-cli}: create → assert awaiting_agent + label-preserved →
+#   register(poll) → assert online + label-preserved → A2A → assert the
+#   poll-mode {status:"queued"} envelope (a2a_proxy.go:462-477). The A2A
+#   arm proves the a2a proxy routes a BYO meta-runtime to the poll queue
+#   (200 + queued) rather than 404/500 — the meaningful round-trip for a
+#   workspace with no standing live agent. A real BYO-agent COMPLETION
+#   needs a standing kimi BYO cell (flagged for the CTO in the PR body).
 #
 # Required env (mirrors test_staging_full_saas.sh):
 #   MOLECULE_CP_URL          default: https://staging-api.moleculesai.app
@@ -456,6 +475,108 @@ RECOVERED_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.lo
 ok "Re-register succeeded — awaiting_agent → online (operator-recoverable)"
 require_transition "re-register: awaiting_agent → online (recovery)"

+# ─── 7b. BYO meta-runtime arms: kimi + kimi-cli ─────────────────────────
+# kimi and kimi-cli are BYO-compute meta-runtimes (isExternalLikeRuntime).
+# They share the external/poll provisioning path but PRESERVE their runtime
+# label (workspace.go normalizeExternalRuntime). They had no provision→online
+# e2e until now. For EACH: create(external:true, runtime=<rt>) → assert
+# awaiting_agent + label preserved → register(poll) → assert online + label
+# preserved → A2A → assert the poll-mode {status:"queued"} envelope.
+#
+# Why poll-mode {queued} is the A2A assertion (not a real completion): there
+# is no standing live BYO agent in staging, so the meaningful round-trip is
+# that the a2a proxy ROUTES a BYO meta-runtime to the poll queue (HTTP 200 +
+# {status:"queued", delivery_mode:"poll"}, a2a_proxy.go:462-477) instead of
+# 404/500. A real BYO-agent COMPLETION needs a standing kimi BYO cell — see
+# the CTO flag in the PR body.
+byo_meta_runtime_arm() {  # $1 = runtime label (kimi | kimi-cli)
+  local rt="$1"
+  local resp wid status auth get_resp db_status reg_dm online_status
+  log "    [$rt] create (external:true, runtime=$rt)..."
+  resp=$(tenant_call POST /workspaces \
+    -d "$(printf '{"name":"ext-%s-e2e","runtime":"%s","external":true}' "$rt" "$rt")")
+  wid=$(echo "$resp" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))")
+  status=$(echo "$resp" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
+  auth=$(echo "$resp" | python3 -c "
+import json,sys
+try:
+    d=json.load(sys.stdin); conn=d.get('connection') or {}
+    print(conn.get('auth_token','') or d.get('auth_token',''))
+except Exception:
+    print('')
+")
+  [ -z "$wid" ] && fail "[$rt] create missing id: $resp"
+  [ "$status" = "awaiting_agent" ] || fail "[$rt] create status='$status' (expected awaiting_agent — external/poll path)"
+  [ -z "$auth" ] && fail "[$rt] create returned no workspace auth token — register impossible"
+
+  # Assert the runtime LABEL was preserved (NOT coerced to generic 'external').
+  get_resp=$(tenant_call GET "/workspaces/$wid")
+  db_status=$(echo "$get_resp" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
+  local db_runtime
+  db_runtime=$(echo "$get_resp" | python3 -c "import json,sys; print(json.load(sys.stdin).get('runtime',''))")
+  [ "$db_status" = "awaiting_agent" ] || fail "[$rt] DB row status=$db_status (expected awaiting_agent)"
+  [ "$db_runtime" = "$rt" ] || fail "[$rt] runtime label coerced to '$db_runtime' (expected '$rt' — normalizeExternalRuntime must PRESERVE the BYO meta-runtime label, workspace.go:752-770)"
+  ok "    [$rt] create → awaiting_agent, runtime label preserved ✓"
+
+  # register(poll) → online. Reuse register_with_retry by setting WS_AUTH_TOKEN
+  # (the helper reads it as a global). REGISTER_RESP is set by the helper.
+  WS_AUTH_TOKEN="$auth"
+  local body
+  body=$(printf '{"id":"%s","url":"https://example.invalid:443","delivery_mode":"poll","agent_card":{"name":"e2e-%s","skills":[{"id":"echo","name":"Echo"}]}}' "$wid" "$rt")
+  REGISTER_RESP=""
+  register_with_retry "[$rt] register" "$body" \
+    || fail "[$rt] register returned non-200 after bounded retries — body: $(printf '%s' "$REGISTER_RESP" | sanitize_http_body | head -c 300)"
+  online_status=$(tenant_call GET "/workspaces/$wid" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
+  [ "$online_status" = "online" ] || fail "[$rt] expected online after register, got $online_status"
+  reg_dm=$(echo "$REGISTER_RESP" | head -n1 | python3 -c "import json,sys; print(json.load(sys.stdin).get('delivery_mode',''))" 2>/dev/null || echo "")
+  [ "$reg_dm" = "poll" ] || fail "[$rt] register response delivery_mode='$reg_dm' (expected poll)"
+  ok "    [$rt] register → online (delivery_mode=poll) ✓"
+
+  # A2A → assert poll-mode {status:"queued"} envelope. Bounded retry on the
+  # transient cold-edge 5xx class; a 4xx/non-queued 2xx is a real bug.
+  local a2a_payload a2a_tmp a2a_code a2a_rc a2a_status attempt
+  a2a_payload=$(python3 -c "
+import json, uuid
+print(json.dumps({
+    'jsonrpc':'2.0','method':'message/send','id':'e2e-byo-1',
+    'params':{'message':{'role':'user','messageId':f'e2e-{uuid.uuid4().hex[:8]}',
+        'parts':[{'kind':'text','text':'BYO meta-runtime poll-route smoke. Respond: OK'}]}}
+}))
+")
+  a2a_tmp=$(mktemp -t byo_a2a.XXXXXX)
+  for attempt in $(seq 1 8); do
+    : >"$a2a_tmp"
+    set +e
+    a2a_code=$(curl -sS --max-time 60 -X POST "$TENANT_URL/workspaces/$wid/a2a" \
+      -H "Authorization: Bearer $TENANT_TOKEN" \
+      -H "X-Molecule-Org-Id: $ORG_ID" \
+      -H "Content-Type: application/json" \
+      -d "$a2a_payload" -o "$a2a_tmp" -w '%{http_code}' 2>/dev/null)
+    a2a_rc=$?
+    set -e
+    a2a_code=${a2a_code:-000}
+    if [ "$a2a_rc" = "0" ] && [ "$a2a_code" = "200" ]; then break; fi
+    if echo "$a2a_code" | grep -Eq '^(502|503|504)$' && [ "$attempt" -lt 8 ]; then
+      log "    [$rt] A2A transient $a2a_code attempt $attempt/8"; sleep 10; continue
+    fi
+    break
+  done
+  a2a_status=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('status',''))" "$a2a_tmp" 2>/dev/null || echo "")
+  local a2a_dm
+  a2a_dm=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('delivery_mode',''))" "$a2a_tmp" 2>/dev/null || echo "")
+  rm -f "$a2a_tmp"
+  [ "$a2a_rc" = "0" ] && [ "$a2a_code" = "200" ] \
+    || fail "[$rt] A2A POST failed (rc=$a2a_rc, http=$a2a_code) — a BYO meta-runtime poll-mode A2A must 200 with a queued envelope, not error"
+  [ "$a2a_status" = "queued" ] && [ "$a2a_dm" = "poll" ] \
+    || fail "[$rt] A2A returned status='$a2a_status' delivery_mode='$a2a_dm' (expected queued/poll — a2a proxy must route a BYO meta-runtime to the poll queue, a2a_proxy.go:462-477)"
+  ok "    [$rt] A2A → poll-mode queued envelope ✓ (provision→online→A2A proven for $rt)"
+}
+
+log "7c/8 BYO meta-runtime arms (kimi, kimi-cli) — provision→online→A2A..."
+byo_meta_runtime_arm "kimi"
+byo_meta_runtime_arm "kimi-cli"
+ok "BYO meta-runtime arms passed for kimi + kimi-cli"
+
 # ─── 8. Done — cleanup runs in the EXIT trap ───────────────────────────
 # REQUIRE_LIVE belt-and-braces: assert here too (in addition to the EXIT
 # trap) so the failure surfaces in step order, not only post-teardown.
@@ -24,6 +24,19 @@
 #
 # Optional env:
 #   E2E_RUNTIME                  hermes (default) | claude-code | codex | openclaw
+#                                | seo-agent | google-adk
+#                                  - seo-agent: a claude-code-adapter template
+#                                    VARIANT (not a distinct registry runtime).
+#                                    Selected via the `template` field (config.yaml
+#                                    resolves runtime=claude-code); reuses the
+#                                    same MiniMax/claude-code key path. See the
+#                                    TEMPLATE derivation + SECRETS_JSON block.
+#                                  - google-adk: Gemini. The AI-Studio-keyed BYOK
+#                                    path (E2E_GOOGLE_API_KEY) is staging-
+#                                    exercisable here; the keyless Vertex PROD
+#                                    path needs WIF (see header note + the CTO
+#                                    flag in the PR body) and is selected via
+#                                    E2E_LLM_PATH=platform + a platform: model.
 #   E2E_PROVISION_TIMEOUT_SECS   default 900 (15 min cold EC2 budget)
 #   E2E_WORKSPACE_ONLINE_TIMEOUT_SECS  default 3600 (60 min — hermes
 #                                cold-boot worst-case + slack). Raised from
@@ -47,6 +60,18 @@
 #                                tear down cleanly (and exit 4 on leak).
 #                                Used by a dedicated sanity workflow
 #                                that verifies the safety net.
+#   E2E_LIFECYCLE                auto (default) | off
+#                                When auto + MODE=full, exercises the
+#                                pause→resume→online and hibernate→resume(wake)
+#                                state transitions on the provisioned parent
+#                                (step 10b). These are REAL transitions on the
+#                                live tenant (Pause stops the container + sets
+#                                status=paused; Resume re-provisions →
+#                                provisioning → online; Hibernate stops +
+#                                status=hibernated; the next A2A auto-wakes it).
+#                                Set `off` for a fast smoke that skips the
+#                                ~2x-reprovision cost. In smoke MODE it is
+#                                skipped regardless (no parent stability budget).
 #   E2E_REQUIRE_LIVE             1 → fail-closed-on-skip guard (CI sets this).
 #                                When set, the run MUST actually complete
 #                                ≥1 full provision→online→A2A cycle. A run
@@ -592,6 +617,24 @@ print(json.dumps({
    'ANTHROPIC_API_KEY': k,
 }))
 ")
+elif [ -n "${E2E_GOOGLE_API_KEY:-}" ]; then
+  # google-adk AI-Studio BYOK path. The `google` provider entry
+  # (providers.yaml:401-413) reads GEMINI_API_KEY / GOOGLE_API_KEY and dials
+  # generativelanguage.googleapis.com — the tenant's OWN key, distinct from the
+  # keyless-Vertex PROD path (which routes through the CP proxy + server-side
+  # WIF and carries NO tenant credential). This branch exercises google-adk
+  # being PROVISIONED AT ALL on staging; the Vertex-specific WIF path is flagged
+  # for the CTO (needs extra provisioning) and is NOT reachable here. Inject
+  # under both env names the provider accepts so the adapter resolves regardless
+  # of which one it reads first.
+  SECRETS_JSON=$(python3 -c "
+import json, os
+k = os.environ['E2E_GOOGLE_API_KEY']
+print(json.dumps({
+    'GOOGLE_API_KEY': k,
+    'GEMINI_API_KEY': k,
+}))
+")
 elif [ -n "${E2E_OPENAI_API_KEY:-}" ]; then
  SECRETS_JSON=$(python3 -c "
 import json, os
@@ -611,11 +654,79 @@ fi
 MODEL_SLUG=$(pick_model_slug "$RUNTIME")
 log "    MODEL_SLUG=$MODEL_SLUG"

-log "5/11 Provisioning parent workspace (runtime=$RUNTIME)..."
+# ─── runtime → provision-selector resolution ────────────────────────────
+# Most runtimes are selected directly by the `runtime` field. seo-agent is
+# the exception: it is NOT a registry runtime (absent from manifest.json +
+# runtime_registry.go knownRuntimes) — it is a claude-code-adapter template
+# VARIANT selected by the `template` field. The ws-server Create handler reads
+# the template's config.yaml, which declares `runtime: claude-code`, and
+# resolves the concrete runtime from there (workspace.go:290-336). So for
+# seo-agent we send template="seo-agent" and OMIT runtime, letting the
+# template resolve it — sending an explicit runtime="seo-agent" would
+# RUNTIME_UNSUPPORTED-422 at workspace.go:374-384 because it is not in
+# knownRuntimes. PROVISION_TEMPLATE is "" for every real registry runtime.
+PROVISION_TEMPLATE=""
+case "$RUNTIME" in
+  seo-agent) PROVISION_TEMPLATE="seo-agent" ;;
+esac
+
+# Build the create payload in Python so the optional `template`/`runtime`
+# fields are emitted conditionally and the secrets blob is embedded without
+# shell-escaping hazards. Args: name, [parent_id|""].
+build_create_payload() {
+  local name="$1" parent_id="${2:-}"
+  E2E_WS_NAME="$name" \
+  E2E_WS_PARENT_ID="$parent_id" \
+  E2E_WS_RUNTIME="$RUNTIME" \
+  E2E_WS_TEMPLATE="$PROVISION_TEMPLATE" \
+  E2E_WS_MODEL="$MODEL_SLUG" \
+  E2E_WS_SECRETS="$SECRETS_JSON" \
+  python3 -c "
+import json, os
+secrets = json.loads(os.environ['E2E_WS_SECRETS'] or '{}')
+payload = {
+    'name': os.environ['E2E_WS_NAME'],
+    'tier': 2,
+    'model': os.environ['E2E_WS_MODEL'],
+    'secrets': secrets,
+}
+tmpl = os.environ.get('E2E_WS_TEMPLATE', '')
+if tmpl:
+    # Template-selected variant (seo-agent): the template's config.yaml
+    # resolves runtime=claude-code server-side. Do NOT also send an explicit
+    # runtime — seo-agent is not a registry runtime and would 422.
+    payload['template'] = tmpl
+else:
+    payload['runtime'] = os.environ['E2E_WS_RUNTIME']
+pid = os.environ.get('E2E_WS_PARENT_ID', '')
+if pid:
+    payload['parent_id'] = pid
+print(json.dumps(payload))
+"
+}
+
+if [ -n "$PROVISION_TEMPLATE" ]; then
+  log "5/11 Provisioning parent workspace (runtime=$RUNTIME via template=$PROVISION_TEMPLATE → claude-code adapter)..."
+else
+  log "5/11 Provisioning parent workspace (runtime=$RUNTIME)..."
+fi
 PARENT_RESP=$(tenant_call POST /workspaces \
  -H "Content-Type: application/json" \
-  -d "{\"name\":\"E2E Parent\",\"runtime\":\"$RUNTIME\",\"tier\":2,\"model\":\"$MODEL_SLUG\",\"secrets\":$SECRETS_JSON}")
-PARENT_ID=$(echo "$PARENT_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['id'])")
+  -d "$(build_create_payload 'E2E Parent')")
+# Surface the workspace-create error CLEARLY instead of dying on a Python
+# KeyError when the response has no 'id'. The load-bearing cases this names:
+#   - google-adk: RUNTIME_UNSUPPORTED 422 if google-adk is absent from the
+#     deployed manifest.json's workspace_templates (the Create-handler
+#     allowlist is manifest-derived — runtime_registry.go). google-adk is in
+#     providers.yaml + provisioner/registry.go + registry_gen but NOT (yet) in
+#     manifest.json, so it cannot be provisioned by `runtime` until the
+#     manifest gains it. Flagged for the CTO — this arm REDS until then.
+#   - seo-agent: an "invalid template" 400 if the seo-agent template isn't
+#     present in the tenant's configs/cache dir (template-cache refresh gap).
+PARENT_ID=$(echo "$PARENT_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo "")
+if [ -z "$PARENT_ID" ]; then
+  fail "Parent workspace create returned no 'id' (runtime=$RUNTIME, template=${PROVISION_TEMPLATE:-<none>}). Response: $(printf '%s' "$PARENT_RESP" | sanitize_http_body)"
+fi
 log "    PARENT_ID=$PARENT_ID"

 # ─── 6. Provision child (full mode only) ────────────────────────────────
@@ -624,8 +735,11 @@ if [ "$MODE" = "full" ]; then
  log "6/11 Provisioning child workspace..."
  CHILD_RESP=$(tenant_call POST /workspaces \
    -H "Content-Type: application/json" \
-    -d "{\"name\":\"E2E Child\",\"runtime\":\"$RUNTIME\",\"tier\":2,\"model\":\"$MODEL_SLUG\",\"parent_id\":\"$PARENT_ID\",\"secrets\":$SECRETS_JSON}")
-  CHILD_ID=$(echo "$CHILD_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['id'])")
+    -d "$(build_create_payload 'E2E Child' "$PARENT_ID")")
+  CHILD_ID=$(echo "$CHILD_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo "")
+  if [ -z "$CHILD_ID" ]; then
+    fail "Child workspace create returned no 'id' (runtime=$RUNTIME, template=${PROVISION_TEMPLATE:-<none>}). Response: $(printf '%s' "$CHILD_RESP" | sanitize_http_body)"
+  fi
  log "    CHILD_ID=$CHILD_ID"
 else
  log "6/11 Canary mode — skipping child workspace"
@@ -1416,6 +1530,135 @@ except Exception:
  fi
 fi

+# ─── 10b. Pause/Resume + Hibernate/Resume lifecycle transitions ─────────
+# Exercise the REAL workspace lifecycle state machine on the provisioned
+# parent — the transitions that previously had only handler unit tests
+# (handlers_additional_test.go / hibernation_test.go) and NO real-infra
+# coverage. Each transition is asserted against the live DB-backed status the
+# GET /workspaces/:id endpoint returns, so a regression in the Pause/Resume/
+# Hibernate handlers (workspace_restart.go) or their CP stop/re-provision
+# wiring fails the gate instead of silently leaking an EC2 / wedging a tenant.
+#
+# Contract (workspace_restart.go):
+#   POST /pause     online → 'paused'  (container stopped, url cleared)  {"status":"paused"}
+#   POST /resume    paused → 'provisioning' → … → 'online' (re-provision) {"status":"provisioning"}
+#   POST /hibernate online → 'hibernating' → 'hibernated' (container stopped) {"status":"hibernated"}
+#   auto-wake       next A2A message/send on a hibernated ws → online
+#
+# Gated to full MODE (smoke has no parent-stability budget) + E2E_LIFECYCLE.
+# Runs LAST (after all read-only A2A/memory/peer checks) so the pause/stop
+# cycles don't disturb the earlier assertions. Skips are LOUD (logged), and
+# any broken transition hard-fails — never a silent pass.
+if [ "$MODE" = "full" ] && [ "${E2E_LIFECYCLE:-auto}" != "off" ]; then
+  log "10b/11 Lifecycle transitions: pause→resume→online, hibernate→resume(wake) on parent $PARENT_ID..."
+
+  lifecycle_status() {  # echoes the live workspace status
+    tenant_call GET "/workspaces/$PARENT_ID" 2>/dev/null \
+      | python3 -c "import json,sys; print(json.load(sys.stdin).get('status') or '')" 2>/dev/null || echo ""
+  }
+  # Bounded readiness-poll for a target status — same fail-closed shape as
+  # wait_workspaces_online_routable, but for an arbitrary terminal status.
+  wait_status() {  # $1=target $2=timeout_secs $3=label
+    local target="$1" timeout="$2" label="$3"
+    local deadline cur last=""
+    deadline=$(( $(date +%s) + timeout ))
+    while true; do
+      cur=$(lifecycle_status)
+      if [ "$cur" != "$last" ]; then log "    parent status → ${cur:-<empty>}"; last="$cur"; fi
+      [ "$cur" = "$target" ] && return 0
+      if [ "$(date +%s)" -gt "$deadline" ]; then
+        log "    [lifecycle] $label never reached '$target' within ${timeout}s (last='$cur')"
+        return 1
+      fi
+      sleep 10
+    done
+  }
+
+  # ── pause → paused ──
+  PAUSE_RESP=$(tenant_call POST "/workspaces/$PARENT_ID/pause" 2>/dev/null || echo '{}')
+  PAUSE_STATUS=$(echo "$PAUSE_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))" 2>/dev/null || echo "")
+  [ "$PAUSE_STATUS" = "paused" ] || fail "Pause: POST /pause returned status='$PAUSE_STATUS' (expected 'paused'). Body: ${PAUSE_RESP:0:200}"
+  # Poll the DB-backed status — the response body could lie; the GET proves the row.
+  wait_status "paused" 120 "pause" || fail "Pause: workspace $PARENT_ID never settled at status=paused (DB row) — Pause handler / CP stop regression (workspace_restart.go Pause)."
+  ok "    pause → paused (DB-verified)"
+
+  # ── resume → provisioning → online ──
+  RESUME_RESP=$(tenant_call POST "/workspaces/$PARENT_ID/resume" 2>/dev/null || echo '{}')
+  RESUME_STATUS=$(echo "$RESUME_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))" 2>/dev/null || echo "")
+  [ "$RESUME_STATUS" = "provisioning" ] || fail "Resume: POST /resume returned status='$RESUME_STATUS' (expected 'provisioning'). Body: ${RESUME_RESP:0:200}"
+  # Resume re-provisions from the preserved config volume; reuse the same
+  # online+routable readiness boundary the initial boot used (no fresh EC2
+  # cold-start, but CP re-provision + heartbeat recovery can still take minutes).
+  wait_workspaces_online_routable "    Waiting for parent to return online after resume (up to $((WORKSPACE_ONLINE_TIMEOUT_SECS/60)) min)..." "$PARENT_ID"
+  ok "    resume → provisioning → online (DB-verified)"
+
+  # ── hibernate → hibernated ──
+  HIB_RESP=$(tenant_call POST "/workspaces/$PARENT_ID/hibernate?force=true" 2>/dev/null || echo '{}')
+  HIB_STATUS=$(echo "$HIB_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))" 2>/dev/null || echo "")
+  [ "$HIB_STATUS" = "hibernated" ] || fail "Hibernate: POST /hibernate?force=true returned status='$HIB_STATUS' (expected 'hibernated'). Body: ${HIB_RESP:0:200}"
+  # The handler runs the claim→stop→'hibernated' sequence; poll the DB row to
+  # confirm it landed on 'hibernated' (not stuck mid-'hibernating').
+  wait_status "hibernated" 120 "hibernate" || fail "Hibernate: workspace $PARENT_ID never settled at status=hibernated (DB row) — Hibernate handler / CP stop regression (workspace_restart.go HibernateWorkspace)."
+  ok "    hibernate → hibernated (DB-verified)"
+
+  # ── resume-from-hibernate via auto-wake on next A2A ──
+  # A hibernated workspace auto-wakes on the next incoming A2A message/send
+  # (no explicit /resume — Resume only handles status=paused). Send a wake
+  # A2A and assert the workspace returns to online. We accept transient cold
+  # 5xx during wake (same edge class the PONG probe tolerates) and poll the
+  # status to the online boundary rather than asserting on the single A2A code.
+  log "    Hibernate auto-wake: sending A2A to wake hibernated parent..."
+  WAKE_PAYLOAD=$(python3 -c "
+import json, uuid
+print(json.dumps({
+    'jsonrpc': '2.0',
+    'method': 'message/send',
+    'id': 'e2e-wake-1',
+    'params': {
+        'message': {
+            'role': 'user',
+            'messageId': f'e2e-wake-{uuid.uuid4().hex[:8]}',
+            'parts': [{'kind': 'text', 'text': 'This is the platform lifecycle smoke test waking a hibernated workspace. No tools or memory are needed — please respond with exactly the single token: WOKE'}]
+        }
+    }
+}))
+")
+  WAKE_TMP=$(mktemp -t wake_a2a.XXXXXX)
+  for WAKE_ATTEMPT in $(seq 1 12); do
+    : >"$WAKE_TMP"
+    set +e
+    WAKE_CODE=$(tenant_call POST "/workspaces/$PARENT_ID/a2a" \
+      --max-time 90 \
+      -H "Content-Type: application/json" \
+      -d "$WAKE_PAYLOAD" \
+      -o "$WAKE_TMP" -w '%{http_code}' 2>/dev/null)
+    WAKE_RC=$?
+    set -e
+    WAKE_CODE=${WAKE_CODE:-000}
+    if [ "$WAKE_RC" = "0" ] && [ "$WAKE_CODE" -ge 200 ] && [ "$WAKE_CODE" -lt 300 ]; then
+      break
+    fi
+    WAKE_SAFE_BODY=$(cat "$WAKE_TMP" 2>/dev/null | sanitize_http_body)
+    # Wake legitimately returns transient 5xx while the container restarts —
+    # retry that class only (bounded), never a 4xx.
+    if echo "$WAKE_CODE" | grep -Eq '^(502|503|504)$' && [ "$WAKE_ATTEMPT" -lt 12 ]; then
+      log "    wake A2A cold/restart attempt $WAKE_ATTEMPT/12 returned $WAKE_CODE: ${WAKE_SAFE_BODY:0:120}"
+      sleep 15
+      continue
+    fi
+    break
+  done
+  rm -f "$WAKE_TMP"
+  # The auto-wake contract is the STATUS transition (hibernated → online), not
+  # the A2A body content — assert the live DB row, the real readiness signal.
+  wait_status "online" "$WORKSPACE_ONLINE_TIMEOUT_SECS" "hibernate-wake" \
+    || fail "Hibernate auto-wake: parent $PARENT_ID never returned to status=online after a wake A2A (last A2A http=$WAKE_CODE) — auto-wake-on-message regression (a hibernated ws must re-provision on the next A2A)."
+  ok "    hibernate → online via auto-wake A2A (DB-verified)"
+  ok "Lifecycle transitions passed: pause→resume→online + hibernate→wake→online"
+else
+  log "10b/11 Lifecycle transitions skipped (MODE=$MODE, E2E_LIFECYCLE=${E2E_LIFECYCLE:-auto}) — pause/resume/hibernate only run in full mode with E2E_LIFECYCLE!=off."
+fi
+
 # ─── 11. Teardown runs via trap ────────────────────────────────────────
 # Fail-closed-on-skip: before declaring PASS, assert (when CI demanded a live
 # run) that every load-bearing lifecycle milestone actually fired. A run that