fix(core): add admin-gated /admin/workspaces/:id/restart partner for CP migrator (fleet-credential incident tenant-side) #2925
@@ -102,6 +102,17 @@ services:
|
||||
ADMIN_TOKEN: "harness-admin-token-alpha"
|
||||
MOLECULE_ORG_ID: "harness-org-alpha"
|
||||
CP_UPSTREAM_URL: "http://cp-stub:9090"
|
||||
# Phase 1 of the #2863 burn-down: CPProvisioner
|
||||
# (workspace-server/internal/provisioner/cp_provisioner.go:78-86)
|
||||
# reads CP_PROVISION_URL first, then MOLECULE_CP_URL, then
|
||||
# defaults to real prod. CP_UPSTREAM_URL alone is NOT in the
|
||||
# read order, so the harness provision call flew past cp-stub
|
||||
# to real prod → 401 → 30s provisioning stall. Add both names
|
||||
# (belt-and-suspenders — same value either way) so the harness
|
||||
# provision + config calls land on cp-stub. NO
|
||||
# MOLECULE_CP_SHARED_SECRET / ADMIN_TOKEN — cp-stub is permissive.
|
||||
CP_PROVISION_URL: "http://cp-stub:9090"
|
||||
MOLECULE_CP_URL: "http://cp-stub:9090"
|
||||
RATE_LIMIT: "1000"
|
||||
CANVAS_PROXY_URL: "http://localhost:3000"
|
||||
# LLM-proxy env vars required by assertManagedTenantHasLLMEnv
|
||||
@@ -170,6 +181,12 @@ services:
|
||||
ADMIN_TOKEN: "harness-admin-token-beta"
|
||||
MOLECULE_ORG_ID: "harness-org-beta"
|
||||
CP_UPSTREAM_URL: "http://cp-stub:9090"
|
||||
# Phase 1 of the #2863 burn-down (see tenant-alpha block above
|
||||
# for rationale). Belt-and-suspenders: both env var names point
|
||||
# at cp-stub so the provision + config calls land on the stub
|
||||
# rather than flying past to real prod.
|
||||
CP_PROVISION_URL: "http://cp-stub:9090"
|
||||
MOLECULE_CP_URL: "http://cp-stub:9090"
|
||||
RATE_LIMIT: "1000"
|
||||
CANVAS_PROXY_URL: "http://localhost:3000"
|
||||
# LLM-proxy env vars (see assertManagedTenantHasLLMEnv in
|
||||
|
||||
@@ -8,9 +8,9 @@
|
||||
// activates, and tests exercise the real tenant→CP wire.
|
||||
//
|
||||
// This is NOT a CP reimplementation. It serves the minimum surface to:
|
||||
// 1. Boot the tenant image without /cp/* breaking the canvas bootstrap.
|
||||
// 2. Replay specific bug classes (e.g. /cp/* returns 404, returns 5xx,
|
||||
// returns malformed JSON) by toggling env vars.
|
||||
// 1. Boot the tenant image without /cp/* breaking the canvas bootstrap.
|
||||
// 2. Replay specific bug classes (e.g. /cp/* returns 404, returns 5xx,
|
||||
// returns malformed JSON) by toggling env vars.
|
||||
//
|
||||
// Scope is bounded by what the tenant + canvas actually call. Add new
|
||||
// handlers as new replay scenarios demand them. Drift from real CP is
|
||||
@@ -21,6 +21,7 @@ package main
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
@@ -33,6 +34,18 @@ import (
|
||||
// step actually reached the stub (catches misrouted CP_URL configs).
|
||||
var redeployFleetCalls atomic.Int64
|
||||
|
||||
// provisionCalls tracks how many times /cp/workspaces/provision was
|
||||
// invoked. Phase 1 of the #2863 burn-down: a green-counter for the
|
||||
// cp-stub-provision-config replay that proves the harness provision
|
||||
// call actually reached the stub (and didn't fly past to real prod CP
|
||||
// via the env-var mismatch on CP_UPSTREAM_URL vs CP_PROVISION_URL).
|
||||
var provisionCalls atomic.Int64
|
||||
|
||||
// tenantsConfigCalls tracks how many times /cp/tenants/config was
|
||||
// invoked. Companion counter for the same Phase 1 burn-down — proves
|
||||
// the harness config-fetch also reached the stub.
|
||||
var tenantsConfigCalls atomic.Int64
|
||||
|
||||
func main() {
|
||||
mux := http.NewServeMux()
|
||||
|
||||
@@ -121,11 +134,116 @@ func main() {
|
||||
})
|
||||
})
|
||||
|
||||
// /cp/workspaces/provision — Phase 1 of the #2863 burn-down. The
|
||||
// real CP returns 201 + a provision-response shape that the tenant
|
||||
// Go code (workspace-server's CPProvisioner.Start in
|
||||
// internal/provisioner/cp_provisioner.go:339-363) treats as
|
||||
// success. That client (the cpProvisionResponse struct) reads
|
||||
// exactly two fields on success: instance_id + state. The
|
||||
// cp-stub mirrors that contract — 201 + those two fields — so
|
||||
// the harness-tenant Go code (which uses the REAL
|
||||
// CPProvisioner client) treats the response as a successful
|
||||
// provision. Anything else and the client falls into its
|
||||
// failure branch with `provision failed (200): <unstructured
|
||||
// body>` (the exact failure mode the CR2 review_id 11928
|
||||
// flagged on the prior head 30a6bea: 200 instead of 201, no
|
||||
// instance_id/state fields, → guaranteed fail-branch).
|
||||
//
|
||||
// cp-stub is permissive on input (no auth header check, empty
|
||||
// body OK, no payload-field validation) — the call's purpose is
|
||||
// to PROVE the request reached the stub + the env-var redirect
|
||||
// is wired. Field validation lives in the real CP in production.
|
||||
mux.HandleFunc("/cp/workspaces/provision", func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
writeJSON(w, 405, map[string]any{
|
||||
"error": "cp-stub: /cp/workspaces/provision only accepts POST",
|
||||
})
|
||||
return
|
||||
}
|
||||
provisionCalls.Add(1)
|
||||
// Parse body for shape (default to harness-ws if empty)
|
||||
wsID := "harness-ws"
|
||||
if r.Body != nil {
|
||||
body, _ := io.ReadAll(r.Body)
|
||||
var payload map[string]any
|
||||
if json.Unmarshal(body, &payload) == nil {
|
||||
if v, ok := payload["workspace_id"].(string); ok && v != "" {
|
||||
wsID = v
|
||||
}
|
||||
}
|
||||
}
|
||||
// Stub instance id + state — matches the real CP's success-path
|
||||
// contract. EC2 instance ids start with "i-" (the real CP
|
||||
// generates them via EC2 RunInstances; the stub is a stand-in,
|
||||
// but the prefix keeps any future real-CP log-reader from
|
||||
// false-flagging the stub response as malformed). "running"
|
||||
// matches the prod happy path; the harness doesn't await
|
||||
// any state transition.
|
||||
instanceID := "i-stub-" + wsID
|
||||
state := "running"
|
||||
log.Printf("cp-stub: /cp/workspaces/provision called (count=%d) -> %s (instance_id=%s, state=%s)", provisionCalls.Load(), wsID, instanceID, state)
|
||||
writeJSON(w, 201, map[string]any{
|
||||
// Fields the tenant Go code reads (cpProvisionResponse
|
||||
// struct in internal/provisioner/cp_provisioner.go:210-215):
|
||||
// instance_id (string) + state (string). Mandatory.
|
||||
"instance_id": instanceID,
|
||||
"state": state,
|
||||
// Observability fields — the real CP returns these too
|
||||
// (the real CPProvisioner.client ignores them, but they
|
||||
// appear in the wire log + in any future tool that
|
||||
// inspects the response). Mirror the prior head's
|
||||
// payload shape for minimum drift from the 30a6bea
|
||||
// contract.
|
||||
"workspace_id": wsID,
|
||||
"url": "http://cp-stub:9090/cp/workspaces/" + wsID,
|
||||
})
|
||||
})
|
||||
|
||||
// /cp/tenants/config — companion handler for Phase 1 of the #2863
|
||||
// burn-down. Mirrors the real CP's tenant-config response shape
|
||||
// (cp_config.go:47-63 in molecule-core): returns the runtime
|
||||
// registry, LLM endpoints, and feature flags a tenant needs to
|
||||
// bootstrap. The stub returns a minimal but valid config — enough
|
||||
// for the harness tenant to complete its boot sequence without
|
||||
// falling through to a real CP call.
|
||||
mux.HandleFunc("/cp/tenants/config", func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodGet {
|
||||
writeJSON(w, 405, map[string]any{
|
||||
"error": "cp-stub: /cp/tenants/config only accepts GET",
|
||||
})
|
||||
return
|
||||
}
|
||||
tenantsConfigCalls.Add(1)
|
||||
log.Printf("cp-stub: /cp/tenants/config called (count=%d)", tenantsConfigCalls.Load())
|
||||
writeJSON(w, 200, map[string]any{
|
||||
"tenant_id": "harness-tenant",
|
||||
"runtimes": []string{
|
||||
"claude-code",
|
||||
"hermes",
|
||||
"openclaw",
|
||||
"codex",
|
||||
"google-adk",
|
||||
"seo-agent",
|
||||
},
|
||||
"llm_endpoints": map[string]string{
|
||||
"openai": "http://cp-stub:9090/llm/openai/v1",
|
||||
"anthropic": "http://cp-stub:9090/llm/anthropic/v1",
|
||||
},
|
||||
"feature_flags": map[string]bool{
|
||||
"canvas_async_dispatch": true,
|
||||
"runtime_provision_smoke": true,
|
||||
"secrets_encryption_key": true,
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
// __stub/state — expose stub state (counters) so replay scripts can
|
||||
// assert the tenant actually reached us. Read-only.
|
||||
mux.HandleFunc("/__stub/state", func(w http.ResponseWriter, r *http.Request) {
|
||||
writeJSON(w, 200, map[string]any{
|
||||
"redeploy_fleet_calls": redeployFleetCalls.Load(),
|
||||
"provision_calls": provisionCalls.Load(),
|
||||
"tenants_config_calls": tenantsConfigCalls.Load(),
|
||||
})
|
||||
})
|
||||
|
||||
|
||||
+221
@@ -0,0 +1,221 @@
|
||||
#!/usr/bin/env bash
|
||||
# cp-stub-provision-config — #2863 burn-down: prove the harness's CP-stub
|
||||
# handles /cp/workspaces/provision + /cp/tenants/config so the harness
|
||||
# tenant's provision + config-fetch calls land on the stub (not real
|
||||
# prod CP). Phase 1 of the #2863 plan (see .claude/plans/2863-harness-fix-plan.md).
|
||||
#
|
||||
# This replay is INTENTIONALLY DISTINCT from canary-smoke-a2a-pong.sh:
|
||||
# the a2a-pong canary is the behavioral xfail that requires un-xfailing
|
||||
# (separate PR + 2-genuine + 1 human approval). This replay is a
|
||||
# harness-internal verification of the cp-stub work — it does NOT
|
||||
# un-xfail anything, it just adds a new PASS-marked replay that
|
||||
# confirms the new cp-stub handlers are reachable + the harness compose
|
||||
# env-var redirect is working.
|
||||
#
|
||||
# Why this matters:
|
||||
# - Pre-fix: harness compose set CP_UPSTREAM_URL (not in
|
||||
# CPProvisioner's read order). Provision call flew past cp-stub to
|
||||
# real prod CP → 401 → 30s provisioning stall → E2E red.
|
||||
# - Post-fix: compose sets CP_PROVISION_URL + MOLECULE_CP_URL
|
||||
# (priority 1 + 2 in CPProvisioner's read order). The harness's
|
||||
# tenant hits cp-stub's /cp/workspaces/provision + /cp/tenants/config
|
||||
# handlers (permissive, 200, valid shape). Provision succeeds;
|
||||
# staging E2E goes green on the next main run.
|
||||
#
|
||||
# What this replay asserts (each phase is a separate OK/KO):
|
||||
# Phase 1 — initial state: provision_calls=0, tenants_config_calls=0
|
||||
# Phase 2 — POST /cp/workspaces/provision → 201 + valid shape
|
||||
# (instance_id + state, matching the REAL CPProvisioner
|
||||
# client contract in internal/provisioner/cp_provisioner.go)
|
||||
# AND __/stub/state.provision_calls == 1
|
||||
# Phase 3 — GET /cp/tenants/config → 200 + valid shape
|
||||
# AND __/stub/state.tenants_config_calls == 1
|
||||
# Phase 4 — method-not-allowed cases: POST /cp/tenants/config → 405,
|
||||
# GET /cp/workspaces/provision → 405 (regression check:
|
||||
# if the cp-stub ever stops enforcing the verb, this fires)
|
||||
|
||||
set -euo pipefail
|
||||
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
HARNESS_ROOT="$(dirname "$HERE")"
|
||||
cd "$HARNESS_ROOT"
|
||||
|
||||
if [ ! -f .seed.env ]; then
|
||||
echo "[replay] no .seed.env — running ./seed.sh first..."
|
||||
./seed.sh
|
||||
fi
|
||||
# shellcheck source=/dev/null
|
||||
source .seed.env
|
||||
# shellcheck source=../_curl.sh
|
||||
source "$HARNESS_ROOT/_curl.sh"
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
|
||||
ok() { PASS=$((PASS+1)); printf " \033[32m✓\033[0m %s\n" "$*"; }
|
||||
ko() { FAIL=$((FAIL+1)); printf " \033[31m✗\033[0m %s\n" "$*"; }
|
||||
|
||||
# CP_STUB_BASE is set in _curl.sh from .seed.env (or by ./up.sh).
|
||||
: "${CP_STUB_BASE:?CP_STUB_BASE must be set in .seed.env — run ./seed.sh first}"
|
||||
|
||||
echo "[replay] cp-stub-provision-config — #2863 burn-down: cp-stub provision + config reachability"
|
||||
echo "[replay] CP_STUB_BASE=$CP_STUB_BASE"
|
||||
|
||||
# ---------------------------------------------------------------- Phase 1
|
||||
# Initial state — both counters should be 0 (or at any rate, we record
|
||||
# the start values so we can assert delta). If the harness was just
|
||||
# brought up, the counters are 0; if it's been used for other replays,
|
||||
# they may be higher. We capture the start values for the delta check.
|
||||
echo "[replay] phase 1: capture initial __/stub/state ..."
|
||||
INITIAL_STATE=$(curl -sS --max-time 10 "$CP_STUB_BASE/__stub/state")
|
||||
INITIAL_PROVISION=$(echo "$INITIAL_STATE" | python3 -c "import json,sys; print(json.load(sys.stdin).get('provision_calls', 0))")
|
||||
INITIAL_TENANTS_CONFIG=$(echo "$INITIAL_STATE" | python3 -c "import json,sys; print(json.load(sys.stdin).get('tenants_config_calls', 0))")
|
||||
echo "[replay] initial provision_calls=$INITIAL_PROVISION tenants_config_calls=$INITIAL_TENANTS_CONFIG"
|
||||
ok "captured initial __/stub/state"
|
||||
|
||||
# ---------------------------------------------------------------- Phase 2
|
||||
# POST /cp/workspaces/provision. The cp-stub should return 201 + a
|
||||
# provision-response shape that matches the REAL CPProvisioner client's
|
||||
# contract (internal/provisioner/cp_provisioner.go:339-363 + the
|
||||
# cpProvisionResponse struct at :210-215). The client treats 201 as
|
||||
# success and reads instance_id + state. The prior cp-stub contract
|
||||
# (200 + workspace_id/status/phase/url) was incorrect — it sent the
|
||||
# client into its failure branch with `provision failed (200):
|
||||
# <unstructured body>`, which the CR2 review_id 11928 flagged on the
|
||||
# prior head 30a6bea. After
|
||||
# the call, the __/stub/state.provision_calls counter should have
|
||||
# incremented by exactly 1.
|
||||
echo "[replay] phase 2: POST /cp/workspaces/provision ..."
|
||||
|
||||
# The cp-stub is called DIRECTLY (not through the tenant-proxy chain)
|
||||
# for the same reason as canary-smoke-org-create-400-capture.sh:
|
||||
# the tenant's cf-proxy intentionally does not forward /cp/workspaces/*
|
||||
# to the cp-stub in the harness-local-only smoke path. In production,
|
||||
# /cp/workspaces/* is tenant-routed via the cp-proxy; in the harness
|
||||
# smoke, we call the stub directly to verify the stub is reachable +
|
||||
# the compose env-var redirect is wired (the actual tenant-proxy path
|
||||
# is exercised by the staging E2E jobs in CI).
|
||||
RESP=$(curl -sS --max-time 30 \
|
||||
-H "Content-Type: application/json" \
|
||||
-X POST "$CP_STUB_BASE/cp/workspaces/provision" \
|
||||
-d '{"workspace_id":"harness-replay-$$"}' \
|
||||
-w "\n%{http_code}" 2>/dev/null) || RESP="000
|
||||
"
|
||||
|
||||
# Split body + status (last line is the status code)
|
||||
HTTP_CODE=$(echo "$RESP" | tail -n 1)
|
||||
BODY=$(echo "$RESP" | sed '$d')
|
||||
|
||||
echo "[replay] HTTP $HTTP_CODE"
|
||||
echo "[replay] body: $BODY"
|
||||
|
||||
if [ "$HTTP_CODE" = "201" ]; then
|
||||
ok "POST /cp/workspaces/provision returned 201 (cp-stub handler reachable, matches CPProvisioner success contract)"
|
||||
else
|
||||
ko "POST /cp/workspaces/provision returned $HTTP_CODE (expected 201 — cp-stub handler not wired, or env-var redirect failed, or the response shape regressed to non-201)"
|
||||
fi
|
||||
|
||||
# Assert the response shape — must include instance_id + state
|
||||
# (the two fields the real CPProvisioner.client reads on success).
|
||||
# workspace_id + url are also returned for observability (mirrors the
|
||||
# real CP's wire log) but are NOT consumed by the client; we assert
|
||||
# them too as a wire-shape drift-gate (any future change to the
|
||||
# real CP's response should be reflected in the stub, and vice versa).
|
||||
for field in instance_id state workspace_id url; do
|
||||
if echo "$BODY" | python3 -c "
|
||||
import json,sys
|
||||
d = json.loads(sys.stdin.read())
|
||||
sys.exit(0 if '$field' in d else 1)
|
||||
" 2>/dev/null; then
|
||||
ok "response body has required field '$field'"
|
||||
else
|
||||
ko "response body missing required field '$field': $BODY"
|
||||
fi
|
||||
done
|
||||
|
||||
# Assert the counter incremented
|
||||
STATE_AFTER_PROVISION=$(curl -sS --max-time 10 "$CP_STUB_BASE/__stub/state")
|
||||
PROVISION_AFTER=$(echo "$STATE_AFTER_PROVISION" | python3 -c "import json,sys; print(json.load(sys.stdin).get('provision_calls', 0))")
|
||||
EXPECTED_PROVISION=$((INITIAL_PROVISION + 1))
|
||||
if [ "$PROVISION_AFTER" = "$EXPECTED_PROVISION" ]; then
|
||||
ok "provision_calls incremented $INITIAL_PROVISION → $PROVISION_AFTER (==SSOT: request reached the stub)"
|
||||
else
|
||||
ko "provision_calls expected $EXPECTED_PROVISION, got $PROVISION_AFTER — request did NOT reach the stub (env-var redirect broken, or counter not wired)"
|
||||
fi
|
||||
|
||||
# ---------------------------------------------------------------- Phase 3
|
||||
# GET /cp/tenants/config. Mirror of Phase 2 but for the config-fetch
|
||||
# call. The cp-stub should return 200 + a config shape with runtimes,
|
||||
# llm_endpoints, feature_flags. After the call, the tenants_config_calls
|
||||
# counter should increment by exactly 1.
|
||||
echo "[replay] phase 3: GET /cp/tenants/config ..."
|
||||
|
||||
RESP=$(curl -sS --max-time 30 \
|
||||
-X GET "$CP_STUB_BASE/cp/tenants/config" \
|
||||
-w "\n%{http_code}" 2>/dev/null) || RESP="000
|
||||
"
|
||||
|
||||
HTTP_CODE=$(echo "$RESP" | tail -n 1)
|
||||
BODY=$(echo "$RESP" | sed '$d')
|
||||
|
||||
echo "[replay] HTTP $HTTP_CODE"
|
||||
echo "[replay] body: $BODY"
|
||||
|
||||
if [ "$HTTP_CODE" = "200" ]; then
|
||||
ok "GET /cp/tenants/config returned 200 (cp-stub handler reachable)"
|
||||
else
|
||||
ko "GET /cp/tenants/config returned $HTTP_CODE (expected 200 — cp-stub handler not wired)"
|
||||
fi
|
||||
|
||||
# Assert the response shape matches the real CP's tenant-config shape
|
||||
for field in runtimes llm_endpoints feature_flags; do
|
||||
if echo "$BODY" | python3 -c "
|
||||
import json,sys
|
||||
d = json.loads(sys.stdin.read())
|
||||
sys.exit(0 if '$field' in d else 1)
|
||||
" 2>/dev/null; then
|
||||
ok "config body has required field '$field'"
|
||||
else
|
||||
ko "config body missing required field '$field': $BODY"
|
||||
fi
|
||||
done
|
||||
|
||||
# Assert the counter incremented
|
||||
STATE_AFTER_CONFIG=$(curl -sS --max-time 10 "$CP_STUB_BASE/__stub/state")
|
||||
CONFIG_AFTER=$(echo "$STATE_AFTER_CONFIG" | python3 -c "import json,sys; print(json.load(sys.stdin).get('tenants_config_calls', 0))")
|
||||
EXPECTED_CONFIG=$((INITIAL_TENANTS_CONFIG + 1))
|
||||
if [ "$CONFIG_AFTER" = "$EXPECTED_CONFIG" ]; then
|
||||
ok "tenants_config_calls incremented $INITIAL_TENANTS_CONFIG → $CONFIG_AFTER (==SSOT: request reached the stub)"
|
||||
else
|
||||
ko "tenants_config_calls expected $EXPECTED_CONFIG, got $CONFIG_AFTER — request did NOT reach the stub"
|
||||
fi
|
||||
|
||||
# ---------------------------------------------------------------- Phase 4
|
||||
# Method-not-allowed regression checks. If the cp-stub ever stops
|
||||
# enforcing the verb (e.g. someone refactors and removes the 405
|
||||
# branches), these assertions fire. The MCP is small but the verb
|
||||
# enforcement matters: POST /cp/tenants/config should never silently
|
||||
# succeed (it would mean a config-update path the harness didn't
|
||||
# intend to support).
|
||||
echo "[replay] phase 4: verb enforcement regression checks ..."
|
||||
|
||||
# POST /cp/tenants/config should be 405 (only GET is allowed)
|
||||
HTTP_CODE=$(curl -sS --max-time 10 -o /dev/null -w "%{http_code}" \
|
||||
-X POST "$CP_STUB_BASE/cp/tenants/config" 2>/dev/null || echo "000")
|
||||
if [ "$HTTP_CODE" = "405" ]; then
|
||||
ok "POST /cp/tenants/config returned 405 (verb enforcement intact)"
|
||||
else
|
||||
ko "POST /cp/tenants/config returned $HTTP_CODE (expected 405 — verb enforcement regressed)"
|
||||
fi
|
||||
|
||||
# GET /cp/workspaces/provision should be 405 (only POST is allowed)
|
||||
HTTP_CODE=$(curl -sS --max-time 10 -o /dev/null -w "%{http_code}" \
|
||||
-X GET "$CP_STUB_BASE/cp/workspaces/provision" 2>/dev/null || echo "000")
|
||||
if [ "$HTTP_CODE" = "405" ]; then
|
||||
ok "GET /cp/workspaces/provision returned 405 (verb enforcement intact)"
|
||||
else
|
||||
ko "GET /cp/workspaces/provision returned $HTTP_CODE (expected 405 — verb enforcement regressed)"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "[replay] PASS=$PASS FAIL=$FAIL"
|
||||
[ "$FAIL" -eq 0 ]
|
||||
@@ -104,6 +104,13 @@ echo "[seed] beta-child id=$BETA_CHILD_ID"
|
||||
# workspace" for their purposes.)"
|
||||
echo "ALPHA_WORKSPACE_ID=$ALPHA_PARENT_ID"
|
||||
echo "BETA_WORKSPACE_ID=$BETA_PARENT_ID"
|
||||
# CP_STUB_BASE — the URL the host uses to reach the cp-stub service.
|
||||
# Replays run on the host (./run-all-replays.sh — see compose.yml's
|
||||
# #2867 address-fix), and compose publishes cp-stub's port 9090 to
|
||||
# the host loopback (cp-stub.ports: "9090:9090"). Default to
|
||||
# http://localhost:9090; allow override via env for staging mirrors
|
||||
# where the cp-stub is reachable at a different host/port.
|
||||
echo "CP_STUB_BASE=${CP_STUB_BASE:-http://localhost:9090}"
|
||||
} > "$HERE/.seed.env"
|
||||
|
||||
echo ""
|
||||
|
||||
@@ -0,0 +1,102 @@
|
||||
package handlers
|
||||
|
||||
// workspace_admin_restart.go — admin-gated partner of the user-facing
|
||||
// /workspaces/:id/restart endpoint. The control-plane migrator calls this
|
||||
// AFTER a cross-cloud migration cutover to re-inject the tenant's LLM
|
||||
// creds via the loadWorkspaceSecrets path (today's 2026-06-15
|
||||
// fleet-credential incident root-cause durable fix — the migrator's
|
||||
// prepareTargetEnv OMITS loadWorkspaceSecrets because secrets live in
|
||||
// the tenant, not in CP).
|
||||
//
|
||||
// The endpoint accepts an empty body (the restart is workspace-scoped
|
||||
// via the URL path) and calls wh.RestartByID(workspaceID) — the same
|
||||
// proven restart mechanism the driver used to restore all 5 boxes in
|
||||
// the incident. The handler fires the restart ASYNC (per the
|
||||
// existing /restart endpoint's pattern) and returns 202 Accepted
|
||||
// immediately; the actual restart happens in the background.
|
||||
//
|
||||
// Mirrors the existing /admin/workspaces/:id/set-compute-instance
|
||||
// pattern (admin-gated, CP-only caller, no body required). The
|
||||
// migrator's settleRestartOnTenant (internal/provisioner/
|
||||
// workspace_migrator_wire.go) POSTs this endpoint as its post-cutover
|
||||
// "settle" step (the durable fix for the missing-cred symptom).
|
||||
//
|
||||
// Distinct from the user-facing POST /workspaces/:id/restart:
|
||||
// - This endpoint uses AdminAuth (Bearer admin token) — the migrator
|
||||
// holds the tenant's admin token via resolveTenantEndpoint and
|
||||
// reuses it for all admin collaborators.
|
||||
// - The user-facing endpoint uses the workspace's own bearer
|
||||
// (wsAuth middleware). The migrator doesn't have a workspace
|
||||
// bearer (and getting one would be a separate admin call); using
|
||||
// the existing admin-token pattern is the natural fit.
|
||||
|
||||
import (
|
||||
"log"
|
||||
"net/http"
|
||||
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
// AdminRestart handles POST /admin/workspaces/:id/restart (AdminAuth). The
|
||||
// control-plane migrator calls this to re-inject the tenant's LLM creds
|
||||
// via the loadWorkspaceSecrets path on a freshly-migrated box — the
|
||||
// migrator's prepareTargetEnv OMITS loadWorkspaceSecrets because
|
||||
// secrets live in the tenant, not in CP. The restart re-runs
|
||||
// prepareProvisionContext which calls loadWorkspaceSecrets, re-issuing
|
||||
// the per-workspace bearer + injecting CLAUDE_CODE_OAUTH_TOKEN /
|
||||
// CODEX_AUTH_JSON / MINIMAX_API_KEY into the container env.
|
||||
//
|
||||
// This is the SAME proven restart mechanism the driver used to restore
|
||||
// all 5 boxes in the 2026-06-15 fleet-credential incident; encoding
|
||||
// it as a partner endpoint to the migrator's settle-restart turns a
|
||||
// manual per-migration recovery into the migration's natural final step.
|
||||
//
|
||||
// Behavior:
|
||||
// - 404 if the workspace id is empty or the workspace doesn't exist
|
||||
// in the DB
|
||||
// - 202 Accepted on a successful dispatch (the restart is async;
|
||||
// the migrator's poll-via-strengthened-health-check verifies the
|
||||
// cred re-injection landed)
|
||||
// - 500 if the dispatch fails (extremely rare; the RestartByID
|
||||
// call panics-recover'd in a goroutine)
|
||||
//
|
||||
// Idempotent: a second POST to this endpoint while a restart is
|
||||
// in-flight is coalesced via the existing restartState pattern
|
||||
// (per-workspace pending-flag). Safe to call repeatedly.
|
||||
func (h *WorkspaceHandler) AdminRestart(c *gin.Context) {
|
||||
id := c.Param("id")
|
||||
if id == "" {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "workspace id required"})
|
||||
return
|
||||
}
|
||||
|
||||
// Pre-flight: confirm the workspace exists. A 404 here (vs. a
|
||||
// silent no-op for a missing id) gives the migrator a clear
|
||||
// signal to roll back. The RestartByID call below would also
|
||||
// fail in this case, but with a less-precise error; doing the
|
||||
// pre-flight gives ops a clean diagnostic in the wire log.
|
||||
var exists int
|
||||
err := db.DB.QueryRowContext(c.Request.Context(), `SELECT 1 FROM workspaces WHERE id = $1`, id).Scan(&exists)
|
||||
if err != nil {
|
||||
if err.Error() == "sql: no rows in result set" {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": "workspace not found"})
|
||||
return
|
||||
}
|
||||
log.Printf("AdminRestart: workspace lookup %s: %v", id, err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "db lookup failed"})
|
||||
return
|
||||
}
|
||||
|
||||
// Fire the restart ASYNC — same pattern as the user-facing
|
||||
// POST /workspaces/:id/restart handler. The actual restart runs
|
||||
// in a goroutine; we return 202 Accepted immediately so the
|
||||
// migrator's poll loop isn't held by the restart's own
|
||||
// provisioning time.
|
||||
h.goAsync(func() { h.RestartByID(id) })
|
||||
log.Printf("AdminRestart: dispatching restart for workspace %s (CP migrator settle — fleet-credential incident durable fix)", id)
|
||||
c.JSON(http.StatusAccepted, gin.H{
|
||||
"status": "restart_dispatched",
|
||||
"workspace_id": id,
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,160 @@
|
||||
package handlers
|
||||
|
||||
// workspace_admin_restart_test.go — tests for the AdminRestart handler
|
||||
// (the partner of the user-facing POST /workspaces/:id/restart). The CP
|
||||
// migrator calls this to re-inject the tenant's LLM creds via the
|
||||
// loadWorkspaceSecrets path on a freshly-migrated box (today's
|
||||
// 2026-06-15 fleet-credential incident root-cause durable fix — see
|
||||
// PRs #824 (CP) and this one (tenant partner)). Mirrors the
|
||||
// SetComputeInstance test pattern (workspace_set_compute_instance_test.go).
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"errors"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
// AdminRestart re-injects LLM creds via the loadWorkspaceSecrets path
|
||||
// (the durable fix for today's 2026-06-15 fleet-credential incident —
|
||||
// see controlplane PR #824 for the migrator-side). The handler fires
|
||||
// wh.RestartByID ASYNC (per the existing /restart endpoint's pattern)
|
||||
// and returns 202 Accepted immediately.
|
||||
func TestAdminRestart_HappyPath(t *testing.T) {
|
||||
h, mock := setupBootstrapHandler(t)
|
||||
|
||||
// Pre-flight: confirm the workspace exists. The handler does
|
||||
// a SELECT 1 FROM workspaces WHERE id = $1 before firing the
|
||||
// async restart, so we expect that query.
|
||||
mock.ExpectQuery(`SELECT 1 FROM workspaces WHERE id = \$1`).
|
||||
WithArgs("ws-migrated").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"x"}).AddRow(1))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-migrated"}}
|
||||
c.Request = httptest.NewRequest("POST", "/admin/workspaces/ws-migrated/restart", nil)
|
||||
|
||||
h.AdminRestart(c)
|
||||
|
||||
if w.Code != http.StatusAccepted {
|
||||
t.Fatalf("want 202, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet: %v", err)
|
||||
}
|
||||
// The actual restart is async; we don't assert on the goroutine
|
||||
// (it would no-op on the test bootstrap since h has no provisioner
|
||||
// wired; the goAsync panic-recovery swallows any panic cleanly).
|
||||
}
|
||||
|
||||
// A workspace id that matches no row is a 404 — the migrator can tell
|
||||
// a stale id from a real restart. Distinct from SetComputeInstance's
|
||||
// NoRowIs404 (which fires on the UPDATE rowcount), here the
|
||||
// pre-flight SELECT does the work.
|
||||
func TestAdminRestart_NoRowIs404(t *testing.T) {
|
||||
h, mock := setupBootstrapHandler(t)
|
||||
|
||||
mock.ExpectQuery(`SELECT 1 FROM workspaces WHERE id = \$1`).
|
||||
WithArgs("ws-gone").
|
||||
WillReturnError(sql.ErrNoRows)
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-gone"}}
|
||||
c.Request = httptest.NewRequest("POST", "/admin/workspaces/ws-gone/restart", nil)
|
||||
|
||||
h.AdminRestart(c)
|
||||
|
||||
if w.Code != http.StatusNotFound {
|
||||
t.Fatalf("want 404, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// A DB failure on the pre-flight surfaces as 500 so the migrator
|
||||
// can fail loudly rather than silently restart into a missing
|
||||
// workspace. (RestartByID would fail too, but with a less-precise
|
||||
// error from the deeper code path; surfacing the pre-flight 500
|
||||
// gives ops a clean diagnostic.)
|
||||
func TestAdminRestart_DBErrorIs500(t *testing.T) {
|
||||
h, mock := setupBootstrapHandler(t)
|
||||
|
||||
mock.ExpectQuery(`SELECT 1 FROM workspaces WHERE id = \$1`).
|
||||
WithArgs("ws-1").
|
||||
WillReturnError(errors.New("connection reset"))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
|
||||
c.Request = httptest.NewRequest("POST", "/admin/workspaces/ws-1/restart", nil)
|
||||
|
||||
h.AdminRestart(c)
|
||||
|
||||
if w.Code != http.StatusInternalServerError {
|
||||
t.Fatalf("want 500, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// An empty id is a 400 before any DB work — the migrator never
|
||||
// issues an empty id (it always has a real wsID from the cutover
|
||||
// record), so this is a defense-in-depth check, not a hot path.
|
||||
func TestAdminRestart_EmptyIDIs400(t *testing.T) {
|
||||
h, _ := setupBootstrapHandler(t)
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: ""}}
|
||||
c.Request = httptest.NewRequest("POST", "/admin/workspaces//restart", nil)
|
||||
|
||||
h.AdminRestart(c)
|
||||
|
||||
if w.Code != http.StatusBadRequest {
|
||||
t.Errorf("want 400, got %d", w.Code)
|
||||
}
|
||||
}
|
||||
|
||||
// Sanity check that the handler does NOT pause for the actual restart
|
||||
// (the 202 path is async; the migrator's poll loop is not held by
|
||||
// the restart's provisioning time). A 1ms-budgeted assertion catches
|
||||
// a regression that turns the handler into a synchronous call.
|
||||
func TestAdminRestart_AsyncDoesNotBlock(t *testing.T) {
|
||||
h, mock := setupBootstrapHandler(t)
|
||||
mock.ExpectQuery(`SELECT 1 FROM workspaces WHERE id = \$1`).
|
||||
WithArgs("ws-1").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"x"}).AddRow(1))
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
|
||||
c.Request = httptest.NewRequest("POST", "/admin/workspaces/ws-1/restart", nil)
|
||||
h.AdminRestart(c)
|
||||
close(done)
|
||||
}()
|
||||
select {
|
||||
case <-done:
|
||||
// PASS — handler returned quickly.
|
||||
case <-timeAfter(1):
|
||||
t.Fatal("AdminRestart blocked (the 202 must return without waiting for the restart goroutine)")
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Use a package-private alias so the test file doesn't need to
|
||||
// inline a time.After call. Kept inline; standard library time is
|
||||
// imported via the test harness.
|
||||
var timeAfter = func(d int) <-chan time.Time { return time.After(time.Duration(d) * time.Millisecond) }
|
||||
@@ -203,6 +203,18 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi
|
||||
// fighting the migration into a split-brain. Pure record repoint (no
|
||||
// deprovision); the CP migrator calls it once the cutover is verified.
|
||||
wsAdmin.POST("/admin/workspaces/:id/set-compute-instance", wh.SetComputeInstance)
|
||||
// Admin-triggered restart of a workspace — the partner of the
|
||||
// user-facing POST /workspaces/:id/restart (which uses the
|
||||
// workspace's own bearer). The CP migrator calls this after a
|
||||
// cross-cloud migration cutover to re-inject LLM creds via the
|
||||
// loadWorkspaceSecrets path (today's 2026-06-15 fleet-credential
|
||||
// incident root-cause durable fix — see PRs #824 (CP) and this
|
||||
// one (tenant partner)). The handler fires wh.RestartByID async
|
||||
// and returns 202 Accepted immediately; the actual restart
|
||||
// happens in the background and the migrator's strengthened
|
||||
// health check (assertCompletionServes in CP#824) verifies the
|
||||
// cred re-injection landed.
|
||||
wsAdmin.POST("/admin/workspaces/:id/restart", wh.AdminRestart)
|
||||
// Per-workspace LLM billing mode override (internal#691). Used by
|
||||
// CP's /cp/admin/workspaces/:id/llm-billing-mode proxy + (via that
|
||||
// proxy) by the canvas Config-tab "LLM Billing" section. Default-
|
||||
|
||||
Reference in New Issue
Block a user