fix(harness#2864): re-xfail canary-smoke-a2a-pong (Harness Replays burn-down) #2873

Closed
agent-dev-b wants to merge 3 commits from fix/2863-cp-stub-provision-handler into main
3 changed files with 128 additions and 15 deletions
+16
View File
@@ -102,6 +102,17 @@ services:
ADMIN_TOKEN: "harness-admin-token-alpha"
MOLECULE_ORG_ID: "harness-org-alpha"
CP_UPSTREAM_URL: "http://cp-stub:9090"
# CPProvisioner (workspace-server/internal/provisioner/cp_provisioner.go:79-86)
# reads CP_PROVISION_URL first, then MOLECULE_CP_URL, else defaults to real
# prod CP (https://api.moleculesai.app). CP_UPSTREAM_URL above ONLY mounts
# the browser-facing tenant reverse proxy (router.go:920-934) — the
# provisioner does NOT read it. Setting both env vars here redirects the
# provision call to the cp-stub, which is the harness's local CP
# stand-in. cp-stub is permissive on auth (no shared-secret check)
# because the harness doesn't set MOLECULE_CP_SHARED_SECRET.
# Fixes core#2863 (canary-smoke-a2a-pong xfail → un-xfail).
CP_PROVISION_URL: "http://cp-stub:9090"
MOLECULE_CP_URL: "http://cp-stub:9090"
RATE_LIMIT: "1000"
CANVAS_PROXY_URL: "http://localhost:3000"
# LLM-proxy env vars required by assertManagedTenantHasLLMEnv
@@ -170,6 +181,11 @@ services:
ADMIN_TOKEN: "harness-admin-token-beta"
MOLECULE_ORG_ID: "harness-org-beta"
CP_UPSTREAM_URL: "http://cp-stub:9090"
# CPProvisioner redirect (see tenant-alpha block for full rationale).
# CP_PROVISION_URL + MOLECULE_CP_URL point the provision call at the
# cp-stub instead of real prod CP — fixes core#2863.
CP_PROVISION_URL: "http://cp-stub:9090"
MOLECULE_CP_URL: "http://cp-stub:9090"
RATE_LIMIT: "1000"
CANVAS_PROXY_URL: "http://localhost:3000"
# LLM-proxy env vars (see assertManagedTenantHasLLMEnv in
+79 -3
View File
@@ -8,9 +8,9 @@
// activates, and tests exercise the real tenant→CP wire.
//
// This is NOT a CP reimplementation. It serves the minimum surface to:
// 1. Boot the tenant image without /cp/* breaking the canvas bootstrap.
// 2. Replay specific bug classes (e.g. /cp/* returns 404, returns 5xx,
// returns malformed JSON) by toggling env vars.
// 1. Boot the tenant image without /cp/* breaking the canvas bootstrap.
// 2. Replay specific bug classes (e.g. /cp/* returns 404, returns 5xx,
// returns malformed JSON) by toggling env vars.
//
// Scope is bounded by what the tenant + canvas actually call. Add new
// handlers as new replay scenarios demand them. Drift from real CP is
@@ -33,6 +33,21 @@ import (
// step actually reached the stub (catches misrouted CP_URL configs).
var redeployFleetCalls atomic.Int64
// provisionCalls tracks how many times /cp/workspaces/provision was invoked.
// Replay scripts (e.g. canary-smoke-a2a-pong) assert > 0 to confirm the
// CPProvisioner actually reached the stub — without this, the
// default-to-real-prod-CP path (cp_provisioner.go:79-86) would silently 401
// and the workspace-start stall would be hard to distinguish from a real
// harness bug. Fixes core#2863.
var provisionCalls atomic.Int64
// tenantsConfigCalls tracks how many times /cp/tenants/config was invoked.
// CPProvisioner also calls this on every workspace-start env refresh
// (cp_config.go:47-63, :79-84). Same rationale as provisionCalls: the
// counter surfaces a misrouted CP_PROVISION_URL as a stub-state delta
// rather than a silent 401.
var tenantsConfigCalls atomic.Int64
func main() {
mux := http.NewServeMux()
@@ -121,11 +136,72 @@ func main() {
})
})
// /cp/workspaces/provision — CPProvisioner (workspace-server/internal/
// provisioner/cp_provisioner.go:315-323) POSTs here on every workspace
// start. Permissive on auth (the harness doesn't set MOLECULE_CP_SHARED_
// SECRET, so real CP 401s and the call flies past — cp-stub accepts
// any caller, mirroring the existing /cp/auth/me + /cp/admin/tenants/
// redeploy-fleet handlers). Returns a valid provision response shape
// so the workspace-start goroutine completes and registers the URL.
// Fixes core#2863 (canary-smoke-a2a-pong xfail → un-xfail).
mux.HandleFunc("/cp/workspaces/provision", func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
writeJSON(w, http.StatusMethodNotAllowed, map[string]any{"error": "POST required"})
return
}
provisionCalls.Add(1)
var body map[string]any
_ = json.NewDecoder(r.Body).Decode(&body) // body is advisory; missing/empty is fine
workspaceID, _ := body["workspace_id"].(string)
if workspaceID == "" {
workspaceID = "harness-ws"
}
// 201 Created (not 200) — cp_provisioner.go:339 checks
// `if resp.StatusCode != http.StatusCreated` and returns
// "cp provisioner: provision failed (200): <body>" otherwise.
// Real CP returns 201 on a successful provision; the cp-stub
// must match for the workspace-start goroutine to complete.
writeJSON(w, http.StatusCreated, map[string]any{
"ok": true,
"workspace_id": workspaceID,
"status": "ready",
"phase": "ready",
"url": envOr("PLATFORM_URL", "http://tenant-alpha:8080"),
})
})
// /cp/tenants/config — CPProvisioner calls this on every env refresh
// (workspace-server/cmd/server/cp_config.go:47-63, :79-84). Returns
// the org config the tenant boot assertion (assertManagedTenantHasLLMEnv)
// needs. Echoes MOLECULE_ORG_ID + the LLM-proxy env values so the
// tenant sees a consistent config shape. Permissive on auth (same
// rationale as /cp/workspaces/provision above). Fixes core#2863.
mux.HandleFunc("/cp/tenants/config", func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
writeJSON(w, http.StatusMethodNotAllowed, map[string]any{"error": "GET required"})
return
}
tenantsConfigCalls.Add(1)
writeJSON(w, http.StatusOK, map[string]any{
"ok": true,
"org_id": envOr("MOLECULE_ORG_ID", "harness-org"),
"config": map[string]any{
"llm_proxy_url": envOr("MOLECULE_LLM_BASE_URL", "http://cp-stub:9090/llm/openai/v1"),
"llm_anthropic_base": envOr("MOLECULE_LLM_ANTHROPIC_BASE_URL", "http://cp-stub:9090/llm/anthropic/v1"),
"llm_usage_url": envOr("MOLECULE_LLM_USAGE_URL", "http://cp-stub:9090/llm/usage"),
"llm_usage_token": envOr("MOLECULE_LLM_USAGE_TOKEN", "harness-llm-usage-token"),
"admin_token": envOr("ADMIN_TOKEN", "harness-admin-token"),
},
})
})
// __stub/state — expose stub state (counters) so replay scripts can
// assert the tenant actually reached us. Read-only.
mux.HandleFunc("/__stub/state", func(w http.ResponseWriter, r *http.Request) {
writeJSON(w, 200, map[string]any{
"redeploy_fleet_calls": redeployFleetCalls.Load(),
"provision_calls": provisionCalls.Load(),
"tenants_config_calls": tenantsConfigCalls.Load(),
})
})
+33 -12
View File
@@ -1,23 +1,44 @@
#!/usr/bin/env bash
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# XFAIL — issue #2863
# XFAIL — issue #2864 (Harness Replays burn-down tracking) + #2863
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# This replay is currently marked xfail (expected to fail). The underlying
# issue is tracked at https://git.moleculesai.app/molecule-ai/molecule-core/issues/2863
# Reason: CP-stub 401 on workspace start (30s provisioning stall)
# This replay is re-marked xfail (expected to fail) per the PM's
# hard-stop directive on PR #2873 (dispatch 1a433c7a): the prior
# attempt to fix the cp-stub 401 surfaced a new red on the actual
# Harness Replays CI (Harness Replays job #367058 concluded
# failure), meeting the prior dispatch's hard-stop criterion
# ("if 201 + counter-assert still doesn't green a2a-pong on
# actual Harness Replays CI, STOP + reply 're-xfail #2864' —
# do NOT burn more cycles").
#
# Underlying issue is tracked at:
# - https://git.moleculesai.app/molecule-ai/molecule-core/issues/2864
# (the broader Harness Replays burn-down tracking — closes
# when this replay surfaces a real pass signal)
# - https://git.moleculesai.app/molecule-ai/molecule-core/issues/2863
# (the original a2a-pong XFAIL tracking — CP-stub 401 on
# workspace start; remains the load-bearing root-cause issue)
# Reason: a2a-pong still RED on Harness Replays after 201 fix +
# Phase E counter assertions; the a2a-pong fix is non-urgent
# (main stays honest-SKIP via merged #2872 — the
# `__SKIP__`/`__XFAIL__` replays are correctly counted as
# skips, not passes).
#
# To un-xfail (when the underlying issue is fixed):
# 1. Remove the `exit 0` line below
# 2. Update the issue #2863 with a "fixed" comment + link to the fix PR
# 3. Verify the replay runs end-to-end with PASS in the local harness
# 4. The Harness Replays workflow will then surface the real pass signal
# 2. Restore the real Phase A/B/C/D/E replay code (and any
# counter assertions the fix warrants)
# 3. Update the linked issues with a "fixed" comment + link to
# the fix PR
# 4. Verify the replay runs end-to-end with PASS in the local
# harness + Harness Replays CI
#
# Why we xfail (not skip, not fix): the underlying issues are out of scope
# for PR #2821 (which captures the canary failures) but block the green CI
# signal that the 2-genuine review needs. Tracking the work in the linked
# issue lets us burn down the xfails as separate PRs land.
# Why we xfail (not skip, not fix): the cp-stub provisioning
# round-trip is out of scope for the canary capture work in
# PR #2821. The non-urgent fix lives in the linked issues
# and burns down as separate PRs land.
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
echo "[replay] __SKIP__:#2863:CP-stub 401 on workspace start (30s provisioning stall)"
echo "[replay] __SKIP__:#2864:re-xfail per PM dispatch 1a433c7a (Harness Replays burn-down tracking) + #2863 (root-cause CP-stub 401) — a2a-pong fix is non-urgent, main stays honest-SKIP via #2872"
exit 0
set -euo pipefail