Compare commits

..

2 Commits

Author SHA1 Message Date
devops-engineer 81630a36f8 Merge branch 'main' into test/delegate-record-db-errors
ci-arm64-advisory / fast-checks (pull_request) Waiting to run
Block internal-flavored paths / Block forbidden paths (pull_request) Successful in 6s
CI / Python Lint & Test (pull_request) Successful in 5s
CI / Detect changes (pull_request) Successful in 13s
Handlers Postgres Integration / detect-changes (pull_request) Successful in 9s
CI / Canvas (Next.js) (pull_request) Successful in 4s
E2E Chat / detect-changes (pull_request) Successful in 16s
E2E API Smoke Test / detect-changes (pull_request) Successful in 21s
CI / Shellcheck (E2E scripts) (pull_request) Successful in 8s
Harness Replays / detect-changes (pull_request) Successful in 13s
CI / Canvas Deploy Status (pull_request) Has been skipped
Lint forbidden tenant-env keys / Scan for repo-host token write into tenant workspace surface (pull_request) Successful in 8s
Lint forbidden tenant-env keys / Scan workspace_secrets writers for forbidden env keys (pull_request) Successful in 17s
E2E Staging Canvas (Playwright) / detect-changes (pull_request) Successful in 24s
E2E Chat / E2E Chat (pull_request) Successful in 5s
Harness Replays / Harness Replays (pull_request) Successful in 6s
Lint shellcheck (arm64 pilot) / shellcheck-arm64 (pilot) (pull_request) Successful in 18s
security-review / approved (pull_request_target) Failing after 11s
gate-check-v3 / gate-check (pull_request_target) Failing after 16s
Secret scan / Scan diff for credential-shaped strings (pull_request) Successful in 19s
qa-review / approved (pull_request_target) Failing after 14s
sop-checklist / review-refire (pull_request_target) Has been skipped
sop-checklist / all-items-acked (pull_request) [info tier:low] acked: 0/7 — missing: comprehensive-testing, local-postgres-e2e, staging-smoke, +4 — body-unfilled: comprehensive-testing, l
E2E Staging Canvas (Playwright) / Canvas tabs E2E (pull_request) Successful in 8s
sop-checklist / na-declarations (pull_request) N/A: (none)
sop-checklist / all-items-acked (pull_request_target) Successful in 6s
sop-tier-check / tier-check (pull_request_target) Failing after 7s
lint-required-no-paths / lint-required-no-paths (pull_request) Successful in 1m8s
Handlers Postgres Integration / Handlers Postgres Integration (pull_request) Successful in 1m17s
E2E API Smoke Test / E2E API Smoke Test (pull_request) Successful in 2m8s
CI / Platform (Go) (pull_request) Successful in 4m10s
CI / all-required (pull_request) Successful in 4s
2026-06-06 18:50:49 +00:00
fullstack-engineer a60033dc16 test(handlers): add missing DB-error tests for Record and SessionSearch
E2E API Smoke Test / E2E API Smoke Test (pull_request) Blocked by required conditions
Handlers Postgres Integration / Handlers Postgres Integration (pull_request) Blocked by required conditions
Harness Replays / Harness Replays (pull_request) Blocked by required conditions
lint-continue-on-error-tracking / lint-continue-on-error-tracking (pull_request) Waiting to run
Lint curl status-code capture / Scan workflows for curl status-capture pollution (pull_request) Waiting to run
lint-mask-pr-atomicity / lint-mask-pr-atomicity (pull_request) Waiting to run
publish-runtime-autobump / bump-and-tag (pull_request) Waiting to run
MCP Stdio Transport Regression / MCP stdio with regular-file stdout (pull_request) Successful in 3m8s
publish-runtime-autobump / pr-validate (pull_request) Successful in 1m22s
Lint workflow YAML (Gitea-1.22.6-hostile shapes) / Lint workflow YAML for Gitea-1.22.6-hostile shapes (pull_request) Successful in 2m11s
Lint pre-flip continue-on-error / Verify continue-on-error flips have run-log proof (pull_request) Successful in 2m41s
lint-required-context-exists-in-bp / lint-required-context-exists-in-bp (pull_request) Successful in 3m6s
Ops Scripts Tests / Ops scripts (unittest) (pull_request) Failing after 1m59s
audit-force-merge / audit (pull_request) Waiting to run
E2E Staging Canvas (Playwright) / Canvas tabs E2E (pull_request) Blocked by required conditions
Runtime PR-Built Compatibility / PR-built wheel + import smoke (pull_request) Blocked by required conditions
Block internal-flavored paths / Block forbidden paths (pull_request) Successful in 34s
CI / Shellcheck (E2E scripts) (pull_request) Successful in 43s
CI / Detect changes (pull_request) Successful in 1m13s
Handlers Postgres Integration / detect-changes (pull_request) Successful in 24s
E2E API Smoke Test / detect-changes (pull_request) Successful in 1m16s
Harness Replays / detect-changes (pull_request) Successful in 53s
E2E Staging Canvas (Playwright) / detect-changes (pull_request) Successful in 1m57s
Secret scan / Scan diff for credential-shaped strings (pull_request) Successful in 38s
Runtime PR-Built Compatibility / detect-changes (pull_request) Successful in 58s
qa-review / approved (pull_request) Failing after 48s
gate-check-v3 / gate-check (pull_request) Failing after 55s
lint-required-no-paths / lint-required-no-paths (pull_request) Successful in 1m51s
security-review / approved (pull_request) Failing after 41s
sop-tier-check / tier-check (pull_request) Successful in 40s
CI / Python Lint & Test (pull_request) Successful in 8m14s
CI / Canvas (Next.js) (pull_request) Successful in 19m47s
CI / Platform (Go) (pull_request) Successful in 21m16s
CI / all-required (pull_request) Successful in 21m27s
CI / Canvas Deploy Reminder (pull_request) Successful in 8s
sop-checklist / all-items-acked (pull_request) [info tier:low] acked: 0/7 — missing: comprehensive-testing, local-postgres-e2e, staging-smoke, +4 — body-unfilled: comprehensive-testing, l
- TestDelegationRecord_DBInsertFails: verifies 500 on activity_logs insert failure
- TestSessionSearch_DBError: verifies 500 on WITH query failure

Both are regression coverage for error paths that lacked test coverage.

🤖 Generated with [Claude Code](https://claude.ai/claude-code)
2026-05-15 07:38:15 +00:00
43 changed files with 251 additions and 1364 deletions
+19 -60
View File
@@ -54,57 +54,32 @@ API="https://${GITEA_HOST}/api/v1"
AUTH="Authorization: token ${GITEA_TOKEN}"
# 1. Fetch the PR. If not merged, no-op.
# Fail-closed: verify HTTP 200 before parsing. A 401/403/404 means the token
# is invalid or the PR is inaccessible — we must NOT silently treat that as
# "not merged" and skip the audit.
PR_TMP=$(mktemp)
PR_HTTP=$(curl -sS -o "$PR_TMP" -w '%{http_code}' -H "$AUTH" \
"${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}")
PR=$(cat "$PR_TMP")
rm -f "$PR_TMP"
if [ "$PR_HTTP" != "200" ]; then
echo "::error::GET /pulls/${PR_NUMBER} returned HTTP ${PR_HTTP} — cannot evaluate merge state."
exit 1
fi
# FAIL-CLOSED: a 200 response with a missing/malformed `merged` field must
# NOT be treated as "not merged" (that would silently skip the audit).
# We verify both presence AND correct type for every field we consume.
PR_SCHEMA_OK=$(echo "$PR" | jq -r '
(.merged | type == "boolean") and
(.merge_commit_sha | type == "string") and
(.merged_by | type == "object") and (.merged_by.login | type == "string") and
(.base | type == "object") and (.base.ref | type == "string") and
(.head | type == "object") and (.head.sha | type == "string")
')
if [ "$PR_SCHEMA_OK" != "true" ]; then
echo "::error::GET /pulls/${PR_NUMBER} returned HTTP 200 but one or more required fields are missing, null, or of wrong type — cannot evaluate force-merge."
exit 1
fi
MERGED=$(echo "$PR" | jq -r '.merged')
PR=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}")
MERGED=$(echo "$PR" | jq -r '.merged // false')
if [ "$MERGED" != "true" ]; then
echo "::notice::PR #${PR_NUMBER} closed without merge — no audit emission."
exit 0
fi
MERGE_SHA=$(echo "$PR" | jq -r '.merge_commit_sha')
MERGED_BY=$(echo "$PR" | jq -r '.merged_by.login')
# NOTE: no || true — with set -euo pipefail, jq parse failures (e.g. field
# missing from API response) propagate as hard errors. Use jq's // operator
# for graceful defaults instead of bash || true guards. This was re-added by
# 8c343e3a ("fix(gitea): add || true guards to jq pipelines") — reverted
# here because the guards mask silent failures that hide malformed API responses.
MERGE_SHA=$(echo "$PR" | jq -r '.merge_commit_sha // empty')
MERGED_BY=$(echo "$PR" | jq -r '.merged_by.login // "unknown"')
TITLE=$(echo "$PR" | jq -r '.title // ""')
BASE_BRANCH=$(echo "$PR" | jq -r '.base.ref')
HEAD_SHA=$(echo "$PR" | jq -r '.head.sha')
BASE_BRANCH=$(echo "$PR" | jq -r '.base.ref // "main"')
HEAD_SHA=$(echo "$PR" | jq -r '.head.sha // empty')
if [ -z "$MERGE_SHA" ]; then
echo "::warning::PR #${PR_NUMBER} merged=true but no merge_commit_sha — cannot evaluate force-merge."
exit 0
fi
# 2. Required status checks — branch-aware JSON dict takes precedence.
if [ -n "${REQUIRED_CHECKS_JSON:-}" ]; then
# FAIL-CLOSED: if REQUIRED_CHECKS_JSON is set, the branch entry must exist
# and be an array. A missing branch or non-array value means the config is
# malformed or drifted — we must NOT silently treat it as "no checks".
_RC_JSON_OK=$(echo "$REQUIRED_CHECKS_JSON" | jq -r --arg branch "$BASE_BRANCH" '
has($branch) and (.[$branch] | type == "array")
')
if [ "$_RC_JSON_OK" != "true" ]; then
echo "::error::REQUIRED_CHECKS_JSON missing or non-array entry for branch '$BASE_BRANCH' — cannot evaluate required checks."
exit 1
fi
REQUIRED=$(echo "$REQUIRED_CHECKS_JSON" | jq -r --arg branch "$BASE_BRANCH" '.[$branch] | .[]')
REQUIRED=$(echo "$REQUIRED_CHECKS_JSON" | jq -r --arg branch "$BASE_BRANCH" '.[$branch] // [] | .[]')
else
REQUIRED="$REQUIRED_CHECKS"
fi
@@ -116,28 +91,12 @@ fi
# 3. Status-check state at the PR HEAD (where checks ran). The merge
# commit doesn't get its own checks; we evaluate the PR's last
# commit, which is what branch protection compared against.
# Fail-closed: verify HTTP 200. A 401/403/404 means the status is
# unreadable — we must NOT treat that as "no statuses" and skip checks.
STATUS_TMP=$(mktemp)
STATUS_HTTP=$(curl -sS -o "$STATUS_TMP" -w '%{http_code}' -H "$AUTH" \
STATUS=$(curl -sS -H "$AUTH" \
"${API}/repos/${OWNER}/${NAME}/commits/${HEAD_SHA}/status")
STATUS=$(cat "$STATUS_TMP")
rm -f "$STATUS_TMP"
if [ "$STATUS_HTTP" != "200" ]; then
echo "::error::GET /commits/${HEAD_SHA}/status returned HTTP ${STATUS_HTTP} — cannot evaluate required checks."
exit 1
fi
# FAIL-CLOSED: a 200 status response missing the 'statuses' array, or with
# 'statuses' set to a non-array type (null/string/object), must NOT be treated
# as "no checks" — that would silently declare all checks green.
if ! echo "$STATUS" | jq -e '(.statuses | type) == "array"' >/dev/null; then
echo "::error::GET /commits/${HEAD_SHA}/status returned HTTP 200 but 'statuses' is missing or not an array — cannot evaluate required checks."
exit 1
fi
declare -A CHECK_STATE
while IFS=$'\t' read -r ctx state; do
[ -n "$ctx" ] && CHECK_STATE[$ctx]="$state"
done < <(echo "$STATUS" | jq -r '.statuses | .[] | "\(.context)\t\(.status)"')
done < <(echo "$STATUS" | jq -r '.statuses // [] | .[] | "\(.context)\t\(.status)"')
# 4. For each required check, was it green at merge? YAML block scalars
# (`|`) leave a trailing newline; skip blank/whitespace-only lines.
+15 -26
View File
@@ -552,34 +552,23 @@ def find_open_issue(title: str) -> dict | None:
hourly; failing one cycle loudly is strictly better than silently
duplicating.
Paginates through all open issues (limit=50 per page) until the
title is found or the result set is exhausted. Previously only one
page was fetched, causing duplicate [ci-drift] issues when the
existing tracking issue fell beyond page 1.
Gitea issue search returns at most page=50 per page; one page is
enough as long as `[ci-drift]` issues are a tiny minority. (See
follow-up issue for Link-header pagination.)
"""
page = 1
while True:
_, results = api(
"GET",
f"/repos/{OWNER}/{NAME}/issues",
query={
"state": "open",
"type": "issues",
"limit": "50",
"page": str(page),
},
_, results = api(
"GET",
f"/repos/{OWNER}/{NAME}/issues",
query={"state": "open", "type": "issues", "limit": "50"},
)
if not isinstance(results, list):
raise ApiError(
f"issue search returned non-list body (got {type(results).__name__})"
)
if not isinstance(results, list):
raise ApiError(
f"issue search returned non-list body (got {type(results).__name__})"
)
for issue in results:
if issue.get("title") == title:
return issue
# Fewer than limit results means last page reached.
if len(results) < 50:
return None
page += 1
for issue in results:
if issue.get("title") == title:
return issue
return None
def render_body(branch: str, findings: list[str], debug: dict) -> str:
+27 -91
View File
@@ -9,33 +9,17 @@ queue. This script provides the missing serialized policy in user space:
candidate (REQUEST_CHANGES, mergeable!=True, insufficient genuine approvals,
or red required CI) is SKIPPED so it cannot head-of-line block newer ready
PRs; the scan continues to the next candidate.
2. Refuse to act unless main's BP-required contexts are green. This is also
the serialized backstop for direct-merge (see below): after a direct merge,
main re-runs push CI and this gate PAUSES the queue if main goes red, so no
merge piles onto an unverified/red main (issue #2358).
2. Refuse to act unless main's BP-required contexts are green.
3. Refuse fork PRs; the queue may only mutate same-repo branches.
4. DIRECT-MERGE when conflict-free (issue #2358). When Gitea reports the PR
conflict-free (mergeable is True) and the merge bar below is met, MERGE IT
DIRECTLY — even if its head does not contain current main. We do NOT call
/pulls/{n}/update first: branch protection does not require strict
up-to-date, so behind-main conflict-free PRs merge cleanly, and calling
/update would trigger Gitea dismiss_stale_approvals (dismissing the genuine
approvals and forcing a re-review every tick — the rebase-churn bottleneck).
The /update path is used ONLY when the PR is DEFINITIVELY not mergeable
(mergeable is literal False) AND its head lacks current main — refreshing the
branch may resolve a behind-main non-conflict; a real conflict returns HTTP
409 and the PR is HELD per #2352. mergeable=None/missing (Gitea STILL
COMPUTING conflict state) is a distinct fail-closed WAIT: never merged AND
never /update'd — calling /update during the compute window would dismiss the
PR's genuine approvals (dismiss_stale_approvals) and re-introduce the exact
rebase-churn this queue eliminates. None is re-checked next tick.
4. If the PR branch does not contain current main, call Gitea's
/pulls/{n}/update endpoint and stop. CI must rerun on the updated head.
5. Merge ONLY when, on the PR's CURRENT head sha:
- >= REQUIRED_APPROVALS distinct GENUINE official APPROVED reviews from
the recognised reviewer set (not stale, not dismissed, commit_id ==
current head), AND
- no open official REQUEST_CHANGES on the current head, AND
- every BP-required status context is green, AND
- the PR is mergeable (Gitea reports it conflict-free).
- the PR is mergeable.
Authoritative gates (fail-closed):
- The REQUIRED status contexts come from BRANCH PROTECTION
@@ -638,32 +622,29 @@ def evaluate_merge_readiness(
approvers: set[str],
request_changes: list[str],
pr_has_current_base: bool,
mergeable: bool | None,
mergeable: bool,
pr_labels: set[str] | None = None,
) -> MergeDecision:
# 1) Main's push-required contexts must be green. Combined state can be
# "failure" due to non-blocking jobs (continue-on-error: true) that do
# not gate merges, so check the explicit required set, not combined.
#
# This main-green gate is ALSO the serialized backstop that makes the
# direct-merge (no update) path safe (issue #2358): after a direct merge
# of a behind-main PR, main re-runs its push CI; if a semantic main-break
# slips through (PR green standalone but broken when combined with newer
# main), main's required contexts go red and this gate PAUSES the queue —
# no further merge piles onto an unverified/red main until it is green.
main_latest = latest_statuses_by_context(main_status.get("statuses") or [])
main_ok, main_bad = required_contexts_green(main_latest, push_required_contexts())
if not main_ok:
return MergeDecision(False, "pause", "main required contexts not green: " + ", ".join(main_bad))
# 2) No open official REQUEST_CHANGES on the current head.
# 2) PR head must contain current main.
if not pr_has_current_base:
return MergeDecision(False, "update", "PR head does not contain current main")
# 3) No open official REQUEST_CHANGES on the current head.
if request_changes:
return MergeDecision(
False, "wait",
"open REQUEST_CHANGES on current head from: " + ", ".join(sorted(request_changes)),
)
# 3) Enough distinct genuine official approvals on the current head.
# 4) Enough distinct genuine official approvals on the current head.
if len(approvers) < required_approvals:
return MergeDecision(
False, "wait",
@@ -672,7 +653,7 @@ def evaluate_merge_readiness(
f"need {required_approvals}",
)
# 4) Every BRANCH-PROTECTION-REQUIRED status context must be green. This is
# 5) Every BRANCH-PROTECTION-REQUIRED status context must be green. This is
# the authoritative status gate — NON-required reds (qa-review,
# security-review, sop-tier/sop-checklist when not BP-required, E2E Chat,
# Staging SaaS, ci-arm64-advisory, continue-on-error jobs) are NOT
@@ -682,53 +663,16 @@ def evaluate_merge_readiness(
if not ok:
return MergeDecision(False, "wait", "required contexts not green: " + ", ".join(missing_or_bad))
# 5) DIRECT-MERGE when conflict-free (issue #2358 — throughput fix).
# If Gitea reports the PR conflict-free (mergeable is True), MERGE IT
# DIRECTLY even if its head does not yet contain current main. Branch
# protection does NOT require strict up-to-date, so a behind-main but
# conflict-free PR merges cleanly. We deliberately do NOT call
# /pulls/{n}/update first: update triggers Gitea dismiss_stale_approvals,
# which would dismiss the PR's genuine approvals and force a full
# re-review every tick — the rebase-churn bottleneck that collapsed
# throughput to ~0/hr with dozens of mergeable PRs open.
#
# The merge bar is UNCHANGED: we only reach here with main green +
# >= required genuine approvals on the current head + no open
# REQUEST_CHANGES + every BP-required context green. The trade-off is
# that the PR's CI ran on a possibly-behind base, so a SEMANTIC main-break
# is caught by POST-merge main CI (step 1's pause backstop) rather than
# pre-merge. force_merge is used ONLY for missing-but-non-required
# governance reds (required are green + approvals genuine), never to
# bypass a failing required context or an approval shortfall.
if mergeable is True:
force = _non_required_red_present(latest, required_contexts)
return MergeDecision(True, "merge", "ready", force=force)
# 6) Gitea must consider the PR mergeable (no conflicts).
if not mergeable:
return MergeDecision(False, "wait", "PR is not mergeable (conflicts)")
# 6) NOT (yet) mergeable. TRI-STATE, fail-closed — never merge on an unknown.
# We MUST distinguish "still computing" (None/missing) from a "definitive
# conflict" (False); collapsing them would route a behind-main but
# STILL-COMPUTING PR into the /update path, whose dismiss_stale_approvals
# is the rebase-churn this change eliminates.
#
# mergeable is None → Gitea has NOT finished computing conflict state.
# WAIT: do nothing this tick — never /update (would dismiss genuine
# approvals during the compute window → churn), never merge. Re-check next
# tick once Gitea reports a decisive True/False.
if mergeable is None:
return MergeDecision(
False, "wait",
"PR mergeability is still being computed (mergeable=None) — waiting",
)
# mergeable is False → DEFINITIVE not-mergeable. If the head also does not
# contain current main, try the /update path to refresh the branch (this
# may resolve a behind-main non-conflict; a real conflict returns HTTP 409
# and process_once HOLDs the PR per #2352). If the head already contains
# current main yet Gitea still reports not-mergeable, there is nothing the
# queue can do (genuine conflict against current main) — WAIT.
if not pr_has_current_base:
return MergeDecision(False, "update", "PR not mergeable and head does not contain current main")
return MergeDecision(False, "wait", "PR is not mergeable (conflicts)")
# Ready. Use force_merge ONLY if the merge would otherwise be blocked by
# missing-but-non-required governance contexts. Required are green and
# approvals are genuine, so force only bypasses non-required reds — never a
# failing required context or missing approval.
force = _non_required_red_present(latest, required_contexts)
return MergeDecision(True, "merge", "ready", force=force)
def get_branch_head(branch: str) -> str:
@@ -1132,20 +1076,12 @@ def _evaluate_candidate(
# never treated as green).
pr_status = get_combined_status(head_sha)
pr_labels = label_names(pr)
# FAIL-CLOSED, TRI-STATE: Gitea returns mergeable=None (or omits the field)
# while it is still COMPUTING conflict state, mergeable=False for a definitive
# conflict, and mergeable=True only when it has proven the PR conflict-free.
# We preserve all THREE states (do NOT collapse None/missing into False):
# - True → direct-merge eligible (step 5).
# - None / missing → still computing → WAIT (never merge, never update,
# never dismiss approvals); re-check next tick.
# - False → definitive conflict → the update/hold path (step 6).
# Collapsing None→False would route a behind-main but STILL-COMPUTING PR into
# the /update path, which triggers dismiss_stale_approvals — the exact
# rebase-churn this change eliminates. Normalize only to the literal True /
# False / None set (some Gitea versions omit the key entirely → None).
raw_mergeable = pr.get("mergeable")
mergeable: bool | None = raw_mergeable if isinstance(raw_mergeable, bool) else None
# FAIL-CLOSED: Gitea returns mergeable=None (or omits the field) while it is
# still COMPUTING conflict state. Only the literal True is decisive proof the
# PR is conflict-free; None and False both mean "not (yet) mergeable". We must
# NOT autonomously merge on an unknown — treat anything but True as not-yet-
# mergeable so evaluate_merge_readiness returns a "wait" decision.
mergeable = pr.get("mergeable") is True
reviews = get_pull_reviews(pr_number)
approvers, request_changes = genuine_approvals(
@@ -305,9 +305,9 @@ def validate_tracker(
if status == "error":
sys.stderr.write(
f"::error::issue {slug}#{num} fetch errored — treating as "
f"unverified, FAILING CLOSED (do not skip on outage).\n"
f"unverified, skipping this check.\n"
)
return (False, f"{slug}#{num} fetch erroredcannot verify tracker")
return (True, "fetch-error — skipped")
assert payload is not None
state = payload.get("state", "")
+8 -4
View File
@@ -144,14 +144,18 @@ debug "tier=$TIER"
# as unachievable (would always fail) — operators notice the clear error
# and create the missing team.
#
# Current Gitea teams: ceo, engineers, managers, qa, security
# Current Gitea teams: ceo, engineers, managers
# Future teams (create before removing "???" fallback): qa, security, security-audit
declare -A TIER_EXPR=(
# tier:low — same as previous OR gate: any engineer, manager, or ceo.
["tier:low"]="engineers,managers,ceo"
# tier:medium — AND of (managers) AND (engineers) AND (qa,security)
# ≥1 approver from managers AND ≥1 from engineers AND ≥1 from qa OR security.
["tier:medium"]="managers AND engineers AND qa,security"
# tier:medium — AND of (managers) AND (engineers) AND (qa???,security???)
# The qa+security clause requires both teams to exist; when not yet
# created, the PR author is responsible for adding them before requesting
# approval on a tier:medium PR. Ops: create qa + security Gitea teams
# and update this map to remove the "???" markers (internal#189 follow-up).
["tier:medium"]="managers AND engineers AND qa???,security???"
# tier:high — ceo only. The AND-composition adds no value for a
# single-team gate, but the framework is wired for consistency.
@@ -1,119 +0,0 @@
#!/usr/bin/env bash
# test_audit_force_merge.sh — regression lock for audit-force-merge fail-closed
# behavior. Verifies every schema validation path via direct jq filter tests.
#
# Usage: bash test_audit_force_merge.sh
set -euo pipefail
fail() { echo "FAIL: $*" >&2; exit 1; }
pass() { echo "PASS: $*"; }
[ -x "$(command -v jq)" ] || { echo "SKIP: jq not on PATH"; exit 0; }
HEAD_SHA="deadbeef00000000000000000000000000000000"
# The schema validation jq expression from audit-force-merge.sh.
validate_pr_schema() {
jq -r '
(.merged | type == "boolean") and
(.merge_commit_sha | type == "string") and
(.merged_by | type == "object") and (.merged_by.login | type == "string") and
(.base | type == "object") and (.base.ref | type == "string") and
(.head | type == "object") and (.head.sha | type == "string")
'
}
validate_statuses_type() {
jq -r '(.statuses | type) == "array"'
}
# T1 — valid PR payload → true
T1=$(echo '{"merged":true,"merge_commit_sha":"abc","merged_by":{"login":"u"},"base":{"ref":"main"},"head":{"sha":"def"}}' | validate_pr_schema)
[ "$T1" = "true" ] || fail "T1: valid payload should pass schema"
pass "T1: valid payload passes schema"
# T2 — merged=false (valid types) → true (schema is about types, not values)
T2=$(echo '{"merged":false,"merge_commit_sha":"abc","merged_by":{"login":"u"},"base":{"ref":"main"},"head":{"sha":"def"}}' | validate_pr_schema)
[ "$T2" = "true" ] || fail "T2: merged=false with valid types should pass schema"
pass "T2: merged=false with valid types passes schema"
# T3 — missing merged field → false
T3=$(echo '{"merge_commit_sha":"abc","merged_by":{"login":"u"},"base":{"ref":"main"},"head":{"sha":"def"}}' | validate_pr_schema)
[ "$T3" = "false" ] || fail "T3: missing merged should fail schema"
pass "T3: missing merged fails schema"
# T4 — merged is string "true" instead of boolean → false
T4=$(echo '{"merged":"true","merge_commit_sha":"abc","merged_by":{"login":"u"},"base":{"ref":"main"},"head":{"sha":"def"}}' | validate_pr_schema)
[ "$T4" = "false" ] || fail "T4: merged as string should fail schema"
pass "T4: merged as string fails schema"
# T5 — merge_commit_sha is null → false
T5=$(echo '{"merged":true,"merge_commit_sha":null,"merged_by":{"login":"u"},"base":{"ref":"main"},"head":{"sha":"def"}}' | validate_pr_schema)
[ "$T5" = "false" ] || fail "T5: null merge_commit_sha should fail schema"
pass "T5: null merge_commit_sha fails schema"
# T6 — merged_by is null → false
T6=$(echo '{"merged":true,"merge_commit_sha":"abc","merged_by":null,"base":{"ref":"main"},"head":{"sha":"def"}}' | validate_pr_schema)
[ "$T6" = "false" ] || fail "T6: null merged_by should fail schema"
pass "T6: null merged_by fails schema"
# T7 — base.ref is number → false
T7=$(echo '{"merged":true,"merge_commit_sha":"abc","merged_by":{"login":"u"},"base":{"ref":123},"head":{"sha":"def"}}' | validate_pr_schema)
[ "$T7" = "false" ] || fail "T7: numeric base.ref should fail schema"
pass "T7: numeric base.ref fails schema"
# T8 — head is missing → false
T8=$(echo '{"merged":true,"merge_commit_sha":"abc","merged_by":{"login":"u"},"base":{"ref":"main"}}' | validate_pr_schema)
[ "$T8" = "false" ] || fail "T8: missing head should fail schema"
pass "T8: missing head fails schema"
# T9 — statuses missing → false
T9=$(echo '{}' | validate_statuses_type)
[ "$T9" = "false" ] || fail "T9: missing statuses should fail type check"
pass "T9: missing statuses fails type check"
# T10 — statuses is string → false
T10=$(echo '{"statuses":"unexpected"}' | validate_statuses_type)
[ "$T10" = "false" ] || fail "T10: string statuses should fail type check"
pass "T10: string statuses fails type check"
# T11 — statuses is null → false
T11=$(echo '{"statuses":null}' | validate_statuses_type)
[ "$T11" = "false" ] || fail "T11: null statuses should fail type check"
pass "T11: null statuses fails type check"
# T12 — statuses is array → true
T12=$(echo '{"statuses":[{"context":"c1","status":"success"}]}' | validate_statuses_type)
[ "$T12" = "true" ] || fail "T12: array statuses should pass type check"
pass "T12: array statuses passes type check"
# T13 — empty array statuses → true
T13=$(echo '{"statuses":[]}' | validate_statuses_type)
[ "$T13" = "true" ] || fail "T13: empty array statuses should pass type check"
pass "T13: empty array statuses passes type check"
# T14-T16: REQUIRED_CHECKS_JSON branch entry validation
validate_required_checks_json() {
local branch="$1"
local json="$2"
echo "$json" | jq -r --arg branch "$branch" 'has($branch) and (.[$branch] | type == "array")'
}
# T14 — branch exists and is array → true
T14=$(validate_required_checks_json "main" '{"main":["CI / all-required"]}')
[ "$T14" = "true" ] || fail "T14: existing array branch should pass"
pass "T14: existing array branch passes"
# T15 — branch missing → false
T15=$(validate_required_checks_json "staging" '{"main":["CI / all-required"]}')
[ "$T15" = "false" ] || fail "T15: missing branch should fail"
pass "T15: missing branch fails"
# T16 — branch entry is string instead of array → false
T16=$(validate_required_checks_json "main" '{"main":"CI / all-required"}')
[ "$T16" = "false" ] || fail "T16: string branch entry should fail"
pass "T16: string branch entry fails"
echo
echo "ALL AUDIT-FORCE-MERGE CHECKS PASSED"
+5 -193
View File
@@ -143,72 +143,13 @@ def test_merge_decision_requires_main_green_pr_green_and_current_base():
assert decision.force is False # no non-required reds present
def test_behind_main_but_mergeable_pr_merges_directly():
"""§SOP-22 (#2358): a behind-main but CONFLICT-FREE PR (mergeable is True)
merges DIRECTLY — no update step. Branch protection does not require strict
up-to-date, and calling /update would dismiss the genuine approvals
(dismiss_stale_approvals), forcing re-review every tick (the throughput
bottleneck). This replaces the old update-before-merge behavior."""
decision = mq.evaluate_merge_readiness(
**_ready_kwargs(pr_has_current_base=False, mergeable=True)
)
assert decision.ready is True
assert decision.action == "merge"
def test_behind_main_and_not_mergeable_pr_updates():
"""The /update path is reached ONLY when the PR is NOT mergeable AND its head
lacks current main — refreshing the branch may resolve a behind-main
non-conflict; a real conflict 409s and is held (#2352)."""
decision = mq.evaluate_merge_readiness(
**_ready_kwargs(pr_has_current_base=False, mergeable=False)
)
def test_merge_decision_updates_stale_pr_before_merge():
decision = mq.evaluate_merge_readiness(**_ready_kwargs(pr_has_current_base=False))
assert decision.ready is False
assert decision.action == "update"
def test_current_base_but_not_mergeable_pr_waits():
"""Up-to-date with main yet Gitea reports not-mergeable → genuine conflict
against current main (or still computing). The queue cannot act: WAIT,
never update (update would not help) and never merge (fail-closed)."""
decision = mq.evaluate_merge_readiness(
**_ready_kwargs(pr_has_current_base=True, mergeable=False)
)
assert decision.ready is False
assert decision.action == "wait"
assert "not mergeable" in decision.reason
def test_behind_main_and_mergeable_none_waits_not_update():
"""§SOP-22 (CR2 #2374) — the churn-residual fix. A BEHIND-MAIN PR whose
mergeability Gitea is STILL COMPUTING (mergeable is None) must WAIT, NOT take
the /update path. The old code collapsed None→False, so a behind-main +
None PR returned action="update" → /pulls/{n}/update → dismiss_stale_approvals
→ the exact rebase-churn this change eliminates, fired during the compute
window. None and False are now DISTINCT: None waits, False updates."""
decision = mq.evaluate_merge_readiness(
**_ready_kwargs(pr_has_current_base=False, mergeable=None)
)
assert decision.ready is False
assert decision.action == "wait" # NOT "update" — no churn during compute
assert "computed" in decision.reason
def test_current_base_and_mergeable_none_waits():
"""Up-to-date with main + mergeable None (still computing) → WAIT (unchanged
fail-closed; just confirming None is never merged regardless of base)."""
decision = mq.evaluate_merge_readiness(
**_ready_kwargs(pr_has_current_base=True, mergeable=None)
)
assert decision.ready is False
assert decision.action == "wait"
def test_MergePermissionError_inherits_from_ApiError():
assert issubclass(mq.MergePermissionError, mq.ApiError)
@@ -565,131 +506,6 @@ def test_process_once_merges_when_mergeable_is_true(monkeypatch):
assert calls["hold_label"] is None
def test_process_once_behind_main_mergeable_none_waits_no_update(monkeypatch):
"""§SOP-22 (CR2 #2374) — end-to-end churn-residual regression. A BEHIND-MAIN
PR (commits do NOT contain main_sha) whose mergeability Gitea is STILL
COMPUTING (mergeable=None) must WAIT: process_once returns 0 and NEVER calls
update_pull (which dismisses genuine approvals via dismiss_stale_approvals)
NOR merge_pull NOR hold. The old None→False collapse routed this exact case
into the /update path → approval-dismissing rebase churn during the compute
window. This proves the durable churn elimination: no update, approvals
preserved, re-checked next tick."""
calls = {"merge_attempts": 0, "hold_label": None, "updated": False}
_fully_ready_process_once_monkeypatch(monkeypatch, mergeable=None, calls=calls)
# Make the head BEHIND main: commits do NOT contain main_sha. This is the
# case the bug missed (the prior None test had current base, masking it).
behind_head = "a" * 40
monkeypatch.setattr(mq, "get_pull_commits", lambda n: [{"sha": behind_head}])
rc = mq.process_once(dry_run=False)
assert rc == 0
assert calls["updated"] is False # NO /update → approvals NOT dismissed
assert calls["merge_attempts"] == 0 # never merge on an unknown
assert calls["hold_label"] is None # transient → not held, retried next tick
# --------------------------------------------------------------------------
# §SOP-22: DIRECT-MERGE throughput fix (#2358). A conflict-free 2-genuine PR
# merges WITHOUT a pre-merge /update call, so its approvals are NOT dismissed by
# dismiss_stale_approvals. The merge bar (2-genuine-on-current-head +
# BP-required green + mergeable + no RC + opt-out) is UNCHANGED; only the
# unnecessary update-before-merge churn is removed. The /update path survives
# for the genuine case it is needed (not-mergeable + behind-main), where a real
# conflict 409s and is held per #2352. mergeable=None stays fail-closed.
# --------------------------------------------------------------------------
def test_process_once_merges_conflict_free_pr_without_update(monkeypatch):
"""§SOP-22(a) — the core throughput fix. A conflict-free, fully-approved PR
merges WITHOUT update_pull ever being called. The old behavior called
/update first whenever the head lacked current main, which dismissed the 2
genuine approvals (dismiss_stale_approvals) and forced re-review every tick.
Assert update_pull is NOT invoked and merge_pull IS invoked."""
calls = {"merge_attempts": 0, "hold_label": None, "updated": False}
_fully_ready_process_once_monkeypatch(monkeypatch, mergeable=True, calls=calls)
# Make the head BEHIND main: commits do NOT contain main_sha. Under the old
# logic this alone forced an update_pull; under the fix it merges directly.
head_sha = "a" * 40
monkeypatch.setattr(mq, "get_pull_commits", lambda n: [{"sha": head_sha}])
rc = mq.process_once(dry_run=False)
assert rc == 0
assert calls["merge_attempts"] == 1 # merged directly
assert calls["updated"] is False # NO update_pull → approvals NOT dismissed
assert calls["hold_label"] is None
def test_process_once_behind_main_conflict_free_merges_directly(monkeypatch):
"""§SOP-22(b) — explicit behind-main + conflict-free case: it still merges
directly (branch protection does not require strict up-to-date)."""
calls = {"merge_attempts": 0, "hold_label": None, "updated": False}
_fully_ready_process_once_monkeypatch(monkeypatch, mergeable=True, calls=calls)
behind_head = "a" * 40
monkeypatch.setattr(mq, "get_pull_commits", lambda n: [{"sha": behind_head}])
rc = mq.process_once(dry_run=False)
assert rc == 0
assert calls["merge_attempts"] == 1
assert calls["updated"] is False
def test_process_once_pauses_when_main_not_green_no_direct_merge(monkeypatch):
"""§SOP-22 backstop — the serialized safety that makes direct-merge safe:
when main's required push contexts are NOT green (e.g. a prior direct merge
introduced a semantic main-break caught by post-merge main CI), the queue
PAUSES — it does NOT merge the next PR onto an unverified/red main."""
calls = {"merge_attempts": 0, "hold_label": None, "updated": False}
_fully_ready_process_once_monkeypatch(monkeypatch, mergeable=True, calls=calls)
main_sha = "b" * 40
def red_main_combined(sha):
if sha == main_sha:
return {"state": "failure",
"statuses": [{"context": "CI / all-required (push)", "status": "failure"}]}
return {"state": "success",
"statuses": [{"context": "CI / all-required (pull_request)", "status": "success"}]}
monkeypatch.setattr(mq, "get_combined_status", red_main_combined)
rc = mq.process_once(dry_run=False)
assert rc == 0
assert calls["merge_attempts"] == 0 # paused — no merge onto red main
assert calls["updated"] is False
def test_direct_merge_bar_unchanged_behind_main(monkeypatch):
"""§SOP-22(d) — the merge bar is UNCHANGED on the new direct-merge path. A
behind-main + conflict-free PR is still rejected (no merge) when ANY gate
fails: insufficient genuine approvals, red required context, open
REQUEST_CHANGES, or opt-out label. Direct-merge removes the update churn, it
does NOT weaken the bar — fail-closed on every gate."""
head_sha = "a" * 40
behind_main = dict(pr_has_current_base=False, mergeable=True)
# <2 genuine approvals → wait, not merge.
d = mq.evaluate_merge_readiness(
**_ready_kwargs(approvers={"agent-researcher"}, **behind_main)
)
assert d.action == "wait" and d.ready is False
# Red required context → wait, not merge.
red_required = {"state": "failure", "statuses": [
{"context": "CI / all-required (pull_request)", "status": "failure"}]}
d = mq.evaluate_merge_readiness(
**_ready_kwargs(pr_status=red_required, **behind_main)
)
assert d.action == "wait" and d.ready is False
# Open REQUEST_CHANGES on current head → wait, not merge.
d = mq.evaluate_merge_readiness(
**_ready_kwargs(request_changes=["agent-reviewer-cr2"], **behind_main)
)
assert d.action == "wait" and d.ready is False
# --------------------------------------------------------------------------
# Fix 3: status fetch is fail-closed (failed fetch != green)
# --------------------------------------------------------------------------
@@ -891,17 +707,13 @@ def _stale_pr_update_409_monkeypatch(monkeypatch, queued_issues, calls):
# Scan-loop process_once enumerates candidates via list_candidate_issues.
monkeypatch.setattr(mq, "list_candidate_issues", lambda *, auto_discover: queued_issues)
monkeypatch.setattr(mq, "get_pull", lambda n: {
"state": "open", "number": n, "mergeable": False,
"state": "open", "number": n, "mergeable": True,
"base": {"ref": "main", "repo_id": 1},
"head": {"sha": head_sha, "repo_id": 1},
"labels": [{"name": "merge-queue"}],
})
# NOTE: mergeable is False (real conflict) AND commits do NOT contain
# main_sha → pr_has_current_base is False → decision.action == "update".
# Under the #2358 direct-merge fix the update path is reached ONLY when the
# PR is NOT mergeable; a mergeable=True behind-main PR would merge directly,
# so this fixture sets mergeable=False to exercise the #2352 409-on-update
# hold path.
# NOTE: commits do NOT contain main_sha → pr_has_current_base is False →
# decision.action == "update".
monkeypatch.setattr(mq, "get_pull_commits", lambda n: [{"sha": head_sha}])
monkeypatch.setattr(mq, "get_pull_reviews", lambda n: [
{"state": "APPROVED", "user": {"login": "agent-researcher"},
@@ -57,12 +57,12 @@ echo "test: tier:low OR-clause splits to 3 tokens"
assert_eq "tier:low" "engineers|managers|ceo" "$(split_clause "engineers,managers,ceo")"
echo "test: tier:medium AND-expression — bash word-split on \$EXPR yields 5 tokens"
EXPR="managers AND engineers AND qa,security"
EXPR="managers AND engineers AND qa???,security???"
out=""
for _raw in $EXPR; do
out="${out}${out:+ ; }$(split_clause "$_raw")"
done
assert_eq "tier:medium" "managers ; AND ; engineers ; AND ; qa|security" "$out"
assert_eq "tier:medium" "managers ; AND ; engineers ; AND ; qa???|security???" "$out"
echo "test: tier:high single-team OR-clause"
assert_eq "tier:high" "ceo" "$(split_clause "ceo")"
@@ -34,8 +34,6 @@ jobs:
check:
name: Block forbidden paths
runs-on: ubuntu-latest
# Hard gate — detected internal-path leaks fail the workflow.
# continue-on-error removed per directive (fail-open → fail-closed).
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
@@ -1,165 +0,0 @@
name: boot-to-registration-e2e (advisory)
# cp#455 — Minimal-cell boot-to-registration e2e.
# CTO directive 14eb4f07: "build the minimal claude-code+kimi cell,
# it should now go GREEN since the fix is live."
#
# Stage 1 of 5-stage rollout. Reuses the dispatch-only EC2
# provisioning path from test_staging_full_saas.sh but reduced to
# the minimum boot-to-registration surface:
#
# 1. Provision request accepted; workspace transitions to booting/running
# 2. Controlplane receives /registry/register for that workspace_id
# 3. JSON-RPC/completion route returns successful minimal response
# 4. Teardown terminates workspace even on failure (trap)
#
# Advisory (non-blocking) per Researcher Stage 2 design — RED on
# current main is expected pre-cp#469-cluster. After cp#477 deploy
# (888efceb) + PR #2167 merge, cell should turn GREEN. THAT green
# is the cluster-proof signal.
#
# Cost controls (mandatory):
# - SPOT instances (tagged run_id/workspace_id for cost attribution)
# - Fast teardown (~3-5 min wall-clock) even on assertion failure
# - Structured per-cell results JSON (runtime/provider/model/
# billing_mode/workspace_id/register_status/completion_status/
# teardown_status/elapsed_seconds)
#
# Inputs:
# runtime : default claude-code
# billing_mode : default platform_managed (the cp#469-cluster path)
# provider : default platform (vs direct-to-provider)
# model : default moonshot/kimi-k2.6 (CTO-specified)
#
# PR target: molecule-core (this file). Companion harness extension
# (test_minimal_boot_cell.sh) lives in tests/e2e/ alongside
# test_staging_full_saas.sh — same repo, same branch.
#
# Note: cp#455 was originally spec'd to live in molecule-controlplane
# (`.gitea/workflows/` path), but molecule-core's CI is the home for
# tenant-boot e2e tests in this stage. Stage 2 may move the path.
on:
workflow_dispatch:
# Note: Gitea 1.22.6 does not support workflow_dispatch.inputs
# (feedback_gitea_workflow_dispatch_inputs_unsupported). Defaults
# are hardcoded in the job env below. Stage 2 can add matrix/
# param support once the Gitea version supports it.
# Advisory: no cron schedule, manual dispatch only. Branch protection
# doesn't require this — RED on main is expected pre-cp#469-cluster
# deploy, GREEN signals the cluster is live.
permissions:
contents: read
# No issue-write; failures surface as red runs in workflow history.
concurrency:
group: boot-to-registration-e2e
cancel-in-progress: false
jobs:
# bp-exempt: advisory e2e — non-gating, manual dispatch only (cp#455 Stage 1)
minimal-cell:
name: Minimal cell (claude-code + platform + moonshot/kimi-k2.6)
runs-on: ubuntu-latest
# Bounded at 12 min. Wall-clock budget breakdown:
# - cold EC2 provision: ~3-4 min (SPOT)
# - /registry/register wait: ~30s
# - completion call: ~10s
# - teardown: ~30-60s
# - tail headroom: ~6-7 min
timeout-minutes: 12
env:
# Hardcoded defaults — Gitea 1.22.6 does not support workflow_dispatch.inputs
# (feedback_gitea_workflow_dispatch_inputs_unsupported). Stage 2 can add
# matrix/param support once the Gitea version supports it.
E2E_RUNTIME: claude-code
E2E_BILLING_MODE: platform_managed
E2E_PROVIDER: platform
E2E_MODEL: moonshot/kimi-k2.6
E2E_RUN_ID: cp455-${{ github.run_id }}
E2E_PROVISION_TIMEOUT_SECS: '300' # 5 min — fast teardown budget
MOLECULE_CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }}
MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Verify required secrets present
run: |
if [ -z "${MOLECULE_ADMIN_TOKEN:-}" ]; then
echo "::error::CP_STAGING_ADMIN_API_TOKEN secret missing — minimal-cell e2e cannot run"
echo "::error::Set it at Settings → Secrets and Variables → Actions; pull from staging-CP's CP_ADMIN_API_TOKEN env in Railway."
exit 1
fi
- name: Install required tools
run: |
for cmd in jq curl python3; do
command -v "$cmd" >/dev/null 2>&1 || {
echo "::error::required tool '$cmd' not on PATH — runner image regression?"
exit 1
}
done
- name: Run minimal-cell boot-to-registration harness
# The harness script handles its own teardown via EXIT trap;
# even on assertion failure (provision timeout, register
# timeout, completion failure), the workspace is deprovisioned
# and a leak is reported. Exit code propagates from the script.
# Structured per-cell results are emitted to ${GITHUB_STEP_SUMMARY}
# so operators see pass/fail per assertion without scrolling.
run: |
bash tests/e2e/test_minimal_boot_cell.sh
- name: Emit structured per-cell results
if: always()
# Always run (even on failure) so the structured results are
# visible in the workflow summary. The script writes a JSON
# file at /tmp/cell-result.json; this step renders it as a
# job summary.
run: |
if [ -f /tmp/cell-result.json ]; then
echo "## Minimal-cell results" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo '```json' >> "$GITHUB_STEP_SUMMARY"
cat /tmp/cell-result.json >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo '```' >> "$GITHUB_STEP_SUMMARY"
else
echo "## Minimal-cell results: NO_RESULT_FILE" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "Harness did not produce /tmp/cell-result.json — likely crashed before trap fired." >> "$GITHUB_STEP_SUMMARY"
fi
- name: Failure summary
if: failure()
run: |
{
echo "## cp#455 minimal-cell FAILED"
echo ""
echo "**Run ID:** ${{ github.run_id }}"
echo "**Runtime:** ${E2E_RUNTIME}"
echo "**Billing mode:** ${E2E_BILLING_MODE}"
echo "**Provider:** ${E2E_PROVIDER}"
echo "**Model:** ${E2E_MODEL}"
echo "**Slug:** ${E2E_RUN_ID}"
echo ""
echo "### What this means"
echo ""
echo "The minimal claude-code+kimi cell did not pass all 4 assertions:"
echo "1. Provision request accepted; workspace transitions to booting/running"
echo "2. Controlplane receives /registry/register for that workspace_id"
echo "3. JSON-RPC/completion route returns successful minimal response"
echo "4. Teardown terminates workspace even on failure (trap)"
echo ""
echo "RED is expected pre-cp#469-cluster. After cp#477 deploy (888efceb) + PR #2167 merge,"
echo "this should turn GREEN. Persistent RED after both merge = cluster bug, not e2e bug."
echo ""
echo "### Next steps"
echo ""
echo "1. Check the harness output above for the assertion that failed"
echo "2. If assertion 1 fails: provision path broken — check CP admin API + EC2 quota"
echo "3. If assertion 2 fails: /registry/register path broken — check workspace-server boot"
echo "4. If assertion 3 fails: LLM proxy / completion path broken — check cp#469 cluster"
echo "5. If assertion 4 fails: teardown trap broken — leak risk, fix immediately"
} >> "$GITHUB_STEP_SUMMARY"
+6 -7
View File
@@ -418,9 +418,10 @@ jobs:
# a manual action that determinism made obsolete.
name: Canvas Deploy Status
runs-on: docker-host
# Per-step no-op (not job-level `if:`) so the job reaches SUCCESS on PRs
# instead of skipped — skipped poisons the PR combined status (internal#817).
# Job-level `if:` so ci-required-drift.py's ci_job_names() detects this as
# github.ref-gated and skips it from the required-context F1 set (mc#1982).
# Step-level exit 0 handles the "not a canvas main push" case.
if: ${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/staging' }}
needs: [changes, canvas-build]
steps:
- name: Record canvas ordered-deploy status
@@ -532,8 +533,9 @@ jobs:
# The `needs:` list MUST stay in lockstep with ci-required-drift.py's
# F1 check (`ci_job_names()` = every job MINUS the sentinel MINUS jobs
# whose `if:` gates on github.event_name/github.ref). canvas-deploy-
# status is per-step-gated (not job-level `if:`) so it reaches SUCCESS
# on PRs and is included here — internal#817. If a new always-running
# reminder is event-gated (`if: github.ref == refs/heads/{main,staging}`)
# so it is intentionally EXCLUDED — it skips on PRs and a `needs:` on a
# skipped job would never let the sentinel run. If a new always-running
# CI job is added, add it here too or ci-required-drift F1 will flag it.
#
# Stays on the dedicated `ci-meta` lane (no docker work, so the
@@ -547,7 +549,6 @@ jobs:
- canvas-build
- shellcheck
- python-lint
- canvas-deploy-status
continue-on-error: false
runs-on: ci-meta
timeout-minutes: 5
@@ -566,7 +567,6 @@ jobs:
CANVAS_RESULT: ${{ needs.canvas-build.result }}
SHELLCHECK_RESULT: ${{ needs.shellcheck.result }}
PYTHON_LINT_RESULT: ${{ needs.python-lint.result }}
CANVAS_DEPLOY_RESULT: ${{ needs.canvas-deploy-status.result }}
run: |
set -euo pipefail
fail=0
@@ -588,7 +588,6 @@ jobs:
check "Canvas (Next.js)" "$CANVAS_RESULT"
check "Shellcheck (E2E scripts)" "$SHELLCHECK_RESULT"
check "Python Lint & Test" "$PYTHON_LINT_RESULT"
check "Canvas Deploy Status" "$CANVAS_DEPLOY_RESULT"
if [ "$fail" -ne 0 ]; then
echo "::error::all-required: one or more aggregated CI jobs did not succeed"
exit 1
+3 -3
View File
@@ -131,9 +131,9 @@ jobs:
# on the per-runtime default ("sonnet" → routes to direct
# Anthropic, defeats the cost saving). Operators can override
# via workflow_dispatch by setting a different E2E_MODEL_SLUG
# input if they need to exercise a specific model. MiniMax-M2.7 is the
# stable staging MiniMax path used by the full-SaaS smoke (#1997).
E2E_MODEL_SLUG: ${{ github.event.inputs.model_slug || 'MiniMax-M2.7' }}
# input if they need to exercise a specific model. MiniMax-M2 is the
# stable staging MiniMax path used by the full-SaaS smoke.
E2E_MODEL_SLUG: ${{ github.event.inputs.model_slug || 'MiniMax-M2' }}
# Bound to 10 min so a stuck provision fails the run instead of
# holding up the next cron firing. 15-min default in the script
# is for the on-PR full lifecycle where we have more headroom.
-14
View File
@@ -250,20 +250,6 @@ jobs:
echo "CANVAS_PORT=${CANVAS_PORT}" >> "$GITHUB_ENV"
echo "Canvas host port: ${CANVAS_PORT}"
- name: Set deterministic admin token
if: needs.detect-changes.outputs.chat == 'true'
run: |
# PR #2291 made auth fail-closed everywhere (no dev-mode escape).
# The platform server requires ADMIN_TOKEN; the canvas requires the
# matching NEXT_PUBLIC_ADMIN_TOKEN or every API call 401s.
# We set a deterministic per-run value so the ephemeral platform
# and canvas are paired correctly.
E2E_ADMIN_TOKEN="e2e-chat-admin-${{ github.run_id }}-${{ github.run_attempt }}"
echo "ADMIN_TOKEN=${E2E_ADMIN_TOKEN}" >> "$GITHUB_ENV"
echo "MOLECULE_ADMIN_TOKEN=${E2E_ADMIN_TOKEN}" >> "$GITHUB_ENV"
echo "NEXT_PUBLIC_ADMIN_TOKEN=${E2E_ADMIN_TOKEN}" >> "$GITHUB_ENV"
echo "Admin token configured for e2e-chat platform + canvas."
- name: Start platform (background)
if: needs.detect-changes.outputs.chat == 'true'
working-directory: workspace-server
@@ -40,7 +40,6 @@ env:
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
# bp-exempt: informational lint enforcing docker-host/publish pin convention (internal#512), not a merge gate
lint-docker-host-pin:
name: Lint docker-host pin on docker-touching workflows
runs-on: docker-host
+3 -3
View File
@@ -112,9 +112,9 @@ jobs:
E2E_RUNTIME: claude-code
# Pin the smoke to a specific MiniMax model rather than relying
# on the per-runtime default (which could resolve to "sonnet" →
# direct Anthropic and defeat the cost saving). MiniMax-M2.7 is the
# stable staging MiniMax path used by the full-SaaS smoke (#1997).
E2E_MODEL_SLUG: MiniMax-M2.7
# direct Anthropic and defeat the cost saving). MiniMax-M2 is the
# stable staging MiniMax path used by the full-SaaS smoke.
E2E_MODEL_SLUG: MiniMax-M2
E2E_RUN_ID: "smoke-${{ github.run_id }}"
# Debug-only: when an operator dispatches with keep_on_failure=true,
# the smoke script's E2E_KEEP_ORG=1 path skips teardown so the
+4 -7
View File
@@ -34,10 +34,8 @@ name: Sweep stale Cloudflare DNS records
# scripts/ops/test_sweep_cf_decide.py (#2027) cover the rule
# classifier.
#
# Secrets: CF_API_TOKEN (preferred CI-scoped name) or CLOUDFLARE_API_TOKEN
# (operator-host canonical name) are accepted — the workflow falls back
# automatically. Same for CF_ZONE_ID / CLOUDFLARE_ZONE_ID. Confirmed
# existing per issue #425 §425 audit. CP_ADMIN_API_TOKEN and
# Secrets: CF_API_TOKEN, CF_ZONE_ID, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
# are confirmed existing per issue #425 §425 audit. CP_ADMIN_API_TOKEN and
# CP_STAGING_ADMIN_API_TOKEN are unconfirmed — if missing, the verify step
# (schedule → hard-fail, dispatch → soft-skip) surfaces it clearly.
@@ -81,8 +79,8 @@ jobs:
# each individually capped at 10s by the script's curl -m flag.
timeout-minutes: 3
env:
CF_API_TOKEN: ${{ secrets.CF_API_TOKEN || secrets.CLOUDFLARE_API_TOKEN }}
CF_ZONE_ID: ${{ secrets.CF_ZONE_ID || secrets.CLOUDFLARE_ZONE_ID }}
CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }}
CF_ZONE_ID: ${{ secrets.CF_ZONE_ID }}
CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
@@ -131,7 +129,6 @@ jobs:
fi
echo "::error::sweep cannot run — required secrets missing: ${missing[*]}"
echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow."
echo "::error::Cloudflare secrets accept either the CI-scoped name (CF_API_TOKEN / CF_ZONE_ID) or the operator-host canonical name (CLOUDFLARE_API_TOKEN / CLOUDFLARE_ZONE_ID)."
echo "::error::a silent skip masked an active CF DNS leak (152/200 zone records) caught only by a manual audit on 2026-04-28; this gate exists to make the gap visible."
exit 1
fi
+6 -8
View File
@@ -29,12 +29,10 @@ name: Sweep stale Cloudflare Tunnels
# the DNS sweep's 50% because tenant-shaped tunnels are mostly
# orphans by design) refuses to nuke past the threshold.
#
# Secrets: CF_API_TOKEN (preferred CI-scoped name) or CLOUDFLARE_API_TOKEN
# (operator-host canonical name) are accepted — the workflow falls back
# automatically. Same for CF_ACCOUNT_ID / CLOUDFLARE_ACCOUNT_ID. Confirmed
# existing per issue #425 §425 audit. CP_ADMIN_API_TOKEN and
# CP_STAGING_ADMIN_API_TOKEN are unconfirmed — if missing, the verify step
# (schedule → hard-fail, dispatch → soft-skip) surfaces it clearly.
# Secrets: CF_API_TOKEN, CF_ACCOUNT_ID are confirmed existing per
# issue #425 §425 audit. CP_ADMIN_API_TOKEN and CP_STAGING_ADMIN_API_TOKEN
# are unconfirmed — if missing, the verify step (schedule → hard-fail,
# dispatch → soft-skip) surfaces it clearly.
on:
schedule:
@@ -76,8 +74,8 @@ jobs:
# the sweep-cf-orphans companion job).
timeout-minutes: 30
env:
CF_API_TOKEN: ${{ secrets.CF_API_TOKEN || secrets.CLOUDFLARE_API_TOKEN }}
CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID || secrets.CLOUDFLARE_ACCOUNT_ID }}
CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }}
CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }}
CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '90' }}
+31 -51
View File
@@ -234,44 +234,30 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
"Authorization": `Bearer ${tenantToken}`,
"X-Molecule-Org-Id": orgID,
};
// Retry workspace creation on transient 5xx / timeout — staging CP can
// return 502/503/504 under load and a single-shot failure kills the
// entire E2E run. 3 attempts with 3s exponential backoff (3s, 6s, 12s)
// gives ~21s total budget, well inside the 20-min provision envelope.
let workspaceId = "";
for (let attempt = 1; attempt <= 3; attempt++) {
const ws = await jsonFetch(`${tenantURL}/workspaces`, {
method: "POST",
headers: tenantAuth,
body: JSON.stringify({
name: "E2E Canvas Test",
runtime: "hermes",
tier: 2,
// Provider-registry SSOT (internal#718) registers ONLY Kimi models for
// the hermes runtime — `moonshot/kimi-k2.6` is the platform-managed
// entry (workspace-server/internal/providers/providers.yaml, hermes ->
// platform). The old `gpt-4o` was never a registered hermes model and
// now 422s UNREGISTERED_MODEL_FOR_RUNTIME (core#2225). This workspace
// defaults closed to platform_managed (see the boot-shape note below),
// so a platform-namespaced model id is the registry-correct choice.
model: "moonshot/kimi-k2.6",
}),
});
if (ws.status >= 200 && ws.status < 300 && ws.body?.id) {
workspaceId = ws.body.id as string;
break;
}
const isTransient = ws.status >= 500 || ws.status === 0;
if (!isTransient || attempt === 3) {
throw new Error(`Workspace create ${ws.status} (attempt ${attempt}): ${JSON.stringify(ws.body)}`);
}
const backoff = 3000 * Math.pow(2, attempt - 1);
console.log(`[staging-setup] Workspace create transient ${ws.status}, retrying in ${backoff}ms...`);
await new Promise((r) => setTimeout(r, backoff));
const ws = await jsonFetch(`${tenantURL}/workspaces`, {
method: "POST",
headers: tenantAuth,
body: JSON.stringify({
name: "E2E Canvas Test",
runtime: "hermes",
tier: 2,
// Provider-registry SSOT (internal#718) registers ONLY Kimi models for
// the hermes runtime — `moonshot/kimi-k2.6` is the platform-managed
// entry (workspace-server/internal/providers/providers.yaml, hermes ->
// platform). The old `gpt-4o` was never a registered hermes model and
// now 422s UNREGISTERED_MODEL_FOR_RUNTIME (core#2225). This workspace
// defaults closed to platform_managed (see the boot-shape note below),
// so a platform-namespaced model id is the registry-correct choice.
model: "moonshot/kimi-k2.6",
}),
});
if (ws.status >= 400 || !ws.body?.id) {
throw new Error(`Workspace create ${ws.status}: ${JSON.stringify(ws.body)}`);
}
const workspaceId = ws.body.id as string;
console.log(`[staging-setup] Workspace created: ${workspaceId}`);
// 6. Wait for workspace online
// 6. Wait for workspace RENDERABLE.
//
// This harness exists to verify the canvas *tab UI* renders (staging-
// tabs.spec.ts: open each of the 13 workspace-panel tabs, assert no hard
@@ -280,16 +266,6 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
// it needs is a workspace ROW that the canvas lists so the node renders
// and the side-panel tabs open. A fully-`online` agent is NOT required.
//
// Hermes cold-boot takes 10-13 min on slow apt days (apt + uv + hermes
// install + npm browser-tools). The controlplane bootstrap-watcher
// deadline fires at 5 min and sets status=failed prematurely; heartbeat
// then transitions failed → online after install.sh finishes. The ONLY
// failed shape we tolerate is the pre-start credential-abort
// (uptime_seconds=0, no last_sample_error) — the agent never ran. Real
// boot regressions (image pull error, panic, PYTHONPATH, etc.) still
// hard-throw immediately so triage gets detail without waiting for a
// polling timeout. See test_staging_full_saas.sh step 7/11 and issue #2632.
//
// That distinction became load-bearing on 2026-06-03: workspace-server
// #2162 (fix(provision): platform-managed workspace must fail-closed when
// CP proxy env absent) made a platform_managed workspace ABORT AT BOOT
@@ -311,10 +287,8 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
// the node + tabs render, proceed. We do NOT mask a real boot regression:
// any `failed` carrying a last_sample_error, OR a non-zero uptime (the
// agent started then crashed — image pull, panic, PYTHONPATH, etc.),
// still hard-throws immediately so triage gets boot_stage / last_error /
// image fields without waiting for a polling timeout.
// Genuine *infra* provision failure is already caught loud one step
// earlier at the org level (instance_status === "failed").
// still hard-throws. Genuine *infra* provision failure is already caught
// loud one step earlier at the org level (instance_status === "failed").
await waitFor<boolean>(
async () => {
const r = await jsonFetch(`${tenantURL}/workspaces/${workspaceId}`, {
@@ -341,7 +315,13 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
);
return true;
}
// Real boot regression — hard-throw immediately with full detail.
// last_sample_error is often empty when the failure happens before
// the agent emits a sample (e.g. boot crash, image pull error,
// missing PYTHONPATH, OpenAI quota at startup). Dumping the full
// body gives triage the boot_stage / last_error / image fields it
// needs without a second probe. Otherwise this propagates as a
// bare "Workspace failed: " — the exact useless message that
// sent #2632 to the issue tracker.
const detail = sampleErr
? sampleErr
: `(no last_sample_error) full body: ${JSON.stringify(r.body)}`;
@@ -353,7 +333,7 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
10_000,
"workspace online",
);
console.log(`[staging-setup] Workspace online`);
console.log(`[staging-setup] Workspace renderable`);
// 7. Hand state off to tests + teardown — overwrite the slug-only
// bootstrap state with the full state spec tests need.
+1 -1
View File
@@ -370,7 +370,7 @@ test.describe("staging canvas tabs", () => {
// The tablist appears once the side panel mounts. Condition-based
// wait — no fixed delay.
const tablist = page.getByRole("tablist", { name: "Workspace panel tabs" });
const tablist = page.locator('[role="tablist"]');
await expect(
tablist,
"side panel tablist never appeared after clicking the workspace node",
+2 -2
View File
@@ -172,7 +172,7 @@ export function ContextMenu() {
const nodeId = contextMenu.nodeId;
closeContextMenu();
try {
await api.post(`/workspaces/${nodeId}/pause?cascade=true`, {});
await api.post(`/workspaces/${nodeId}/pause`, {});
updateNodeData(nodeId, { status: "paused" });
} catch (e) {
showToast("Pause failed", "error");
@@ -184,7 +184,7 @@ export function ContextMenu() {
const nodeId = contextMenu.nodeId;
closeContextMenu();
try {
await api.post(`/workspaces/${nodeId}/resume?cascade=true`, {});
await api.post(`/workspaces/${nodeId}/resume`, {});
updateNodeData(nodeId, { status: "provisioning" });
} catch (e) {
showToast("Resume failed", "error");
@@ -385,7 +385,7 @@ describe("ContextMenu — item actions", () => {
render(<ContextMenu />);
fireEvent.click(screen.getByRole("menuitem", { name: /pause/i }));
await act(async () => { /* flush */ });
expect(mockPost).toHaveBeenCalledWith("/workspaces/n1/pause?cascade=true", {});
expect(mockPost).toHaveBeenCalledWith("/workspaces/n1/pause", {});
expect(mockStoreState.updateNodeData).toHaveBeenCalledWith("n1", { status: "paused" });
});
@@ -395,7 +395,7 @@ describe("ContextMenu — item actions", () => {
render(<ContextMenu />);
fireEvent.click(screen.getByRole("menuitem", { name: /resume/i }));
await act(async () => { /* flush */ });
expect(mockPost).toHaveBeenCalledWith("/workspaces/n1/resume?cascade=true", {});
expect(mockPost).toHaveBeenCalledWith("/workspaces/n1/resume", {});
});
});
+1 -1
View File
@@ -324,7 +324,7 @@ export const useCanvasStore = create<CanvasState>((set, get) => ({
batchPause: async () => {
const ids = Array.from(get().selectedNodeIds);
const results = await Promise.allSettled(
ids.map((id) => api.post(`/workspaces/${id}/pause?cascade=true`))
ids.map((id) => api.post(`/workspaces/${id}/pause`))
);
const failed: string[] = [];
results.forEach((r, i) => {
+3 -3
View File
@@ -2,7 +2,7 @@
**Status:** living document — update when you ship a feature that touches one backend.
**Owner:** workspace-server + controlplane teams.
**Last audit:** 2026-05-31 (Claude agent — drift risk #6 verified resolved; nil guards present, contract tests run without Skip).
**Last audit:** 2026-05-07 (plugin install/uninstall closed for EC2 backend via EIC SSH push to the bind-mounted `/configs/plugins/<name>/`, mirroring the Files API PR #1702 pattern).
## Why this exists
@@ -93,12 +93,12 @@ For "do we have any backend?", use `HasProvisioner()`, never bare `h.provisioner
3. **Restart divergence on runtime changes.** Docker re-reads `/configs/config.yaml` from the container before stop, so a changed `runtime:` survives a restart even if the DB isn't synced. EC2 trusts the DB only. If you change the runtime via the Config tab and the handler races the restart, Docker will land on the new runtime, EC2 will land on the old one. **Fix path:** make the Config-tab save explicitly flush to DB before kicking off a restart, not deferred.
4. **Console-output asymmetry.** Users debugging a stuck workspace on Docker see `docker logs`; on EC2 they see `GetConsoleOutput`. The two outputs look nothing alike. **Fix path:** expose a unified `GET /workspaces/:id/boot-log` that proxies to whichever backend serves the data. Already partly there via `cp_provisioner.Console`.
5. **Template script drift.** `install.sh` and `start.sh` in each template repo do the same high-level work (install hermes-agent, write .env, write config.yaml, start gateway) but must be kept byte-level consistent on the provider-key forwarding block. Easy to forget. Enforced now by `tools/check-template-parity.sh` (see below) — run it in each template repo's CI.
6. ~~**Both backends panic when underlying client is nil.**~~ **RESOLVED**nil guards landed in `Provisioner` (`Start`, `Stop`, `IsRunning`, `ExecRead`, `RemoveVolume`, `VolumeHasFile`, `WriteAuthTokenToVolume`) and `CPProvisioner` (`Stop`, `IsRunning`), all returning `ErrNoBackend`. Contract tests (`TestDockerBackend_Contract`, `TestCPProvisionerBackend_Contract`, `TestZeroValuedBackends_NoPanic`) run in CI without `t.Skip`.
6. **Both backends panic when underlying client is nil.****Resolved** (`fix/provisioner-nil-guards-1813`). `Provisioner.{Stop,IsRunning}` and `CPProvisioner.{Stop,IsRunning}` now guard against nil clients with `ErrNoBackend`, so the contract-test runner executes scenarios against zero-valued backends without panic.
## Enforcement
- **`tools/check-template-parity.sh`** (this repo) — ensures `install.sh` and `start.sh` in a template repo forward identical sets of provider keys. Wire into each template repo's CI as `bash $MONOREPO/tools/check-template-parity.sh install.sh start.sh`.
- **Contract tests**`workspace-server/internal/provisioner/backend_contract_test.go` defines the behaviors every `provisioner.Provisioner` implementation must satisfy. Fails compile when a method drifts between `Docker` and `CPProvisioner`. Scenario-level runs (`TestDockerBackend_Contract`, `TestCPProvisionerBackend_Contract`, `TestZeroValuedBackends_NoPanic`) execute in CI — drift risk #6 resolved.
- **Contract tests**`workspace-server/internal/provisioner/backend_contract_test.go` defines the behaviors every `provisioner.Provisioner` implementation must satisfy. Fails compile when a method drifts between `Docker` and `CPProvisioner`. Scenario-level runs execute against zero-valued backends since drift risk #6 was resolved (`fix/provisioner-nil-guards-1813`).
- **Source-level dispatcher pins**`workspace_provision_auto_test.go` enforces the SoT pattern documented above:
- `TestNoCallSiteCallsDirectProvisionerExceptAuto` — no handler calls `.provisionWorkspace(` or `.provisionWorkspaceCP(` directly outside the dispatcher's allowlist.
- `TestNoCallSiteCallsBareStop` — no handler calls `.provisioner.Stop(` or `.cpProv.Stop(` directly outside the dispatcher's allowlist (strips Go comments before substring match so archaeology in code comments doesn't trip the gate).
+1 -10
View File
@@ -19,10 +19,7 @@
#
# Env vars required:
# CF_API_TOKEN — Cloudflare token with zone:dns:edit
# (falls back to CLOUDFLARE_API_TOKEN if CF_API_TOKEN is unset;
# the workflow YAML maps both secret names into CF_API_TOKEN)
# CF_ZONE_ID — the zone (moleculesai.app)
# (falls back to CLOUDFLARE_ZONE_ID if CF_ZONE_ID is unset)
# CP_ADMIN_API_TOKEN — CP admin bearer for api.moleculesai.app
# CP_STAGING_ADMIN_API_TOKEN — CP admin bearer for staging-api.moleculesai.app
# AWS_* — standard AWS creds (default region us-east-2)
@@ -59,12 +56,6 @@ need() {
exit 1
fi
}
# Fallback: operator-host canonical names → CI-scoped names.
# The workflow YAML already maps both, but direct script invocation
# (e.g. local ops) may only have the canonical names set.
CF_API_TOKEN="${CF_API_TOKEN:-${CLOUDFLARE_API_TOKEN:-}}"
CF_ZONE_ID="${CF_ZONE_ID:-${CLOUDFLARE_ZONE_ID:-}}"
need CF_API_TOKEN
need CF_ZONE_ID
need CP_ADMIN_API_TOKEN
@@ -130,7 +121,7 @@ if not payload.get("success", False) or not isinstance(payload.get("result"), li
print(f"ERROR: Cloudflare DNS list failed: {detail}", file=sys.stderr)
raise SystemExit(1)
'; then
log "Cloudflare DNS list failed; verify CF_API_TOKEN (or CLOUDFLARE_API_TOKEN) has Zone:DNS:Edit and CF_ZONE_ID (or CLOUDFLARE_ZONE_ID) is the moleculesai.app zone."
log "Cloudflare DNS list failed; verify CF_API_TOKEN has Zone:DNS:Edit and CF_ZONE_ID is the moleculesai.app zone."
exit 1
fi
TOTAL_CF=$(echo "$CF_JSON" | python3 -c "import json,sys; print(len(json.load(sys.stdin)['result']))")
-9
View File
@@ -29,11 +29,8 @@
# account:cloudflare_tunnel:edit scope.
# (Same secret as sweep-cf-orphans, but the
# token must include the tunnel scope.)
# (falls back to CLOUDFLARE_API_TOKEN if CF_API_TOKEN is unset;
# the workflow YAML maps both secret names into CF_API_TOKEN)
# CF_ACCOUNT_ID — the account that owns the tunnels (visible
# in dash.cloudflare.com URL path)
# (falls back to CLOUDFLARE_ACCOUNT_ID if CF_ACCOUNT_ID is unset)
# CP_ADMIN_API_TOKEN — CP admin bearer for api.moleculesai.app
# CP_STAGING_ADMIN_API_TOKEN — CP admin bearer for staging-api.moleculesai.app
#
@@ -73,12 +70,6 @@ need() {
exit 1
fi
}
# Fallback: operator-host canonical names → CI-scoped names.
# The workflow YAML already maps both, but direct script invocation
# (e.g. local ops) may only have the canonical names set.
CF_API_TOKEN="${CF_API_TOKEN:-${CLOUDFLARE_API_TOKEN:-}}"
CF_ACCOUNT_ID="${CF_ACCOUNT_ID:-${CLOUDFLARE_ACCOUNT_ID:-}}"
need CF_API_TOKEN
need CF_ACCOUNT_ID
need CP_ADMIN_API_TOKEN
-299
View File
@@ -1,299 +0,0 @@
#!/usr/bin/env bash
# cp#455 — Minimal-cell boot-to-registration harness.
# CTO directive 14eb4f07: "build the minimal claude-code+kimi cell,
# it should now go GREEN since the fix is live."
#
# Stage 1 of 5-stage rollout. Reduced to the minimum boot-to-
# registration surface so each cell run is ~3-5 min wall-clock.
#
# Four assertions (per Researcher Task #79 spec):
# 1. Provision request accepted; workspace transitions to booting/running
# 2. Controlplane receives /registry/register for that workspace_id
# 3. JSON-RPC/completion route returns successful minimal response
# 4. Teardown terminates workspace even on failure (trap)
#
# Cost controls (mandatory):
# - SPOT instances (via the dispatch-only EC2 provisioning path;
# we don't set instance type — that's the provisioner's call)
# - Fast teardown ~3-5 min wall-clock
# - Structured per-cell results JSON output
#
# Auth model (mirrors test_staging_full_saas.sh):
# Single MOLECULE_ADMIN_TOKEN drives everything.
# - POST /cp/admin/orgs to provision
# - GET /cp/admin/orgs/:slug/admin-token for per-tenant token
# - DELETE /cp/admin/tenants/:slug for teardown
# Per-tenant admin token drives tenant API calls (workspaces,
# /registry/register, JSON-RPC completion).
#
# Required env:
# MOLECULE_CP_URL default: https://staging-api.moleculesai.app
# MOLECULE_ADMIN_TOKEN CP admin bearer
#
# Optional env (passed from workflow_dispatch inputs):
# E2E_RUNTIME default claude-code
# E2E_BILLING_MODE default platform_managed
# E2E_PROVIDER default platform
# E2E_MODEL default moonshot/kimi-k2.6
# E2E_RUN_ID Slug suffix; CI: cp455-${GITHUB_RUN_ID}
# E2E_PROVISION_TIMEOUT_SECS default 300 (5 min — fast teardown budget)
# E2E_KEEP_ORG 1 → skip teardown (debugging only)
#
# Exit codes:
# 0 happy path
# 1 generic failure
# 2 missing required env
# 3 provisioning timed out (assertion 1)
# 4 register timeout (assertion 2)
# 5 completion failure (assertion 3)
# 6 teardown left orphan (assertion 4)
set -uo pipefail
CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}"
ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}"
RUNTIME="${E2E_RUNTIME:-claude-code}"
BILLING_MODE="${E2E_BILLING_MODE:-platform_managed}"
PROVIDER="${E2E_PROVIDER:-platform}"
MODEL="${E2E_MODEL:-moonshot/kimi-k2.6}"
PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-300}"
KEEP_ORG="${E2E_KEEP_ORG:-}"
RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}"
SLUG="cp455-${RUNTIME}-${RUN_ID_SUFFIX}"
WORKSPACE_ID=""
TENANT_TOKEN=""
RESULT_JSON="/tmp/cell-result.json"
PROVISION_START_EPOCH=""
PROVISION_END_EPOCH=""
REGISTER_STATUS="not_attempted"
COMPLETION_STATUS="not_attempted"
TEARDOWN_STATUS="not_attempted"
EXIT_CODE=0
# Structured per-cell results writer. Emits JSON with all 4
# assertion statuses + elapsed timing. Called from EXIT trap so
# results are captured even on early failure.
write_result() {
local elapsed="${1:-0}"
cat > "${RESULT_JSON}" <<JSON
{
"runtime": "${RUNTIME}",
"billing_mode": "${BILLING_MODE}",
"provider": "${PROVIDER}",
"model": "${MODEL}",
"workspace_id": "${WORKSPACE_ID}",
"register_status": "${REGISTER_STATUS}",
"completion_status": "${COMPLETION_STATUS}",
"teardown_status": "${TEARDOWN_STATUS}",
"elapsed_seconds": ${elapsed},
"exit_code": ${EXIT_CODE},
"ts": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
}
JSON
}
# EXIT trap — ALWAYS run. Writes structured results, tears down
# workspace if we have one, never lets the script exit without
# emitting /tmp/cell-result.json.
on_exit() {
local exit_code=$?
EXIT_CODE=${exit_code}
local now
now=$(date +%s)
local elapsed=0
if [ -n "${PROVISION_START_EPOCH:-}" ] && [ "${PROVISION_START_EPOCH}" -gt 0 ] 2>/dev/null; then
elapsed=$(( now - PROVISION_START_EPOCH ))
fi
# Assertion 4: teardown terminates workspace even on failure.
if [ -z "${KEEP_ORG}" ] && [ -n "${SLUG:-}" ]; then
if [ -n "${WORKSPACE_ID:-}" ] || [ -n "${SLUG:-}" ]; then
echo "::group::Teardown (trap)"
echo "DELETE ${CP_URL}/cp/admin/tenants/${SLUG}"
local teardown_http_code
teardown_http_code=$(curl -sS -o /dev/null -w '%{http_code}' \
-X DELETE \
-H "Authorization: Bearer ${ADMIN_TOKEN}" \
--max-time 60 \
"${CP_URL}/cp/admin/tenants/${SLUG}" || echo "000")
if [ "${teardown_http_code}" = "200" ] || [ "${teardown_http_code}" = "204" ] || [ "${teardown_http_code}" = "404" ]; then
TEARDOWN_STATUS="ok"
echo "Teardown OK (HTTP ${teardown_http_code})"
else
TEARDOWN_STATUS="leak_risk_http_${teardown_http_code}"
echo "::error::Teardown returned HTTP ${teardown_http_code} — orphan risk"
# Bump exit code to 6 if teardown is the failure source.
if [ "${EXIT_CODE}" -eq 0 ]; then
EXIT_CODE=6
fi
fi
echo "::endgroup::"
fi
else
TEARDOWN_STATUS="skipped_keep_org"
fi
write_result "${elapsed}"
echo "Structured results written to ${RESULT_JSON}"
cat "${RESULT_JSON}"
exit "${EXIT_CODE}"
}
trap on_exit EXIT
trap 'echo "::error::Script aborted on signal"; exit 130' INT TERM
PROVISION_START_EPOCH=$(date +%s)
# Assertion 1: Provision request accepted; workspace transitions to
# booting/running.
echo "::group::Assertion 1: Provision"
echo "POST ${CP_URL}/cp/admin/orgs slug=${SLUG} runtime=${RUNTIME} billing_mode=${BILLING_MODE} provider=${PROVIDER} model=${MODEL}"
PROVISION_HTTP_CODE=$(curl -sS -o /tmp/provision-resp.json -w '%{http_code}' \
-X POST \
-H "Authorization: Bearer ${ADMIN_TOKEN}" \
-H "Content-Type: application/json" \
--max-time 30 \
-d "$(cat <<JSON
{
"slug": "${SLUG}",
"runtime": "${RUNTIME}",
"billing_mode": "${BILLING_MODE}",
"provider": "${PROVIDER}",
"model": "${MODEL}",
"tier": "spot",
"tags": {
"cp455_minimal_cell": "1",
"run_id": "${RUN_ID_SUFFIX}"
}
}
JSON
)" \
"${CP_URL}/cp/admin/orgs" || echo "000")
echo "HTTP ${PROVISION_HTTP_CODE}"
if [ "${PROVISION_HTTP_CODE}" != "202" ] && [ "${PROVISION_HTTP_CODE}" != "200" ]; then
echo "::error::Provision failed (HTTP ${PROVISION_HTTP_CODE})"
cat /tmp/provision-resp.json 2>/dev/null || true
EXIT_CODE=1
exit "${EXIT_CODE}"
fi
echo "::endgroup::"
# Wait for org to reach running + retrieve per-tenant token. Bounded
# at PROVISION_TIMEOUT_SECS. We poll the admin token endpoint; once
# the org is up, the endpoint returns 200 with the token, and the
# workspace_id is in the same response or in a follow-up /orgs/:slug
# call.
echo "::group::Wait for org to be ready (max ${PROVISION_TIMEOUT_SECS}s)"
WAIT_START=$(date +%s)
WAIT_DEADLINE=$(( WAIT_START + PROVISION_TIMEOUT_SECS ))
TENANT_TOKEN=""
while [ "$(date +%s)" -lt "${WAIT_DEADLINE}" ]; do
TOKEN_HTTP_CODE=$(curl -sS -o /tmp/token-resp.json -w '%{http_code}' \
-H "Authorization: Bearer ${ADMIN_TOKEN}" \
--max-time 10 \
"${CP_URL}/cp/admin/orgs/${SLUG}/admin-token" || echo "000")
if [ "${TOKEN_HTTP_CODE}" = "200" ]; then
TENANT_TOKEN=$(jq -r '.admin_token // .token // empty' /tmp/token-resp.json 2>/dev/null || echo "")
if [ -n "${TENANT_TOKEN}" ]; then
WORKSPACE_ID=$(jq -r '.workspace_id // .default_workspace_id // empty' /tmp/token-resp.json 2>/dev/null || echo "")
if [ -z "${WORKSPACE_ID}" ]; then
# Fallback: list orgs and find by slug
WORKSPACE_ID=$(curl -sS -H "Authorization: Bearer ${ADMIN_TOKEN}" \
"${CP_URL}/cp/admin/orgs/${SLUG}" | jq -r '.workspace_id // .default_workspace_id // empty' 2>/dev/null || echo "")
fi
if [ -n "${WORKSPACE_ID}" ]; then
PROVISION_END_EPOCH=$(date +%s)
echo "Org ready in $(( PROVISION_END_EPOCH - WAIT_START ))s — workspace_id=${WORKSPACE_ID}"
break
fi
fi
fi
sleep 5
done
if [ -z "${TENANT_TOKEN}" ] || [ -z "${WORKSPACE_ID}" ]; then
echo "::error::Provision timed out (org never reached running within ${PROVISION_TIMEOUT_SECS}s)"
EXIT_CODE=3
exit "${EXIT_CODE}"
fi
echo "::endgroup::"
# Assertion 2: Controlplane receives /registry/register for that
# workspace_id. The harness doesn't POST to /registry/register
# directly — that's the workspace-server's own job on boot. We
# verify the registration was received by polling the registry
# endpoint (or by checking that a /workspaces/:id call returns
# the expected fields).
echo "::group::Assertion 2: /registry/register for workspace_id=${WORKSPACE_ID}"
REGISTER_DEADLINE=$(( $(date +%s) + 60 ))
while [ "$(date +%s)" -lt "${REGISTER_DEADLINE}" ]; do
REG_HTTP_CODE=$(curl -sS -o /tmp/reg-resp.json -w '%{http_code}' \
-H "Authorization: Bearer ${TENANT_TOKEN}" \
--max-time 10 \
"${CP_URL}/cp/registry/workspaces/${WORKSPACE_ID}" || echo "000")
if [ "${REG_HTTP_CODE}" = "200" ]; then
REGISTERED=$(jq -r '.registered // .workspace_id // empty' /tmp/reg-resp.json 2>/dev/null || echo "")
if [ -n "${REGISTERED}" ]; then
REGISTER_STATUS="ok"
echo "Registry confirms workspace_id=${WORKSPACE_ID} registered"
break
fi
fi
sleep 3
done
if [ "${REGISTER_STATUS}" != "ok" ]; then
echo "::error::Registry did not confirm registration within 60s"
cat /tmp/reg-resp.json 2>/dev/null || true
EXIT_CODE=4
exit "${EXIT_CODE}"
fi
echo "::endgroup::"
# Assertion 3: JSON-RPC/completion route returns successful minimal
# response. One minimal completion call — keep payload small.
echo "::group::Assertion 3: JSON-RPC completion"
COMPLETION_HTTP_CODE=$(curl -sS -o /tmp/completion-resp.json -w '%{http_code}' \
-X POST \
-H "Authorization: Bearer ${TENANT_TOKEN}" \
-H "Content-Type: application/json" \
--max-time 30 \
-d "$(cat <<JSON
{
"jsonrpc": "2.0",
"id": 1,
"method": "completion",
"params": {
"workspace_id": "${WORKSPACE_ID}",
"model": "${MODEL}",
"messages": [{"role": "user", "content": "ping"}],
"max_tokens": 1
}
}
JSON
)" \
"${CP_URL}/cp/rpc" || echo "000")
echo "HTTP ${COMPLETION_HTTP_CODE}"
if [ "${COMPLETION_HTTP_CODE}" != "200" ]; then
echo "::error::Completion failed (HTTP ${COMPLETION_HTTP_CODE})"
cat /tmp/completion-resp.json 2>/dev/null || true
EXIT_CODE=5
exit "${EXIT_CODE}"
fi
# Verify JSON-RPC 2.0 success envelope
RPC_ERROR=$(jq -r '.error // empty' /tmp/completion-resp.json 2>/dev/null || echo "")
if [ -n "${RPC_ERROR}" ]; then
echo "::error::Completion returned JSON-RPC error: ${RPC_ERROR}"
cat /tmp/completion-resp.json 2>/dev/null || true
EXIT_CODE=5
exit "${EXIT_CODE}"
fi
RPC_RESULT=$(jq -r '.result // empty' /tmp/completion-resp.json 2>/dev/null || echo "")
if [ -z "${RPC_RESULT}" ] || [ "${RPC_RESULT}" = "null" ]; then
echo "::error::Completion response missing result field"
cat /tmp/completion-resp.json 2>/dev/null || true
EXIT_CODE=5
exit "${EXIT_CODE}"
fi
COMPLETION_STATUS="ok"
echo "Completion OK"
echo "::endgroup::"
echo "All 4 assertions passed for ${SLUG} (workspace_id=${WORKSPACE_ID})"
+11 -10
View File
@@ -53,9 +53,7 @@
# PV_RUNTIMES space list; default "hermes openclaw claude-code"
# E2E_PROVISION_TIMEOUT_SECS default 1800 (hermes/openclaw cold EC2 budget)
# E2E_MINIMAX_API_KEY / E2E_ANTHROPIC_API_KEY / E2E_OPENAI_API_KEY
# DEPRECATED for this script — platform-managed models
# use the CP LLM proxy; direct vendor keys are blocked
# by PR #2291. Kept in workflow env for other E2Es.
# LLM provider key injected so the runtime can boot
# PV_TOKEN_DIAGNOSTIC_ONLY
# 1 -> stop after create/token acquisition. Useful
# to classify Hermes-only vs shared auth-route issues.
@@ -224,14 +222,17 @@ else
fi
# ─── 4. Provision the parent + one sibling per runtime under test ──────
# Platform-managed models: Molecule owns billing via the CP LLM proxy, so
# the workspace needs NO tenant key. PR #2291 blocks direct vendor key writes
# (ANTHROPIC_API_KEY, ANTHROPIC_AUTH_TOKEN, MINIMAX_API_KEY, etc.) for
# platform-managed workspaces. We intentionally keep SECRETS_JSON empty so a
# stray E2E_*_API_KEY in the runner env cannot silently convert this into a
# BYOK run and mask the platform-managed path (mirrors
# test_staging_full_saas.sh's E2E_LLM_PATH=platform branch).
# Inject the LLM provider key so each runtime can authenticate at boot.
# Priority: MiniMax → direct-Anthropic → OpenAI (mirrors
# test_staging_full_saas.sh's secrets-injection chain).
SECRETS_JSON='{}'
if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
SECRETS_JSON=$(python3 -c "import json,os;k=os.environ['E2E_MINIMAX_API_KEY'];print(json.dumps({'ANTHROPIC_BASE_URL':'https://api.minimax.io/anthropic','ANTHROPIC_AUTH_TOKEN':k,'MINIMAX_API_KEY':k}))")
elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
SECRETS_JSON=$(python3 -c "import json,os;k=os.environ['E2E_ANTHROPIC_API_KEY'];print(json.dumps({'ANTHROPIC_API_KEY':k}))")
elif [ -n "${E2E_OPENAI_API_KEY:-}" ]; then
SECRETS_JSON=$(python3 -c "import json,os;k=os.environ['E2E_OPENAI_API_KEY'];print(json.dumps({'OPENAI_API_KEY':k,'OPENAI_BASE_URL':'https://api.openai.com/v1','MODEL_PROVIDER':'openai:gpt-4o','HERMES_INFERENCE_PROVIDER':'custom','HERMES_CUSTOM_BASE_URL':'https://api.openai.com/v1','HERMES_CUSTOM_API_KEY':k,'HERMES_CUSTOM_API_MODE':'chat_completions'}))")
fi
# Workspace-create now enforces the MODEL_REQUIRED contract: there is NO
# platform-side default model for a runtime (feedback_workspace_model_required_
-48
View File
@@ -584,54 +584,6 @@ def test_find_open_issue_raises_on_transient_error(drift_module, monkeypatch):
drift_module.find_open_issue("[ci-drift] foo")
# --------------------------------------------------------------------------
# Pagination: search beyond page 1 so an existing issue on any page is found
# --------------------------------------------------------------------------
def test_find_open_issue_paginates_to_page_2(drift_module, monkeypatch):
"""Issue exists on page 2 → paginate and find it."""
target = {"number": 99, "title": "[ci-drift] foo"}
filler = [{"number": i, "title": f"other-{i}"} for i in range(1, 51)]
class PaginatedStub:
def __init__(self):
self.calls = []
def __call__(self, method, path, *, body=None, query=None, expect_json=True):
self.calls.append((method, path, body, query))
page = int((query or {}).get("page", "1"))
if page == 1:
return 200, filler
if page == 2:
return 200, [target]
return 200, []
stub = PaginatedStub()
monkeypatch.setattr(drift_module, "api", stub)
assert drift_module.find_open_issue("[ci-drift] foo") == target
assert len(stub.calls) == 2
def test_find_open_issue_stops_at_last_page(drift_module, monkeypatch):
"""No match across pages → stop when a page has <50 results."""
filler = [{"number": i, "title": f"other-{i}"} for i in range(1, 51)]
class PaginatedStub:
def __init__(self):
self.calls = []
def __call__(self, method, path, *, body=None, query=None, expect_json=True):
self.calls.append((method, path, body, query))
page = int((query or {}).get("page", "1"))
if page == 1:
return 200, filler
return 200, []
stub = PaginatedStub()
monkeypatch.setattr(drift_module, "api", stub)
assert drift_module.find_open_issue("[ci-drift] foo") is None
assert len(stub.calls) == 2
# --------------------------------------------------------------------------
# Idempotent path: existing issue is PATCHed, NOT duplicated
# --------------------------------------------------------------------------
+1 -10
View File
@@ -351,17 +351,8 @@ func main() {
// (true, err) on any transient error, so a CP blip never flips a healthy
// workspace.
if cpProv != nil {
// Guard against double-reprovision thrash (internal#544): the restart
// debounce window must cover the reconciler interval so a workspace
// flipped offline by one reconcile tick isn't immediately reprovisioned
// again by the next tick before the debounce drops it. If the interval
// ever shrinks below the debounce window, the coupling silently breaks.
reconcileInterval := 60 * time.Second
if handlers.RestartDebounceWindow < reconcileInterval {
log.Fatalf("RestartDebounceWindow (%s) must be >= CP instance reconciler interval (%s) to prevent double-reprovision thrash (internal#544)", handlers.RestartDebounceWindow, reconcileInterval)
}
go supervised.RunWithRecover(ctx, "cp-instance-reconciler", func(c context.Context) {
registry.StartCPInstanceReconciler(c, cpProv, onWorkspaceOffline, reconcileInterval)
registry.StartCPInstanceReconciler(c, cpProv, onWorkspaceOffline, 60*time.Second)
})
}
@@ -271,11 +271,6 @@ func (m *Manager) Reload(ctx context.Context) {
ch.Config["_channel_id"] = ch.ID
go func(a ChannelAdapter, c ChannelRow, pCtx context.Context) {
defer func() {
if r := recover(); r != nil {
log.Printf("PANIC recovered in channel polling goroutine: %v", r)
}
}()
if err := a.StartPolling(pCtx, c.Config, m.onInboundMessage); err != nil {
log.Printf("Channels: polling error for %s/%s: %v", c.ChannelType, truncID(c.ID), err)
}
@@ -359,11 +354,6 @@ func (m *Manager) HandleInbound(ctx context.Context, ch ChannelRow, msg *Inbound
typingCtx, typingCancel := context.WithCancel(fireCtx)
defer typingCancel()
go func() {
defer func() {
if r := recover(); r != nil {
log.Printf("PANIC recovered in typing indicator goroutine: %v", r)
}
}()
typer.SendTyping(ch.Config, msg.ChatID)
ticker := time.NewTicker(4 * time.Second)
defer ticker.Stop()
@@ -63,6 +63,31 @@ func TestSessionSearchReturnsActivityAndMemory(t *testing.T) {
}
}
func TestSessionSearch_DBError(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewActivityHandler(broadcaster)
mock.ExpectQuery("WITH session_items AS").
WillReturnError(context.DeadlineExceeded)
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Request = httptest.NewRequest("GET", "/workspaces/ws-123/session-search?q=test", bytes.NewBufferString(""))
c.Request.Header.Set("Content-Type", "application/json")
c.Params = gin.Params{{Key: "id", Value: "ws-123"}}
handler.SessionSearch(c)
if w.Code != http.StatusInternalServerError {
t.Errorf("expected 500 on DB error, got %d", w.Code)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations: %v", err)
}
}
// ---------- Activity List source filter ----------
func TestActivityList_SourceCanvas(t *testing.T) {
@@ -142,7 +142,7 @@ func ghcrAuthHeader() string {
log.Printf("workspace-images: failed to marshal GHCR auth: %v", err)
return ""
}
return base64.StdEncoding.EncodeToString(js)
return base64.URLEncoding.EncodeToString(js)
}
// Refresh pulls the requested runtimes' template images from GHCR and (if
@@ -47,9 +47,9 @@ func TestGHCRAuthHeader_EncodesDockerEnginePayload(t *testing.T) {
if got == "" {
t.Fatal("expected non-empty auth header")
}
raw, err := base64.StdEncoding.DecodeString(got)
raw, err := base64.URLEncoding.DecodeString(got)
if err != nil {
t.Fatalf("auth header is not valid base64: %v", err)
t.Fatalf("auth header is not valid base64-url: %v", err)
}
var payload map[string]string
if err := json.Unmarshal(raw, &payload); err != nil {
@@ -80,9 +80,9 @@ func TestGHCRAuthHeader_RespectsRegistryEnv(t *testing.T) {
if got == "" {
t.Fatal("expected non-empty auth header")
}
raw, err := base64.StdEncoding.DecodeString(got)
raw, err := base64.URLEncoding.DecodeString(got)
if err != nil {
t.Fatalf("auth header is not valid base64: %v", err)
t.Fatalf("auth header is not valid base64-url: %v", err)
}
var payload map[string]string
if err := json.Unmarshal(raw, &payload); err != nil {
@@ -220,7 +220,7 @@ func TestGHCRAuthHeader_TrimsWhitespace(t *testing.T) {
t.Setenv("GHCR_USER", " alice ")
t.Setenv("GHCR_TOKEN", "\tfake-tok-value\n")
got := ghcrAuthHeader()
raw, _ := base64.StdEncoding.DecodeString(got)
raw, _ := base64.URLEncoding.DecodeString(got)
var payload map[string]string
_ = json.Unmarshal(raw, &payload)
if payload["username"] != "alice" {
@@ -173,8 +173,20 @@ func (h *DelegationHandler) Delegate(c *gin.Context) {
// check_task_status returned status='queued' forever even after a
// real reply landed). messageId mirrors delegation_id so the
// platform's idempotency-key extraction also keys off the same id.
// Build A2A payload via helper so contract tests can assert the envelope shape.
a2aBody, marshalErr := buildDelegateA2ABody(delegationID, body.Task)
a2aBody, marshalErr := json.Marshal(map[string]interface{}{
"method": "message/send",
"params": map[string]interface{}{
"message": map[string]interface{}{
"role": "user",
"messageId": delegationID,
// A2A v0.3 Part discriminator is `kind`, NOT `type` (#2251) —
// a `type`-keyed Part is dropped by the receiver's v0.3
// validator, silently losing the delegated task.
"parts": []map[string]interface{}{{"kind": "text", "text": body.Task}},
"metadata": map[string]interface{}{"delegation_id": delegationID},
},
},
})
if marshalErr != nil {
log.Printf("Delegation %s: json.Marshal a2aBody failed: %v", delegationID, marshalErr)
}
@@ -362,27 +374,6 @@ func insertDelegationRow(ctx context.Context, c *gin.Context, sourceID string, b
return insertTrackingUnavailable
}
// buildDelegateA2ABody constructs the A2A JSON-RPC envelope for a delegation.
// The returned shape is a schema-valid SendMessageRequest with role="user",
// messageId, parts, and delegation metadata. Extracted to a pure function so
// unit tests can assert the envelope contract without standing up HTTP or DB.
func buildDelegateA2ABody(delegationID, task string) ([]byte, error) {
return json.Marshal(map[string]interface{}{
"method": "message/send",
"params": map[string]interface{}{
"message": map[string]interface{}{
"role": "user",
"messageId": delegationID,
// A2A v0.3 Part discriminator is `kind`, NOT `type` (#2251) —
// a `type`-keyed Part is dropped by the receiver's v0.3
// validator, silently losing the delegated task.
"parts": []map[string]interface{}{{"kind": "text", "text": task}},
"metadata": map[string]interface{}{"delegation_id": delegationID},
},
},
})
}
// executeDelegation runs in a goroutine — sends A2A and stores the result.
// Updates delegation status through: pending → dispatched → received → completed/failed
// delegationRetryDelay is the pause between the first failed proxy attempt
@@ -602,6 +602,33 @@ func TestDelegationRecord_RejectsInvalidUUID(t *testing.T) {
}
}
func TestDelegationRecord_DBInsertFails(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
broadcaster := newTestBroadcaster()
wh := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
h := NewDelegationHandler(wh, broadcaster)
mock.ExpectExec("INSERT INTO activity_logs").
WillReturnError(fmt.Errorf("connection refused"))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: "550e8400-e29b-41d4-a716-446655440000"}}
body := `{"target_id":"550e8400-e29b-41d4-a716-446655440001","task":"hello","delegation_id":"del-xyz"}`
c.Request = httptest.NewRequest("POST", "/delegations/record", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
h.Record(c)
if w.Code != http.StatusInternalServerError {
t.Errorf("expected 500 on DB insert failure, got %d", w.Code)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet expectations: %v", err)
}
}
func TestDelegationUpdateStatus_CompletedInsertsResultRow(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
@@ -1762,74 +1789,3 @@ func TestListDelegations_LedgerFailedIncludesErrorDetail(t *testing.T) {
t.Errorf("unmet sqlmock expectations: %v", err)
}
}
// ---------- buildDelegateA2ABody: schema-valid SendMessageRequest ----------
// TestBuildDelegateA2ABody_SchemaValidSendMessageRequest pins the contract
// requested by issue #2251: delegate_task must produce a schema-valid A2A
// SendMessageRequest with role="user", messageId, parts, and metadata.
func TestBuildDelegateA2ABody_SchemaValidSendMessageRequest(t *testing.T) {
delegationID := "del-2251-test"
task := "write a contract test"
body, err := buildDelegateA2ABody(delegationID, task)
if err != nil {
t.Fatalf("buildDelegateA2ABody failed: %v", err)
}
var envelope map[string]interface{}
if err := json.Unmarshal(body, &envelope); err != nil {
t.Fatalf("body is not valid JSON: %v", err)
}
// Top-level envelope shape
if envelope["method"] != "message/send" {
t.Errorf("method = %v, want message/send", envelope["method"])
}
params, ok := envelope["params"].(map[string]interface{})
if !ok {
t.Fatalf("params missing or not a map: %T", envelope["params"])
}
msg, ok := params["message"].(map[string]interface{})
if !ok {
t.Fatalf("message missing or not a map: %T", params["message"])
}
// Issue #2251: role is required
if msg["role"] != "user" {
t.Errorf("message.role = %v, want \"user\"", msg["role"])
}
// messageId must be present and match delegationID
if msg["messageId"] != delegationID {
t.Errorf("message.messageId = %v, want %s", msg["messageId"], delegationID)
}
// parts must be a non-empty list with a text part
parts, ok := msg["parts"].([]interface{})
if !ok || len(parts) == 0 {
t.Fatalf("message.parts missing or empty: %T", msg["parts"])
}
firstPart, ok := parts[0].(map[string]interface{})
if !ok {
t.Fatalf("first part is not a map: %T", parts[0])
}
// A2A v0.3 Part discriminator is `kind`, NOT `type` (#2251)
if firstPart["kind"] != "text" {
t.Errorf("first part kind = %v, want text", firstPart["kind"])
}
if firstPart["text"] != task {
t.Errorf("first part text = %v, want %q", firstPart["text"], task)
}
// metadata.delegation_id must match
meta, ok := msg["metadata"].(map[string]interface{})
if !ok {
t.Fatalf("metadata missing or not a map: %T", msg["metadata"])
}
if meta["delegation_id"] != delegationID {
t.Errorf("metadata.delegation_id = %v, want %s", meta["delegation_id"], delegationID)
}
}
+9 -53
View File
@@ -54,55 +54,6 @@ func mcpPost(t *testing.T, h *MCPHandler, workspaceID string, body interface{})
return w
}
// assertA2ASendMessageSchema validates that body is a schema-valid A2A
// SendMessageRequest with role="user", messageId, and non-empty parts.
// Issue #2251 contract test: delegate_task must always produce this shape.
func assertA2ASendMessageSchema(t *testing.T, body []byte, wantTask string) {
t.Helper()
var envelope map[string]interface{}
if err := json.Unmarshal(body, &envelope); err != nil {
t.Fatalf("A2A body is not valid JSON: %v", err)
}
if envelope["jsonrpc"] != "2.0" {
t.Errorf("jsonrpc = %v, want 2.0", envelope["jsonrpc"])
}
if envelope["method"] != "message/send" {
t.Errorf("method = %v, want message/send", envelope["method"])
}
params, ok := envelope["params"].(map[string]interface{})
if !ok {
t.Fatalf("params missing or not a map: %T", envelope["params"])
}
msg, ok := params["message"].(map[string]interface{})
if !ok {
t.Fatalf("message missing or not a map: %T", params["message"])
}
if msg["role"] != "user" {
t.Errorf("message.role = %v, want \"user\"", msg["role"])
}
if msg["messageId"] == "" {
t.Error("message.messageId is empty")
}
parts, ok := msg["parts"].([]interface{})
if !ok || len(parts) == 0 {
t.Fatalf("message.parts missing or empty: %T", msg["parts"])
}
firstPart, ok := parts[0].(map[string]interface{})
if !ok {
t.Fatalf("first part is not a map: %T", parts[0])
}
// A2A v0.3 Part discriminator is `kind`, NOT `type` (#2251)
if firstPart["kind"] != "text" {
t.Errorf("first part kind = %v, want text", firstPart["kind"])
}
if firstPart["text"] != wantTask {
t.Errorf("first part text = %v, want %q", firstPart["text"], wantTask)
}
}
func expectCanCommunicateSiblings(mock sqlmock.Sqlmock, callerID, targetID, parentID string) {
mock.ExpectQuery(`SELECT id, parent_id FROM workspaces WHERE id = \$1`).
WithArgs(callerID).
@@ -258,7 +209,9 @@ func TestMCPHandler_DelegateTask_RoutesThroughPlatformA2AProxy(t *testing.T) {
if !logActivity {
t.Fatal("delegate_task should log through platform A2A proxy")
}
assertA2ASendMessageSchema(t, body, "do work")
if !strings.Contains(string(body), "do work") {
t.Fatalf("A2A body missing task text: %s", string(body))
}
return 200, []byte(`{"result":{"message":{"parts":[{"text":"done"}]}}}`), nil
}
@@ -299,7 +252,9 @@ func TestMCPHandler_DelegateTaskAsync_RoutesThroughPlatformA2AProxy(t *testing.T
if workspaceID != targetID || proxyCallerID != callerID {
t.Fatalf("unexpected proxy route target=%q caller=%q", workspaceID, proxyCallerID)
}
assertA2ASendMessageSchema(t, body, "async work")
if !strings.Contains(string(body), "async work") {
t.Fatalf("A2A body missing task text: %s", string(body))
}
called <- struct{}{}
return 200, []byte(`{"result":{"message":{"parts":[{"text":"accepted"}]}}}`), nil
}
@@ -349,8 +304,10 @@ func TestMCPHandler_DelegateTask_WithAttachments(t *testing.T) {
if workspaceID != targetID || proxyCallerID != callerID {
t.Fatalf("unexpected proxy route target=%q caller=%q", workspaceID, proxyCallerID)
}
assertA2ASendMessageSchema(t, body, "review this video")
bodyStr := string(body)
if !strings.Contains(bodyStr, `"text":"review this video"`) {
t.Fatalf("A2A body missing task text: %s", bodyStr)
}
if !strings.Contains(bodyStr, `"kind":"video"`) {
t.Fatalf("A2A body missing video attachment kind: %s", bodyStr)
}
@@ -429,7 +386,6 @@ func TestMCPHandler_DelegateTaskAsync_WithAttachments(t *testing.T) {
waitGlobalAsyncForTest()
select {
case body := <-called:
assertA2ASendMessageSchema(t, body, "async work with image")
bodyStr := string(body)
if !strings.Contains(bodyStr, `"kind":"image"`) {
t.Fatalf("A2A body missing image attachment kind: %s", bodyStr)
@@ -177,7 +177,7 @@ func isEnvIdentPart(c byte) bool {
return isEnvIdentStart(c) || (c >= '0' && c <= '9')
}
// loadWorkspaceEnv reads the org root .env and the workspace-specific .env files.
// loadWorkspaceEnv reads the org root .env and the workspace-specific .env
// (workspace overrides org root). Used by both secret injection and channel
// config expansion.
//
@@ -161,7 +161,7 @@ func (h *PluginsHandler) uninstallViaDocker(ctx context.Context, c *gin.Context,
// 1. Strip plugin's rule/fragment markers from CLAUDE.md (mirrors
// AgentskillsAdaptor.uninstall lines 184-188). Best-effort: if
// the user edited CLAUDE.md, our marker stays untouched.
h.stripPluginMarkersFromMemory(ctx, workspaceID, containerName, pluginName)
h.stripPluginMarkersFromMemory(ctx, containerName, pluginName)
// 2. Remove copied skill dirs declared in the plugin's plugin.yaml.
for _, skill := range skillNames {
@@ -171,11 +171,9 @@ func (h *PluginsHandler) uninstallViaDocker(ctx context.Context, c *gin.Context,
log.Printf("Plugin uninstall: skipping invalid skill name %q in %s: %v", skill, pluginName, err)
continue
}
if _, rmErr := h.execAsRoot(ctx, containerName, []string{
_, _ = h.execAsRoot(ctx, containerName, []string{
"rm", "-rf", "/configs/skills/" + skill,
}); rmErr != nil {
log.Printf("Plugin uninstall: failed to remove skill %s from %s: %v", skill, workspaceID, rmErr)
}
})
}
// 3. Delete the plugin directory itself (as root to handle file ownership).
@@ -393,7 +393,7 @@ func (h *PluginsHandler) readPluginSkillsFromContainer(ctx context.Context, cont
// `# Plugin: <name> /` — mirrors AgentskillsAdaptor.uninstall's stripping
// logic so install/uninstall are symmetric. Best-effort: silent on read or
// write failure, since the rest of uninstall must still succeed.
func (h *PluginsHandler) stripPluginMarkersFromMemory(ctx context.Context, workspaceID, containerName, pluginName string) {
func (h *PluginsHandler) stripPluginMarkersFromMemory(ctx context.Context, containerName, pluginName string) {
// Use sed via bash -c for atomic in-place delete: drop the marker line
// and the blank line that follows it (install adds a leading blank line
// before the marker via append_to_memory). Three sed passes mirror the
@@ -417,9 +417,7 @@ func (h *PluginsHandler) stripPluginMarkersFromMemory(ctx context.Context, works
`awk 'BEGIN{skip=0; blanks=0} /^%s/{skip=1; blanks=0; next} skip==1 && /^[[:space:]]*$/{blanks++; if(blanks>=2){skip=0; print; next} next} /^# Plugin: /{if(skip==1)skip=0} skip==1{next} {print}' /configs/CLAUDE.md > /tmp/claude.new && mv /tmp/claude.new /configs/CLAUDE.md`,
regexpEscapeForAwk(marker),
)
if _, awkErr := h.execAsRoot(ctx, containerName, []string{"bash", "-c", script}); awkErr != nil {
log.Printf("Plugin uninstall: failed to strip markers from CLAUDE.md for %s in %s: %v", pluginName, workspaceID, awkErr)
}
_, _ = h.execAsRoot(ctx, containerName, []string{"bash", "-c", script})
}
// regexpEscapeForAwk escapes characters that have special meaning inside an
@@ -14,7 +14,6 @@ import (
"net/http"
"os"
"path/filepath"
"runtime/debug"
"strings"
"sync"
"time"
@@ -114,11 +113,6 @@ func (h *WorkspaceHandler) goAsync(fn func()) {
h.asyncWG.Add(1)
go func() {
defer h.asyncWG.Done()
defer func() {
if r := recover(); r != nil {
log.Printf("PANIC recovered in goAsync goroutine: %v\n%s", r, debug.Stack())
}
}()
fn()
}()
}
@@ -157,11 +151,6 @@ func globalGoAsync(fn func()) {
globalAsync.Add(1)
go func() {
defer globalAsync.Done()
defer func() {
if r := recover(); r != nil {
log.Printf("PANIC recovered in globalGoAsync goroutine: %v\n%s", r, debug.Stack())
}
}()
fn()
}()
}
@@ -45,7 +45,7 @@ type restartState struct {
// flipped running=true. Used by the self-fire debounce (internal#544,
// the ws-server self-fire restart feedback loop seen in prod-Reviewer/
// Researcher 2026-05-19 ~00:05Z 4x reprov thrash): any RestartByID
// arriving within RestartDebounceWindow of this timestamp is silently
// arriving within restartDebounceWindow of this timestamp is silently
// dropped so a probe firing during the EC2-pending window can't
// re-trigger a fresh full cycle on the just-launched instance.
restartStartedAt time.Time
@@ -55,19 +55,13 @@ type restartState struct {
// its own entry so unrelated workspaces don't serialize on each other.
var restartStates sync.Map // map[workspaceID]*restartState
// RestartDebounceWindow is the silent-drop window for successive RestartByID
// restartDebounceWindow is the silent-drop window for successive RestartByID
// calls. Sized to cover the typical EC2 pending → online interval (20-30s)
// with a margin so a probe firing during the just-after-online but still-
// flaky heartbeat window also gets dropped. Bigger than that would block
// legitimate "Restart failed, retry" recoveries; smaller would let the
// 4x thrash class through. Package-level so tests can shrink it.
//
// COUPLING: this window MUST be >= the CP instance reconciler interval
// (cmd/server/main.go:355, currently 60s). If the interval ever shrinks
// below this window, a workspace flipped offline by one reconcile tick can
// be reprovisioned again by the next tick before the debounce drops it,
// reopening the double-reprovision thrash class (internal#544).
var RestartDebounceWindow = 60 * time.Second
var restartDebounceWindow = 60 * time.Second
// restartByIDDropCounter is incremented every time RestartByID drops a call
// inside the debounce window. Exposed as a package-level atomic counter so
@@ -542,7 +536,7 @@ func (h *WorkspaceHandler) RestartByID(workspaceID string) {
return
}
// Self-fire debounce: drop (not coalesce) successive RestartByID calls
// within RestartDebounceWindow of the most recent cycle's start. This
// within restartDebounceWindow of the most recent cycle's start. This
// is the load-bearing protection against the 4x reprov thrash class —
// coalesceRestart's pending-flag would otherwise drain by running
// ANOTHER full cycle of stop+provision on the just-launched EC2 (still
@@ -556,14 +550,14 @@ func (h *WorkspaceHandler) RestartByID(workspaceID string) {
if shouldDebounceRestart(workspaceID) {
restartByIDDropCounter.Add(1)
log.Printf("RestartByID: %s — dropped (within %s self-fire debounce window; total dropped=%d)",
workspaceID, RestartDebounceWindow, restartByIDDropCounter.Load())
workspaceID, restartDebounceWindow, restartByIDDropCounter.Load())
return
}
coalesceRestart(workspaceID, func() { h.runRestartCycle(workspaceID) })
}
// shouldDebounceRestart reports whether the most recent cycle for this
// workspace started within RestartDebounceWindow. Read-only on
// workspace started within restartDebounceWindow. Read-only on
// restartState; the actual restartStartedAt stamp is written in
// coalesceRestart when running flips false→true.
func shouldDebounceRestart(workspaceID string) bool {
@@ -577,7 +571,7 @@ func shouldDebounceRestart(workspaceID string) bool {
if state.restartStartedAt.IsZero() {
return false
}
return time.Since(state.restartStartedAt) < RestartDebounceWindow
return time.Since(state.restartStartedAt) < restartDebounceWindow
}
// coalesceRestart implements the pending-flag gate around an arbitrary cycle
@@ -600,7 +594,7 @@ func coalesceRestart(workspaceID string, cycle func()) {
}
state.running = true
// Stamp the start time so the RestartByID debounce can drop any
// self-fire probe that hits within RestartDebounceWindow. Only the
// self-fire probe that hits within restartDebounceWindow. Only the
// false→true edge stamps; the drain-loop's inner cycles re-use the
// same start (they're effectively one "restart event" from the
// debounce's POV).
@@ -203,9 +203,9 @@ func TestRestartByID_DebounceExpiresAfterWindow(t *testing.T) {
const wsID = "self-fire-ws-debounce-release"
resetSelfFireState(wsID)
orig := RestartDebounceWindow
RestartDebounceWindow = 5 * time.Millisecond
defer func() { RestartDebounceWindow = orig }()
orig := restartDebounceWindow
restartDebounceWindow = 5 * time.Millisecond
defer func() { restartDebounceWindow = orig }()
// Stamp inside the window.
sv, _ := restartStates.LoadOrStore(wsID, &restartState{})
@@ -199,11 +199,6 @@ func (s *Scheduler) Start(ctx context.Context) {
// entry/exit — those are kept as redundant signals but this pulse is the
// one that guarantees liveness freshness regardless of tick state.
go func() {
defer func() {
if r := recover(); r != nil {
log.Printf("PANIC recovered in scheduler heartbeat goroutine: %v", r)
}
}()
pulseTicker := time.NewTicker(10 * time.Second)
defer pulseTicker.Stop()
for {
@@ -643,11 +638,6 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
summary := s.extractResponseSummary(respBody)
if summary != "" {
go func(wsID, text string) {
defer func() {
if r := recover(); r != nil {
log.Printf("PANIC recovered in broadcast summary goroutine: %v", r)
}
}()
postCtx, postCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer postCancel()
s.channels.BroadcastToWorkspaceChannels(postCtx, wsID, text)