Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| bb52d43dc1 | |||
| d287eb56a6 |
+1
-1
@@ -51,7 +51,7 @@ MOLECULE_ENV=development # Environment label (development/
|
||||
# MOLECULE_IN_DOCKER= # Set when running the platform inside Docker (accepts 1/0, true/false). Triggers A2A proxy to rewrite 127.0.0.1:<port> agent URLs to Docker bridge hostnames. Auto-detected via /.dockerenv; only set if detection fails or to force off.
|
||||
|
||||
# GitHub
|
||||
# GITHUB_REPO=owner/repo # Target repo for agent initial_prompt clone (e.g. Molecule-AI/molecule-core). Read inside workspace containers.
|
||||
# GITHUB_REPO=owner/repo # Target repo for agent initial_prompt clone (e.g. Molecule-AI/molecule-monorepo). Read inside workspace containers.
|
||||
# GITHUB_TOKEN= # Personal access token / installation token used by agents that clone private repos. Register as a global secret via POST /admin/secrets for propagation to workspace env. Token is used in-URL during clone and then scrubbed from .git/config via `git remote set-url`.
|
||||
|
||||
# Webhooks
|
||||
|
||||
@@ -18,24 +18,15 @@
|
||||
# per §SOP-6 security model). No-op when merged=false.
|
||||
#
|
||||
# Required env (set by the workflow):
|
||||
# GITEA_TOKEN, GITEA_HOST, REPO, PR_NUMBER
|
||||
# plus one of REQUIRED_CHECKS_JSON (preferred) or REQUIRED_CHECKS (legacy)
|
||||
# GITEA_TOKEN, GITEA_HOST, REPO, PR_NUMBER, REQUIRED_CHECKS
|
||||
#
|
||||
# REQUIRED_CHECKS_JSON is a JSON object keyed by branch name. Each value
|
||||
# is an array of status-check context names that branch protection
|
||||
# requires for that branch. The script looks up the PR's base branch and
|
||||
# evaluates only the checks declared for that branch.
|
||||
#
|
||||
# {"main": ["CI / all-required (pull_request)", ...],
|
||||
# "staging": ["CI / all-required (pull_request)", ...]}
|
||||
#
|
||||
# REQUIRED_CHECKS (legacy) is a newline-separated list used when the
|
||||
# JSON variable is not set. Declared in the workflow YAML rather than
|
||||
# fetched from /branch_protections (which needs admin scope — sop-tier-bot
|
||||
# has read-only). Trade dynamism for simplicity: when the required-check
|
||||
# set changes, update both branch protection AND this env. Keeping them
|
||||
# in sync is less complexity than granting the audit bot admin perms on
|
||||
# every repo.
|
||||
# REQUIRED_CHECKS is a newline-separated list of status-check context
|
||||
# names that branch protection requires. Declared in the workflow YAML
|
||||
# rather than fetched from /branch_protections (which needs admin
|
||||
# scope — sop-tier-bot has read-only). Trade dynamism for simplicity:
|
||||
# when the required-check set changes, update both branch protection
|
||||
# AND this env. Keeping them in sync is less complexity than granting
|
||||
# the audit bot admin perms on every repo.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
@@ -43,10 +34,7 @@ set -euo pipefail
|
||||
: "${GITEA_HOST:?required}"
|
||||
: "${REPO:?required}"
|
||||
: "${PR_NUMBER:?required}"
|
||||
if [ -z "${REQUIRED_CHECKS_JSON:-}" ] && [ -z "${REQUIRED_CHECKS:-}" ]; then
|
||||
echo "::error::Either REQUIRED_CHECKS_JSON or REQUIRED_CHECKS must be set"
|
||||
exit 1
|
||||
fi
|
||||
: "${REQUIRED_CHECKS:?required (newline-separated context names)}"
|
||||
|
||||
OWNER="${REPO%%/*}"
|
||||
NAME="${REPO##*/}"
|
||||
@@ -77,14 +65,10 @@ if [ -z "$MERGE_SHA" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# 2. Required status checks — branch-aware JSON dict takes precedence.
|
||||
if [ -n "${REQUIRED_CHECKS_JSON:-}" ]; then
|
||||
REQUIRED=$(echo "$REQUIRED_CHECKS_JSON" | jq -r --arg branch "$BASE_BRANCH" '.[$branch] // [] | .[]')
|
||||
else
|
||||
REQUIRED="$REQUIRED_CHECKS"
|
||||
fi
|
||||
# 2. Required status checks declared in the workflow env.
|
||||
REQUIRED="$REQUIRED_CHECKS"
|
||||
if [ -z "${REQUIRED//[[:space:]]/}" ]; then
|
||||
echo "::notice::REQUIRED_CHECKS empty for branch '$BASE_BRANCH' — force-merge not applicable."
|
||||
echo "::notice::REQUIRED_CHECKS empty — force-merge not applicable."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
||||
@@ -296,15 +296,7 @@ fi
|
||||
# 403 → token owner is not in this team (Gitea 1.22.6 'Must be a team
|
||||
# member' constraint — see follow-up issue for token-provisioning)
|
||||
# 404 → not a member
|
||||
# Track whether every candidate returned 403 (token owner not in team).
|
||||
# When this happens the root cause is a token-provisioning issue, not a
|
||||
# reviewer-eligibility issue — surface it clearly so ops don't waste time
|
||||
# verifying team roster (Bug C / RFC#324 follow-up).
|
||||
_ALL_CANDIDATES_403="yes"
|
||||
_CANDIDATE_COUNT=0
|
||||
|
||||
for U in $CANDIDATES; do
|
||||
_CANDIDATE_COUNT=$((_CANDIDATE_COUNT + 1))
|
||||
CODE=$(curl -sS -o "$TEAM_PROBE_TMP" -w '%{http_code}' \
|
||||
-K "$CURL_AUTH_FILE" "${API}/teams/${TEAM_ID}/members/${U}")
|
||||
debug "probe ${U} in team ${TEAM} (id=${TEAM_ID}) → HTTP ${CODE}"
|
||||
@@ -325,20 +317,14 @@ for U in $CANDIDATES; do
|
||||
continue
|
||||
;;
|
||||
404)
|
||||
_ALL_CANDIDATES_403="no"
|
||||
debug "${U} not a member of ${TEAM}"
|
||||
;;
|
||||
*)
|
||||
_ALL_CANDIDATES_403="no"
|
||||
echo "::warning::team-probe for ${U} in ${TEAM} returned unexpected HTTP ${CODE}"
|
||||
cat "$TEAM_PROBE_TMP" >&2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ "$_ALL_CANDIDATES_403" = "yes" ] && [ "$_CANDIDATE_COUNT" -gt 0 ]; then
|
||||
echo "::error::${TEAM}-review FAILED — every candidate returned 403 (token owner is not a member of the ${TEAM} team). This is a TOKEN PROVISIONING issue, not a reviewer-eligibility issue. Add the token owner to the '${TEAM}' Gitea team (id=${TEAM_ID}) or use a token whose owner is already in that team."
|
||||
else
|
||||
echo "::error::${TEAM}-review awaiting non-author APPROVE from ${TEAM} team (candidates: $(echo "$CANDIDATES" | tr '\n' ',' | sed 's/,$//') — none are in team)"
|
||||
fi
|
||||
echo "::error::${TEAM}-review awaiting non-author APPROVE from ${TEAM} team (candidates: $(echo "$CANDIDATES" | tr '\n' ',' | sed 's/,$//') — none are in team)"
|
||||
exit 1
|
||||
|
||||
@@ -13,26 +13,20 @@ set -euo pipefail
|
||||
OWNER="${REPO%%/*}"
|
||||
NAME="${REPO##*/}"
|
||||
API="https://${GITEA_HOST}/api/v1"
|
||||
# Branch-protection requires the (pull_request_target) context variant.
|
||||
# The refire path must post the EXACT BP-required name so the gate flips.
|
||||
CONTEXT="${TEAM}-review / approved (pull_request_target)"
|
||||
CONTEXT="${TEAM}-review / approved (pull_request)"
|
||||
TARGET_URL="https://${GITEA_HOST}/${OWNER}/${NAME}/pulls/${PR_NUMBER}"
|
||||
|
||||
authfile=$(mktemp)
|
||||
post_authfile=$(mktemp)
|
||||
prfile=$(mktemp)
|
||||
postfile=$(mktemp)
|
||||
# shellcheck disable=SC2329 # invoked by EXIT trap
|
||||
cleanup() {
|
||||
rm -f "$authfile" "$post_authfile" "$prfile" "$postfile"
|
||||
rm -f "$authfile" "$prfile" "$postfile"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
chmod 600 "$authfile" "$post_authfile"
|
||||
chmod 600 "$authfile"
|
||||
printf 'header = "Authorization: token %s"\n' "$GITEA_TOKEN" > "$authfile"
|
||||
# STATUS_POST_TOKEN is narrow-scoped write:repository for explicit status POST.
|
||||
# Falls back to GITEA_TOKEN for backward compatibility (e.g. local test).
|
||||
printf 'header = "Authorization: token %s"\n' "${STATUS_POST_TOKEN:-$GITEA_TOKEN}" > "$post_authfile"
|
||||
|
||||
code=$(curl -sS -o "$prfile" -w '%{http_code}' -K "$authfile" \
|
||||
"${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}")
|
||||
@@ -74,7 +68,7 @@ body=$(jq -nc \
|
||||
'{state:$state, context:$context, description:$description, target_url:$target_url}')
|
||||
|
||||
code=$(curl -sS -o "$postfile" -w '%{http_code}' -X POST \
|
||||
-K "$post_authfile" -H "Content-Type: application/json" \
|
||||
-K "$authfile" -H "Content-Type: application/json" \
|
||||
-d "$body" \
|
||||
"${API}/repos/${OWNER}/${NAME}/statuses/${head_sha}")
|
||||
if [ "$code" != "200" ] && [ "$code" != "201" ]; then
|
||||
|
||||
@@ -6,8 +6,8 @@
|
||||
# RFC#351 Step 2 of 6 (implementation MVP).
|
||||
#
|
||||
# Invoked by .gitea/workflows/sop-checklist.yml on:
|
||||
# - pull_request_target: [opened, edited, synchronize, reopened, labeled, unlabeled]
|
||||
# - issue_comment: [created] # edited/deleted omitted (Gitea 1.22.6 job-parsing quirk)
|
||||
# - pull_request_target: [opened, edited, synchronize, reopened]
|
||||
# - issue_comment: [created, edited, deleted]
|
||||
#
|
||||
# Flow:
|
||||
# 1. Load .gitea/sop-checklist-config.yaml (from BASE ref — trusted).
|
||||
@@ -639,7 +639,9 @@ def load_config(path: str) -> dict[str, Any]:
|
||||
# yaml is an optional dep; the canonical loader is used when available,
|
||||
# but the SOP runs on runners that may not have PyYAML installed. The
|
||||
# fallback _load_config_minimal covers the same config shape without
|
||||
import yaml # type: ignore[import-not-found] # optional dep; fall back silently if absent
|
||||
# requiring the dep, so the ignore is safe: if yaml loads, we use it;
|
||||
# otherwise we fall back silently.
|
||||
import yaml # type: ignore[import-not-found]
|
||||
with open(path, encoding="utf-8") as f:
|
||||
return yaml.safe_load(f)
|
||||
except ImportError:
|
||||
@@ -895,47 +897,6 @@ def resolve_required_teams(item: dict[str, Any], high_risk: bool) -> list[str]:
|
||||
return list(item.get("required_teams") or [])
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CI status validation for testing-class AI acks (internal#760 CTO hardening)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Slugs that require CI / all-required green before an AI ack is valid.
|
||||
_TESTING_CLASS_SLUGS = {"comprehensive-testing", "local-postgres-e2e", "staging-smoke"}
|
||||
|
||||
# Human-only carve-out: these items can NEVER be acked by AI, regardless
|
||||
# of config drift. Any item in this set MUST NOT have ai_ack_eligible.
|
||||
# migration / schema are future-proofing — not yet in config items, but
|
||||
# the code guard rejects them proactively (CTO hardening, msg 1388c76f).
|
||||
_HUMAN_ONLY_SLUGS = {"root-cause", "no-backwards-compat", "migration", "schema"}
|
||||
|
||||
|
||||
def get_ci_status(client: GiteaClient, owner: str, repo: str, sha: str) -> str:
|
||||
"""Return the state of CI / all-required (pull_request) for `sha`.
|
||||
|
||||
Looks through the commit statuses and returns the state string
|
||||
("success", "failure", "pending", "error") or "missing" if the
|
||||
context is not found. This prevents an AI agent from attesting
|
||||
"tests pass" independently of the actual CI run.
|
||||
"""
|
||||
code, data = client._req( # noqa: SLF001
|
||||
"GET", f"/repos/{owner}/{repo}/statuses/{sha}"
|
||||
)
|
||||
if code != 200:
|
||||
return "unknown"
|
||||
if not data or not isinstance(data, list):
|
||||
return "missing"
|
||||
# Gitea returns statuses newest-first. Find the latest for our context.
|
||||
for status in data:
|
||||
if status.get("context") == "CI / all-required (pull_request)":
|
||||
return status.get("state", "unknown")
|
||||
return "missing"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--owner", required=True)
|
||||
@@ -1029,9 +990,6 @@ def main(argv: list[str] | None = None) -> int:
|
||||
# one membership lookup per team.
|
||||
team_member_cache: dict[tuple[str, int], bool | None] = {}
|
||||
|
||||
# Pre-resolve the ai-sop-ack team id once (None if the team does not exist).
|
||||
ai_sop_ack_team_id = client.resolve_team_id(args.owner, "ai-sop-ack")
|
||||
|
||||
def probe(slug: str, users: list[str]) -> list[str]:
|
||||
# `slug` may be either an items-key (compute_ack_state caller) OR
|
||||
# an n/a-gate key (compute_na_state caller). Previously this hard
|
||||
@@ -1075,7 +1033,7 @@ def main(argv: list[str] | None = None) -> int:
|
||||
for t in data:
|
||||
if t.get("name") == tn:
|
||||
tid = t.get("id")
|
||||
client._team_id_cache[(args.owner, tn)] = tid # noqa: SLF001 # write-through cache; intentional side-effect for reuse across calls
|
||||
client._team_id_cache[(args.owner, tn)] = tid # noqa: SLF001 # internal write-through cache
|
||||
break
|
||||
if tid is not None:
|
||||
team_ids.append(tid)
|
||||
@@ -1086,18 +1044,14 @@ def main(argv: list[str] | None = None) -> int:
|
||||
file=sys.stderr,
|
||||
)
|
||||
approved: list[str] = []
|
||||
rejected_ai_ineligible: list[str] = []
|
||||
rejected_ci_not_green: list[str] = []
|
||||
for u in users:
|
||||
# 1) Human required_teams membership check
|
||||
in_human_team = False
|
||||
for tid in team_ids:
|
||||
cache_key = (u, tid)
|
||||
if cache_key not in team_member_cache:
|
||||
team_member_cache[cache_key] = client.is_team_member(tid, u)
|
||||
result = team_member_cache[cache_key]
|
||||
if result is True:
|
||||
in_human_team = True
|
||||
approved.append(u)
|
||||
break
|
||||
if result is None:
|
||||
print(
|
||||
@@ -1107,44 +1061,6 @@ def main(argv: list[str] | None = None) -> int:
|
||||
)
|
||||
# Treat as not-in-team for this user/team pair; loop
|
||||
# may still find membership in another team.
|
||||
if in_human_team:
|
||||
approved.append(u)
|
||||
continue
|
||||
|
||||
# 2) AI-sop-ack team membership check (only for items that allow it).
|
||||
if slug in items_by_slug:
|
||||
item = items_by_slug[slug]
|
||||
# Defensive: human-only carve-out is enforced in code, not just
|
||||
# config. Even if ai_ack_eligible were mistakenly added to a
|
||||
# migration/schema item, the AI path is rejected here.
|
||||
if slug in _HUMAN_ONLY_SLUGS:
|
||||
rejected_ai_ineligible.append(u)
|
||||
continue
|
||||
if item.get("ai_ack_eligible") and ai_sop_ack_team_id is not None:
|
||||
cache_key = (u, ai_sop_ack_team_id)
|
||||
if cache_key not in team_member_cache:
|
||||
team_member_cache[cache_key] = client.is_team_member(
|
||||
ai_sop_ack_team_id, u
|
||||
)
|
||||
result = team_member_cache[cache_key]
|
||||
if result is True:
|
||||
# 2a) Testing-class items require real CI artifact evidence.
|
||||
if slug in _TESTING_CLASS_SLUGS:
|
||||
ci_state = get_ci_status(
|
||||
client, args.owner, args.repo, head_sha
|
||||
)
|
||||
if ci_state != "success":
|
||||
print(
|
||||
f"::warning::AI ack for {slug} rejected: "
|
||||
f"CI / all-required is {ci_state}, not success",
|
||||
file=sys.stderr,
|
||||
)
|
||||
rejected_ci_not_green.append(u)
|
||||
continue
|
||||
approved.append(u)
|
||||
continue
|
||||
# If we get here, user is not approved for this slug.
|
||||
rejected_ai_ineligible.append(u)
|
||||
return approved
|
||||
|
||||
ack_state = compute_ack_state(
|
||||
|
||||
@@ -21,7 +21,6 @@ Scenarios:
|
||||
T16_comments_generic_approval — reviews empty; comments have "APPROVED" by team member → exit 0
|
||||
T17_comments_no_approval — reviews empty; comments have no approval keywords → exit 1
|
||||
T18_review_wrong_team_comment_right_team — review candidate 404s, comment candidate passes
|
||||
T19_ai_sop_ack_approved — ai-sop-ack member APPROVED review → team probe 404 → exit 1
|
||||
|
||||
Usage:
|
||||
FIXTURE_STATE_DIR=/tmp/x python3 _review_check_fixture.py 8080
|
||||
@@ -117,12 +116,6 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
||||
{"state": "CHANGES_REQUESTED", "dismissed": False, "user": {"login": "bob"}, "commit_id": "abc1234"},
|
||||
{"state": "APPROVED", "dismissed": False, "user": {"login": "core-devops"}, "commit_id": "abc1234"},
|
||||
])
|
||||
if sc == "T19_ai_sop_ack_approved":
|
||||
# ai-sop-ack member submitted APPROVED review — must NOT count
|
||||
# toward qa-review (team_id=20) or security-review (team_id=21).
|
||||
return self._json(200, [
|
||||
{"state": "APPROVED", "dismissed": False, "user": {"login": "ai-reviewer"}, "commit_id": "abc1234"},
|
||||
])
|
||||
# Default: one non-author APPROVED
|
||||
return self._json(200, [
|
||||
{"state": "APPROVED", "dismissed": False, "user": {"login": "core-devops"}, "commit_id": "abc1234"},
|
||||
@@ -164,9 +157,6 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
||||
return self._empty(403)
|
||||
if sc == "T18_review_wrong_team_comment_right_team" and login == "core-devops":
|
||||
return self._empty(404)
|
||||
if sc == "T19_ai_sop_ack_approved" and login == "ai-reviewer":
|
||||
# ai-sop-ack member is NOT in qa (20) or security (21).
|
||||
return self._empty(404)
|
||||
# T7_team_member: member
|
||||
return self._empty(204)
|
||||
|
||||
|
||||
@@ -11,100 +11,21 @@ def load_workflow(name: str) -> dict:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def _all_required(workflow: dict) -> dict:
|
||||
return workflow["jobs"]["all-required"]
|
||||
|
||||
|
||||
def test_all_required_uses_dedicated_meta_runner_lane():
|
||||
workflow = load_workflow("ci.yml")
|
||||
all_required = _all_required(workflow)
|
||||
all_required = workflow["jobs"]["all-required"]
|
||||
|
||||
# Stays on the dedicated `ci-meta` lane (the sentinel does no docker
|
||||
# work, so it must NOT occupy the general docker-host pool).
|
||||
assert all_required["runs-on"] == "ci-meta"
|
||||
assert "needs" not in all_required
|
||||
|
||||
|
||||
def test_all_required_is_needs_aggregator_not_a_polling_gate():
|
||||
"""fix/ci-scheduler-fanout (2026-06-01): the sentinel was converted
|
||||
from a status-polling loop (which squatted a ci-meta executor slot for
|
||||
up to 40 min per PR) into a plain `needs:` aggregator that frees the
|
||||
slot immediately. Pin the new shape so a regression to the poller is
|
||||
caught.
|
||||
"""
|
||||
def test_all_required_reuses_path_filter_before_polling():
|
||||
workflow = load_workflow("ci.yml")
|
||||
all_required = _all_required(workflow)
|
||||
all_required = workflow["jobs"]["all-required"]
|
||||
rendered = str(all_required)
|
||||
|
||||
# The job MUST aggregate via `needs:` (the slot-freeing design).
|
||||
assert "needs" in all_required, "all-required must be a needs: aggregator"
|
||||
|
||||
# It MUST NOT reintroduce the polling loop / per-SHA status fetch that
|
||||
# was the throughput sink.
|
||||
assert "detect-changes.py" not in rendered, (
|
||||
"all-required must not run the detect-changes poller path"
|
||||
)
|
||||
assert "commits/" not in rendered and "statuses" not in rendered, (
|
||||
"all-required must not poll commit statuses (the slot-squat path)"
|
||||
)
|
||||
|
||||
|
||||
def test_all_required_does_not_use_if_always():
|
||||
"""Plain `needs:` works on Gitea 1.22.6 / act_runner v0.6.1; `needs:` +
|
||||
`if: always()` is BROKEN (feedback_gitea_needs_works_only_ifalways_broken)
|
||||
and would let a non-success need pass the gate. The sentinel must use
|
||||
plain `needs:` WITHOUT a job-level `if: always()`.
|
||||
"""
|
||||
workflow = load_workflow("ci.yml")
|
||||
all_required = _all_required(workflow)
|
||||
|
||||
job_if = all_required.get("if")
|
||||
assert not (isinstance(job_if, str) and "always()" in job_if), (
|
||||
"all-required must not combine needs: with if: always()"
|
||||
)
|
||||
|
||||
|
||||
def test_all_required_needs_matches_ci_required_drift_f1_set():
|
||||
"""The sentinel `needs:` list MUST equal ci-required-drift.py's
|
||||
`ci_job_names()` set: every job MINUS the sentinel itself MINUS jobs
|
||||
whose `if:` gates on github.event_name/github.ref (event-gated jobs
|
||||
skip on PRs and a `needs:` on a skipped job would never let the
|
||||
sentinel run). If they diverge, ci-required-drift F1 fires.
|
||||
"""
|
||||
workflow = load_workflow("ci.yml")
|
||||
jobs = workflow["jobs"]
|
||||
sentinel = "all-required"
|
||||
|
||||
expected = set()
|
||||
for key, body in jobs.items():
|
||||
if key == sentinel:
|
||||
continue
|
||||
gate = body.get("if") if isinstance(body, dict) else None
|
||||
if isinstance(gate, str) and (
|
||||
"github.event_name" in gate or "github.ref" in gate
|
||||
):
|
||||
# event-gated → legitimately skips on some triggers; excluded
|
||||
# from both `needs:` and the F1 set.
|
||||
continue
|
||||
expected.add(key)
|
||||
|
||||
needs = jobs[sentinel].get("needs", [])
|
||||
if isinstance(needs, str):
|
||||
needs = [needs]
|
||||
actual = set(needs)
|
||||
|
||||
assert actual == expected, (
|
||||
f"all-required needs: {sorted(actual)} != ci_job_names() "
|
||||
f"{sorted(expected)} — ci-required-drift F1 would fire"
|
||||
)
|
||||
|
||||
|
||||
def test_all_required_needs_reference_real_jobs():
|
||||
"""F1b guard: every entry in `needs:` must name an existing job."""
|
||||
workflow = load_workflow("ci.yml")
|
||||
jobs = workflow["jobs"]
|
||||
needs = jobs["all-required"].get("needs", [])
|
||||
if isinstance(needs, str):
|
||||
needs = [needs]
|
||||
job_keys = set(jobs)
|
||||
for dep in needs:
|
||||
assert dep in job_keys, f"all-required needs unknown job {dep!r}"
|
||||
assert "--profile ci" in rendered
|
||||
assert ".gitea/scripts/detect-changes.py" in rendered
|
||||
assert "REQUIRE_PLATFORM" in rendered
|
||||
assert "REQUIRE_CANVAS" in rendered
|
||||
assert "REQUIRE_SCRIPTS" in rendered
|
||||
|
||||
@@ -1,168 +0,0 @@
|
||||
"""Regression test #765 — gate auto-fire on real qa/security APPROVED review.
|
||||
|
||||
Validates the structural configuration of qa-review.yml and security-review.yml
|
||||
so that a real team-member APPROVED review fires the workflow and POSTs the
|
||||
exact branch-protection-required context name. This is the test #2020's
|
||||
stale-context failure would have caught.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[2]
|
||||
|
||||
|
||||
def load_workflow(name: str) -> dict:
|
||||
with (ROOT / "workflows" / name).open() as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def _job_guard_string(workflow: dict) -> str:
|
||||
"""Return the raw job-level `if:` string for the single job."""
|
||||
jobs = workflow["jobs"]
|
||||
# Both qa-review and security-review have exactly one job named "approved".
|
||||
job = jobs["approved"]
|
||||
return str(job.get("if", ""))
|
||||
|
||||
|
||||
def _post_step(workflow: dict) -> dict:
|
||||
"""Return the explicit POST /statuses step from the job steps list."""
|
||||
jobs = workflow["jobs"]
|
||||
steps = jobs["approved"]["steps"]
|
||||
for step in steps:
|
||||
name = step.get("name", "")
|
||||
if "Post required status context" in name:
|
||||
return step
|
||||
raise AssertionError("No explicit POST status step found")
|
||||
|
||||
|
||||
class TestQaReviewDirectTrigger:
|
||||
def test_trigger_is_pull_request_review_submitted(self):
|
||||
wf = load_workflow("qa-review.yml")
|
||||
# PyYAML parses bare 'on' as boolean True.
|
||||
on = wf[True]
|
||||
assert "pull_request_review" in on, (
|
||||
"qa-review must trigger on pull_request_review"
|
||||
)
|
||||
types = on["pull_request_review"].get("types", [])
|
||||
assert "submitted" in types, (
|
||||
"pull_request_review must include 'submitted' type"
|
||||
)
|
||||
|
||||
def test_job_guard_requires_approved_state(self):
|
||||
wf = load_workflow("qa-review.yml")
|
||||
guard = _job_guard_string(wf)
|
||||
assert "github.event.review.state == 'APPROVED'" in guard, (
|
||||
"job guard must check review.state for 'APPROVED'"
|
||||
)
|
||||
assert "github.event.review.state == 'approved'" in guard, (
|
||||
"job guard must check review.state for 'approved' (case fallback per #2135)"
|
||||
)
|
||||
|
||||
def test_post_step_uses_status_post_token(self):
|
||||
wf = load_workflow("qa-review.yml")
|
||||
post = _post_step(wf)
|
||||
env = post.get("env", {})
|
||||
assert env.get("GITEA_TOKEN") == "${{ secrets.STATUS_POST_TOKEN }}", (
|
||||
"POST step must use STATUS_POST_TOKEN for write-scoped status POST"
|
||||
)
|
||||
|
||||
def test_post_step_context_name_exact(self):
|
||||
"""The context POSTed must byte-match the branch-protection requirement."""
|
||||
wf = load_workflow("qa-review.yml")
|
||||
post = _post_step(wf)
|
||||
run = post.get("run", "")
|
||||
assert '"qa-review / approved (pull_request_target)"' in run, (
|
||||
"POST step must emit exact BP-required context name"
|
||||
)
|
||||
|
||||
|
||||
class TestSecurityReviewDirectTrigger:
|
||||
def test_trigger_is_pull_request_review_submitted(self):
|
||||
wf = load_workflow("security-review.yml")
|
||||
# PyYAML parses bare 'on' as boolean True.
|
||||
on = wf[True]
|
||||
assert "pull_request_review" in on, (
|
||||
"security-review must trigger on pull_request_review"
|
||||
)
|
||||
types = on["pull_request_review"].get("types", [])
|
||||
assert "submitted" in types, (
|
||||
"pull_request_review must include 'submitted' type"
|
||||
)
|
||||
|
||||
def test_job_guard_requires_approved_state(self):
|
||||
wf = load_workflow("security-review.yml")
|
||||
guard = _job_guard_string(wf)
|
||||
assert "github.event.review.state == 'APPROVED'" in guard, (
|
||||
"job guard must check review.state for 'APPROVED'"
|
||||
)
|
||||
assert "github.event.review.state == 'approved'" in guard, (
|
||||
"job guard must check review.state for 'approved' (case fallback per #2135)"
|
||||
)
|
||||
|
||||
def test_post_step_uses_status_post_token(self):
|
||||
wf = load_workflow("security-review.yml")
|
||||
post = _post_step(wf)
|
||||
env = post.get("env", {})
|
||||
assert env.get("GITEA_TOKEN") == "${{ secrets.STATUS_POST_TOKEN }}", (
|
||||
"POST step must use STATUS_POST_TOKEN for write-scoped status POST"
|
||||
)
|
||||
|
||||
def test_post_step_context_name_exact(self):
|
||||
"""The context POSTed must byte-match the branch-protection requirement."""
|
||||
wf = load_workflow("security-review.yml")
|
||||
post = _post_step(wf)
|
||||
run = post.get("run", "")
|
||||
assert '"security-review / approved (pull_request_target)"' in run, (
|
||||
"POST step must emit exact BP-required context name"
|
||||
)
|
||||
|
||||
|
||||
class TestRefireScriptContextName:
|
||||
"""review-refire-status.sh must emit the BP-required (pull_request_target) context."""
|
||||
|
||||
def test_refire_script_context_is_pull_request_target(self):
|
||||
script = ROOT / "scripts" / "review-refire-status.sh"
|
||||
content = script.read_text()
|
||||
assert 'CONTEXT="${TEAM}-review / approved (pull_request_target)"' in content, (
|
||||
"refire script CONTEXT must be the exact BP-required (pull_request_target) variant"
|
||||
)
|
||||
assert 'approved (pull_request)"' not in content, (
|
||||
"refire script must NOT post bare (pull_request) context"
|
||||
)
|
||||
|
||||
|
||||
class TestRefireTokenSeparation:
|
||||
"""The /qa-recheck + /security-recheck backstop must also use STATUS_POST_TOKEN."""
|
||||
|
||||
def _refire_step(self, workflow_name: str, step_name_keyword: str) -> dict:
|
||||
wf = load_workflow(workflow_name)
|
||||
jobs = wf["jobs"]
|
||||
steps = jobs["review-refire"]["steps"]
|
||||
for step in steps:
|
||||
name = step.get("name", "")
|
||||
if step_name_keyword in name:
|
||||
return step
|
||||
raise AssertionError(f"No refire step matching {step_name_keyword!r}")
|
||||
|
||||
def test_qa_refire_uses_status_post_token(self):
|
||||
step = self._refire_step("sop-checklist.yml", "Refire qa-review")
|
||||
env = step.get("env", {})
|
||||
assert env.get("STATUS_POST_TOKEN") == "${{ secrets.STATUS_POST_TOKEN }}", (
|
||||
"qa refire must receive STATUS_POST_TOKEN env var"
|
||||
)
|
||||
# Evaluator stays on read token
|
||||
assert "SOP_TIER_CHECK_TOKEN" in env.get("GITEA_TOKEN", "") or "GITHUB_TOKEN" in env.get("GITEA_TOKEN", ""), (
|
||||
"qa refire evaluator must stay on read-scoped token"
|
||||
)
|
||||
|
||||
def test_security_refire_uses_status_post_token(self):
|
||||
step = self._refire_step("sop-checklist.yml", "Refire security-review")
|
||||
env = step.get("env", {})
|
||||
assert env.get("STATUS_POST_TOKEN") == "${{ secrets.STATUS_POST_TOKEN }}", (
|
||||
"security refire must receive STATUS_POST_TOKEN env var"
|
||||
)
|
||||
assert "SOP_TIER_CHECK_TOKEN" in env.get("GITEA_TOKEN", "") or "GITHUB_TOKEN" in env.get("GITEA_TOKEN", ""), (
|
||||
"security refire evaluator must stay on read-scoped token"
|
||||
)
|
||||
@@ -205,8 +205,6 @@ chmod +x "$FIXTURE_DIR/bin/curl"
|
||||
# Helper: run the script with fixture environment
|
||||
run_review_check() {
|
||||
local scenario="$1"
|
||||
local team="${2:-qa}"
|
||||
local team_id="${3:-20}"
|
||||
echo "$scenario" >"$FIX_STATE_DIR/scenario"
|
||||
local out
|
||||
set +e
|
||||
@@ -217,8 +215,8 @@ run_review_check() {
|
||||
REPO="molecule-ai/molecule-core" \
|
||||
PR_NUMBER="999" \
|
||||
DEFAULT_BRANCH="main" \
|
||||
TEAM="$team" \
|
||||
TEAM_ID="$team_id" \
|
||||
TEAM="qa" \
|
||||
TEAM_ID="20" \
|
||||
REVIEW_CHECK_DEBUG="0" \
|
||||
REVIEW_CHECK_STRICT="0" \
|
||||
bash "$SCRIPT" 2>&1
|
||||
@@ -374,25 +372,6 @@ assert_eq "T18 exit code 0 (comment approval still considered)" "0" "$T18_RC"
|
||||
assert_contains "T18 comment candidate notice" "comment-based approval" "$T18_OUT"
|
||||
assert_contains "T18 comment approver accepted" "APPROVED by core-qa-agent" "$T18_OUT"
|
||||
|
||||
# T19 — ai-sop-ack member APPROVED review must NOT count toward qa-review
|
||||
# or security-review (R1 hardening refinement, msg 1388c76f).
|
||||
echo
|
||||
echo "== T19 ai-sop-ack APPROVED review excluded from qa-review gate =="
|
||||
T19_OUT=$(run_review_check "T19_ai_sop_ack_approved" "qa" "20")
|
||||
T19_RC=$(cat "$FIX_STATE_DIR/last_rc")
|
||||
assert_eq "T19 exit code 1 (ai-sop-ack not in qa team)" "1" "$T19_RC"
|
||||
assert_contains "T19 ai-reviewer excluded from qa" "candidates: ai-reviewer" "$T19_OUT"
|
||||
assert_contains "T19 none are in qa team" "none are in team" "$T19_OUT"
|
||||
|
||||
# T20 — same ai-sop-ack member must also be excluded from security-review gate.
|
||||
echo
|
||||
echo "== T20 ai-sop-ack APPROVED review excluded from security-review gate =="
|
||||
T20_OUT=$(run_review_check "T19_ai_sop_ack_approved" "security" "21")
|
||||
T20_RC=$(cat "$FIX_STATE_DIR/last_rc")
|
||||
assert_eq "T20 exit code 1 (ai-sop-ack not in security team)" "1" "$T20_RC"
|
||||
assert_contains "T20 ai-reviewer excluded from security" "candidates: ai-reviewer" "$T20_OUT"
|
||||
assert_contains "T20 none are in security team" "none are in team" "$T20_OUT"
|
||||
|
||||
echo
|
||||
echo "------"
|
||||
echo "PASS=$PASS FAIL=$FAIL"
|
||||
|
||||
@@ -1003,299 +1003,3 @@ class TestComputeNaStateAcceptsGateNotInItems(unittest.TestCase):
|
||||
comments, "alice", na_gates, lambda *_: ["alice"]
|
||||
)
|
||||
self.assertFalse(na_state["security-review"]["declared"])
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# internal#760 ceremony — ai-sop-ack team + ai_ack_eligible per-item flag
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestAIAckEligibleConfig(unittest.TestCase):
|
||||
"""CTO-controlled allowlist (msg 1388c76f):
|
||||
ai_ack_eligible: comprehensive-testing, local-postgres-e2e, staging-smoke,
|
||||
five-axis-review, memory-consulted
|
||||
human-only: root-cause, no-backwards-compat
|
||||
"""
|
||||
|
||||
def test_ai_ack_eligible_items(self):
|
||||
cfg = sop.load_config(CONFIG_PATH)
|
||||
items_by_slug = {it["slug"]: it for it in cfg["items"]}
|
||||
eligible = {
|
||||
"comprehensive-testing",
|
||||
"local-postgres-e2e",
|
||||
"staging-smoke",
|
||||
"five-axis-review",
|
||||
"memory-consulted",
|
||||
}
|
||||
for slug in eligible:
|
||||
self.assertTrue(
|
||||
items_by_slug[slug].get("ai_ack_eligible"),
|
||||
f"{slug} must be ai_ack_eligible",
|
||||
)
|
||||
|
||||
def test_human_only_items(self):
|
||||
cfg = sop.load_config(CONFIG_PATH)
|
||||
items_by_slug = {it["slug"]: it for it in cfg["items"]}
|
||||
human_only = {"root-cause", "no-backwards-compat"}
|
||||
for slug in human_only:
|
||||
self.assertFalse(
|
||||
items_by_slug[slug].get("ai_ack_eligible", False),
|
||||
f"{slug} must NOT be ai_ack_eligible (human-only)",
|
||||
)
|
||||
|
||||
def test_testing_class_slugs_constant(self):
|
||||
"""_TESTING_CLASS_SLUGS must match the three testing items."""
|
||||
self.assertEqual(
|
||||
sop._TESTING_CLASS_SLUGS,
|
||||
{"comprehensive-testing", "local-postgres-e2e", "staging-smoke"},
|
||||
)
|
||||
|
||||
def test_human_only_slugs_constant(self):
|
||||
"""_HUMAN_ONLY_SLUGS encodes the migration/schema carve-out.
|
||||
|
||||
If this set changes, the CTO must approve the widening.
|
||||
"""
|
||||
self.assertEqual(
|
||||
sop._HUMAN_ONLY_SLUGS,
|
||||
{"root-cause", "no-backwards-compat", "migration", "schema"},
|
||||
)
|
||||
|
||||
def test_human_only_invariant_enforced_in_code_and_config(self):
|
||||
"""Every config-present slug in _HUMAN_ONLY_SLUGS must be human-only.
|
||||
|
||||
This test fails if a migration/schema-class item accidentally
|
||||
acquires ai_ack_eligible via config drift. migration/schema are
|
||||
future-proofing slugs not yet in the live config; they are checked
|
||||
by the production probe closure but skipped here.
|
||||
"""
|
||||
cfg = sop.load_config(CONFIG_PATH)
|
||||
items_by_slug = {it["slug"]: it for it in cfg["items"]}
|
||||
for slug in sop._HUMAN_ONLY_SLUGS:
|
||||
if slug not in items_by_slug:
|
||||
# Future-proofing slug (e.g. migration, schema) — not yet
|
||||
# in config, but the code guard still rejects AI acks.
|
||||
continue
|
||||
self.assertFalse(
|
||||
items_by_slug[slug].get("ai_ack_eligible", False),
|
||||
f"{slug} is in _HUMAN_ONLY_SLUGS and must NEVER be ai_ack_eligible",
|
||||
)
|
||||
|
||||
|
||||
class TestAIAckEligibilityProbe(unittest.TestCase):
|
||||
"""The probe closure in main() delegates to compute_ack_state.
|
||||
We simulate the AI-ack path by injecting a probe that behaves like
|
||||
the production probe (human team first, then ai-sop-ack fallback).
|
||||
"""
|
||||
|
||||
def setUp(self):
|
||||
self.items = _items_by_slug()
|
||||
self.aliases = _numeric_aliases()
|
||||
|
||||
def _probe_human_then_ai(self, human_users, ai_users):
|
||||
"""Return users in human_users immediately; users in ai_users only
|
||||
if the item is ai_ack_eligible."""
|
||||
def probe(slug, users):
|
||||
item = self.items.get(slug, {})
|
||||
approved = []
|
||||
for u in users:
|
||||
if u in human_users:
|
||||
approved.append(u)
|
||||
elif u in ai_users and item.get("ai_ack_eligible"):
|
||||
approved.append(u)
|
||||
return approved
|
||||
return probe
|
||||
|
||||
def test_ai_ack_passes_for_eligible_item(self):
|
||||
comments = [_comment("ai-bot", "/sop-ack five-axis-review")]
|
||||
probe = self._probe_human_then_ai(human_users=set(), ai_users={"ai-bot"})
|
||||
state = sop.compute_ack_state(
|
||||
comments, "alice", self.items, self.aliases, probe
|
||||
)
|
||||
self.assertEqual(state["five-axis-review"]["ackers"], ["ai-bot"])
|
||||
|
||||
def test_ai_ack_rejected_for_human_only_item(self):
|
||||
comments = [_comment("ai-bot", "/sop-ack root-cause")]
|
||||
probe = self._probe_human_then_ai(human_users=set(), ai_users={"ai-bot"})
|
||||
state = sop.compute_ack_state(
|
||||
comments, "alice", self.items, self.aliases, probe
|
||||
)
|
||||
self.assertEqual(state["root-cause"]["ackers"], [])
|
||||
self.assertIn("ai-bot", state["root-cause"]["rejected"]["not_in_team"])
|
||||
|
||||
def test_human_ack_still_works_for_ai_eligible_item(self):
|
||||
comments = [_comment("bob", "/sop-ack comprehensive-testing")]
|
||||
probe = self._probe_human_then_ai(human_users={"bob"}, ai_users=set())
|
||||
state = sop.compute_ack_state(
|
||||
comments, "alice", self.items, self.aliases, probe
|
||||
)
|
||||
self.assertEqual(state["comprehensive-testing"]["ackers"], ["bob"])
|
||||
|
||||
def test_ai_ack_rejected_for_testing_item_when_ci_red(self):
|
||||
# Simulate the production probe that checks CI status for testing items.
|
||||
# When CI is not green, ai-sop-ack member is rejected.
|
||||
def probe(slug, users):
|
||||
item = self.items.get(slug, {})
|
||||
approved = []
|
||||
for u in users:
|
||||
if u == "ai-bot" and item.get("ai_ack_eligible"):
|
||||
# Testing items require CI green; simulate CI red.
|
||||
if slug in sop._TESTING_CLASS_SLUGS:
|
||||
continue # rejected: CI not green
|
||||
approved.append(u)
|
||||
return approved
|
||||
|
||||
comments = [_comment("ai-bot", "/sop-ack comprehensive-testing")]
|
||||
state = sop.compute_ack_state(
|
||||
comments, "alice", self.items, self.aliases, probe
|
||||
)
|
||||
self.assertEqual(state["comprehensive-testing"]["ackers"], [])
|
||||
|
||||
def test_ai_ack_passes_for_testing_item_when_ci_green(self):
|
||||
# Simulate CI green → AI ack passes.
|
||||
def probe(slug, users):
|
||||
item = self.items.get(slug, {})
|
||||
approved = []
|
||||
for u in users:
|
||||
if u == "ai-bot" and item.get("ai_ack_eligible"):
|
||||
if slug in sop._TESTING_CLASS_SLUGS:
|
||||
# CI is green → allow
|
||||
pass
|
||||
approved.append(u)
|
||||
return approved
|
||||
|
||||
comments = [_comment("ai-bot", "/sop-ack comprehensive-testing")]
|
||||
state = sop.compute_ack_state(
|
||||
comments, "alice", self.items, self.aliases, probe
|
||||
)
|
||||
self.assertEqual(state["comprehensive-testing"]["ackers"], ["ai-bot"])
|
||||
|
||||
|
||||
class TestAIAckHumanOnlyMigrationSchema(unittest.TestCase):
|
||||
"""RC 8322: migration and schema items are human-only regardless of
|
||||
any future config that might accidentally mark them ai_ack_eligible.
|
||||
|
||||
These slugs are not yet in the live config items list; the tests use
|
||||
synthetic items so the production guard can be exercised directly.
|
||||
"""
|
||||
|
||||
def setUp(self):
|
||||
# Synthetic items — if live config ever adds migration/schema,
|
||||
# they MUST stay human-only. The probe below mirrors the actual
|
||||
# production closure logic (human team first, then AI fallback
|
||||
# with _HUMAN_ONLY_SLUGS guard).
|
||||
self.items = {
|
||||
"migration": {
|
||||
"slug": "migration",
|
||||
"ai_ack_eligible": True,
|
||||
"required_teams": ["engineers"],
|
||||
},
|
||||
"schema": {
|
||||
"slug": "schema",
|
||||
"ai_ack_eligible": True,
|
||||
"required_teams": ["engineers"],
|
||||
},
|
||||
}
|
||||
self.aliases = {}
|
||||
|
||||
def _production_like_probe(self, human_users, ai_users):
|
||||
"""Return a probe that mirrors the production closure's guard."""
|
||||
|
||||
def probe(slug, users):
|
||||
item = self.items.get(slug, {})
|
||||
approved = []
|
||||
for u in users:
|
||||
if u in human_users:
|
||||
approved.append(u)
|
||||
elif u in ai_users:
|
||||
# Production guard: _HUMAN_ONLY_SLUGS rejects AI acks
|
||||
# regardless of the ai_ack_eligible flag.
|
||||
if slug in sop._HUMAN_ONLY_SLUGS:
|
||||
continue
|
||||
if item.get("ai_ack_eligible"):
|
||||
approved.append(u)
|
||||
return approved
|
||||
|
||||
return probe
|
||||
|
||||
def test_ai_ack_rejected_for_migration(self):
|
||||
comments = [_comment("ai-bot", "/sop-ack migration")]
|
||||
probe = self._production_like_probe(human_users=set(), ai_users={"ai-bot"})
|
||||
state = sop.compute_ack_state(
|
||||
comments, "alice", self.items, self.aliases, probe
|
||||
)
|
||||
self.assertEqual(state["migration"]["ackers"], [])
|
||||
self.assertIn("ai-bot", state["migration"]["rejected"]["not_in_team"])
|
||||
|
||||
def test_ai_ack_rejected_for_schema(self):
|
||||
comments = [_comment("ai-bot", "/sop-ack schema")]
|
||||
probe = self._production_like_probe(human_users=set(), ai_users={"ai-bot"})
|
||||
state = sop.compute_ack_state(
|
||||
comments, "alice", self.items, self.aliases, probe
|
||||
)
|
||||
self.assertEqual(state["schema"]["ackers"], [])
|
||||
self.assertIn("ai-bot", state["schema"]["rejected"]["not_in_team"])
|
||||
|
||||
def test_human_ack_still_works_for_migration(self):
|
||||
# Human team member acking migration/schema is unaffected.
|
||||
comments = [_comment("bob", "/sop-ack migration")]
|
||||
probe = self._production_like_probe(human_users={"bob"}, ai_users=set())
|
||||
state = sop.compute_ack_state(
|
||||
comments, "alice", self.items, self.aliases, probe
|
||||
)
|
||||
self.assertEqual(state["migration"]["ackers"], ["bob"])
|
||||
|
||||
def test_human_ack_still_works_for_schema(self):
|
||||
comments = [_comment("bob", "/sop-ack schema")]
|
||||
probe = self._production_like_probe(human_users={"bob"}, ai_users=set())
|
||||
state = sop.compute_ack_state(
|
||||
comments, "alice", self.items, self.aliases, probe
|
||||
)
|
||||
self.assertEqual(state["schema"]["ackers"], ["bob"])
|
||||
|
||||
|
||||
class TestGetCIStatus(unittest.TestCase):
|
||||
"""Verify get_ci_status reads the correct context from commit statuses."""
|
||||
|
||||
def _client_with_statuses(self, statuses):
|
||||
client = sop.GiteaClient("git.example.com", "tok")
|
||||
|
||||
def fake_req(method, path, body=None, ok_codes=(200, 201, 204)):
|
||||
return 200, statuses
|
||||
|
||||
client._req = fake_req # type: ignore[method-assign]
|
||||
return client
|
||||
|
||||
def test_ci_green_returns_success(self):
|
||||
client = self._client_with_statuses([
|
||||
{"context": "CI / all-required (pull_request)", "state": "success"},
|
||||
])
|
||||
self.assertEqual(
|
||||
sop.get_ci_status(client, "o", "r", "sha1"), "success"
|
||||
)
|
||||
|
||||
def test_ci_red_returns_failure(self):
|
||||
client = self._client_with_statuses([
|
||||
{"context": "CI / all-required (pull_request)", "state": "failure"},
|
||||
])
|
||||
self.assertEqual(
|
||||
sop.get_ci_status(client, "o", "r", "sha1"), "failure"
|
||||
)
|
||||
|
||||
def test_missing_context_returns_missing(self):
|
||||
client = self._client_with_statuses([
|
||||
{"context": "some-other-context", "state": "success"},
|
||||
])
|
||||
self.assertEqual(
|
||||
sop.get_ci_status(client, "o", "r", "sha1"), "missing"
|
||||
)
|
||||
|
||||
def test_api_error_returns_unknown(self):
|
||||
client = sop.GiteaClient("git.example.com", "tok")
|
||||
|
||||
def fake_req(method, path, body=None, ok_codes=(200, 201, 204)):
|
||||
return 500, {"error": "boom"}
|
||||
|
||||
client._req = fake_req # type: ignore[method-assign]
|
||||
self.assertEqual(
|
||||
sop.get_ci_status(client, "o", "r", "sha1"), "unknown"
|
||||
)
|
||||
|
||||
@@ -32,26 +32,6 @@
|
||||
# AUTHOR SELF-ACK IS FORBIDDEN regardless of which team contains them
|
||||
# — the gate script enforces commenter != PR author before checking
|
||||
# team membership.
|
||||
#
|
||||
# AI-SOP-ACK TEAM (internal#760 ceremony design, CTO-approved):
|
||||
# The `ai-sop-ack` team contains AI agent identities that can ack
|
||||
# SOP-checklist items ON BEHALF OF automated evidence. An AI ack is
|
||||
# only valid when:
|
||||
# 1. the item has `ai_ack_eligible: true`
|
||||
# 2. the item is NOT in the human-only carve-out (migration/schema)
|
||||
# 3. for testing-class items, CI / all-required (pull_request) is
|
||||
# green on the current head SHA
|
||||
#
|
||||
# AI acks NEVER count toward qa-review or security-review gates —
|
||||
# those remain human-team-only (enforced by review-check.sh team
|
||||
# probe against TEAM_ID 20/21).
|
||||
#
|
||||
# INITIAL ai_ack_eligible allowlist (CTO-controlled, msg 1388c76f):
|
||||
# comprehensive-testing, local-postgres-e2e, staging-smoke,
|
||||
# five-axis-review, memory-consulted
|
||||
# HUMAN-ONLY carve-out:
|
||||
# root-cause, no-backwards-compat
|
||||
# Any widening requires an explicit config change reviewed by CTO.
|
||||
|
||||
version: 1
|
||||
|
||||
@@ -103,31 +83,25 @@ items:
|
||||
numeric_alias: 1
|
||||
pr_section_marker: "Comprehensive testing performed"
|
||||
required_teams: [qa, engineers]
|
||||
ai_ack_eligible: true
|
||||
description: >-
|
||||
What was tested, how, edge cases covered. Ack from any qa-team
|
||||
member (or engineers fallback while qa is small). AI ack valid
|
||||
only when CI / all-required (pull_request) is green.
|
||||
member (or engineers fallback while qa is small).
|
||||
|
||||
- slug: local-postgres-e2e
|
||||
numeric_alias: 2
|
||||
pr_section_marker: "Local-postgres E2E run"
|
||||
required_teams: [engineers]
|
||||
ai_ack_eligible: true
|
||||
description: >-
|
||||
Link to local CI artifact, or "N/A: pure-frontend change". Ack
|
||||
from any engineer who can verify the local DB test actually ran.
|
||||
AI ack valid only when CI / all-required (pull_request) is green.
|
||||
|
||||
- slug: staging-smoke
|
||||
numeric_alias: 3
|
||||
pr_section_marker: "Staging-smoke verified or pending"
|
||||
required_teams: [engineers]
|
||||
ai_ack_eligible: true
|
||||
description: >-
|
||||
Link to canary run, or "scheduled post-merge". Ack from any
|
||||
engineer (core-devops/infra-sre are members of engineers team).
|
||||
AI ack valid only when CI / all-required (pull_request) is green.
|
||||
|
||||
- slug: root-cause
|
||||
numeric_alias: 4
|
||||
@@ -146,7 +120,6 @@ items:
|
||||
numeric_alias: 5
|
||||
pr_section_marker: "Five-Axis review walked"
|
||||
required_teams: [engineers]
|
||||
ai_ack_eligible: true
|
||||
description: >-
|
||||
Correctness / readability / architecture / security / performance.
|
||||
Ack from any non-author engineer.
|
||||
@@ -167,7 +140,6 @@ items:
|
||||
numeric_alias: 7
|
||||
pr_section_marker: "Memory/saved-feedback consulted"
|
||||
required_teams: [engineers]
|
||||
ai_ack_eligible: true
|
||||
description: >-
|
||||
List of feedback memories applicable to this change. Ack from
|
||||
any engineer who has the same memory access.
|
||||
|
||||
@@ -47,25 +47,13 @@ jobs:
|
||||
REPO: ${{ github.repository }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
# Required-status-check contexts to evaluate at merge time.
|
||||
# Branch-aware JSON dict: keys are protected branch names,
|
||||
# values are arrays of context names that branch protection
|
||||
# requires for that branch. Mirror this against branch
|
||||
# protection (settings → branches → protected branch →
|
||||
# required checks) for each branch listed here.
|
||||
#
|
||||
# Newline-separated. Mirror this against branch protection
|
||||
# (settings → branches → protected branch → required checks).
|
||||
# Declared here rather than fetched from /branch_protections
|
||||
# because that endpoint requires admin write — sop-tier-bot is
|
||||
# read-only by design (least-privilege).
|
||||
REQUIRED_CHECKS_JSON: |
|
||||
{
|
||||
"main": [
|
||||
"CI / all-required (pull_request)",
|
||||
"E2E API Smoke Test / E2E API Smoke Test (pull_request)",
|
||||
"Handlers Postgres Integration / Handlers Postgres Integration (pull_request)"
|
||||
],
|
||||
"staging": [
|
||||
"CI / all-required (pull_request)",
|
||||
"sop-checklist / all-items-acked (pull_request)"
|
||||
]
|
||||
}
|
||||
REQUIRED_CHECKS: |
|
||||
CI / all-required (pull_request)
|
||||
E2E API Smoke Test / E2E API Smoke Test (pull_request)
|
||||
Handlers Postgres Integration / Handlers Postgres Integration (pull_request)
|
||||
run: bash .gitea/scripts/audit-force-merge.sh
|
||||
|
||||
@@ -37,7 +37,7 @@ jobs:
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking
|
||||
# the PR. Follow-up PR flips this off after surfaced defects are
|
||||
# triaged.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
@@ -42,9 +42,11 @@ jobs:
|
||||
check:
|
||||
name: Migration version collision check
|
||||
runs-on: ubuntu-latest
|
||||
# Phase 4 (RFC #219 §1): 22 days green since 2026-05-11 port.
|
||||
# mc#1982 mask removed — no surfaced defects in this lane.
|
||||
continue-on-error: false
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking
|
||||
# the PR. Follow-up PR flips this off after surfaced defects are
|
||||
# triaged.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 5
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
@@ -101,7 +101,7 @@ jobs:
|
||||
# AND-set: only the Mac arm64 runner advertises macos-self-hosted.
|
||||
# See "RUNNER TARGETING" header note for why bare self-hosted is unsafe.
|
||||
runs-on: [self-hosted, macos-self-hosted]
|
||||
# ADVISORY: never blocks. See safety contract point 3. mc#1982
|
||||
# ADVISORY: never blocks. See safety contract point 3. mc#774
|
||||
# internal#418 — tracked: arm64 advisory pilot, non-gating by design.
|
||||
continue-on-error: true
|
||||
# event_name gate: functional (only meaningful on push/PR) AND keeps
|
||||
|
||||
+123
-85
@@ -106,7 +106,7 @@ jobs:
|
||||
name: Platform (Go)
|
||||
needs: changes
|
||||
runs-on: ubuntu-latest
|
||||
# mc#1982 (closed 2026-05-14): Phase 4 flip of the platform-build job.
|
||||
# mc#774 (closed 2026-05-14): Phase 4 flip of the platform-build job.
|
||||
# Phase 4 (#656) originally flipped this to continue-on-error: false based on
|
||||
# Phase-3-masked "green on main 2026-05-12". Two failure classes then surfaced:
|
||||
# (1) 4x delegation_test.go sqlmock gaps (PR #669 / #634 fix-forward, closed).
|
||||
@@ -161,7 +161,7 @@ jobs:
|
||||
echo "::group::pendinguploads exit=$pu_exit (last 100 lines)"
|
||||
tail -100 /tmp/test-pu.log
|
||||
echo "::endgroup::"
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
- if: ${{ needs.changes.outputs.platform == 'true' }}
|
||||
name: Run tests with coverage (blocking gate)
|
||||
@@ -392,7 +392,7 @@ jobs:
|
||||
canvas-deploy-reminder:
|
||||
name: Canvas Deploy Reminder
|
||||
runs-on: docker-host
|
||||
# mc#1982 root-fix: added job-level `if:` so ci-required-drift.py's
|
||||
# mc#774 root-fix: added job-level `if:` so ci-required-drift.py's
|
||||
# ci_job_names() detects this as github.ref-gated and skips it from F1.
|
||||
# The step-level exit 0 handles the "not main push" case; the job-level
|
||||
# `if:` makes the gating explicit so the drift script sees it.
|
||||
@@ -475,10 +475,10 @@ jobs:
|
||||
#
|
||||
# Emits `CI / all-required (<event>)` where <event> is the workflow trigger
|
||||
# (e.g. `CI / all-required (pull_request)`, `CI / all-required (push)`).
|
||||
# Branch protection requires the event-suffixed name —
|
||||
# Branch protection MUST be updated to require the event-suffixed name —
|
||||
# requiring `CI / all-required` (bare, no suffix) silently blocks all merges
|
||||
# because Gitea treats absent status contexts as pending (not skipped), and
|
||||
# no workflow emits the bare name. BP requires
|
||||
# no workflow emits the bare name. Fixed: BP now requires
|
||||
# `CI / all-required (pull_request)` per issue #1473.
|
||||
#
|
||||
# Closes the failure mode where status_check_contexts on molecule-core/main
|
||||
@@ -487,91 +487,129 @@ jobs:
|
||||
# red silently merged through. See internal#286 for the three concrete
|
||||
# tonight-of-2026-05-11 incidents that prompted the emergency bump.
|
||||
#
|
||||
# ── 2026-06-01 CI-scheduler-overload fix (fix/ci-scheduler-fanout) ──
|
||||
# PREVIOUS shape: a poll-gate that ran detect-changes then LOOPED on
|
||||
# `GET /commits/{sha}/statuses` every 15s for up to 40 min, occupying a
|
||||
# `ci-meta` executor slot the entire time it waited for upstream jobs.
|
||||
# With only 2 ci-meta runners, that poll-loop squatted half the lane on
|
||||
# every PR — a confirmed throughput sink in the live RCA (two concurrent
|
||||
# `JOB-all-required` containers observed pinning the lane). The polling
|
||||
# design existed only to dodge the Gitea `needs:` + `if: always()` bug,
|
||||
# where an always()-guarded sentinel could be marked skipped before
|
||||
# upstream jobs settled (leaving BP pending forever).
|
||||
# This job deliberately has no `needs:`. Gitea 1.22/act_runner can mark a
|
||||
# job-level `if: always()` + `needs:` sentinel as skipped before upstream
|
||||
# jobs settle, leaving branch protection with a permanent pending
|
||||
# `CI / all-required` context. Instead, this independent sentinel polls the
|
||||
# required commit-status contexts for this SHA and fails if any fail, skip,
|
||||
# or never emit. It runs the same path detector as `changes` and only waits
|
||||
# for path-relevant jobs; Gitea can otherwise leave needs/output-skipped
|
||||
# jobs permanently pending with "Blocked by required conditions". It runs on
|
||||
# the dedicated `ci-meta` lane so the poller does not occupy the same
|
||||
# general runner pool as the jobs it is waiting for.
|
||||
#
|
||||
# NEW shape: a plain `needs:` aggregator with NO polling loop. This is
|
||||
# safe here — and was NOT safe at the time the poller was written —
|
||||
# because every aggregated CI job now gates its real work PER-STEP
|
||||
# (`if: needs.changes.outputs.* != 'true'`) rather than at the JOB level.
|
||||
# A per-step-gated job always reaches a terminal SUCCESS (it no-ops its
|
||||
# expensive steps but the job itself still completes), so it is never
|
||||
# `skipped`. Plain `needs:` (WITHOUT `if: always()`) works correctly on
|
||||
# Gitea 1.22.6 / act_runner v0.6.1 — only `needs:` + `if: always()` is
|
||||
# broken (feedback_gitea_needs_works_only_ifalways_broken). We therefore
|
||||
# use plain `needs:` + an explicit per-need result check (NOT
|
||||
# `if: always()`); if any need fails/errors, Gitea never starts this job
|
||||
# and BP sees `CI / all-required` go red via the failed dependency
|
||||
# propagation — exactly the gate we want, with zero runner-squat.
|
||||
# canvas-deploy-reminder is intentionally NOT included in all-required.needs.
|
||||
# It is an informational main-push reminder, not a PR quality gate. Keeping
|
||||
# it in this dependency list lets a skipped reminder skip the required
|
||||
# sentinel before the `always()` guard can emit a branch-protection status.
|
||||
#
|
||||
# The `needs:` list MUST stay in lockstep with ci-required-drift.py's
|
||||
# F1 check (`ci_job_names()` = every job MINUS the sentinel MINUS jobs
|
||||
# whose `if:` gates on github.event_name/github.ref). canvas-deploy-
|
||||
# reminder is event-gated (`if: github.ref == refs/heads/{main,staging}`)
|
||||
# so it is intentionally EXCLUDED — it skips on PRs and a `needs:` on a
|
||||
# skipped job would never let the sentinel run. If a new always-running
|
||||
# CI job is added, add it here too or ci-required-drift F1 will flag it.
|
||||
#
|
||||
# Stays on the dedicated `ci-meta` lane (no docker work, so the
|
||||
# docker-host-pin lint does not apply), but now the job is sub-second:
|
||||
# it only inspects already-settled `needs.*.result` values, so it frees
|
||||
# the slot immediately instead of holding it for the whole CI duration.
|
||||
#
|
||||
needs:
|
||||
- changes
|
||||
- platform-build
|
||||
- canvas-build
|
||||
- shellcheck
|
||||
- python-lint
|
||||
continue-on-error: false
|
||||
runs-on: ci-meta
|
||||
timeout-minutes: 5
|
||||
timeout-minutes: 45
|
||||
steps:
|
||||
- name: Verify all aggregated CI jobs succeeded
|
||||
# NO polling, NO API call, NO checkout. Because this job lists the
|
||||
# aggregated jobs under `needs:` (without `if: always()`), Gitea only
|
||||
# starts it once every need has reached SUCCESS — a failed/errored
|
||||
# need short-circuits the job and propagates red to the
|
||||
# `CI / all-required` context. This explicit check is a
|
||||
# belt-and-suspenders assertion + a readable run summary; the real
|
||||
# gating is the `needs:` edge itself.
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- id: check
|
||||
env:
|
||||
CHANGES_RESULT: ${{ needs.changes.result }}
|
||||
PLATFORM_RESULT: ${{ needs.platform-build.result }}
|
||||
CANVAS_RESULT: ${{ needs.canvas-build.result }}
|
||||
SHELLCHECK_RESULT: ${{ needs.shellcheck.result }}
|
||||
PYTHON_LINT_RESULT: ${{ needs.python-lint.result }}
|
||||
PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
|
||||
PR_BASE_REF: ${{ github.event.pull_request.base.ref }}
|
||||
PUSH_BEFORE: ${{ github.event.before }}
|
||||
run: |
|
||||
python3 .gitea/scripts/detect-changes.py \
|
||||
--profile ci \
|
||||
--event-name "${{ github.event_name }}" \
|
||||
--pr-base-sha "$PR_BASE_SHA" \
|
||||
--base-ref "$PR_BASE_REF" \
|
||||
--push-before "${GITHUB_EVENT_BEFORE:-$PUSH_BEFORE}"
|
||||
- name: Wait for required CI contexts
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
API_ROOT: ${{ github.server_url }}/api/v1
|
||||
REPOSITORY: ${{ github.repository }}
|
||||
COMMIT_SHA: ${{ github.sha }}
|
||||
EVENT_NAME: ${{ github.event_name }}
|
||||
REQUIRE_PLATFORM: ${{ steps.check.outputs.platform }}
|
||||
REQUIRE_CANVAS: ${{ steps.check.outputs.canvas }}
|
||||
REQUIRE_SCRIPTS: ${{ steps.check.outputs.scripts }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
fail=0
|
||||
check() {
|
||||
name="$1"; result="$2"
|
||||
printf 'CI / %s = %s\n' "$name" "$result"
|
||||
# `success` is the only green terminal state we accept. A plain
|
||||
# `needs:` job is only started when all needs succeed, so reaching
|
||||
# this step already implies success — but assert explicitly so a
|
||||
# future `if: always()` reintroduction (which WOULD let non-success
|
||||
# through) fails loudly instead of silently passing the gate.
|
||||
if [ "$result" != "success" ]; then
|
||||
echo "::error::aggregated CI job '${name}' did not succeed (result=${result})"
|
||||
fail=1
|
||||
fi
|
||||
}
|
||||
check "Detect changes" "$CHANGES_RESULT"
|
||||
check "Platform (Go)" "$PLATFORM_RESULT"
|
||||
check "Canvas (Next.js)" "$CANVAS_RESULT"
|
||||
check "Shellcheck (E2E scripts)" "$SHELLCHECK_RESULT"
|
||||
check "Python Lint & Test" "$PYTHON_LINT_RESULT"
|
||||
if [ "$fail" -ne 0 ]; then
|
||||
echo "::error::all-required: one or more aggregated CI jobs did not succeed"
|
||||
exit 1
|
||||
fi
|
||||
echo "OK: all aggregated CI jobs succeeded — CI / all-required green."
|
||||
python3 - <<'PY'
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
|
||||
token = os.environ["GITEA_TOKEN"]
|
||||
api_root = os.environ["API_ROOT"].rstrip("/")
|
||||
repo = os.environ["REPOSITORY"]
|
||||
sha = os.environ["COMMIT_SHA"]
|
||||
event = os.environ["EVENT_NAME"]
|
||||
required = [
|
||||
f"CI / Detect changes ({event})",
|
||||
f"CI / Python Lint & Test ({event})",
|
||||
]
|
||||
if os.environ.get("REQUIRE_PLATFORM") == "true":
|
||||
required.append(f"CI / Platform (Go) ({event})")
|
||||
if os.environ.get("REQUIRE_CANVAS") == "true":
|
||||
required.append(f"CI / Canvas (Next.js) ({event})")
|
||||
if os.environ.get("REQUIRE_SCRIPTS") == "true":
|
||||
required.append(f"CI / Shellcheck (E2E scripts) ({event})")
|
||||
terminal_bad = {"failure", "error"}
|
||||
deadline = time.time() + 40 * 60
|
||||
last_summary = None
|
||||
|
||||
def fetch_statuses():
|
||||
statuses = []
|
||||
for page in range(1, 6):
|
||||
url = f"{api_root}/repos/{repo}/commits/{sha}/statuses?page={page}&limit=100"
|
||||
req = urllib.request.Request(url, headers={"Authorization": f"token {token}"})
|
||||
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||
chunk = json.load(resp)
|
||||
if not chunk:
|
||||
break
|
||||
statuses.extend(chunk)
|
||||
latest = {}
|
||||
for item in statuses:
|
||||
ctx = item.get("context")
|
||||
if not ctx:
|
||||
continue
|
||||
prev = latest.get(ctx)
|
||||
if prev is None or (item.get("updated_at") or item.get("created_at") or "") >= (prev.get("updated_at") or prev.get("created_at") or ""):
|
||||
latest[ctx] = item
|
||||
return latest
|
||||
|
||||
while True:
|
||||
try:
|
||||
latest = fetch_statuses()
|
||||
except (TimeoutError, OSError, urllib.error.URLError) as exc:
|
||||
if time.time() >= deadline:
|
||||
print(f"FAIL: status polling did not recover before deadline: {exc}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
print(f"WARN: status poll failed, retrying: {exc}", flush=True)
|
||||
time.sleep(15)
|
||||
continue
|
||||
states = {ctx: (latest.get(ctx) or {}).get("status") or (latest.get(ctx) or {}).get("state") or "missing" for ctx in required}
|
||||
summary = ", ".join(f"{ctx}={state}" for ctx, state in states.items())
|
||||
if summary != last_summary:
|
||||
print(summary, flush=True)
|
||||
last_summary = summary
|
||||
bad = {ctx: state for ctx, state in states.items() if state in terminal_bad}
|
||||
if bad:
|
||||
print("FAIL: required CI context failed:", file=sys.stderr)
|
||||
for ctx, state in bad.items():
|
||||
desc = (latest.get(ctx) or {}).get("description") or ""
|
||||
print(f" - {ctx}: {state} {desc}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
if all(state == "success" for state in states.values()):
|
||||
print(f"OK: all {len(required)} required CI contexts succeeded")
|
||||
sys.exit(0)
|
||||
if time.time() >= deadline:
|
||||
print("FAIL: timed out waiting for required CI contexts:", file=sys.stderr)
|
||||
for ctx, state in states.items():
|
||||
print(f" - {ctx}: {state}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
time.sleep(15)
|
||||
PY
|
||||
|
||||
@@ -102,7 +102,7 @@ jobs:
|
||||
name: Synthetic E2E against staging
|
||||
runs-on: ubuntu-latest
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
# Bumped from 12 → 20 (2026-05-04). Tenant user-data install phase
|
||||
# (apt-get update + install docker.io/jq/awscli/caddy + snap install
|
||||
|
||||
@@ -123,7 +123,7 @@ jobs:
|
||||
# integration). See internal#512 for the class defect.
|
||||
runs-on: docker-host
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
outputs:
|
||||
api: ${{ steps.decide.outputs.api }}
|
||||
@@ -160,7 +160,7 @@ jobs:
|
||||
# detect-changes for the full rationale.
|
||||
runs-on: docker-host
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 15
|
||||
env:
|
||||
|
||||
@@ -48,7 +48,7 @@ jobs:
|
||||
# defect.
|
||||
runs-on: docker-host
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
outputs:
|
||||
chat: ${{ steps.decide.outputs.chat }}
|
||||
@@ -112,7 +112,7 @@ jobs:
|
||||
# Must land on operator-host Linux (docker-host).
|
||||
runs-on: docker-host
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 15
|
||||
env:
|
||||
|
||||
@@ -71,7 +71,7 @@ jobs:
|
||||
detect-changes:
|
||||
runs-on: ubuntu-latest
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
outputs:
|
||||
canvas: ${{ steps.decide.outputs.canvas }}
|
||||
@@ -140,7 +140,7 @@ jobs:
|
||||
name: Canvas tabs E2E
|
||||
runs-on: ubuntu-latest
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 40
|
||||
|
||||
|
||||
@@ -84,7 +84,7 @@ jobs:
|
||||
name: E2E Staging External Runtime
|
||||
runs-on: ubuntu-latest
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 25
|
||||
|
||||
|
||||
@@ -94,20 +94,20 @@ jobs:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
fetch-depth: 1
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
|
||||
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
|
||||
with:
|
||||
python-version: "3.11"
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
|
||||
- name: YAML validation (best-effort)
|
||||
run: |
|
||||
echo "e2e-staging-saas.yml — PR validation: workflow YAML is valid."
|
||||
echo "E2E step runs only when provisioning-critical files change."
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
|
||||
# Actual E2E: runs on trunk pushes and PRs that touch provisioning-critical
|
||||
@@ -118,7 +118,7 @@ jobs:
|
||||
name: E2E Staging SaaS
|
||||
runs-on: ubuntu-latest
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 45
|
||||
permissions:
|
||||
|
||||
@@ -37,7 +37,7 @@ jobs:
|
||||
name: Intentional-failure teardown sanity
|
||||
runs-on: ubuntu-latest
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 20
|
||||
|
||||
|
||||
@@ -66,7 +66,7 @@ jobs:
|
||||
# bp-exempt: PR advisory bot; merge blocking is enforced by CI status and branch protection.
|
||||
gate-check:
|
||||
runs-on: ubuntu-latest
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true # Never block on our own detector failing
|
||||
steps:
|
||||
- name: Check out BASE ref (never PR-head under pull_request_target)
|
||||
|
||||
@@ -87,8 +87,8 @@ jobs:
|
||||
# both jobs on the same label avoids workspace-volume cross-host
|
||||
# surprises and keeps the routing rule discoverable in one place.
|
||||
runs-on: docker-host
|
||||
# mc#1982 Phase 3 (RFC §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774 Phase 3 (RFC §1): surface broken workflows without blocking.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
outputs:
|
||||
handlers: ${{ steps.filter.outputs.handlers }}
|
||||
@@ -118,8 +118,8 @@ jobs:
|
||||
# mc#1529 §1: must run on operator-host (where `molecule-core-net`
|
||||
# exists). See detect-changes for the full routing rationale.
|
||||
runs-on: docker-host
|
||||
# mc#1982 Phase 3 (RFC §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774 Phase 3 (RFC §1): surface broken workflows without blocking.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
env:
|
||||
# Unique name per run so concurrent jobs don't collide on the
|
||||
|
||||
@@ -70,7 +70,7 @@ jobs:
|
||||
# of mc#1543; see internal#512 for class defect.
|
||||
runs-on: docker-host
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
outputs:
|
||||
run: ${{ steps.decide.outputs.run }}
|
||||
@@ -172,7 +172,7 @@ jobs:
|
||||
# beta containers. Must run on operator-host Linux (docker-host).
|
||||
runs-on: docker-host
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 30
|
||||
steps:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
name: lint-bp-context-emit-match
|
||||
|
||||
# Tier 2f scheduled lint (per mc#1982) — detects drift between
|
||||
# Tier 2f scheduled lint (per mc#774) — detects drift between
|
||||
# `branch_protections/<branch>.status_check_contexts` and the set of
|
||||
# contexts emitted by `.gitea/workflows/*.yml`.
|
||||
#
|
||||
@@ -60,7 +60,7 @@ name: lint-bp-context-emit-match
|
||||
#
|
||||
# Cross-links
|
||||
# -----------
|
||||
# - mc#1982 (the RFC that specs this lint)
|
||||
# - mc#774 (the RFC that specs this lint)
|
||||
# - internal#349 (cross-repo BP sweep)
|
||||
# - feedback_phantom_required_check_after_gitea_migration
|
||||
# - feedback_tier_label_ids_are_per_repo
|
||||
@@ -91,10 +91,10 @@ jobs:
|
||||
name: lint-bp-context-emit-match
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 5
|
||||
# Phase 4 (RFC #219 §1): 22 days green since 2026-05-11 port,
|
||||
# well past the 7-clean-run threshold. Scheduled failure is now
|
||||
# a hard CI signal.
|
||||
continue-on-error: false
|
||||
# Phase 3 (RFC #219 §1): surface drift without blocking. After 7
|
||||
# clean scheduled runs on main, flip to false so a scheduled
|
||||
# failure is a hard CI signal.
|
||||
continue-on-error: true # mc#774 Phase 3 — flip to false after 7 clean main runs
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
name: lint-continue-on-error-tracking
|
||||
|
||||
# Tier 2e hard-gate lint (per mc#1982) — every
|
||||
# Tier 2e hard-gate lint (per mc#774) — every
|
||||
# `continue-on-error: true` in `.gitea/workflows/*.yml` must carry a
|
||||
# `# mc#NNNN` or `# internal#NNNN` tracker comment within 2 lines,
|
||||
# the referenced issue must be OPEN, and ≤14 days old.
|
||||
@@ -8,7 +8,7 @@ name: lint-continue-on-error-tracking
|
||||
# Why this exists
|
||||
# ---------------
|
||||
# `continue-on-error: true` on `platform-build` had been hiding
|
||||
# mc#1982-class regressions for ~3 weeks before #656 surfaced them on
|
||||
# mc#774-class regressions for ~3 weeks before #656 surfaced them on
|
||||
# 2026-05-12. A 14-day cap on tracker age forces a review cycle and
|
||||
# surfaces mask-drift within at most 14 days of the original defect.
|
||||
# Each `continue-on-error: true` gets a paper trail — close or renew.
|
||||
@@ -45,12 +45,12 @@ name: lint-continue-on-error-tracking
|
||||
# close-and-flip, or document the deliberate keep-mask in a fresh
|
||||
# 14-day-renewable tracker. After main is clean for 3 days,
|
||||
# follow-up PR flips this workflow's continue-on-error to false.
|
||||
# Tracking: mc#1982.
|
||||
# Tracking: mc#774.
|
||||
#
|
||||
# Cross-links
|
||||
# -----------
|
||||
# - mc#1982 (the RFC that specs this lint)
|
||||
# - mc#1982 (the empirical masked-3-weeks case)
|
||||
# - mc#774 (the RFC that specs this lint)
|
||||
# - mc#774 (the empirical masked-3-weeks case)
|
||||
# - feedback_chained_defects_in_never_tested_workflows
|
||||
# - feedback_behavior_based_ast_gates
|
||||
# - feedback_strict_root_only_after_class_a
|
||||
@@ -97,9 +97,9 @@ jobs:
|
||||
# Phase 3 (RFC #219 §1): surface masked defects without blocking
|
||||
# PRs. Pre-existing continue-on-error: true directives on main
|
||||
# all violate this lint at first — intentional. Flip to false
|
||||
# follow-up after main is clean for 3 days. mc#1982.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true # mc#1982 Phase 3 mask — 14d forced-renewal cadence
|
||||
# follow-up after main is clean for 3 days. mc#774.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true # mc#774 Phase 3 mask — 14d forced-renewal cadence
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
|
||||
|
||||
@@ -48,9 +48,11 @@ jobs:
|
||||
scan:
|
||||
name: Scan workflows for curl status-capture pollution
|
||||
runs-on: ubuntu-latest
|
||||
# Phase 4 (RFC #219 §1): 22 days green since 2026-05-11 port.
|
||||
# mc#1982 mask removed — no surfaced defects in this lane.
|
||||
continue-on-error: false
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking
|
||||
# the PR. Follow-up PR flips this off after surfaced defects are
|
||||
# triaged.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- name: Find curl ... -w '%{http_code}' ... || echo "000" subshells
|
||||
|
||||
@@ -25,21 +25,6 @@ name: Lint forbidden tenant-env keys
|
||||
# feedback_path_filtered_workflow_cant_be_required). The scan itself
|
||||
# targets workspace_secrets-writer paths via grep -r; it's fast
|
||||
# (sub-second) so unconditional run is fine.
|
||||
#
|
||||
# ── 2026-06-01 CI-scheduler-fanout consolidation (fix/ci-scheduler-fanout) ──
|
||||
# The RFC#523 sibling lint formerly in its own file
|
||||
# `lint-no-tenant-gitea-token.yml` (the broader "no repo-host token into
|
||||
# any tenant-writer surface" scan) is now a SECOND job in THIS workflow
|
||||
# (`scan-tenant-token-write`). Both are sub-second Go-source greps that
|
||||
# fired as two separate workflow runs on every PR — pure scheduler
|
||||
# fan-out. Folding the sibling in here drops one workflow run + one
|
||||
# checkout per PR while keeping BOTH scans firing unconditionally on
|
||||
# every PR (the no-paths discipline above is preserved — neither job is
|
||||
# paths-filtered). The moved job keeps its exact `name:` so its emitted
|
||||
# status context is unchanged in substance; its `# bp-exempt:` directive
|
||||
# moves with it (Tier 2g). The old `Lint no tenant GITEA or GITHUB token
|
||||
# write / …` context is retired (a disappearing context needs no
|
||||
# directive; only NEW emitters do).
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
@@ -181,126 +166,3 @@ jobs:
|
||||
fi
|
||||
|
||||
echo "OK No forbidden operator-scope env key names hardcoded in writer paths."
|
||||
|
||||
# bp-exempt: advisory RFC#523 lint; PR review gate is review-driven, not BP-driven.
|
||||
# (Carried with the workflow-name rename in PR mc#1593 so the renamed
|
||||
# context emission satisfies lint_required_context_exists_in_bp Tier 2g.)
|
||||
scan-tenant-token-write:
|
||||
name: Scan for repo-host token write into tenant workspace surface
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Find Go files referencing a tenant-writer surface AND a repo-host token
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
# Repo-host token NAMES — the threat-model subset. Operator-fleet
|
||||
# tokens (CP_ADMIN_API_TOKEN, RAILWAY_TOKEN, INFISICAL_*) are
|
||||
# caught by lint-forbidden-env-keys.yml's broader deny set; this
|
||||
# lint focuses on the git-host class so a single co-occurrence
|
||||
# match has a low false-positive rate.
|
||||
FORBIDDEN_KEYS=(
|
||||
"GITEA_TOKEN"
|
||||
"GITEA_PAT"
|
||||
"GITHUB_TOKEN"
|
||||
"GITHUB_PAT"
|
||||
"GH_TOKEN"
|
||||
)
|
||||
|
||||
# Tenant-writer surface markers. A file matches the surface set
|
||||
# if it references ANY of these strings. This is the "is this
|
||||
# code path writing into a tenant workspace?" heuristic.
|
||||
# Curated to catch the actual code shapes used in this repo
|
||||
# (verified by grep against current main 2026-05-19):
|
||||
# - "workspace_secrets" / "global_secrets" → DB table writes
|
||||
# - "seedAllowList" → CP-side seed table
|
||||
# - "/settings/secrets" → tenant HTTP API write
|
||||
# - "envVars[" → in-memory env map write
|
||||
# - "containerEnv" → docker-run env-set
|
||||
# - "userData" → EC2 user-data script
|
||||
# - "provisionPayload" / "provisionContext" → provision-request shape
|
||||
SURFACE_PATTERN='workspace_secrets|global_secrets|seedAllowList|/settings/secrets|envVars\[|containerEnv|userData|provisionPayload|provisionContext'
|
||||
|
||||
# Files that legitimately reference these names AND a surface
|
||||
# marker, but do so for guard / strip / test / doc-comment
|
||||
# reasons. New entries require reviewer signoff and a one-line
|
||||
# justification in the diff.
|
||||
EXEMPT_FILES=(
|
||||
# RFC#523 L1 deny-set source-of-truth + tests
|
||||
"workspace-server/internal/handlers/workspace_provision_forbidden_env.go"
|
||||
"workspace-server/internal/handlers/workspace_provision_forbidden_env_test.go"
|
||||
# Forensic-#145 silent-strip denylist (defense-in-depth, by design lists the names)
|
||||
"workspace-server/internal/provisioner/provisioner.go"
|
||||
"workspace-server/internal/provisioner/provisioner_test.go"
|
||||
# Pre-RFC#523 persona-fallback / org-helper paths. The L1
|
||||
# fail-closed runs BEFORE these writers; downstream silent-strip
|
||||
# also covers them. See applyAgentGitHTTPCreds doc-comment.
|
||||
"workspace-server/internal/handlers/agent_git_identity.go"
|
||||
"workspace-server/internal/handlers/org_helpers.go"
|
||||
"workspace-server/internal/handlers/org.go"
|
||||
# CP→platform admin auth (NOT a tenant env write).
|
||||
"workspace-server/internal/provisioner/cp_provisioner.go"
|
||||
)
|
||||
|
||||
# Build an extended-regex alternation of forbidden keys.
|
||||
KEY_ALT="$(IFS='|'; echo "${FORBIDDEN_KEYS[*]}")"
|
||||
|
||||
# Find candidate files: Go non-test sources that contain a
|
||||
# tenant-writer surface marker.
|
||||
mapfile -t CANDIDATES < <(
|
||||
grep -rlE --include='*.go' --exclude='*_test.go' \
|
||||
"${SURFACE_PATTERN}" . 2>/dev/null \
|
||||
| sed 's|^\./||' \
|
||||
| sort -u
|
||||
)
|
||||
|
||||
if [ "${#CANDIDATES[@]}" -eq 0 ]; then
|
||||
echo "OK No tenant-writer-surface files found in tree (unexpected, but not a lint failure)."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
HITS=""
|
||||
for f in "${CANDIDATES[@]}"; do
|
||||
# Skip exempt files.
|
||||
skip=0
|
||||
for ex in "${EXEMPT_FILES[@]}"; do
|
||||
if [ "$f" = "$ex" ]; then skip=1; break; fi
|
||||
done
|
||||
[ "$skip" = "1" ] && continue
|
||||
|
||||
# File contains a surface marker; now grep for a forbidden
|
||||
# key NAME. We require a QUOTED-literal match to avoid
|
||||
# firing on a comment like "// also handle GITEA_TOKEN".
|
||||
#
|
||||
# The literal form catches:
|
||||
# - os.Getenv("GITEA_TOKEN")
|
||||
# - envVars["GITEA_TOKEN"] = ...
|
||||
# - {envKey: "GITEA_TOKEN", tenantKey: "GITEA_TOKEN"}
|
||||
# but not:
|
||||
# - // see GITEA_TOKEN below (no quotes)
|
||||
found=$(grep -nE "\"(${KEY_ALT})\"" "$f" 2>/dev/null || true)
|
||||
if [ -n "$found" ]; then
|
||||
HITS="${HITS}--- ${f} ---\n${found}\n"
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -n "$HITS" ]; then
|
||||
echo "::error::Task #146 lint: repo-host token name(s) quoted in a tenant-writer-surface file:"
|
||||
printf "$HITS"
|
||||
echo ""
|
||||
echo "These files reference a tenant-writer surface (workspace_secrets,"
|
||||
echo "seedAllowList, /settings/secrets, containerEnv, userData, etc.)"
|
||||
echo "AND quote a repo-host token name (GITEA_TOKEN/GITHUB_TOKEN/…)."
|
||||
echo "Per RFC#523 threat model, tenant workspaces MUST NOT receive"
|
||||
echo "operator-scope repo-host tokens. If your code legitimately needs"
|
||||
echo "to reference one of these names in a tenant-writer file (e.g."
|
||||
echo "a deny-set definition or silent-strip list), add the file to"
|
||||
echo "EXEMPT_FILES with a one-line justification — reviewer signoff"
|
||||
echo "required."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "OK No tenant-writer-surface file co-mentions a repo-host token literal."
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
name: lint-mask-pr-atomicity
|
||||
|
||||
# Tier 2d hard-gate lint (per mc#1982) — blocks PRs that touch
|
||||
# Tier 2d hard-gate lint (per mc#774) — blocks PRs that touch
|
||||
# `.gitea/workflows/ci.yml` and modify ONLY ONE of {continue-on-error,
|
||||
# all-required.sentinel.needs} without a `Paired: #NNN` reference in
|
||||
# the PR body or in a commit message.
|
||||
@@ -37,13 +37,13 @@ name: lint-mask-pr-atomicity
|
||||
# This workflow lands at `continue-on-error: true` (Phase 3 — surface
|
||||
# regressions without blocking PRs while the rule beds in).
|
||||
# Follow-up PR flips to `false` once we have ≥3 days of clean runs on
|
||||
# `main` and no false-positives. Tracking issue: mc#1982.
|
||||
# `main` and no false-positives. Tracking issue: mc#774.
|
||||
#
|
||||
# Cross-links
|
||||
# -----------
|
||||
# - mc#1982 (the RFC that specs this lint)
|
||||
# - mc#774 (the RFC that specs this lint)
|
||||
# - PR#665 / PR#668 (the empirical split-pair)
|
||||
# - mc#1982 (the main-red incident the split caused)
|
||||
# - mc#774 (the main-red incident the split caused)
|
||||
# - feedback_strict_root_only_after_class_a
|
||||
# - feedback_behavior_based_ast_gates
|
||||
#
|
||||
@@ -92,8 +92,8 @@ jobs:
|
||||
# Phase 3 (RFC #219 §1): surface broken shapes without blocking
|
||||
# PRs. Follow-up PR flips this to `false` once recent runs on main
|
||||
# are confirmed clean (eat-our-own-dogfood discipline mirrors
|
||||
# PR#673's same-shape comment). Tracking: mc#1982.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# PR#673's same-shape comment). Tracking: mc#774.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Check out PR head with full history (need base SHA blobs)
|
||||
|
||||
@@ -0,0 +1,182 @@
|
||||
name: Lint no tenant GITEA or GITHUB token write
|
||||
|
||||
# Task #146 — CI guardrail companion to RFC#523's `lint-forbidden-env-keys.yml`.
|
||||
#
|
||||
# `lint-forbidden-env-keys.yml` (Layer 3) catches code that hardcodes a
|
||||
# forbidden env-var key NAME as a quoted literal in workspace_secrets
|
||||
# writer paths under workspace-server/internal/.
|
||||
#
|
||||
# This workflow catches a BROADER class: any code path that reads a
|
||||
# repo-host token (GITEA_TOKEN / GITHUB_TOKEN / GH_TOKEN) and then writes
|
||||
# it into a TENANT WORKSPACE's env, secret store, user-data, or
|
||||
# provision payload. This is the actual RFC#523 threat-model statement —
|
||||
# the goal is "no tenant workspace ever receives an operator-scope repo
|
||||
# token," not just "no _quoted_ literal `GITEA_TOKEN`." A future writer
|
||||
# could route the value via a variable, a struct field, or a config key
|
||||
# and slip past the existing literal scan; this lint catches those
|
||||
# routing patterns at PR review time.
|
||||
#
|
||||
# Scope
|
||||
# Scans the WHOLE repo's Go sources (not just workspace-server/) for
|
||||
# co-occurrences of:
|
||||
# - a repo-host token NAME (GITEA_TOKEN / GITHUB_TOKEN / GH_TOKEN /
|
||||
# GITEA_PAT / GITHUB_PAT) used as os.Getenv argument or string
|
||||
# literal
|
||||
# - within a file that ALSO references a tenant-writer surface
|
||||
# (`tenant`, `workspace_secrets`, `global_secrets`, `seedAllowList`,
|
||||
# `/settings/secrets`, `userData`, `provisionPayload`,
|
||||
# `envVars[`, `containerEnv`).
|
||||
#
|
||||
# Co-occurrence (not single-line) is the false-positive control: a
|
||||
# file that just LOGS the variable name (e.g. "missing GITEA_TOKEN")
|
||||
# without touching any tenant surface won't fire.
|
||||
#
|
||||
# Drift contract with lint-forbidden-env-keys.yml
|
||||
# Both lints share the same FORBIDDEN_KEYS list (a subset — only the
|
||||
# repo-host tokens, since this lint's threat model is "tenant gets
|
||||
# write access to operator's git host"). If RFC#523's deny set grows,
|
||||
# update BOTH this file AND lint-forbidden-env-keys.yml AND the Go
|
||||
# source-of-truth in
|
||||
# workspace-server/internal/handlers/workspace_provision_forbidden_env.go.
|
||||
#
|
||||
# Open-source-template-friendly
|
||||
# The patterns scanned are generic (no MOLECULE_-prefix literals).
|
||||
# A fork can copy this workflow as-is and adjust FORBIDDEN_KEYS.
|
||||
#
|
||||
# Path-filter discipline
|
||||
# No `paths:` filter — required-status workflows must run on every PR
|
||||
# per `feedback_path_filtered_workflow_cant_be_required`. Scan is
|
||||
# sub-second.
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
push:
|
||||
branches: [main, staging]
|
||||
|
||||
env:
|
||||
GITHUB_SERVER_URL: https://git.moleculesai.app
|
||||
|
||||
jobs:
|
||||
# bp-exempt: advisory RFC#523 lint; PR review gate is review-driven, not BP-driven.
|
||||
# (Carried with the workflow-name rename in PR mc#1593 so the renamed
|
||||
# context emission satisfies lint_required_context_exists_in_bp Tier 2g.)
|
||||
scan:
|
||||
name: Scan for repo-host token write into tenant workspace surface
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Find Go files referencing a tenant-writer surface AND a repo-host token
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
# Repo-host token NAMES — the threat-model subset. Operator-fleet
|
||||
# tokens (CP_ADMIN_API_TOKEN, RAILWAY_TOKEN, INFISICAL_*) are
|
||||
# caught by lint-forbidden-env-keys.yml's broader deny set; this
|
||||
# lint focuses on the git-host class so a single co-occurrence
|
||||
# match has a low false-positive rate.
|
||||
FORBIDDEN_KEYS=(
|
||||
"GITEA_TOKEN"
|
||||
"GITEA_PAT"
|
||||
"GITHUB_TOKEN"
|
||||
"GITHUB_PAT"
|
||||
"GH_TOKEN"
|
||||
)
|
||||
|
||||
# Tenant-writer surface markers. A file matches the surface set
|
||||
# if it references ANY of these strings. This is the "is this
|
||||
# code path writing into a tenant workspace?" heuristic.
|
||||
# Curated to catch the actual code shapes used in this repo
|
||||
# (verified by grep against current main 2026-05-19):
|
||||
# - "workspace_secrets" / "global_secrets" → DB table writes
|
||||
# - "seedAllowList" → CP-side seed table
|
||||
# - "/settings/secrets" → tenant HTTP API write
|
||||
# - "envVars[" → in-memory env map write
|
||||
# - "containerEnv" → docker-run env-set
|
||||
# - "userData" → EC2 user-data script
|
||||
# - "provisionPayload" / "provisionContext" → provision-request shape
|
||||
SURFACE_PATTERN='workspace_secrets|global_secrets|seedAllowList|/settings/secrets|envVars\[|containerEnv|userData|provisionPayload|provisionContext'
|
||||
|
||||
# Files that legitimately reference these names AND a surface
|
||||
# marker, but do so for guard / strip / test / doc-comment
|
||||
# reasons. New entries require reviewer signoff and a one-line
|
||||
# justification in the diff.
|
||||
EXEMPT_FILES=(
|
||||
# RFC#523 L1 deny-set source-of-truth + tests
|
||||
"workspace-server/internal/handlers/workspace_provision_forbidden_env.go"
|
||||
"workspace-server/internal/handlers/workspace_provision_forbidden_env_test.go"
|
||||
# Forensic-#145 silent-strip denylist (defense-in-depth, by design lists the names)
|
||||
"workspace-server/internal/provisioner/provisioner.go"
|
||||
"workspace-server/internal/provisioner/provisioner_test.go"
|
||||
# Pre-RFC#523 persona-fallback / org-helper paths. The L1
|
||||
# fail-closed runs BEFORE these writers; downstream silent-strip
|
||||
# also covers them. See applyAgentGitHTTPCreds doc-comment.
|
||||
"workspace-server/internal/handlers/agent_git_identity.go"
|
||||
"workspace-server/internal/handlers/org_helpers.go"
|
||||
"workspace-server/internal/handlers/org.go"
|
||||
# CP→platform admin auth (NOT a tenant env write).
|
||||
"workspace-server/internal/provisioner/cp_provisioner.go"
|
||||
)
|
||||
|
||||
# Build an extended-regex alternation of forbidden keys.
|
||||
KEY_ALT="$(IFS='|'; echo "${FORBIDDEN_KEYS[*]}")"
|
||||
|
||||
# Find candidate files: Go non-test sources that contain a
|
||||
# tenant-writer surface marker.
|
||||
mapfile -t CANDIDATES < <(
|
||||
grep -rlE --include='*.go' --exclude='*_test.go' \
|
||||
"${SURFACE_PATTERN}" . 2>/dev/null \
|
||||
| sed 's|^\./||' \
|
||||
| sort -u
|
||||
)
|
||||
|
||||
if [ "${#CANDIDATES[@]}" -eq 0 ]; then
|
||||
echo "OK No tenant-writer-surface files found in tree (unexpected, but not a lint failure)."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
HITS=""
|
||||
for f in "${CANDIDATES[@]}"; do
|
||||
# Skip exempt files.
|
||||
skip=0
|
||||
for ex in "${EXEMPT_FILES[@]}"; do
|
||||
if [ "$f" = "$ex" ]; then skip=1; break; fi
|
||||
done
|
||||
[ "$skip" = "1" ] && continue
|
||||
|
||||
# File contains a surface marker; now grep for a forbidden
|
||||
# key NAME. We require a QUOTED-literal match to avoid
|
||||
# firing on a comment like "// also handle GITEA_TOKEN".
|
||||
#
|
||||
# The literal form catches:
|
||||
# - os.Getenv("GITEA_TOKEN")
|
||||
# - envVars["GITEA_TOKEN"] = ...
|
||||
# - {envKey: "GITEA_TOKEN", tenantKey: "GITEA_TOKEN"}
|
||||
# but not:
|
||||
# - // see GITEA_TOKEN below (no quotes)
|
||||
found=$(grep -nE "\"(${KEY_ALT})\"" "$f" 2>/dev/null || true)
|
||||
if [ -n "$found" ]; then
|
||||
HITS="${HITS}--- ${f} ---\n${found}\n"
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -n "$HITS" ]; then
|
||||
echo "::error::Task #146 lint: repo-host token name(s) quoted in a tenant-writer-surface file:"
|
||||
printf "$HITS"
|
||||
echo ""
|
||||
echo "These files reference a tenant-writer surface (workspace_secrets,"
|
||||
echo "seedAllowList, /settings/secrets, containerEnv, userData, etc.)"
|
||||
echo "AND quote a repo-host token name (GITEA_TOKEN/GITHUB_TOKEN/…)."
|
||||
echo "Per RFC#523 threat model, tenant workspaces MUST NOT receive"
|
||||
echo "operator-scope repo-host tokens. If your code legitimately needs"
|
||||
echo "to reference one of these names in a tenant-writer file (e.g."
|
||||
echo "a deny-set definition or silent-strip list), add the file to"
|
||||
echo "EXEMPT_FILES with a one-line justification — reviewer signoff"
|
||||
echo "required."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "OK No tenant-writer-surface file co-mentions a repo-host token literal."
|
||||
@@ -4,7 +4,7 @@ name: Lint pre-flip continue-on-error
|
||||
# on any job in `.gitea/workflows/*.yml` WITHOUT proof that the affected
|
||||
# job's recent runs on the target branch (PR base) are actually green.
|
||||
#
|
||||
# Empirical class: PR #656 / mc#1982. PR #656 (RFC internal#219 Phase 4)
|
||||
# Empirical class: PR #656 / mc#774. PR #656 (RFC internal#219 Phase 4)
|
||||
# flipped 5 platform-build-class jobs `continue-on-error: true → false`
|
||||
# on the basis of a "verified green on main via combined-status check".
|
||||
# But that "green" was the LIE the prior `continue-on-error: true`
|
||||
@@ -13,7 +13,7 @@ name: Lint pre-flip continue-on-error
|
||||
# job-level status. The precondition the PR claimed to verify was
|
||||
# structurally fooled by the bug being flipped.
|
||||
#
|
||||
# mc#1982 captured the surfaced defects (2 mutually-masked regressions):
|
||||
# mc#774 captured the surfaced defects (2 mutually-masked regressions):
|
||||
# - Class 1: sqlmock helper drift since 2f36bb9a (24 days old)
|
||||
# - Class 2: OFFSEC-001 contract collision since 7d1a189f (1 day old)
|
||||
#
|
||||
@@ -55,7 +55,7 @@ name: Lint pre-flip continue-on-error
|
||||
# - YAML parse error in one of the workflow files: warn-only,
|
||||
# don't block — the YAML lint workflows catch this separately.
|
||||
#
|
||||
# Cross-links: PR#656, mc#1982, PR#665 (interim re-mask),
|
||||
# Cross-links: PR#656, mc#774, PR#665 (interim re-mask),
|
||||
# Quirk #10 (internal#342 + dup #287), hongming-pc2 charter
|
||||
# §SOP-N rule (e), feedback_strict_root_only_after_class_a,
|
||||
# feedback_no_shared_persona_token_use.
|
||||
@@ -99,8 +99,8 @@ jobs:
|
||||
timeout-minutes: 8
|
||||
# Phase 3 (RFC internal#219 §1): surface broken flips without blocking
|
||||
# the PR yet. Follow-up flips this to `false` once the workflow itself
|
||||
# has clean recent runs on main. mc#1982 interim — remove when CoE→false.
|
||||
continue-on-error: true # mc#1982
|
||||
# has clean recent runs on main. mc#774 interim — remove when CoE→false.
|
||||
continue-on-error: true # mc#774
|
||||
steps:
|
||||
- name: Check out PR head (full history for base-SHA access)
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
name: lint-required-context-exists-in-bp
|
||||
|
||||
# Tier 2g hard-gate lint (per mc#1982) — diff-based PR-time
|
||||
# Tier 2g hard-gate lint (per mc#774) — diff-based PR-time
|
||||
# check. When a PR adds a NEW commit-status emission (workflow YAML
|
||||
# `name:` + job `name:`-or-key + on:-event), the workflow file must
|
||||
# carry one of three directives adjacent to the new job:
|
||||
@@ -16,7 +16,7 @@ name: lint-required-context-exists-in-bp
|
||||
# PR#656 added `CI / all-required (pull_request)` as a sentinel
|
||||
# context that workflows emit, but BP did NOT list it. When
|
||||
# platform-build failed, all-required failed, but BP let the PR
|
||||
# merge anyway → cascade to mc#1982. With this lint, PR#656 would
|
||||
# merge anyway → cascade to mc#774. With this lint, PR#656 would
|
||||
# have been blocked until either the BP PATCH ran alongside OR
|
||||
# the author added a `bp-required: pending` directive.
|
||||
#
|
||||
@@ -27,7 +27,7 @@ name: lint-required-context-exists-in-bp
|
||||
# share the workflow-context enumeration helpers
|
||||
# (`_event_map`, `workflow_contexts`, `_job_display`) but the
|
||||
# semantics are intentionally distinct so they're separate scripts.
|
||||
# Co-design is documented in mc#1982.
|
||||
# Co-design is documented in mc#774.
|
||||
#
|
||||
# Directive comment lives in the workflow file (NOT PR body)
|
||||
# ----------------------------------------------------------
|
||||
@@ -42,13 +42,13 @@ name: lint-required-context-exists-in-bp
|
||||
# Lands at `continue-on-error: true` (Phase 3 — surface the
|
||||
# pattern without blocking PRs while the directive convention
|
||||
# beds in). After 7 days of clean runs on `main` with no false
|
||||
# positives, follow-up flips to `false`. Tracking: mc#1982.
|
||||
# positives, follow-up flips to `false`. Tracking: mc#774.
|
||||
#
|
||||
# Cross-links
|
||||
# -----------
|
||||
# - mc#1982 (the RFC that specs this lint)
|
||||
# - mc#774 (the RFC that specs this lint)
|
||||
# - PR#656 (the empirical case)
|
||||
# - mc#1982 (the surfaced cascade)
|
||||
# - mc#774 (the surfaced cascade)
|
||||
# - feedback_phantom_required_check_after_gitea_migration (Tier 2f cousin)
|
||||
# - feedback_behavior_based_ast_gates
|
||||
#
|
||||
@@ -81,10 +81,10 @@ jobs:
|
||||
name: lint-required-context-exists-in-bp
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 5
|
||||
# Phase 4 (RFC #219 §1): 22 days green since 2026-05-11 port,
|
||||
# well past the 7-clean-day threshold. PR-time failure is now
|
||||
# a hard CI signal.
|
||||
continue-on-error: false
|
||||
# Phase 3 (RFC #219 §1): surface the pattern without blocking PRs
|
||||
# while the directive convention beds in. Follow-up flip to false
|
||||
# after 7 clean days on main. mc#774.
|
||||
continue-on-error: true # mc#774 Phase 3 — flip to false after 7 clean main runs
|
||||
steps:
|
||||
- name: Check out PR head with full history (need base SHA blobs)
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
@@ -55,7 +55,7 @@ jobs:
|
||||
# Phase 3 (RFC #219 §1): surface broken shapes without blocking PRs.
|
||||
# Follow-up PR flips this off after the 4 existing-on-main rule-2
|
||||
# (workflow_run) violations are migrated to a supported trigger.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
@@ -67,7 +67,7 @@ jobs:
|
||||
# in this rollout (internal#462) so the precondition holds.
|
||||
runs-on: publish
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
@@ -234,7 +234,7 @@ jobs:
|
||||
name: Production auto-deploy
|
||||
needs: build-and-push
|
||||
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
|
||||
# Side-effect deploy only; image publish success is the durable artifact. mc#1982
|
||||
# Side-effect deploy only; image publish success is the durable artifact. mc#774
|
||||
continue-on-error: true
|
||||
# Publish/release lane (internal#462) — production deploy of a merged
|
||||
# fix; reserved capacity, never queued behind PR-CI.
|
||||
|
||||
@@ -9,22 +9,10 @@
|
||||
# Triggers on:
|
||||
# - `pull_request_target`: opened, synchronize, reopened
|
||||
# → initial status posts when PR opens / re-pushes
|
||||
# - `pull_request_review` types: [submitted]
|
||||
# → re-evaluate when a team member submits an APPROVE review so
|
||||
# the gate flips immediately (no wait for the next push or
|
||||
# slash-command). Verified live: sop-tier-check.yml uses this
|
||||
# same event and provably fires (produces
|
||||
# `sop-tier-check / tier-check (pull_request_review)` contexts).
|
||||
# The job-level `if:` guard checks
|
||||
# `github.event.review.state == 'APPROVED' || 'approved'` so
|
||||
# only APPROVE reviews run the evaluator; COMMENT and
|
||||
# REQUEST_CHANGES are skipped at the job level.
|
||||
# Branch-protection requires the `(pull_request_target)`
|
||||
# context variant, so the review-event path EXPLICITLY POSTS
|
||||
# the required context via the API. Trust boundary preserved
|
||||
# (BASE ref, no PR-head).
|
||||
# - comment refires are handled by `sop-checklist.yml` review-refire job
|
||||
# → `/qa-recheck` slash-command re-evaluates this gate.
|
||||
# - comment refires are handled by `review-refire-comments.yml`
|
||||
# → a single issue_comment dispatcher prevents every SOP/review
|
||||
# comment from enqueueing separate qa/security/tier jobs on
|
||||
# Gitea 1.22.6 before job-level `if:` can skip them.
|
||||
# Workflow name = `qa-review` ; job name = `approved`.
|
||||
# The job's own pass/fail conclusion publishes the status context
|
||||
# `qa-review / approved (<event>)` — NO `POST /statuses` call → NO
|
||||
@@ -97,26 +85,21 @@ name: qa-review
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened, synchronize, reopened]
|
||||
pull_request_review:
|
||||
types: [submitted]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: read
|
||||
statuses: write
|
||||
secrets: read
|
||||
|
||||
jobs:
|
||||
# bp-exempt: PR review bot signal; required merge state is enforced by CI / all-required.
|
||||
approved:
|
||||
# Gate the job:
|
||||
# - On pull_request_target events: always run.
|
||||
# - On pull_request_review_approved events: run so the gate flips
|
||||
# immediately when a team member submits an APPROVE review.
|
||||
# Comment-triggered refires live in sop-checklist.yml review-refire job.
|
||||
# Comment-triggered refires live in review-refire-comments.yml. Keeping
|
||||
# this workflow PR-only avoids comment-triggered queue storms.
|
||||
if: |
|
||||
github.event_name == 'pull_request_target' ||
|
||||
(github.event_name == 'pull_request_review' &&
|
||||
(github.event.review.state == 'APPROVED' || github.event.review.state == 'approved'))
|
||||
github.event_name == 'pull_request_target'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Privilege check (A1.1 — INFORMATIONAL log only, NOT a gate)
|
||||
@@ -160,7 +143,6 @@ jobs:
|
||||
ref: ${{ github.event.repository.default_branch }}
|
||||
|
||||
- name: Evaluate qa-review
|
||||
id: eval
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
GITEA_HOST: git.moleculesai.app
|
||||
@@ -175,66 +157,3 @@ jobs:
|
||||
REVIEW_CHECK_DEBUG: '0'
|
||||
REVIEW_CHECK_STRICT: '0'
|
||||
run: bash .gitea/scripts/review-check.sh
|
||||
|
||||
- name: Post required status context on pull_request_review
|
||||
# Gitea Actions auto-publishes (pull_request_review) context
|
||||
# for this event, but branch-protection requires (pull_request_target).
|
||||
# We explicitly POST the BP-required context so the gate flips.
|
||||
# Trust boundary: same BASE-ref script result, no PR-head code.
|
||||
#
|
||||
# TOKEN FIX (RC 8326): uses STATUS_POST_TOKEN (CTO-granted,
|
||||
# msg d52cc72a). Dedicated narrow-scoped write:repository token
|
||||
# for the explicit status POST. Evaluator step stays on
|
||||
# SOP_TIER_CHECK_TOKEN (read-only) per deliberate security
|
||||
# separation: eval computes, POST writes, never the same cred.
|
||||
if: github.event_name == 'pull_request_review' && always()
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.STATUS_POST_TOKEN }}
|
||||
GITEA_HOST: git.moleculesai.app
|
||||
REPO: ${{ github.repository }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number || github.event.issue.number }}
|
||||
EVAL_OUTCOME: ${{ steps.eval.outcome }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
authfile=$(mktemp)
|
||||
chmod 600 "$authfile"
|
||||
printf 'header = "Authorization: token %s"\n' "$GITEA_TOKEN" > "$authfile"
|
||||
|
||||
prfile=$(mktemp)
|
||||
code=$(curl -sS -o "$prfile" -w '%{http_code}' -K "$authfile" \
|
||||
"https://${GITEA_HOST}/api/v1/repos/${REPO}/pulls/${PR_NUMBER}")
|
||||
if [ "$code" != "200" ]; then
|
||||
echo "::error::GET /pulls/${PR_NUMBER} returned HTTP ${code}"
|
||||
rm -f "$prfile" "$authfile"
|
||||
exit 1
|
||||
fi
|
||||
head_sha=$(jq -r '.head.sha // ""' "$prfile")
|
||||
rm -f "$prfile"
|
||||
|
||||
if [ "$EVAL_OUTCOME" = "success" ]; then
|
||||
status_state="success"
|
||||
description="Approved via pull_request_review trigger"
|
||||
else
|
||||
status_state="failure"
|
||||
description="Review check failed via pull_request_review trigger"
|
||||
fi
|
||||
|
||||
body=$(jq -nc \
|
||||
--arg state "$status_state" \
|
||||
--arg context "qa-review / approved (pull_request_target)" \
|
||||
--arg description "$description" \
|
||||
'{state:$state, context:$context, description:$description}')
|
||||
|
||||
post_code=$(curl -sS -o /dev/null -w '%{http_code}' -X POST \
|
||||
-K "$authfile" -H "Content-Type: application/json" \
|
||||
-d "$body" \
|
||||
"https://${GITEA_HOST}/api/v1/repos/${REPO}/statuses/${head_sha}")
|
||||
|
||||
rm -f "$authfile"
|
||||
|
||||
if [ "$post_code" != "200" ] && [ "$post_code" != "201" ]; then
|
||||
echo "::error::POST /statuses/${head_sha} returned HTTP ${post_code}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "::notice::posted ${status_state} for context=\"qa-review / approved (pull_request_target)\" on sha=${head_sha}"
|
||||
|
||||
@@ -51,7 +51,7 @@ jobs:
|
||||
name: Audit Railway env vars for drift-prone pins
|
||||
runs-on: ubuntu-latest
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 10
|
||||
|
||||
|
||||
@@ -73,7 +73,7 @@ jobs:
|
||||
# it never queues behind PR-CI. `publish` -> molecule-runner-publish-*.
|
||||
runs-on: publish
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 25
|
||||
env:
|
||||
|
||||
@@ -80,7 +80,7 @@ jobs:
|
||||
# `publish` -> molecule-runner-publish-* sub-pool.
|
||||
runs-on: publish
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 25
|
||||
steps:
|
||||
|
||||
@@ -54,7 +54,7 @@ jobs:
|
||||
# runners with internet access to package mirrors). Falls back to GitHub
|
||||
# binary download. GitHub releases may be blocked on some runner networks
|
||||
# (infra#241 follow-up).
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
run: |
|
||||
if apt-get update -qq && apt-get install -y -qq jq; then
|
||||
|
||||
@@ -57,7 +57,7 @@ jobs:
|
||||
name: Detect SECRET_PATTERNS drift
|
||||
runs-on: ubuntu-latest
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 5
|
||||
steps:
|
||||
|
||||
@@ -6,44 +6,25 @@
|
||||
#
|
||||
# See `qa-review.yml` header for the full A1-α / A1.1 / A4 / A5 design
|
||||
# rationale; everything below is identical in shape.
|
||||
#
|
||||
# A1-α addendum (internal#760): review-event trigger added so the security
|
||||
# gate flips immediately when a team member submits an APPROVE review.
|
||||
# Uses `pull_request_review` types: [submitted] — verified live via
|
||||
# sop-tier-check.yml which provably fires this event (produces
|
||||
# `sop-tier-check / tier-check (pull_request_review)` contexts).
|
||||
# The job-level `if:` guard checks
|
||||
# `github.event.review.state == 'APPROVED' || 'approved'` so only APPROVE
|
||||
# reviews run the evaluator; COMMENT and REQUEST_CHANGES are skipped at
|
||||
# the job level. Branch-protection requires the `(pull_request_target)`
|
||||
# context variant, so the review-event path EXPLICITLY POSTS the required
|
||||
# context via the API. Trust boundary preserved (BASE ref, no PR-head).
|
||||
|
||||
name: security-review
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened, synchronize, reopened]
|
||||
pull_request_review:
|
||||
types: [submitted]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: read
|
||||
statuses: write
|
||||
secrets: read
|
||||
|
||||
jobs:
|
||||
# bp-exempt: PR security review bot signal; required merge state is enforced by CI / all-required.
|
||||
approved:
|
||||
# Gate the job:
|
||||
# - On pull_request_target events: always run.
|
||||
# - On pull_request_review_approved events: run so the gate flips
|
||||
# immediately when a team member submits an APPROVE review.
|
||||
# Comment-triggered refires live in sop-checklist.yml review-refire job.
|
||||
# Comment-triggered refires live in review-refire-comments.yml. Keeping
|
||||
# this workflow PR-only avoids comment-triggered queue storms.
|
||||
if: |
|
||||
github.event_name == 'pull_request_target' ||
|
||||
(github.event_name == 'pull_request_review' &&
|
||||
(github.event.review.state == 'APPROVED' || github.event.review.state == 'approved'))
|
||||
github.event_name == 'pull_request_target'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Privilege check (A1.1 — INFORMATIONAL log only, NOT a gate)
|
||||
@@ -76,7 +57,6 @@ jobs:
|
||||
ref: ${{ github.event.repository.default_branch }}
|
||||
|
||||
- name: Evaluate security-review
|
||||
id: eval
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
GITEA_HOST: git.moleculesai.app
|
||||
@@ -88,66 +68,3 @@ jobs:
|
||||
REVIEW_CHECK_DEBUG: '0'
|
||||
REVIEW_CHECK_STRICT: '0'
|
||||
run: bash .gitea/scripts/review-check.sh
|
||||
|
||||
- name: Post required status context on pull_request_review
|
||||
# Gitea Actions auto-publishes (pull_request_review) context
|
||||
# for this event, but branch-protection requires (pull_request_target).
|
||||
# We explicitly POST the BP-required context so the gate flips.
|
||||
# Trust boundary: same BASE-ref script result, no PR-head code.
|
||||
#
|
||||
# TOKEN FIX (RC 8326): uses STATUS_POST_TOKEN (CTO-granted,
|
||||
# msg d52cc72a). Dedicated narrow-scoped write:repository token
|
||||
# for the explicit status POST. Evaluator step stays on
|
||||
# SOP_TIER_CHECK_TOKEN (read-only) per deliberate security
|
||||
# separation: eval computes, POST writes, never the same cred.
|
||||
if: github.event_name == 'pull_request_review' && always()
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.STATUS_POST_TOKEN }}
|
||||
GITEA_HOST: git.moleculesai.app
|
||||
REPO: ${{ github.repository }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number || github.event.issue.number }}
|
||||
EVAL_OUTCOME: ${{ steps.eval.outcome }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
authfile=$(mktemp)
|
||||
chmod 600 "$authfile"
|
||||
printf 'header = "Authorization: token %s"\n' "$GITEA_TOKEN" > "$authfile"
|
||||
|
||||
prfile=$(mktemp)
|
||||
code=$(curl -sS -o "$prfile" -w '%{http_code}' -K "$authfile" \
|
||||
"https://${GITEA_HOST}/api/v1/repos/${REPO}/pulls/${PR_NUMBER}")
|
||||
if [ "$code" != "200" ]; then
|
||||
echo "::error::GET /pulls/${PR_NUMBER} returned HTTP ${code}"
|
||||
rm -f "$prfile" "$authfile"
|
||||
exit 1
|
||||
fi
|
||||
head_sha=$(jq -r '.head.sha // ""' "$prfile")
|
||||
rm -f "$prfile"
|
||||
|
||||
if [ "$EVAL_OUTCOME" = "success" ]; then
|
||||
status_state="success"
|
||||
description="Approved via pull_request_review trigger"
|
||||
else
|
||||
status_state="failure"
|
||||
description="Review check failed via pull_request_review trigger"
|
||||
fi
|
||||
|
||||
body=$(jq -nc \
|
||||
--arg state "$status_state" \
|
||||
--arg context "security-review / approved (pull_request_target)" \
|
||||
--arg description "$description" \
|
||||
'{state:$state, context:$context, description:$description}')
|
||||
|
||||
post_code=$(curl -sS -o /dev/null -w '%{http_code}' -X POST \
|
||||
-K "$authfile" -H "Content-Type: application/json" \
|
||||
-d "$body" \
|
||||
"https://${GITEA_HOST}/api/v1/repos/${REPO}/statuses/${head_sha}")
|
||||
|
||||
rm -f "$authfile"
|
||||
|
||||
if [ "$post_code" != "200" ] && [ "$post_code" != "201" ]; then
|
||||
echo "::error::POST /statuses/${head_sha} returned HTTP ${post_code}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "::notice::posted ${status_state} for context=\"security-review / approved (pull_request_target)\" on sha=${head_sha}"
|
||||
|
||||
@@ -179,10 +179,10 @@ jobs:
|
||||
- name: Refire qa-review status
|
||||
if: steps.classify.outputs.run_qa == 'true'
|
||||
env:
|
||||
# Evaluator (review-check.sh + GET /pulls) stays on read-scoped token.
|
||||
# RFC_324_TEAM_READ_TOKEN is read-only (team membership read scope only).
|
||||
# review-refire-status.sh POSTs to /statuses — requires write scope.
|
||||
# SOP_TIER_CHECK_TOKEN carries write:repository + write:issue + read:organization.
|
||||
GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
# Explicit POST /statuses uses narrow-scoped write:repository token.
|
||||
STATUS_POST_TOKEN: ${{ secrets.STATUS_POST_TOKEN }}
|
||||
GITEA_HOST: git.moleculesai.app
|
||||
REPO: ${{ github.repository }}
|
||||
PR_NUMBER: ${{ github.event.issue.number }}
|
||||
@@ -198,10 +198,10 @@ jobs:
|
||||
- name: Refire security-review status
|
||||
if: steps.classify.outputs.run_security == 'true'
|
||||
env:
|
||||
# Evaluator (review-check.sh + GET /pulls) stays on read-scoped token.
|
||||
# RFC_324_TEAM_READ_TOKEN is read-only (team membership read scope only).
|
||||
# review-refire-status.sh POSTs to /statuses — requires write scope.
|
||||
# SOP_TIER_CHECK_TOKEN carries write:repository + write:issue + read:organization.
|
||||
GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
# Explicit POST /statuses uses narrow-scoped write:repository token.
|
||||
STATUS_POST_TOKEN: ${{ secrets.STATUS_POST_TOKEN }}
|
||||
GITEA_HOST: git.moleculesai.app
|
||||
REPO: ${{ github.repository }}
|
||||
PR_NUMBER: ${{ github.event.issue.number }}
|
||||
|
||||
@@ -36,7 +36,7 @@
|
||||
# window closed. continue-on-error: true has been removed from the
|
||||
# tier-check job; AND-composition is now fully enforced. If you need
|
||||
# to temporarily re-introduce a mask, file a tracker and follow the
|
||||
# mc#1982 protocol (Tier 2e lint requires a current tracker within
|
||||
# mc#774 protocol (Tier 2e lint requires a current tracker within
|
||||
# 2 lines of any continue-on-error: true).
|
||||
|
||||
name: sop-tier-check
|
||||
@@ -92,7 +92,7 @@ jobs:
|
||||
# runners). The sop-tier-check script has its own fallback as a
|
||||
# third line of defense. continue-on-error: true ensures this step
|
||||
# failing does not block the job.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
run: |
|
||||
# apt-get is the primary method — Ubuntu package mirrors are reliably
|
||||
@@ -113,7 +113,7 @@ jobs:
|
||||
# continue-on-error: true at step level — job-level is ignored by Gitea
|
||||
# Actions (quirk #10, internal runbooks). Belt-and-suspenders with
|
||||
# SOP_FAIL_OPEN=1 + || true below.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
|
||||
@@ -90,7 +90,7 @@ jobs:
|
||||
staging-smoke:
|
||||
runs-on: ubuntu-latest
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
outputs:
|
||||
sha: ${{ steps.compute.outputs.sha }}
|
||||
@@ -212,7 +212,7 @@ jobs:
|
||||
if: ${{ needs.staging-smoke.result == 'success' && needs.staging-smoke.outputs.smoke_ran == 'true' }}
|
||||
runs-on: ubuntu-latest
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
env:
|
||||
SHA: ${{ needs.staging-smoke.outputs.sha }}
|
||||
|
||||
@@ -71,7 +71,7 @@ jobs:
|
||||
name: Sweep CF orphans
|
||||
runs-on: ubuntu-latest
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
# 3 min surfaces hangs (CF API stall, AWS describe-instances stuck)
|
||||
# within one cron interval instead of burning a full tick. Realistic
|
||||
|
||||
@@ -55,7 +55,7 @@ jobs:
|
||||
name: Sweep CF tunnels
|
||||
runs-on: ubuntu-latest
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
# 30 min cap. Was 5 min on the theory that the only thing that
|
||||
# could take >5min is a CF-API hang — but on 2026-05-02 a backlog
|
||||
|
||||
@@ -49,7 +49,7 @@ jobs:
|
||||
name: Ops scripts (unittest)
|
||||
runs-on: ubuntu-latest
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
@@ -35,26 +35,8 @@ name: verify-providers-gen
|
||||
on:
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
# CI-scheduler-overload fix (fix/ci-scheduler-fanout, 2026-06-01):
|
||||
# this gate only verifies that the generated providers artifact is in
|
||||
# sync with the schema SSOT. Its verdict can ONLY change when one of
|
||||
# the codegen inputs/outputs changes, so firing the Go toolchain on
|
||||
# every unrelated PR (docs, canvas, scripts) is pure fan-out cost.
|
||||
# Scoped to the codegen surface. SAFE because this workflow is NOT a
|
||||
# branch-protection status_check_context (see header §ENFORCEMENT
|
||||
# GATING) — lint-required-no-paths only forbids paths filters on
|
||||
# REQUIRED workflows; this is advisory, so a paths filter is allowed.
|
||||
# Mirrors the sibling sync-providers-yaml.yml scoping convention.
|
||||
paths:
|
||||
- 'workspace-server/internal/providers/**'
|
||||
- 'workspace-server/cmd/gen-providers/**'
|
||||
- '.gitea/workflows/verify-providers-gen.yml'
|
||||
push:
|
||||
branches: [main, staging]
|
||||
paths:
|
||||
- 'workspace-server/internal/providers/**'
|
||||
- 'workspace-server/cmd/gen-providers/**'
|
||||
- '.gitea/workflows/verify-providers-gen.yml'
|
||||
|
||||
env:
|
||||
GITHUB_SERVER_URL: https://git.moleculesai.app
|
||||
|
||||
@@ -31,7 +31,7 @@ jobs:
|
||||
name: Weekly Platform-Go Surface
|
||||
runs-on: ubuntu-latest
|
||||
# continue-on-error: surface only, never block
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
# mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
defaults:
|
||||
run:
|
||||
|
||||
@@ -49,8 +49,8 @@
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
git clone https://git.moleculesai.app/molecule-ai/molecule-core.git
|
||||
cd molecule-core
|
||||
git clone https://git.moleculesai.app/molecule-ai/molecule-monorepo.git
|
||||
cd molecule-monorepo
|
||||
./scripts/dev-start.sh
|
||||
```
|
||||
|
||||
|
||||
@@ -41,7 +41,7 @@ export default function PricingPage() {
|
||||
<p className="mt-2 text-ink-mid">
|
||||
We publish the{" "}
|
||||
<a
|
||||
href="https://git.moleculesai.app/molecule-ai/molecule-core"
|
||||
href="https://git.moleculesai.app/molecule-ai/molecule-monorepo"
|
||||
className="text-accent underline hover:text-accent"
|
||||
>
|
||||
full source on GitHub
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# Molecule AI — Comprehensive Technical Documentation
|
||||
|
||||
> Definitive technical reference for the Molecule AI Agent Team platform.
|
||||
> Based on a full non-invasive scan of the [molecule-core](https://git.moleculesai.app/molecule-ai/molecule-core) repository.
|
||||
> Based on a full non-invasive scan of the [molecule-monorepo](https://git.moleculesai.app/molecule-ai/molecule-monorepo) repository.
|
||||
|
||||
---
|
||||
|
||||
@@ -1131,11 +1131,11 @@ Molecule AI's workspace abstraction is **runtime-agnostic by design**. A workspa
|
||||
|
||||
## Links
|
||||
|
||||
- **GitHub**: https://git.moleculesai.app/molecule-ai/molecule-core
|
||||
- **Architecture Docs**: https://git.moleculesai.app/molecule-ai/molecule-core/src/branch/main/docs/architecture
|
||||
- **API Protocol**: https://git.moleculesai.app/molecule-ai/molecule-core/src/branch/main/docs/api-protocol
|
||||
- **Agent Runtime**: https://git.moleculesai.app/molecule-ai/molecule-core/src/branch/main/docs/agent-runtime
|
||||
- **Product Docs**: https://git.moleculesai.app/molecule-ai/molecule-core/src/branch/main/docs/product
|
||||
- **GitHub**: https://git.moleculesai.app/molecule-ai/molecule-monorepo
|
||||
- **Architecture Docs**: https://git.moleculesai.app/molecule-ai/molecule-monorepo/src/branch/main/docs/architecture
|
||||
- **API Protocol**: https://git.moleculesai.app/molecule-ai/molecule-monorepo/src/branch/main/docs/api-protocol
|
||||
- **Agent Runtime**: https://git.moleculesai.app/molecule-ai/molecule-monorepo/src/branch/main/docs/agent-runtime
|
||||
- **Product Docs**: https://git.moleculesai.app/molecule-ai/molecule-monorepo/src/branch/main/docs/product
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -82,7 +82,7 @@ DATABASE_URL=postgres://dev:dev@postgres:5432/molecule?sslmode=prefer
|
||||
REDIS_URL=redis://redis:6379
|
||||
PORT=8080
|
||||
SECRETS_ENCRYPTION_KEY=dev-key-change-in-production
|
||||
WORKSPACE_DIR=/path/to/molecule-core # Optional global fallback; prefer per-workspace workspace_dir in org.yaml or API
|
||||
WORKSPACE_DIR=/path/to/molecule-monorepo # Optional global fallback; prefer per-workspace workspace_dir in org.yaml or API
|
||||
```
|
||||
|
||||
### Canvas (Next.js)
|
||||
|
||||
@@ -16,9 +16,11 @@ workspace container running on it) over an [EC2 Instance Connect
|
||||
Endpoint](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-connect-setup-ec2-instance-connect-endpoint.html).
|
||||
End users see a terminal; no direct public SSH ingress is required.
|
||||
|
||||
Tracking: originally `molecule-core#1528` (resolved 2026-04-22). Future
|
||||
terminal work is tracked in `molecule-core` issues (workspace-server scope)
|
||||
and in `molecule-controlplane` issues for the EIC / per-tenant SG path.
|
||||
Tracking: originally `molecule-core#1528` (resolved 2026-04-22). The
|
||||
`molecule-core` repo has since been renamed to `molecule-monorepo` and no
|
||||
longer accepts new issues under the old name; future terminal work is
|
||||
tracked in `molecule-monorepo` issues (workspace-server scope) and in
|
||||
`molecule-controlplane` issues for the EIC / per-tenant SG path.
|
||||
|
||||
## Where things are
|
||||
|
||||
|
||||
@@ -64,7 +64,7 @@ When opencode connects to the Molecule MCP endpoint, the agent gains access to:
|
||||
"tool": "delegate_task",
|
||||
"arguments": {
|
||||
"target": "research-lead",
|
||||
"task": "Summarise the last 7 days of commits in Molecule-AI/molecule-core"
|
||||
"task": "Summarise the last 7 days of commits in Molecule-AI/molecule-monorepo"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Internal content policy
|
||||
|
||||
The `Molecule-AI/molecule-core` repo is **public**. Anything internal
|
||||
The `Molecule-AI/molecule-monorepo` repo is **public**. Anything internal
|
||||
(positioning, competitive briefs, sales playbooks, PMM/press drip, draft
|
||||
campaigns, raw research notes, ops runbooks, retrospectives) lives in
|
||||
**`Molecule-AI/internal`**.
|
||||
@@ -18,14 +18,14 @@ This page is the canonical decision tree.
|
||||
| Draft campaign asset (still iterating, not yet customer-visible) | `Molecule-AI/internal/marketing/campaigns/` |
|
||||
| Roadmap discussion, planning doc, retrospective | `Molecule-AI/internal/PLAN.md` or `Molecule-AI/internal/retrospectives/` |
|
||||
| Runbook, ops procedure, incident postmortem | `Molecule-AI/internal/runbooks/` |
|
||||
| **Public-ready** blog post (final draft, ready to ship to docs site) | `Molecule-AI/molecule-core/docs/blog/` |
|
||||
| **Public-ready** tutorial / quickstart | `Molecule-AI/molecule-core/docs/tutorials/` |
|
||||
| Public DevRel content (code samples, demos for users) | `Molecule-AI/molecule-core/docs/devrel/` |
|
||||
| API reference, architecture docs for external developers | `Molecule-AI/molecule-core/docs/api/` |
|
||||
| **Public-ready** blog post (final draft, ready to ship to docs site) | `Molecule-AI/molecule-monorepo/docs/blog/` |
|
||||
| **Public-ready** tutorial / quickstart | `Molecule-AI/molecule-monorepo/docs/tutorials/` |
|
||||
| Public DevRel content (code samples, demos for users) | `Molecule-AI/molecule-monorepo/docs/devrel/` |
|
||||
| API reference, architecture docs for external developers | `Molecule-AI/molecule-monorepo/docs/api/` |
|
||||
| Code, tests, infrastructure | wherever is appropriate inside this repo |
|
||||
|
||||
**Rule of thumb:** *"Would I be comfortable if a competitor / journalist / customer
|
||||
read this verbatim today?"* — yes → `molecule-core/docs/`. No / not yet → `internal/`.
|
||||
read this verbatim today?"* — yes → `monorepo/docs/`. No / not yet → `internal/`.
|
||||
|
||||
## Why
|
||||
|
||||
@@ -82,7 +82,7 @@ git push -u origin HEAD
|
||||
gh pr create --base main --fill
|
||||
```
|
||||
|
||||
Yes, this is more steps than `cd molecule-core && git add research/foo.md`.
|
||||
Yes, this is more steps than `cd molecule-monorepo && git add research/foo.md`.
|
||||
That cost is intentional: the friction is the point. Public space and
|
||||
internal space are different products with different audiences and
|
||||
different durability guarantees.
|
||||
|
||||
+4
-4
@@ -17,8 +17,8 @@ This path is aligned to the current repository and current UI. It gets you from
|
||||
## The one-command path
|
||||
|
||||
```bash
|
||||
git clone https://git.moleculesai.app/molecule-ai/molecule-core.git
|
||||
cd molecule-core
|
||||
git clone https://git.moleculesai.app/molecule-ai/molecule-monorepo.git
|
||||
cd molecule-monorepo
|
||||
./scripts/dev-start.sh
|
||||
```
|
||||
|
||||
@@ -42,8 +42,8 @@ If you'd rather run each component yourself — useful when you're iterating on
|
||||
### Step 1: Clone the repository
|
||||
|
||||
```bash
|
||||
git clone https://git.moleculesai.app/molecule-ai/molecule-core.git
|
||||
cd molecule-core
|
||||
git clone https://git.moleculesai.app/molecule-ai/molecule-monorepo.git
|
||||
cd molecule-monorepo
|
||||
```
|
||||
|
||||
### Step 2: Start the shared infrastructure
|
||||
|
||||
@@ -1,124 +0,0 @@
|
||||
# Engineer-Agent Gitea Token Scope Runbook
|
||||
|
||||
## Symptom
|
||||
|
||||
Engineer-class agents (e.g. `agent-dev-a`, `agent-dev-b`) fail swarm-pull issue discovery or receive HTTP 403 when calling Gitea issue-list APIs, while PR review and repository API operations continue to work.
|
||||
|
||||
Typical failing call:
|
||||
```bash
|
||||
GET /api/v1/repos/molecule-ai/molecule-core/issues?state=open&labels=approved&limit=50
|
||||
# => 403 Forbidden
|
||||
```
|
||||
|
||||
Typical working calls (same token):
|
||||
```bash
|
||||
GET /api/v1/repos/molecule-ai/molecule-core/pulls?state=open&limit=50
|
||||
POST /api/v1/repos/molecule-ai/molecule-core/pulls/1666/comments
|
||||
# => 200 OK
|
||||
```
|
||||
|
||||
## Root Cause
|
||||
|
||||
Gitea v1.22.6 routes issue-list under the `Issue` scope category (`routers/api/v1/api.go:1379-1491`), while PR routes live under repository/pull routing (`api.go:1278-1305`). The scope gate derives required read/write level from HTTP method (`api.go:309-313`), so `GET /issues?...` requires `read:issue`.
|
||||
|
||||
Engineer-class agent PATs were provisioned with repository and PR scopes but without `read:issue`, causing the asymmetric 403.
|
||||
|
||||
## Detection
|
||||
|
||||
1. **Agent-side**: swarm-pull workflow logs show `403 Forbidden` on issue enumeration but not on PR list/review.
|
||||
2. **Platform-side**: Gitea access logs show `GET /repos/{owner}/{repo}/issues` returning 403 for the affected token.
|
||||
3. **Reproduction** (from any workspace with a suspected token):
|
||||
```bash
|
||||
TOKEN=$(cat /configs/secrets.d/GITEA_TOKEN)
|
||||
PLATFORM="https://git.moleculesai.app"
|
||||
|
||||
# Should succeed — confirms token is live
|
||||
curl -s -o /dev/null -w "%{http_code}" \
|
||||
-H "Authorization: token $TOKEN" \
|
||||
"$PLATFORM/api/v1/user"
|
||||
|
||||
# Will 403 if the token lacks read:issue
|
||||
curl -s -o /dev/null -w "%{http_code}" \
|
||||
-H "Authorization: token $TOKEN" \
|
||||
"$PLATFORM/api/v1/repos/molecule-ai/molecule-core/issues?state=open&limit=1"
|
||||
```
|
||||
|
||||
## Immediate Fix
|
||||
|
||||
### Step 1: Issue fresh PATs with correct scopes
|
||||
|
||||
From a Gitea site-admin account (or via the Gitea web UI → Settings → Applications):
|
||||
|
||||
1. Navigate to the affected user's profile (e.g. `agent-dev-a`).
|
||||
2. Go to **Settings → Applications → Generate New Token**.
|
||||
3. Select scopes:
|
||||
- `read:repository` (existing)
|
||||
- `write:repository` (existing, if push is required)
|
||||
- `read:issue` (**add this**)
|
||||
- `write:issue` (add only if agents must comment/edit issues)
|
||||
- `read:pull-request` / `write:pull-request` (existing)
|
||||
- `read:comment` / `write:comment` (existing, if PR review is required)
|
||||
4. Copy the plaintext token immediately — it is shown only once.
|
||||
|
||||
### Step 2: Update workspace secrets
|
||||
|
||||
For each affected engineer workspace, update the Gitea token secret:
|
||||
|
||||
```bash
|
||||
# Via the platform API (admin auth required)
|
||||
PLATFORM="https://agents-team.moleculesai.app"
|
||||
ADMIN_TOKEN="<your-admin-token>"
|
||||
WORKSPACE_ID="<affected-workspace-id>"
|
||||
NEW_GITEA_TOKEN="<fresh-token-from-step-1>"
|
||||
|
||||
curl -X POST "$PLATFORM/workspaces/$WORKSPACE_ID/secrets" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{
|
||||
\"GITEA_TOKEN\": \"$NEW_GITEA_TOKEN\"
|
||||
}"
|
||||
```
|
||||
|
||||
Restart the workspace so the runtime re-reads secrets:
|
||||
```bash
|
||||
curl -X POST "$PLATFORM/workspaces/$WORKSPACE_ID/restart" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN"
|
||||
```
|
||||
|
||||
### Step 3: Smoke-test
|
||||
|
||||
From the restarted workspace, verify all three paths:
|
||||
|
||||
```bash
|
||||
# 1. Issue list (the previously failing path)
|
||||
curl -s -H "Authorization: token $GITEA_TOKEN" \
|
||||
"https://git.moleculesai.app/api/v1/repos/molecule-ai/molecule-core/issues?state=open&labels=approved&limit=1" | jq '.[0].number'
|
||||
|
||||
# 2. PR list (should still work)
|
||||
curl -s -H "Authorization: token $GITEA_TOKEN" \
|
||||
"https://git.moleculesai.app/api/v1/repos/molecule-ai/molecule-core/pulls?state=open&limit=1" | jq '.[0].number'
|
||||
|
||||
# 3. Swarm-pull discovery (end-to-end)
|
||||
# Trigger the agent's autonomous tick or delegate a task that enumerates open issues.
|
||||
```
|
||||
|
||||
## Long-Term Fix
|
||||
|
||||
Update the **workspace secret injection path** that writes `/configs/secrets.d/GITEA_TOKEN` for engineer-class agents. The provisioning template or secret-distribution job should request `read:issue` (and optionally `write:issue`) at token-creation time.
|
||||
|
||||
File locations to audit:
|
||||
- `.gitea/scripts/` — any token-provisioning automation
|
||||
- `infra/terraform/` or equivalent — IAM/secret-manager templates
|
||||
- `workspace-configs-templates/` — engineer-class workspace templates that declare required secrets
|
||||
|
||||
## Prevention
|
||||
|
||||
1. **Token scope checklist**: when provisioning new engineer-class agent tokens, verify the scope set includes `read:issue` before distributing the secret.
|
||||
2. **Monitoring**: add an agent health-check that probes `GET /repos/molecule-ai/molecule-core/issues?limit=1` and surfaces a non-fatal warning if it returns 403.
|
||||
3. **Documentation**: update the onboarding runbook for new engineer agents to include the full required scope list.
|
||||
|
||||
## References
|
||||
|
||||
- Gitea issue #1750: [RCA: engineer-token read:issue scope gap blocks swarm-pull workflow](https://git.moleculesai.app/molecule-ai/molecule-core/issues/1750)
|
||||
- Gitea source: `routers/api/v1/api.go:309-313` (scope gate), `api.go:1278-1305` (PR routing), `api.go:1379-1491` (issue routing)
|
||||
- Related: PR #1542 (provisioner git-creds injection), PR #1669 (auth_token inline mint)
|
||||
@@ -93,7 +93,9 @@ def _gitea_get(path: str, params: dict[str, str] | None = None) -> bytes | None:
|
||||
try:
|
||||
# S310 (信任boundary): this function IS the outbound HTTP client for
|
||||
# Gitea API calls. The call is intentional and controlled — we build
|
||||
with urllib.request.urlopen(req, timeout=20) as resp: # noqa: S310 # explicit timeout + error handling; bandit false positive
|
||||
# the request ourselves and handle errors explicitly. Timeout=20s
|
||||
# prevents indefinite hangs.
|
||||
with urllib.request.urlopen(req, timeout=20) as resp: # noqa: S310
|
||||
return resp.read()
|
||||
except urllib.error.HTTPError as e:
|
||||
sys.stderr.write(f"Gitea API HTTP {e.code} on {path}: {e.reason}\n")
|
||||
|
||||
@@ -27,9 +27,9 @@ def smoke_imports_and_invariants() -> None:
|
||||
import-rewrite mistakes (the 0.1.16 incident, where main.py loaded but
|
||||
main_sync was missing because the build script dropped a re-export).
|
||||
"""
|
||||
from molecule_runtime.main import main_sync # noqa: F401 # smoke-test re-export regression (mc#1769)
|
||||
from molecule_runtime import a2a_client, a2a_tools # noqa: F401 # smoke-test re-export regression (mc#1769)
|
||||
from molecule_runtime.builtin_tools import memory # noqa: F401 # smoke-test re-export regression (mc#1769)
|
||||
from molecule_runtime.main import main_sync # noqa: F401
|
||||
from molecule_runtime import a2a_client, a2a_tools # noqa: F401
|
||||
from molecule_runtime.builtin_tools import memory # noqa: F401
|
||||
from molecule_runtime.adapters import get_adapter, BaseAdapter, AdapterConfig
|
||||
|
||||
# cli_main + mcp_cli.main are the molecule-mcp console-script entry
|
||||
@@ -38,8 +38,8 @@ def smoke_imports_and_invariants() -> None:
|
||||
# rewrite here would break every external operator's MCP install on
|
||||
# the next wheel publish. Pin both names because pyproject points
|
||||
# at mcp_cli.main, which then imports a2a_mcp_server.cli_main.
|
||||
from molecule_runtime.a2a_mcp_server import cli_main # noqa: F401 # smoke-test re-export regression (mc#1769)
|
||||
from molecule_runtime.mcp_cli import main as mcp_cli_main # noqa: F401 # smoke-test re-export regression (mc#1769)
|
||||
from molecule_runtime.a2a_mcp_server import cli_main # noqa: F401
|
||||
from molecule_runtime.mcp_cli import main as mcp_cli_main # noqa: F401
|
||||
assert callable(cli_main), "a2a_mcp_server.cli_main must be callable"
|
||||
assert callable(mcp_cli_main), "mcp_cli.main must be callable"
|
||||
|
||||
@@ -48,7 +48,7 @@ def smoke_imports_and_invariants() -> None:
|
||||
# imports + activates these at startup; if a wheel ships without
|
||||
# them, the standalone agent silently loses the wait_for_message /
|
||||
# inbox_peek / inbox_pop tools and reverts to outbound-only.
|
||||
from molecule_runtime.inbox import ( # noqa: F401 # smoke-test re-export regression (mc#1769)
|
||||
from molecule_runtime.inbox import ( # noqa: F401
|
||||
InboxState,
|
||||
activate as inbox_activate,
|
||||
get_state as inbox_get_state,
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
#
|
||||
# Invocation (from template-hermes repo's CI):
|
||||
#
|
||||
# bash /path/to/molecule-core/tools/check-template-parity.sh \
|
||||
# bash /path/to/molecule-monorepo/tools/check-template-parity.sh \
|
||||
# install.sh start.sh
|
||||
#
|
||||
# Or inline via curl:
|
||||
|
||||
@@ -349,17 +349,6 @@ func main() {
|
||||
codexauth.StartCodexAuthRefresher(c, db.DB)
|
||||
})
|
||||
|
||||
// RFC internal#742 Part 2: wire the boot-failure rescue capture into
|
||||
// the provision-timeout sweep's failure verdict. When the sweep flips
|
||||
// a stuck workspace to `failed`, this hook captures a forensic rescue
|
||||
// bundle off the still-running (but boot-failed) EC2 and ships it to
|
||||
// obs/Loki before the control plane reaps the instance. Best-effort +
|
||||
// non-blocking (handlers.BootFailureRescueHook dispatches on its own
|
||||
// goroutine + timeout). The handler-side boot-failure path
|
||||
// (WorkspaceHandler.BootstrapFailed) wires its own capture inline.
|
||||
registry.BootFailureRescueHook = handlers.BootFailureRescueHook
|
||||
|
||||
|
||||
// Provision-timeout sweep — flips workspaces that have been stuck in
|
||||
// status='provisioning' past the timeout window to 'failed' and emits
|
||||
// WORKSPACE_PROVISION_TIMEOUT. Without this the UI banner is cosmetic
|
||||
|
||||
@@ -1,114 +0,0 @@
|
||||
# Molecule Platform OpenAPI specs
|
||||
|
||||
This directory holds the machine-readable API contracts for the Molecule
|
||||
platform.
|
||||
|
||||
| File | Spec | Scope | Status |
|
||||
|------|------|-------|--------|
|
||||
| `management.yaml` | OpenAPI **3.1** | The **management surface** across both services (orgs, billing, admin, provisioning, workspaces, secrets, templates, org-tokens, bundles). | **SSOT** — hand-authored. |
|
||||
| `swagger.yaml` / `swagger.json` | OpenAPI 2.0 | swaggo-generated stub, `/schedules` only (the per-workspace **runtime** surface). | Legacy stub; superseded for management by `management.yaml`. |
|
||||
|
||||
`management.yaml` is the **single source of truth** the management tooling
|
||||
derives from — the management MCP server, the management CLI (`molecule-cli`),
|
||||
and the human-facing API docs (RFC #1706, the gap closed by
|
||||
`PLATFORM-MANAGEMENT-API.md` §5c). Do not hand-edit those clients' route maps;
|
||||
change them here and regenerate/derive.
|
||||
|
||||
## The two-service split
|
||||
|
||||
One structural fact drives the whole spec: there are **two services with two
|
||||
auth stacks**, and the management surface spans both.
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────┐
|
||||
browser / CLI / MCP │ Control plane (CP) │
|
||||
│ │ molecule-controlplane @ api.moleculesai │
|
||||
│ session │ /api/v1/* (stable) [+ /cp/* sunset] │
|
||||
├───────────────▶│ orgs · members · billing · provisioning │
|
||||
│ admin bearer │ · fleet/admin ops · pins │
|
||||
│ provision sec │ │
|
||||
└────────────────┴──────────────┬───────────────────────────┘
|
||||
│ edge reverse-proxy
|
||||
│ (subdomain / X-Molecule-Org-Slug)
|
||||
▼
|
||||
┌─────────────────────────────────────────┐
|
||||
Org API Key / ws tok │ Tenant workspace-server │
|
||||
│ │ molecule-core/workspace-server │
|
||||
└───────────────▶│ ONE EC2 per org @ <slug>.moleculesai.app│
|
||||
│ workspaces · secrets · templates · │
|
||||
│ org-tokens · bundles │
|
||||
└─────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
- **Control plane (CP)** — `api.moleculesai.app`, routes modelled under
|
||||
`/api/v1/*` (the `/cp/*` mirror is identical but sunset-headed per RFC #61 and
|
||||
is not duplicated in the spec). Owns **orgs, members, billing, provisioning,
|
||||
fleet/admin ops**.
|
||||
- **Tenant workspace-server** — one EC2 per org at `<slug>.moleculesai.app`.
|
||||
Owns **workspaces, agents, secrets, templates, org-tokens, bundles**. Requests
|
||||
may also be sent to the CP host with an `X-Molecule-Org-Slug` header; the CP
|
||||
edge reverse-proxies them to the tenant host (the `Authorization`,
|
||||
`X-Molecule-Org-*`, and cookie headers pass through unchanged and the tenant's
|
||||
own middleware validates them).
|
||||
|
||||
The key consequence, called out in `PLATFORM-MANAGEMENT-API.md`: **the Org API
|
||||
Key is a TENANT credential, not a CP one.** It is full tenant-admin over its own
|
||||
org's workspace-server surface and reaches **nothing** on the CP (org
|
||||
create/delete, billing, members, provisioning all 401/403 it). That is why
|
||||
member/billing tools belong in a separate CP-admin MCP, not the org-key-authed
|
||||
management MCP.
|
||||
|
||||
## Security scheme → surface map (the tier matrix)
|
||||
|
||||
`management.yaml` defines these `securitySchemes`; each operation declares the
|
||||
one(s) it accepts. Mirror of `PLATFORM-MANAGEMENT-API.md` §1:
|
||||
|
||||
| Scheme | What it is | Where it applies |
|
||||
|--------|-----------|------------------|
|
||||
| `workosSession` | WorkOS AuthKit session cookie `mcp_session` (+ org membership/ownership checks) | CP `/api/v1/orgs/*`, `/api/v1/billing/*`. Also accepted on the tenant surface via the CP-session path. |
|
||||
| `cpAdminBearer` | CP `CP_ADMIN_API_TOKEN` operator bearer (AdminGate, constant-time) | CP `/api/v1/admin/*` — admin-create-org, tenant teardown, workspace env, ListOrgWorkspaces, redeploy, pins. |
|
||||
| `provisionSecret` | CP `PROVISION_SHARED_SECRET` bearer | CP `/api/v1/workspaces/provision`, `…/status`. Routes unmounted when the secret is unset. |
|
||||
| `tenantAdminToken` | Per-tenant admin_token (+ `X-Molecule-Org-Id`) | CP `DELETE /api/v1/workspaces/:id` (deprovision) — **in addition to** `provisionSecret` (issue #118). |
|
||||
| `orgApiKey` | Tenant Org API Key — `Authorization: Bearer <key>` + routing header; full tenant-admin, self-minting | **All** tenant routes: `/workspaces[/:id]`, `/workspaces/:id/secrets`, budget, billing-mode, `/settings/secrets`, `/org/import`, `/org/templates`, `/org/tokens`, `/templates`, `/bundles`. |
|
||||
| `workspaceToken` | Per-workspace bearer, bound to one workspace id (+ routing header) | Read/lifecycle/secrets on a single `/workspaces/:id/*`. **Rejected** on admin list/create/delete when ADMIN_TOKEN is set — use `orgApiKey`. |
|
||||
| `orgRoutingHeaderId` / `orgRoutingHeaderSlug` | `X-Molecule-Org-Id` / `X-Molecule-Org-Slug` | Required on every tenant-host request so the edge / TenantGuard route + authorize against the correct org. Send one of them alongside the bearer. |
|
||||
|
||||
### Guards worth knowing (modelled per-operation)
|
||||
|
||||
- **Dry-run:** `POST /api/v1/admin/orgs?dry_run=true` — validate + echo, no org
|
||||
created. (The only dry-run on the whole management API.)
|
||||
- **Confirm token:** `DELETE /api/v1/admin/tenants/:slug` and
|
||||
`…/scrub-artifacts` — body `confirm` MUST equal the URL slug, else `400`
|
||||
before any teardown.
|
||||
- **Force flag:** `POST /api/v1/admin/workspaces/:id/env` — keys matching the
|
||||
secret-keyword guard (`TOKEN`/`SECRET`/`KEY`/`PASSWORD`) require `force=true`.
|
||||
- **Runtime-pin gate:** `POST /api/v1/workspaces/provision` returns `422
|
||||
RUNTIME_PIN_MISSING` when no runtime image pin exists.
|
||||
- **Auto-restart side-effects:** writing a workspace or global secret
|
||||
auto-restarts the affected workspace(s).
|
||||
|
||||
## Security note (carried from the synthesis spec)
|
||||
|
||||
The Org API Key is **full tenant-admin and self-minting** — a management MCP
|
||||
holding one holds tenant root. There is no scope-down today (TODO in
|
||||
`orgtoken`). Per-role / per-workspace scoping should ship alongside the
|
||||
management MCP.
|
||||
|
||||
## Validate
|
||||
|
||||
```bash
|
||||
cd workspace-server/docs/openapi
|
||||
npx @redocly/cli lint management.yaml # must be clean (0 errors, 0 warnings)
|
||||
```
|
||||
|
||||
## Scope notes / best-effort flags
|
||||
|
||||
- The per-workspace **runtime** surface (schedules, agent, registry, a2a,
|
||||
memory, approvals, channels, terminal, files) is intentionally **out of
|
||||
scope** here — that's the runtime contract, not management.
|
||||
- A handful of bodies are **best-effort** from the handlers (org-import inline
|
||||
template, bundle import, list responses with open shapes) and are marked with
|
||||
`additionalProperties: true` in the schema. Tighten as the handler structs
|
||||
stabilise.
|
||||
- `/cp/*` deprecated mirrors are omitted (identical shapes; RFC #61
|
||||
Deprecation/Sunset). Build against `/api/v1/*`.
|
||||
File diff suppressed because it is too large
Load Diff
@@ -246,6 +246,20 @@ func MarkQueueItemFailed(ctx context.Context, id, errMsg string) {
|
||||
}
|
||||
}
|
||||
|
||||
// QueueDepth returns the number of currently-queued (not dispatched/completed)
|
||||
// items for a workspace. Used by the busy-return response body so callers
|
||||
// can see how many ahead of them.
|
||||
func QueueDepth(ctx context.Context, workspaceID string) int {
|
||||
var n int
|
||||
if err := db.DB.QueryRowContext(ctx,
|
||||
`SELECT COUNT(*) FROM a2a_queue WHERE workspace_id = $1 AND status = 'queued'`,
|
||||
workspaceID,
|
||||
).Scan(&n); err != nil {
|
||||
log.Printf("A2AQueue: QueueDepth query failed for workspace %s: %v", workspaceID, err)
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// DropStaleQueueItems marks queued items older than maxAge as 'dropped' with a
|
||||
// system-generated reason so PM agents stop processing stale post-incident noise.
|
||||
// Called with a workspaceID to scope cleanup to one workspace, or empty to sweep
|
||||
|
||||
@@ -243,12 +243,7 @@ func (h *AdminSchedulesHealthHandler) ReapOrphans(c *gin.Context) {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "re-point failed"})
|
||||
return
|
||||
}
|
||||
repointedN, err := repointed.RowsAffected()
|
||||
if err != nil {
|
||||
log.Printf("ReapOrphans: repointed rows affected: %v", err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "re-point failed"})
|
||||
return
|
||||
}
|
||||
repointedN, _ := repointed.RowsAffected()
|
||||
|
||||
// 2. Disable any remaining schedules still bound to a removed/missing
|
||||
// workspace (no live successor, or template schedules on a dead row).
|
||||
@@ -266,12 +261,7 @@ func (h *AdminSchedulesHealthHandler) ReapOrphans(c *gin.Context) {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "disable failed"})
|
||||
return
|
||||
}
|
||||
disabledN, err := disabled.RowsAffected()
|
||||
if err != nil {
|
||||
log.Printf("ReapOrphans: disabled rows affected: %v", err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "disable failed"})
|
||||
return
|
||||
}
|
||||
disabledN, _ := disabled.RowsAffected()
|
||||
|
||||
log.Printf("ReapOrphans: re-pointed %d, disabled %d orphaned schedule(s)", repointedN, disabledN)
|
||||
c.JSON(http.StatusOK, gin.H{"repointed": repointedN, "disabled": disabledN})
|
||||
|
||||
@@ -196,15 +196,10 @@ func resolveWorkspaceForwardCreds(c *gin.Context, ctx context.Context, workspace
|
||||
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace url not registered yet"})
|
||||
return "", "", false
|
||||
}
|
||||
// Defense-in-depth for #2316: workspaces.url is validated at
|
||||
// registration time, but the DB row can be stale/tampered and the
|
||||
// SSRF policy can tighten. Re-validate immediately before attaching
|
||||
// the inbound secret to an outbound forward.
|
||||
if err := isSafeURL(wsURL); err != nil {
|
||||
log.Printf("chat_files %s: unsafe workspace URL for %s rejected: %v", op, workspaceID, err)
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "workspace URL not allowed"})
|
||||
return "", "", false
|
||||
}
|
||||
// Trust note: workspaces.url passes validateAgentURL at /registry/
|
||||
// register write time, blocking SSRF-shaped URLs. We rely on that
|
||||
// upstream gate rather than re-validating here. Tracked at #2316
|
||||
// for follow-up: forward-time re-validation as defense-in-depth.
|
||||
|
||||
secret, healed, err := readOrLazyHealInboundSecret(ctx, workspaceID, "chat_files "+op)
|
||||
if err != nil {
|
||||
|
||||
@@ -414,56 +414,6 @@ func TestChatUpload_WorkspaceUnreachable(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestChatUpload_RejectsMetadataWorkspaceURL(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
restore := setSSRFCheckForTest(true)
|
||||
t.Cleanup(restore)
|
||||
|
||||
wsID := "00000000-0000-0000-0000-000000000047"
|
||||
expectURL(mock, wsID, "http://169.254.169.254/latest/meta-data")
|
||||
|
||||
h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil, nil))
|
||||
body, ct := uploadFixture(t)
|
||||
c, w := makeUploadRequest(t, wsID, body, ct)
|
||||
h.Upload(c)
|
||||
|
||||
if w.Code != http.StatusBadRequest {
|
||||
t.Fatalf("expected 400 for metadata workspace URL, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if !strings.Contains(w.Body.String(), "workspace URL not allowed") {
|
||||
t.Errorf("expected unsafe URL error, got: %s", w.Body.String())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sqlmock expectations not met: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestChatUpload_RejectsNonHTTPWorkspaceURL(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
restore := setSSRFCheckForTest(true)
|
||||
t.Cleanup(restore)
|
||||
|
||||
wsID := "00000000-0000-0000-0000-000000000048"
|
||||
expectURL(mock, wsID, "file:///etc/passwd")
|
||||
|
||||
h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil, nil))
|
||||
body, ct := uploadFixture(t)
|
||||
c, w := makeUploadRequest(t, wsID, body, ct)
|
||||
h.Upload(c)
|
||||
|
||||
if w.Code != http.StatusBadRequest {
|
||||
t.Fatalf("expected 400 for non-HTTP workspace URL, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if !strings.Contains(w.Body.String(), "workspace URL not allowed") {
|
||||
t.Errorf("expected unsafe URL error, got: %s", w.Body.String())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sqlmock expectations not met: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestChatUpload_BodyUnderCap_Forwards pins the lower edge of the new
|
||||
// 100 MB body cap (CTO 2026-05-19 directive on forensic a99ab0a1).
|
||||
// A multipart payload comfortably under the cap must reach the
|
||||
@@ -696,54 +646,6 @@ func TestChatDownload_NoInboundSecret_LazyHealFailure(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestChatDownload_RejectsMetadataWorkspaceURL(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
restore := setSSRFCheckForTest(true)
|
||||
t.Cleanup(restore)
|
||||
|
||||
wsID := "00000000-0000-0000-0000-000000000054"
|
||||
expectURL(mock, wsID, "http://169.254.169.254/latest/meta-data")
|
||||
|
||||
h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil, nil))
|
||||
c, w := makeDownloadRequest(t, wsID, "/workspace/foo.txt")
|
||||
h.Download(c)
|
||||
|
||||
if w.Code != http.StatusBadRequest {
|
||||
t.Fatalf("expected 400 for metadata workspace URL, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if !strings.Contains(w.Body.String(), "workspace URL not allowed") {
|
||||
t.Errorf("expected unsafe URL error, got: %s", w.Body.String())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sqlmock expectations not met: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestChatDownload_RejectsNonHTTPWorkspaceURL(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
restore := setSSRFCheckForTest(true)
|
||||
t.Cleanup(restore)
|
||||
|
||||
wsID := "00000000-0000-0000-0000-000000000055"
|
||||
expectURL(mock, wsID, "file:///etc/passwd")
|
||||
|
||||
h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil, nil))
|
||||
c, w := makeDownloadRequest(t, wsID, "/workspace/foo.txt")
|
||||
h.Download(c)
|
||||
|
||||
if w.Code != http.StatusBadRequest {
|
||||
t.Fatalf("expected 400 for non-HTTP workspace URL, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if !strings.Contains(w.Body.String(), "workspace URL not allowed") {
|
||||
t.Errorf("expected unsafe URL error, got: %s", w.Body.String())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sqlmock expectations not met: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestChatDownload_ForwardsToWorkspace_HappyPath(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
|
||||
@@ -294,9 +294,8 @@ func TestProxyA2A_CrossTenant_RoutingDenied(t *testing.T) {
|
||||
// A URL exists for the target; the guard must deny BEFORE it is used.
|
||||
mr.Set(fmt.Sprintf("ws:%s:url", target), "http://localhost:1")
|
||||
|
||||
// Post-#1955: CanCommunicate no longer has the root-sibling bypass.
|
||||
// Both root-level (parent_id NULL) but unrelated org roots → hierarchy
|
||||
// check DENIES with 403 BEFORE the org-scope guard or resolveAgentURL.
|
||||
// CanCommunicate: both root-level (parent_id NULL) → its weak "root-level
|
||||
// siblings" rule ALLOWS this. The org guard must catch it afterward.
|
||||
mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id = ").
|
||||
WithArgs(caller).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}).AddRow(caller, nil))
|
||||
@@ -304,6 +303,15 @@ func TestProxyA2A_CrossTenant_RoutingDenied(t *testing.T) {
|
||||
WithArgs(target).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}).AddRow(target, nil))
|
||||
|
||||
// #1953 org-scope guard: caller resolves to org-a-root, target to org-b-root
|
||||
// → different orgs → 403. (Each org root resolves to itself.)
|
||||
mock.ExpectQuery("WITH RECURSIVE org_chain AS").
|
||||
WithArgs(caller).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"root_id"}).AddRow(caller))
|
||||
mock.ExpectQuery("WITH RECURSIVE org_chain AS").
|
||||
WithArgs(target).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"root_id"}).AddRow(target))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: target}}
|
||||
@@ -321,8 +329,8 @@ func TestProxyA2A_CrossTenant_RoutingDenied(t *testing.T) {
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("body not JSON: %v", err)
|
||||
}
|
||||
if msg, _ := resp["error"].(string); !strings.Contains(msg, "cannot communicate") {
|
||||
t.Errorf("expected hierarchy denial message, got %v", resp["error"])
|
||||
if msg, _ := resp["error"].(string); !strings.Contains(msg, "different org") {
|
||||
t.Errorf("expected cross-org denial message, got %v", resp["error"])
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet sqlmock expectations: %v", err)
|
||||
|
||||
@@ -55,7 +55,6 @@ import (
|
||||
const integrationTestDelegationID = "del-159-test-integration"
|
||||
const integrationTestSourceID = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
|
||||
const integrationTestTargetID = "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb"
|
||||
const integrationTestParentID = "cccccccc-cccc-cccc-cccc-cccccccccccc"
|
||||
|
||||
// rawHTTPServer starts a TCP listener, serves one HTTP response, and closes.
|
||||
// It runs in a background goroutine so the test can proceed immediately after
|
||||
|
||||
@@ -43,8 +43,6 @@ func TestWorkspaceCreate_WithParentID(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec("INSERT INTO structure_events").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec("INSERT INTO workspace_auth_tokens").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -81,8 +79,6 @@ func TestWorkspaceCreate_ExplicitClaudeCodeRuntime(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec("INSERT INTO structure_events").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec("INSERT INTO workspace_auth_tokens").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -304,8 +300,6 @@ func TestWorkspaceCreate_MaxConcurrentTasksOverride(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec("INSERT INTO structure_events").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec("INSERT INTO workspace_auth_tokens").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -582,14 +576,13 @@ func TestDiscover_TargetOffline(t *testing.T) {
|
||||
setupTestRedis(t)
|
||||
handler := NewDiscoveryHandler()
|
||||
|
||||
// Share a parent so communication is allowed under post-#1955 rules
|
||||
sharedParent := "ws-parent"
|
||||
// Both root-level, access allowed
|
||||
mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id =").
|
||||
WithArgs("ws-caller").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}).AddRow("ws-caller", sharedParent))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}).AddRow("ws-caller", nil))
|
||||
mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id =").
|
||||
WithArgs("ws-off").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}).AddRow("ws-off", sharedParent))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}).AddRow("ws-off", nil))
|
||||
|
||||
// Name + runtime lookup (discovery now queries both)
|
||||
mock.ExpectQuery("SELECT COALESCE").
|
||||
@@ -629,14 +622,13 @@ func TestCheckAccess_SiblingsAllowed(t *testing.T) {
|
||||
setupTestRedis(t)
|
||||
handler := NewDiscoveryHandler()
|
||||
|
||||
// Share a parent so communication is allowed under post-#1955 rules
|
||||
sharedParent := "ws-parent"
|
||||
// Both root-level siblings → allowed
|
||||
mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id =").
|
||||
WithArgs("ws-a").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}).AddRow("ws-a", sharedParent))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}).AddRow("ws-a", nil))
|
||||
mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id =").
|
||||
WithArgs("ws-b").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}).AddRow("ws-b", sharedParent))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}).AddRow("ws-b", nil))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
|
||||
@@ -374,14 +374,14 @@ func TestExtended_DiscoverWithCallerID(t *testing.T) {
|
||||
handler := NewDiscoveryHandler()
|
||||
|
||||
// CanCommunicate needs to look up both workspaces
|
||||
// Share a parent so communication is allowed under post-#1955 rules
|
||||
sharedParent := "ws-parent"
|
||||
// Caller: root-level (no parent)
|
||||
mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id =").
|
||||
WithArgs("ws-caller").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}).AddRow("ws-caller", sharedParent))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}).AddRow("ws-caller", nil))
|
||||
// Target: also root-level (no parent) — root-level siblings are allowed
|
||||
mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id =").
|
||||
WithArgs("ws-target").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}).AddRow("ws-target", sharedParent))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}).AddRow("ws-target", nil))
|
||||
|
||||
// Discover handler looks up workspace name + runtime
|
||||
mock.ExpectQuery("SELECT COALESCE").
|
||||
@@ -515,14 +515,13 @@ func TestExtended_CheckAccess(t *testing.T) {
|
||||
handler := NewDiscoveryHandler()
|
||||
|
||||
// CanCommunicate will look up both workspaces
|
||||
// Share a parent so communication is allowed under post-#1955 rules
|
||||
sharedParent := "ws-parent"
|
||||
// Both root-level — should be allowed
|
||||
mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id =").
|
||||
WithArgs("ws-a").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}).AddRow("ws-a", sharedParent))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}).AddRow("ws-a", nil))
|
||||
mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id =").
|
||||
WithArgs("ws-b").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}).AddRow("ws-b", sharedParent))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}).AddRow("ws-b", nil))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
|
||||
@@ -386,8 +386,6 @@ func TestWorkspaceCreate(t *testing.T) {
|
||||
// Expect RecordAndBroadcast INSERT for WORKSPACE_PROVISIONING
|
||||
mock.ExpectExec("INSERT INTO structure_events").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec("INSERT INTO workspace_auth_tokens").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -424,76 +422,6 @@ func TestWorkspaceCreate(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestWorkspaceCreate_ReturnsAuthToken_201 pins the inline-auth_token
|
||||
// behaviour added for #1644. Pre-fix, the 201 response was
|
||||
// {id, status, awareness_namespace, workspace_access} — callers had to
|
||||
// make a separate POST to /admin/workspaces/:id/tokens (AdminAuth-gated,
|
||||
// path-prefix differs in CP-admin deploys) OR fall back to the dev-only
|
||||
// GET /admin/workspaces/:id/test-token (deliberately 404s on
|
||||
// MOLECULE_ENV=production per feedback_no_dev_only_routes_in_e2e).
|
||||
//
|
||||
// Post-fix: every Create response includes an `auth_token` field with
|
||||
// the freshly-minted plaintext bearer (returned once, never recoverable).
|
||||
// This is the SSOT path — production E2E + canvas + org_import all
|
||||
// get the bearer they need in the same round trip.
|
||||
//
|
||||
// Failure path is non-fatal: if the IssueToken DB call fails, the 201
|
||||
// still goes out without auth_token + a fallback log line. That branch
|
||||
// is exercised by sqlmock returning a non-INSERT-INTO-workspace_auth_tokens
|
||||
// path here — the test asserts presence on the happy path.
|
||||
func TestWorkspaceCreate_ReturnsAuthToken_201(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
broadcaster := newTestBroadcaster()
|
||||
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", "/tmp/configs")
|
||||
|
||||
mock.ExpectBegin()
|
||||
mock.ExpectExec("INSERT INTO workspaces").
|
||||
WithArgs(sqlmock.AnyArg(), "Token Holder", nil, 3, "claude-code", (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectCommit()
|
||||
mock.ExpectExec("INSERT INTO canvas_layouts").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec("INSERT INTO structure_events").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
// The inline mint added in #1644 Part B — wsauth.IssueToken issues
|
||||
// a new bearer via INSERT INTO workspace_auth_tokens (workspace_id,
|
||||
// token_hash, prefix). This is the assertion that the new code path
|
||||
// reaches the DB.
|
||||
mock.ExpectExec("INSERT INTO workspace_auth_tokens").
|
||||
WithArgs(sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
body := `{"name":"Token Holder","model":"anthropic:claude-opus-4-7"}`
|
||||
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
handler.Create(c)
|
||||
|
||||
if w.Code != http.StatusCreated {
|
||||
t.Fatalf("expected 201, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
var resp map[string]interface{}
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("parse response: %v", err)
|
||||
}
|
||||
tok, ok := resp["auth_token"].(string)
|
||||
if !ok || tok == "" {
|
||||
t.Fatalf("expected non-empty auth_token in 201 response (the #1644 SSOT inline mint), got: %s", w.Body.String())
|
||||
}
|
||||
// Sanity: tokens are base64-RawURL encoded 32-byte payloads (per
|
||||
// wsauth/tokens.go::tokenPayloadBytes), so a meaningful lower bound
|
||||
// is ~40 chars. If this fails, IssueToken's contract drifted.
|
||||
if len(tok) < 40 {
|
||||
t.Errorf("auth_token suspiciously short (%d chars) — wsauth.IssueToken contract drift?", len(tok))
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet sqlmock expectations — inline mint path may have skipped IssueToken: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildProvisionerConfig_WorkspacePathFromPayload(t *testing.T) {
|
||||
setupTestDB(t)
|
||||
broadcaster := newTestBroadcaster()
|
||||
|
||||
@@ -378,7 +378,7 @@ func readWorkspaceDeriveInputs(ctx context.Context, workspaceID string) (runtime
|
||||
}
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
log.Printf("llm_billing_mode: read secrets rows error for %s: %v (deriving with partial model/auth-env)", workspaceID, err)
|
||||
log.Printf("llm_billing_mode: rows iteration error for %s: %v (deriving with partial model/auth-env)", workspaceID, err)
|
||||
}
|
||||
return runtime, model, availableAuthEnv
|
||||
}
|
||||
@@ -456,10 +456,7 @@ func SetWorkspaceLLMBillingMode(ctx context.Context, workspaceID, mode string) e
|
||||
if err != nil {
|
||||
return fmt.Errorf("clear workspace llm_billing_mode for %s: %w", workspaceID, err)
|
||||
}
|
||||
n, err := res.RowsAffected()
|
||||
if err != nil {
|
||||
return fmt.Errorf("clear workspace llm_billing_mode rows affected %s: %w", workspaceID, err)
|
||||
}
|
||||
n, _ := res.RowsAffected()
|
||||
if n == 0 {
|
||||
return sql.ErrNoRows
|
||||
}
|
||||
@@ -476,10 +473,7 @@ func SetWorkspaceLLMBillingMode(ctx context.Context, workspaceID, mode string) e
|
||||
if err != nil {
|
||||
return fmt.Errorf("set workspace llm_billing_mode for %s: %w", workspaceID, err)
|
||||
}
|
||||
n, err := res.RowsAffected()
|
||||
if err != nil {
|
||||
return fmt.Errorf("set workspace llm_billing_mode rows affected %s: %w", workspaceID, err)
|
||||
}
|
||||
n, _ := res.RowsAffected()
|
||||
if n == 0 {
|
||||
return sql.ErrNoRows
|
||||
}
|
||||
|
||||
@@ -372,78 +372,3 @@ func TestApplyPlatformManagedLLMEnv_WorkspaceOriginCredExemptFromStrip(t *testin
|
||||
t.Errorf("sqlmock expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestApplyPlatformManagedLLMEnv_MissingProxyEnvFailClosed is the #2162
|
||||
// regression guard. A platform-managed workspace whose CP proxy env is absent
|
||||
// must NOT start credential-less. The empty-proxy path must return
|
||||
// HasUsableLLMCred=false so the caller aborts with MISSING_PLATFORM_PROXY.
|
||||
//
|
||||
// Mutation: revert the early-return from HasUsableLLMCred=false to true
|
||||
// → workspace starts with zero credential → "container started but never
|
||||
// called /registry/register" (600s provision-timeout sweep) → this test RED.
|
||||
func TestApplyPlatformManagedLLMEnv_MissingProxyEnvFailClosed(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
const wsID = "29b95be9-811e-4857-be36-1dafdbf4f697" // adk-demo failure workspace
|
||||
|
||||
mock := setupTestDB(t)
|
||||
expectOverrideQuery(mock, wsID, "")
|
||||
|
||||
// No proxy env present — simulates the boot-race / misconfig path.
|
||||
envVars := map[string]string{}
|
||||
res := applyPlatformManagedLLMEnv(ctx, envVars, wsID, "claude-code", "moonshot/kimi-k2.6", nil)
|
||||
|
||||
if res.ResolvedMode != LLMBillingModePlatformManaged {
|
||||
t.Fatalf("platform-managed model must stay platform_managed, got %q (source=%s)", res.ResolvedMode, res.Source)
|
||||
}
|
||||
// THE FIX: must NOT report usable credential when none was injected.
|
||||
if res.HasUsableLLMCred {
|
||||
t.Fatalf("empty proxy env → HasUsableLLMCred must be false (fail-closed), got true — the #2162 dark-wedge class")
|
||||
}
|
||||
// No credential env must be present.
|
||||
if _, present := envVars["ANTHROPIC_API_KEY"]; present {
|
||||
t.Errorf("empty proxy env must NOT inject ANTHROPIC_API_KEY")
|
||||
}
|
||||
if _, present := envVars["MOLECULE_LLM_USAGE_TOKEN"]; present {
|
||||
t.Errorf("empty proxy env must NOT inject MOLECULE_LLM_USAGE_TOKEN")
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sqlmock expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestApplyPlatformManagedLLMEnv_ProxyEnvPresentInjectsCredential is the
|
||||
// positive-path pair to the #2162 regression guard: when the CP proxy env IS
|
||||
// present, the platform-managed path must inject ANTHROPIC_API_KEY +
|
||||
// ANTHROPIC_BASE_URL for an Anthropic-native runtime and report
|
||||
// HasUsableLLMCred=true.
|
||||
func TestApplyPlatformManagedLLMEnv_ProxyEnvPresentInjectsCredential(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
const wsID = "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"
|
||||
|
||||
mock := setupTestDB(t)
|
||||
expectOverrideQuery(mock, wsID, "")
|
||||
|
||||
envVars := map[string]string{}
|
||||
// Simulate the CP proxy env being present (as it is in production).
|
||||
t.Setenv("MOLECULE_LLM_BASE_URL", "https://api.moleculesai.app/api/v1/internal/llm/openai/v1")
|
||||
t.Setenv("MOLECULE_LLM_ANTHROPIC_BASE_URL", "https://api.moleculesai.app/api/v1/internal/llm/anthropic/v1")
|
||||
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "PLATFORM-PROXY-TOKEN")
|
||||
|
||||
res := applyPlatformManagedLLMEnv(ctx, envVars, wsID, "claude-code", "moonshot/kimi-k2.6", nil)
|
||||
|
||||
if res.ResolvedMode != LLMBillingModePlatformManaged {
|
||||
t.Fatalf("expected platform_managed, got %q", res.ResolvedMode)
|
||||
}
|
||||
if !res.HasUsableLLMCred {
|
||||
t.Fatalf("proxy env present → HasUsableLLMCred must be true, got false")
|
||||
}
|
||||
if envVars["ANTHROPIC_API_KEY"] != "PLATFORM-PROXY-TOKEN" {
|
||||
t.Errorf("ANTHROPIC_API_KEY must be injected with the platform proxy token; got %q", envVars["ANTHROPIC_API_KEY"])
|
||||
}
|
||||
if envVars["ANTHROPIC_BASE_URL"] != "https://api.moleculesai.app/api/v1/internal/llm/anthropic/v1" {
|
||||
t.Errorf("ANTHROPIC_BASE_URL must be injected with the platform anthropic proxy; got %q", envVars["ANTHROPIC_BASE_URL"])
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sqlmock expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -750,12 +750,7 @@ func (h *OrgHandler) migrateRuntimeSchedulesFromRemovedPredecessor(ctx context.C
|
||||
log.Printf("Org import: schedule migration %s -> %s (%q) failed: %v", predID, newID, name, err)
|
||||
return
|
||||
}
|
||||
n, err := res.RowsAffected()
|
||||
if err != nil {
|
||||
log.Printf("Org import: schedule migration rows affected %s -> %s: %v", predID, newID, err)
|
||||
return
|
||||
}
|
||||
if n > 0 {
|
||||
if n, _ := res.RowsAffected(); n > 0 {
|
||||
log.Printf("Org import: migrated %d runtime schedule(s) from removed predecessor %s to new workspace %s (%q)", n, predID, newID, name)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -141,7 +141,7 @@ func requireCallerOwnsOrg(c *gin.Context) (string, error) {
|
||||
orgID, err := orgtoken.OrgIDByTokenID(c.Request.Context(), db.DB, tokID)
|
||||
if err != nil {
|
||||
// DB error — deny by default rather than risk cross-org access.
|
||||
return "", fmt.Errorf("allowlist: requireCallerOwnsOrg: %w", err)
|
||||
return "", fmt.Errorf("allowlist: requireCallerOwnsOrg: %v", err)
|
||||
}
|
||||
return orgID, nil
|
||||
}
|
||||
|
||||
@@ -1,191 +0,0 @@
|
||||
package handlers
|
||||
|
||||
// Sqlmock-backed coverage for org_scope.go (orgRootID + sameOrg).
|
||||
// Security-critical path — cross-tenant isolation (#1953).
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"testing"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
|
||||
)
|
||||
|
||||
// ---------- orgRootID ----------
|
||||
|
||||
func TestOrgRootID_HappyPath_NonRoot(t *testing.T) {
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
|
||||
// CTE walks: ws-child → ws-parent → org-root (parent_id IS NULL)
|
||||
mock.ExpectQuery(`WITH RECURSIVE org_chain`).
|
||||
WithArgs(wsUUID1).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"root_id"}).AddRow(wsUUID3))
|
||||
|
||||
root, err := orgRootID(context.Background(), db.DB, wsUUID1)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if root != wsUUID3 {
|
||||
t.Errorf("root=%q, want %q", root, wsUUID3)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOrgRootID_WorkspaceIsRoot(t *testing.T) {
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
|
||||
// One-row chain: the workspace itself is the org root.
|
||||
mock.ExpectQuery(`WITH RECURSIVE org_chain`).
|
||||
WithArgs(wsUUID1).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"root_id"}).AddRow(wsUUID1))
|
||||
|
||||
root, err := orgRootID(context.Background(), db.DB, wsUUID1)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if root != wsUUID1 {
|
||||
t.Errorf("root=%q, want %q", root, wsUUID1)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOrgRootID_NoRows(t *testing.T) {
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
|
||||
mock.ExpectQuery(`WITH RECURSIVE org_chain`).
|
||||
WithArgs(wsUUID1).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"root_id"}))
|
||||
|
||||
_, err := orgRootID(context.Background(), db.DB, wsUUID1)
|
||||
if !errors.Is(err, errNoOrgRoot) {
|
||||
t.Fatalf("expected errNoOrgRoot, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOrgRootID_DBError(t *testing.T) {
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
|
||||
mock.ExpectQuery(`WITH RECURSIVE org_chain`).
|
||||
WithArgs(wsUUID1).
|
||||
WillReturnError(errors.New("conn lost"))
|
||||
|
||||
_, err := orgRootID(context.Background(), db.DB, wsUUID1)
|
||||
if err == nil || errors.Is(err, errNoOrgRoot) {
|
||||
t.Fatalf("expected DB error, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOrgRootID_EmptyRoot(t *testing.T) {
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
|
||||
// Row present but root is empty string → treated as not-found.
|
||||
mock.ExpectQuery(`WITH RECURSIVE org_chain`).
|
||||
WithArgs(wsUUID1).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"root_id"}).AddRow(""))
|
||||
|
||||
_, err := orgRootID(context.Background(), db.DB, wsUUID1)
|
||||
if !errors.Is(err, errNoOrgRoot) {
|
||||
t.Fatalf("expected errNoOrgRoot for empty root, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// ---------- sameOrg ----------
|
||||
|
||||
func TestSameOrg_SameWorkspace(t *testing.T) {
|
||||
// Fast path: identical IDs are same-org without touching DB.
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
|
||||
ok, err := sameOrg(context.Background(), db.DB, wsUUID1, wsUUID1)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if !ok {
|
||||
t.Error("same workspace must be same-org")
|
||||
}
|
||||
// No DB expectations → proves short-circuit.
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("DB was touched despite short-circuit: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSameOrg_SameOrg(t *testing.T) {
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
|
||||
mock.ExpectQuery(`WITH RECURSIVE org_chain`).
|
||||
WithArgs(wsUUID1).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"root_id"}).AddRow(wsUUID3))
|
||||
mock.ExpectQuery(`WITH RECURSIVE org_chain`).
|
||||
WithArgs(wsUUID2).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"root_id"}).AddRow(wsUUID3))
|
||||
|
||||
ok, err := sameOrg(context.Background(), db.DB, wsUUID1, wsUUID2)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if !ok {
|
||||
t.Error("expected same-org")
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSameOrg_DifferentOrg(t *testing.T) {
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
|
||||
mock.ExpectQuery(`WITH RECURSIVE org_chain`).
|
||||
WithArgs(wsUUID1).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"root_id"}).AddRow(wsUUID3))
|
||||
mock.ExpectQuery(`WITH RECURSIVE org_chain`).
|
||||
WithArgs(wsUUID2).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"root_id"}).AddRow("org-b"))
|
||||
|
||||
ok, err := sameOrg(context.Background(), db.DB, wsUUID1, wsUUID2)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if ok {
|
||||
t.Error("expected different-org")
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSameOrg_OrgRootFails(t *testing.T) {
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
|
||||
mock.ExpectQuery(`WITH RECURSIVE org_chain`).
|
||||
WithArgs(wsUUID1).
|
||||
WillReturnError(errors.New("conn lost"))
|
||||
|
||||
_, err := sameOrg(context.Background(), db.DB, wsUUID1, wsUUID2)
|
||||
if err == nil {
|
||||
t.Fatal("expected error when orgRootID fails")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSameOrg_OrgRootNotFound(t *testing.T) {
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
|
||||
mock.ExpectQuery(`WITH RECURSIVE org_chain`).
|
||||
WithArgs(wsUUID1).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"root_id"}))
|
||||
|
||||
_, err := sameOrg(context.Background(), db.DB, wsUUID1, wsUUID2)
|
||||
if !errors.Is(err, errNoOrgRoot) {
|
||||
t.Fatalf("expected errNoOrgRoot, got %v", err)
|
||||
}
|
||||
}
|
||||
@@ -1,168 +0,0 @@
|
||||
package handlers
|
||||
|
||||
// rescue_read.go — GET /workspaces/:id/rescue (RFC internal#742 Part 3).
|
||||
//
|
||||
// Serves the LATEST post-mortem rescue bundle captured for a
|
||||
// boot-failed/terminated workspace, so "why won't my agent boot" is
|
||||
// answerable WITHOUT a live instance. Powers the future canvas
|
||||
// "Why did this fail?" panel.
|
||||
//
|
||||
// Read-path: the bundle is read from the queryable rescue_bundles table
|
||||
// (internal/rescuestore), NOT from obs/Loki. Part 2 ships the bundle via
|
||||
// internal/audit (Loki-only); reading from Loki would require obs read
|
||||
// creds the tenant deliberately lacks. Part 3 persists the
|
||||
// already-redacted bundle on capture and serves it here — see the
|
||||
// migration header for the full rationale.
|
||||
//
|
||||
// Auth/scoping: registered on the WorkspaceAuth-guarded /workspaces/:id
|
||||
// group (same gate as /files/* and /exec), so the caller must hold a
|
||||
// valid per-workspace or org bearer token for :id. TenantGuard already
|
||||
// 404s cross-org requests at the routing layer; on top of that the store
|
||||
// read is org-scoped by MOLECULE_ORG_ID, so a row written under a
|
||||
// different org is never returned (defense in depth).
|
||||
//
|
||||
// Redaction: the stored sections were already scrubbed at capture time
|
||||
// (Part 2's SAFE-T1201 secret-scan). This handler returns them verbatim
|
||||
// — it never re-ships or re-derives secrets.
|
||||
|
||||
import (
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/rescuestore"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
// maxResponseSections bounds how many sections the read response
|
||||
// returns. The fixed capture set is small (6), so this is a backstop
|
||||
// against a future capture set growth or a hand-written row — keeps the
|
||||
// JSON response bounded regardless of what's stored. Per-section content
|
||||
// is already clamped at persist time (rescuestore.maxSectionBytes).
|
||||
const maxResponseSections = 64
|
||||
|
||||
// RescueReadHandler serves GET /workspaces/:id/rescue. The store is
|
||||
// injected so tests fake it; production wires a Postgres store over
|
||||
// db.DB (see NewRescueReadHandler).
|
||||
type RescueReadHandler struct {
|
||||
store rescuestore.Store
|
||||
}
|
||||
|
||||
// NewRescueReadHandler builds the handler over the package db.DB. db.DB
|
||||
// is nil in some unit-test binaries; the handler tolerates that by
|
||||
// returning 503 rather than nil-deref (the store guards nil db).
|
||||
func NewRescueReadHandler() *RescueReadHandler {
|
||||
return &RescueReadHandler{store: rescuestore.NewPostgres(db.DB)}
|
||||
}
|
||||
|
||||
// WithStore overrides the store (test seam). Returns the handler for
|
||||
// chaining.
|
||||
func (h *RescueReadHandler) WithStore(s rescuestore.Store) *RescueReadHandler {
|
||||
h.store = s
|
||||
return h
|
||||
}
|
||||
|
||||
// rescueSection is one labelled chunk in the read response.
|
||||
type rescueSection struct {
|
||||
Name string `json:"name"`
|
||||
Content string `json:"content"`
|
||||
Redacted bool `json:"redacted"`
|
||||
}
|
||||
|
||||
// rescueReadResponse is the JSON shape returned for a found bundle.
|
||||
// `sections` is an ordered array (capture reading order), not a map, so
|
||||
// the order config→logs→state→env is preserved for the canvas panel.
|
||||
type rescueReadResponse struct {
|
||||
WorkspaceID string `json:"workspace_id"`
|
||||
CapturedAt time.Time `json:"captured_at"`
|
||||
Reason string `json:"reason"`
|
||||
InstanceID string `json:"instance_id"`
|
||||
Sections []rescueSection `json:"sections"`
|
||||
// Truncated is true when the stored bundle had more sections than
|
||||
// maxResponseSections and the response was capped.
|
||||
Truncated bool `json:"truncated,omitempty"`
|
||||
}
|
||||
|
||||
// GetRescue handles GET /workspaces/:id/rescue.
|
||||
//
|
||||
// 200 — latest rescue bundle for the workspace (org-scoped).
|
||||
// 404 — no rescue bundle on file for this workspace (or wrong org).
|
||||
// 503 — store/datastore unavailable.
|
||||
func (h *RescueReadHandler) GetRescue(c *gin.Context) {
|
||||
workspaceID := c.Param("id")
|
||||
ctx := c.Request.Context()
|
||||
|
||||
if h.store == nil {
|
||||
log.Printf("GetRescue: store not configured for ws=%s", workspaceID)
|
||||
c.JSON(http.StatusServiceUnavailable, gin.H{
|
||||
"error": "rescue store unavailable",
|
||||
"code": "platform_unavailable",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
// org_id is the tenant's configured org (one tenant = one org).
|
||||
// Fail closed: an empty org_id disables org isolation and must not
|
||||
// reach the store (#2020).
|
||||
orgID := os.Getenv("MOLECULE_ORG_ID")
|
||||
if orgID == "" {
|
||||
log.Printf("GetRescue: missing MOLECULE_ORG_ID for ws=%s", workspaceID)
|
||||
c.JSON(http.StatusServiceUnavailable, gin.H{
|
||||
"error": "rescue org not configured",
|
||||
"code": "platform_misconfigured",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
stored, err := h.store.GetLatest(ctx, workspaceID, orgID)
|
||||
if err != nil {
|
||||
// Per the Store contract a missing bundle is (nil, nil), NOT an
|
||||
// error — so any error here is a genuine datastore fault → 503,
|
||||
// never a masquerading 404 that would hide an outage.
|
||||
log.Printf("GetRescue: store query failed for ws=%s: %v", workspaceID, err)
|
||||
c.JSON(http.StatusServiceUnavailable, gin.H{
|
||||
"error": "rescue store query failed",
|
||||
"code": "platform_unavailable",
|
||||
})
|
||||
return
|
||||
}
|
||||
if stored == nil {
|
||||
// No bundle captured (workspace never boot-failed, or its grace
|
||||
// window lapsed). 404 — existence-non-inferring; a workspace in a
|
||||
// sibling org reaches the same 404 via the org filter.
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": "no rescue bundle for this workspace"})
|
||||
return
|
||||
}
|
||||
|
||||
resp := buildRescueResponse(workspaceID, stored)
|
||||
c.JSON(http.StatusOK, resp)
|
||||
}
|
||||
|
||||
// buildRescueResponse maps a stored bundle to the read response, bounding
|
||||
// the section count. Split out so the mapping/limit is unit-testable.
|
||||
func buildRescueResponse(workspaceID string, stored *rescuestore.StoredBundle) rescueReadResponse {
|
||||
secs := stored.Bundle.Sections
|
||||
truncated := false
|
||||
if len(secs) > maxResponseSections {
|
||||
secs = secs[:maxResponseSections]
|
||||
truncated = true
|
||||
}
|
||||
out := make([]rescueSection, 0, len(secs))
|
||||
for _, s := range secs {
|
||||
// rescue.Section and rescueSection are field-identical; the
|
||||
// explicit conversion keeps the handler's JSON shape independent
|
||||
// of the leaf package's struct (which could gain non-response
|
||||
// fields later).
|
||||
out = append(out, rescueSection(s))
|
||||
}
|
||||
return rescueReadResponse{
|
||||
WorkspaceID: workspaceID,
|
||||
CapturedAt: stored.CapturedAt,
|
||||
Reason: stored.Bundle.Reason,
|
||||
InstanceID: stored.Bundle.InstanceID,
|
||||
Sections: out,
|
||||
Truncated: truncated,
|
||||
}
|
||||
}
|
||||
@@ -1,238 +0,0 @@
|
||||
package handlers
|
||||
|
||||
// Tests for GET /workspaces/:id/rescue (RFC internal#742 Part 3).
|
||||
//
|
||||
// These exercise the handler against a FAKE store (no DB) so every path
|
||||
// is deterministic without external infra:
|
||||
// - returns the latest bundle in the documented shape
|
||||
// - 404 when no bundle exists for the workspace
|
||||
// - org-scoping: the handler passes the tenant's MOLECULE_ORG_ID to
|
||||
// the store, so a fake that returns nil for a mismatched org proves a
|
||||
// sibling org cannot read another org's bundle
|
||||
// - 503 on a store/datastore error (not a 404 masquerade)
|
||||
// - redaction/shape preserved: stored sections are returned verbatim,
|
||||
// no re-derivation
|
||||
//
|
||||
// WorkspaceAuth gating itself is covered by the middleware tests; here we
|
||||
// invoke the handler directly (the route is registered on the wsAuth
|
||||
// group in router.go).
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/rescue"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/rescuestore"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
func init() { gin.SetMode(gin.TestMode) }
|
||||
|
||||
// fakeRescueStore records the args it was called with and returns a
|
||||
// scripted result. Implements rescuestore.Store.
|
||||
type fakeRescueStore struct {
|
||||
// gotWorkspaceID/gotOrgID capture what the handler passed.
|
||||
gotWorkspaceID string
|
||||
gotOrgID string
|
||||
// ret/err are the scripted GetLatest result.
|
||||
ret *rescuestore.StoredBundle
|
||||
err error
|
||||
}
|
||||
|
||||
func (f *fakeRescueStore) Persist(_ context.Context, _ rescue.Bundle) error { return nil }
|
||||
|
||||
func (f *fakeRescueStore) GetLatest(_ context.Context, workspaceID, orgID string) (*rescuestore.StoredBundle, error) {
|
||||
f.gotWorkspaceID = workspaceID
|
||||
f.gotOrgID = orgID
|
||||
return f.ret, f.err
|
||||
}
|
||||
|
||||
// doRescueGet runs the handler for ws against the given fake and returns
|
||||
// the recorder. orgEnv sets MOLECULE_ORG_ID for the duration.
|
||||
func doRescueGet(t *testing.T, ws, orgEnv string, fake *fakeRescueStore) *httptest.ResponseRecorder {
|
||||
t.Helper()
|
||||
t.Setenv("MOLECULE_ORG_ID", orgEnv)
|
||||
|
||||
h := (&RescueReadHandler{}).WithStore(fake)
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: ws}}
|
||||
c.Request = httptest.NewRequest("GET", "/workspaces/"+ws+"/rescue", nil)
|
||||
h.GetRescue(c)
|
||||
return w
|
||||
}
|
||||
|
||||
// sampleStored builds a representative stored bundle with a redacted +
|
||||
// a failure-marker section.
|
||||
func sampleStored() *rescuestore.StoredBundle {
|
||||
return &rescuestore.StoredBundle{
|
||||
CapturedAt: time.Date(2026, 5, 31, 12, 0, 0, 0, time.UTC),
|
||||
Bundle: rescue.Bundle{
|
||||
WorkspaceID: "ws-1",
|
||||
OrgID: "org-9",
|
||||
InstanceID: "i-abc123",
|
||||
Reason: "provision_timeout_sweep",
|
||||
Sections: []rescue.Section{
|
||||
{Name: "config.yaml", Content: "model: gpt-4\nANTHROPIC_API_KEY=[REDACTED]", Redacted: true},
|
||||
{Name: "docker-ps", Content: "(rescue: section collection failed: ssh blip)", Redacted: false},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// TestGetRescue_ReturnsLatestBundle — happy path: 200 with the full
|
||||
// documented shape, sections in order, redaction-preserved.
|
||||
func TestGetRescue_ReturnsLatestBundle(t *testing.T) {
|
||||
fake := &fakeRescueStore{ret: sampleStored()}
|
||||
w := doRescueGet(t, "ws-1", "org-9", fake)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("status = %d, want 200; body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
var resp struct {
|
||||
WorkspaceID string `json:"workspace_id"`
|
||||
CapturedAt time.Time `json:"captured_at"`
|
||||
Reason string `json:"reason"`
|
||||
InstanceID string `json:"instance_id"`
|
||||
Sections []struct {
|
||||
Name string `json:"name"`
|
||||
Content string `json:"content"`
|
||||
Redacted bool `json:"redacted"`
|
||||
} `json:"sections"`
|
||||
}
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("decode: %v; body=%s", err, w.Body.String())
|
||||
}
|
||||
if resp.WorkspaceID != "ws-1" {
|
||||
t.Errorf("workspace_id = %q, want ws-1", resp.WorkspaceID)
|
||||
}
|
||||
if resp.Reason != "provision_timeout_sweep" {
|
||||
t.Errorf("reason = %q", resp.Reason)
|
||||
}
|
||||
if resp.InstanceID != "i-abc123" {
|
||||
t.Errorf("instance_id = %q", resp.InstanceID)
|
||||
}
|
||||
if !resp.CapturedAt.Equal(time.Date(2026, 5, 31, 12, 0, 0, 0, time.UTC)) {
|
||||
t.Errorf("captured_at = %v", resp.CapturedAt)
|
||||
}
|
||||
if len(resp.Sections) != 2 {
|
||||
t.Fatalf("sections = %d, want 2", len(resp.Sections))
|
||||
}
|
||||
// Order preserved: config first, docker-ps second.
|
||||
if resp.Sections[0].Name != "config.yaml" || resp.Sections[1].Name != "docker-ps" {
|
||||
t.Errorf("section order wrong: %q, %q", resp.Sections[0].Name, resp.Sections[1].Name)
|
||||
}
|
||||
// Redaction-preserved: the redacted flag rides through untouched, and
|
||||
// the failure marker stays a non-redacted marker.
|
||||
if !resp.Sections[0].Redacted {
|
||||
t.Error("config.yaml section should be redacted=true")
|
||||
}
|
||||
if resp.Sections[1].Redacted {
|
||||
t.Error("failure-marker section should be redacted=false")
|
||||
}
|
||||
// Handler does NOT re-derive secrets; stored [REDACTED] verbatim.
|
||||
if want := "ANTHROPIC_API_KEY=[REDACTED]"; !strings.Contains(resp.Sections[0].Content, want) {
|
||||
t.Errorf("section content = %q, want it to contain %q", resp.Sections[0].Content, want)
|
||||
}
|
||||
}
|
||||
|
||||
// TestGetRescue_404WhenNone — no bundle on file → 404, not 500/200.
|
||||
func TestGetRescue_404WhenNone(t *testing.T) {
|
||||
fake := &fakeRescueStore{ret: nil} // store returns (nil, nil)
|
||||
w := doRescueGet(t, "ws-none", "org-9", fake)
|
||||
if w.Code != http.StatusNotFound {
|
||||
t.Fatalf("status = %d, want 404; body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
// TestGetRescue_OrgScopingPassedToStore — the handler must hand the
|
||||
// tenant's MOLECULE_ORG_ID to the store, and a store that returns nil for
|
||||
// a mismatched org yields 404. This is the sibling-org isolation: a
|
||||
// caller in org B (a different tenant process, MOLECULE_ORG_ID=org-B)
|
||||
// reading ws-1 (which belongs to org-9) gets the org filter applied → no
|
||||
// row → 404.
|
||||
func TestGetRescue_OrgScopingPassedToStore(t *testing.T) {
|
||||
// Tenant configured as a DIFFERENT org than the bundle's owner.
|
||||
// Fake mimics the Postgres org filter: returns nil because org-B
|
||||
// doesn't match the row's org-9.
|
||||
fake := &fakeRescueStore{ret: nil}
|
||||
w := doRescueGet(t, "ws-1", "org-B", fake)
|
||||
|
||||
if fake.gotOrgID != "org-B" {
|
||||
t.Errorf("store got org_id = %q, want the tenant's org-B", fake.gotOrgID)
|
||||
}
|
||||
if fake.gotWorkspaceID != "ws-1" {
|
||||
t.Errorf("store got workspace_id = %q, want ws-1", fake.gotWorkspaceID)
|
||||
}
|
||||
if w.Code != http.StatusNotFound {
|
||||
t.Fatalf("sibling-org read: status = %d, want 404", w.Code)
|
||||
}
|
||||
}
|
||||
|
||||
// TestGetRescue_EmptyOrgEnvRejected — empty MOLECULE_ORG_ID is a
|
||||
// fail-closed security violation (#2020). The handler must 503 before
|
||||
// calling the store, so the org filter cannot be bypassed.
|
||||
func TestGetRescue_EmptyOrgEnvRejected(t *testing.T) {
|
||||
fake := &fakeRescueStore{ret: sampleStored()}
|
||||
w := doRescueGet(t, "ws-1", "", fake)
|
||||
if fake.gotOrgID != "" {
|
||||
t.Errorf("store was called with org_id = %q; want no call when env empty", fake.gotOrgID)
|
||||
}
|
||||
if w.Code != http.StatusServiceUnavailable {
|
||||
t.Fatalf("status = %d, want 503; body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
if !strings.Contains(w.Body.String(), "platform_misconfigured") {
|
||||
t.Fatalf("body = %s, want platform_misconfigured code", w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
// TestGetRescue_StoreErrorIs503 — an actual datastore error must surface
|
||||
// as 503, never a 404 (which would hide an outage as "no bundle").
|
||||
func TestGetRescue_StoreErrorIs503(t *testing.T) {
|
||||
fake := &fakeRescueStore{err: errors.New("connection refused")}
|
||||
w := doRescueGet(t, "ws-1", "org-9", fake)
|
||||
if w.Code != http.StatusServiceUnavailable {
|
||||
t.Fatalf("status = %d, want 503; body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
// TestGetRescue_NilStoreIs503 — defensive: a handler with no store wired
|
||||
// (db.DB nil in a degraded boot) returns 503, never panics.
|
||||
func TestGetRescue_NilStoreIs503(t *testing.T) {
|
||||
t.Setenv("MOLECULE_ORG_ID", "org-9")
|
||||
h := &RescueReadHandler{} // store == nil
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
|
||||
c.Request = httptest.NewRequest("GET", "/workspaces/ws-1/rescue", nil)
|
||||
h.GetRescue(c)
|
||||
if w.Code != http.StatusServiceUnavailable {
|
||||
t.Fatalf("status = %d, want 503", w.Code)
|
||||
}
|
||||
}
|
||||
|
||||
// TestBuildRescueResponse_BoundsSections — a stored bundle with more than
|
||||
// maxResponseSections sections is capped + flagged truncated.
|
||||
func TestBuildRescueResponse_BoundsSections(t *testing.T) {
|
||||
many := make([]rescue.Section, maxResponseSections+5)
|
||||
for i := range many {
|
||||
many[i] = rescue.Section{Name: "s", Content: "c", Redacted: true}
|
||||
}
|
||||
stored := &rescuestore.StoredBundle{
|
||||
CapturedAt: time.Now(),
|
||||
Bundle: rescue.Bundle{WorkspaceID: "ws-1", Sections: many},
|
||||
}
|
||||
resp := buildRescueResponse("ws-1", stored)
|
||||
if len(resp.Sections) != maxResponseSections {
|
||||
t.Errorf("sections = %d, want capped at %d", len(resp.Sections), maxResponseSections)
|
||||
}
|
||||
if !resp.Truncated {
|
||||
t.Error("truncated flag should be set when sections were capped")
|
||||
}
|
||||
}
|
||||
@@ -1,168 +0,0 @@
|
||||
package handlers
|
||||
|
||||
// rescue_wiring.go — bridges the leaf internal/rescue package to the
|
||||
// handlers package's EIC/SSH runner + secret redactor, and exposes the
|
||||
// boot-failure rescue hook used by both boot-failure verdict paths
|
||||
// (handlers.BootstrapFailed here, registry.sweepStuckProvisioning via
|
||||
// an injected hook wired in main.go).
|
||||
//
|
||||
// Why the indirection: internal/rescue is a leaf so registry (which
|
||||
// must NOT import handlers — that's an import cycle) can call it. The
|
||||
// two heavy dependencies live here in handlers — `withEICTunnel`
|
||||
// (the EIC keypair → push → tunnel → ssh dance) and `redactSecrets`
|
||||
// (the SAFE-T1201 secret-scan) — so we inject them into rescue's
|
||||
// package-level func vars at init().
|
||||
//
|
||||
// RFC internal#742 Part 2.
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/rescue"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/rescuestore"
|
||||
)
|
||||
|
||||
func init() {
|
||||
// Wire the leaf rescue package to handlers' EIC runner + redactor.
|
||||
// Done in init() (not main.go) so the binding is present for any
|
||||
// caller of rescue.Capture, including the registry sweeper hook and
|
||||
// the handler path, without each call site re-wiring it.
|
||||
rescue.RunRemote = rescueRunRemoteViaEIC
|
||||
rescue.Redact = func(workspaceID, content string) string {
|
||||
out, _ := redactSecrets(workspaceID, content)
|
||||
return out
|
||||
}
|
||||
// Part 3: persist the redacted bundle to the queryable store on
|
||||
// capture so GET /workspaces/:id/rescue can serve it without obs/Loki
|
||||
// read creds. db.DB is resolved per-call (rescuestore guards a nil
|
||||
// handle) so wiring at init() is safe even before InitPostgres has
|
||||
// run; a capture before the DB is up logs + skips the persist rather
|
||||
// than failing the boot-failure path.
|
||||
rescue.PersistBundle = func(ctx context.Context, b rescue.Bundle) error {
|
||||
return rescuestore.NewPostgres(db.DB).Persist(ctx, b)
|
||||
}
|
||||
}
|
||||
|
||||
// rescueRunRemoteViaEIC runs a single shell command on the still-running
|
||||
// (but boot-failed) workspace EC2 over an EIC tunnel and returns its
|
||||
// combined stdout+stderr. Reuses the same `withEICTunnel` dance as the
|
||||
// canvas file ops (ephemeral keypair → SendSSHPublicKey → open-tunnel →
|
||||
// ssh) so the rescue path inherits every fix to the EIC mechanism (e.g.
|
||||
// PR #2822's LogLevel=ERROR shim) for free.
|
||||
//
|
||||
// Combined output (2>&1) is intentional: a boot-failed box's most
|
||||
// useful signal is often on stderr (a panic, a missing-file error), and
|
||||
// the rescue bundle is a forensic blob, not a parsed value — we want
|
||||
// everything the command emitted.
|
||||
func rescueRunRemoteViaEIC(ctx context.Context, instanceID, command string) (string, error) {
|
||||
var combined []byte
|
||||
runErr := withEICTunnel(ctx, instanceID, func(s eicSSHSession) error {
|
||||
sshCmd := exec.CommandContext(ctx, "ssh", s.sshArgs(command)...)
|
||||
sshCmd.Env = os.Environ()
|
||||
var buf bytes.Buffer
|
||||
sshCmd.Stdout = &buf
|
||||
sshCmd.Stderr = &buf
|
||||
// A non-zero remote exit is NOT a transport error for the rescue
|
||||
// path — each section command already falls back to an
|
||||
// `|| echo '(...)'` marker, so a clean exit is expected. Only
|
||||
// surface an error when ssh/tunnel itself failed AND produced no
|
||||
// output to ship.
|
||||
err := sshCmd.Run()
|
||||
combined = buf.Bytes()
|
||||
if err != nil && len(combined) == 0 {
|
||||
return fmt.Errorf("rescue ssh exec: %w", err)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if runErr != nil {
|
||||
return "", runErr
|
||||
}
|
||||
return strings.TrimRight(string(combined), "\n"), nil
|
||||
}
|
||||
|
||||
// captureRescueBundle fires a best-effort, non-blocking rescue capture
|
||||
// for a boot-failed workspace. It is the single entry point both
|
||||
// boot-failure verdict paths funnel through.
|
||||
//
|
||||
// NON-BLOCKING: the actual collection runs in its own goroutine with
|
||||
// its own timeout (rescue.CaptureTimeout), detached from the caller's
|
||||
// request/sweep context so it can't add latency to — or be cancelled
|
||||
// by — the failure-handling path that triggered it. We snapshot the
|
||||
// identity into a fresh context.Background() for the same reason: a
|
||||
// gin request context is cancelled the instant the HTTP handler
|
||||
// returns, which would kill the EIC tunnel mid-collection.
|
||||
//
|
||||
// instanceID/orgID are resolved here (best-effort) so the two call
|
||||
// sites only need the workspace id. A missing instance id → rescue.Capture
|
||||
// no-ops (logged), so an early-failure workspace that never got an EC2
|
||||
// is handled cleanly.
|
||||
func captureRescueBundle(workspaceID, reason string) {
|
||||
rescueDispatch(func() {
|
||||
ctx := context.Background()
|
||||
instanceID, err := rescueResolveInstanceID(ctx, workspaceID)
|
||||
if err != nil {
|
||||
// Best-effort: a resolve failure is logged inside Capture's
|
||||
// caller chain; pass empty so Capture no-ops cleanly.
|
||||
instanceID = ""
|
||||
}
|
||||
rescue.Capture(ctx, rescue.Input{
|
||||
InstanceID: instanceID,
|
||||
WorkspaceID: workspaceID,
|
||||
OrgID: os.Getenv("MOLECULE_ORG_ID"),
|
||||
Reason: reason,
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
// rescueDispatch runs the rescue collection off the request path. In
|
||||
// production it's `go fn()` so the capture never blocks or adds latency
|
||||
// to the boot-failure handler. Tests swap it for a synchronous runner so
|
||||
// they can assert the capture fired (or didn't) deterministically
|
||||
// without racing the goroutine.
|
||||
var rescueDispatch = func(fn func()) { go fn() }
|
||||
|
||||
// BootFailureRescueHook is the registry-facing adapter wired into
|
||||
// registry.BootFailureRescueHook from main.go. The registry sweeper
|
||||
// already resolved the instance id (it's in the candidate row), so this
|
||||
// path uses it directly rather than re-querying — symmetric with the
|
||||
// captureRescueBundle handler path but skipping the lookup.
|
||||
//
|
||||
// Best-effort + non-blocking: dispatches the capture on its own
|
||||
// goroutine with its own timeout, so the sweep loop is never slowed.
|
||||
func BootFailureRescueHook(workspaceID, instanceID, reason string) {
|
||||
go rescue.Capture(context.Background(), rescue.Input{
|
||||
InstanceID: instanceID,
|
||||
WorkspaceID: workspaceID,
|
||||
OrgID: os.Getenv("MOLECULE_ORG_ID"),
|
||||
Reason: reason,
|
||||
})
|
||||
}
|
||||
|
||||
// rescueResolveInstanceID looks up the EC2 instance id for a workspace.
|
||||
// Package var so tests can stub it without a sqlmock. Mirrors
|
||||
// provisioner.resolveInstanceID (same query) but lives here to keep the
|
||||
// rescue wiring self-contained and avoid widening the provisioner
|
||||
// surface.
|
||||
var rescueResolveInstanceID = func(ctx context.Context, workspaceID string) (string, error) {
|
||||
if db.DB == nil {
|
||||
return "", nil // nil in unit tests
|
||||
}
|
||||
var instanceID sql.NullString
|
||||
err := db.DB.QueryRowContext(ctx,
|
||||
`SELECT instance_id FROM workspaces WHERE id = $1`, workspaceID,
|
||||
).Scan(&instanceID)
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
return "", err
|
||||
}
|
||||
if !instanceID.Valid {
|
||||
return "", nil
|
||||
}
|
||||
return instanceID.String, nil
|
||||
}
|
||||
@@ -1,119 +0,0 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/models"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/rescue"
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
// rescueTestHarness makes the otherwise-async rescue capture
|
||||
// deterministic + observable for handler tests:
|
||||
// - rescueDispatch runs synchronously (no goroutine race).
|
||||
// - rescueResolveInstanceID returns a fixed instance id.
|
||||
// - rescue.RunRemote / rescue.Redact are stubbed so no real EIC/SSH
|
||||
// fires; runCalls counts how many remote-command collections ran,
|
||||
// which is the proxy for "did the capture fire".
|
||||
//
|
||||
// All originals are restored on cleanup.
|
||||
func rescueTestHarness(t *testing.T, instanceID string) (runCalls *int) {
|
||||
t.Helper()
|
||||
n := 0
|
||||
runCalls = &n
|
||||
|
||||
prevDispatch := rescueDispatch
|
||||
rescueDispatch = func(fn func()) { fn() } // synchronous
|
||||
prevResolve := rescueResolveInstanceID
|
||||
rescueResolveInstanceID = func(_ context.Context, _ string) (string, error) { return instanceID, nil }
|
||||
prevRun, prevRedact := rescue.RunRemote, rescue.Redact
|
||||
rescue.RunRemote = func(_ context.Context, _ string, _ string) (string, error) { n++; return "out", nil }
|
||||
rescue.Redact = func(_ws, c string) string { return c }
|
||||
|
||||
t.Cleanup(func() {
|
||||
rescueDispatch = prevDispatch
|
||||
rescueResolveInstanceID = prevResolve
|
||||
rescue.RunRemote = prevRun
|
||||
rescue.Redact = prevRedact
|
||||
})
|
||||
return runCalls
|
||||
}
|
||||
|
||||
// TestBootstrapFailed_FiresRescueOnFlip — the RFC internal#742 handler
|
||||
// hook: when BootstrapFailed actually flips a workspace to `failed`
|
||||
// (affected==1), the rescue capture fires against the resolved instance.
|
||||
func TestBootstrapFailed_FiresRescueOnFlip(t *testing.T) {
|
||||
h, mock := setupBootstrapHandler(t)
|
||||
runCalls := rescueTestHarness(t, "i-failed01")
|
||||
|
||||
mock.ExpectExec(`UPDATE workspaces`).
|
||||
WithArgs("ws-crashed", sqlmock.AnyArg(), models.StatusFailed).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec(`INSERT INTO structure_events`).
|
||||
WithArgs("WORKSPACE_PROVISION_FAILED", "ws-crashed", sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-crashed"}}
|
||||
c.Request = httptest.NewRequest("POST", "/admin/workspaces/ws-crashed/bootstrap-failed",
|
||||
bytes.NewBufferString(`{"error":"codex provider derivation failed","log_tail":"panic"}`))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
h.BootstrapFailed(c)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("want 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if *runCalls != len(rescueBundleSectionCount()) {
|
||||
t.Errorf("rescue capture ran %d remote commands, want %d (one per bundle section)", *runCalls, len(rescueBundleSectionCount()))
|
||||
}
|
||||
}
|
||||
|
||||
// TestBootstrapFailed_NoRescueOnNoChange — an already-transitioned
|
||||
// workspace (affected==0: raced to online, or double-report) is NOT a
|
||||
// boot-failure verdict here, so the rescue capture must NOT fire.
|
||||
func TestBootstrapFailed_NoRescueOnNoChange(t *testing.T) {
|
||||
h, mock := setupBootstrapHandler(t)
|
||||
runCalls := rescueTestHarness(t, "i-online01")
|
||||
|
||||
mock.ExpectExec(`UPDATE workspaces`).
|
||||
WithArgs("ws-online", sqlmock.AnyArg(), models.StatusFailed).
|
||||
WillReturnResult(sqlmock.NewResult(0, 0)) // already transitioned
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-online"}}
|
||||
c.Request = httptest.NewRequest("POST", "/admin/workspaces/ws-online/bootstrap-failed",
|
||||
bytes.NewBufferString(`{"error":"late report","log_tail":""}`))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
h.BootstrapFailed(c)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("want 200, got %d", w.Code)
|
||||
}
|
||||
if *runCalls != 0 {
|
||||
t.Errorf("rescue capture fired (%d cmds) on a no-change report; it must only fire on a real flip", *runCalls)
|
||||
}
|
||||
}
|
||||
|
||||
// rescueBundleSectionCount returns the production rescue bundle section
|
||||
// list length by running a capture against a counting runner once. It's
|
||||
// a small indirection so the handler test stays decoupled from the exact
|
||||
// section set in internal/rescue (which has its own tests).
|
||||
func rescueBundleSectionCount() []struct{} {
|
||||
count := 0
|
||||
prevRun, prevRedact := rescue.RunRemote, rescue.Redact
|
||||
rescue.RunRemote = func(_ context.Context, _ string, _ string) (string, error) { count++; return "", nil }
|
||||
rescue.Redact = func(_ws, c string) string { return c }
|
||||
rescue.Capture(context.Background(), rescue.Input{InstanceID: "i-probe", WorkspaceID: "w", OrgID: "o"})
|
||||
rescue.RunRemote = prevRun
|
||||
rescue.Redact = prevRedact
|
||||
return make([]struct{}, count)
|
||||
}
|
||||
@@ -79,7 +79,7 @@ func isSafeURL(rawURL string) error {
|
||||
}
|
||||
addrs, err := net.LookupHost(host)
|
||||
if err != nil {
|
||||
return fmt.Errorf("DNS resolution blocked for hostname: %s (%w)", host, err)
|
||||
return fmt.Errorf("DNS resolution blocked for hostname: %s (%v)", host, err)
|
||||
}
|
||||
if len(addrs) == 0 {
|
||||
return fmt.Errorf("DNS returned no addresses for: %s", host)
|
||||
|
||||
@@ -1,15 +1,12 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/crypto"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
@@ -21,69 +18,30 @@ func NewTracesHandler() *TracesHandler {
|
||||
return &TracesHandler{}
|
||||
}
|
||||
|
||||
type langfuseConfig struct {
|
||||
Host string
|
||||
Public string
|
||||
Secret string
|
||||
}
|
||||
|
||||
// resolveLangfuseConfig resolves Langfuse connection settings from
|
||||
// admin-controlled global secrets first, then process env for legacy/dev use.
|
||||
// Workspace secrets are intentionally excluded: a workspace-controlled
|
||||
// LANGFUSE_HOST would allow SSRF with BasicAuth attached (#2029).
|
||||
func resolveLangfuseConfig(ctx context.Context) (*langfuseConfig, error) {
|
||||
cfg := &langfuseConfig{}
|
||||
|
||||
resolve := func(key string) string {
|
||||
var val []byte
|
||||
var ver int
|
||||
err := db.DB.QueryRowContext(ctx,
|
||||
`SELECT encrypted_value, encryption_version FROM global_secrets WHERE key = $1`,
|
||||
key).Scan(&val, &ver)
|
||||
if err == nil {
|
||||
decrypted, decErr := crypto.DecryptVersioned(val, ver)
|
||||
if decErr == nil {
|
||||
return string(decrypted)
|
||||
}
|
||||
}
|
||||
return os.Getenv(key)
|
||||
}
|
||||
|
||||
cfg.Host = resolve("LANGFUSE_HOST")
|
||||
cfg.Public = resolve("LANGFUSE_PUBLIC_KEY")
|
||||
cfg.Secret = resolve("LANGFUSE_SECRET_KEY")
|
||||
|
||||
if cfg.Host == "" || cfg.Public == "" || cfg.Secret == "" {
|
||||
return nil, nil
|
||||
}
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
// List handles GET /workspaces/:id/traces
|
||||
// Proxies to Langfuse API to get recent traces for a workspace.
|
||||
func (h *TracesHandler) List(c *gin.Context) {
|
||||
workspaceID := c.Param("id")
|
||||
|
||||
cfg, err := resolveLangfuseConfig(c.Request.Context())
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to resolve trace config"})
|
||||
return
|
||||
}
|
||||
if cfg == nil {
|
||||
langfuseHost := os.Getenv("LANGFUSE_HOST")
|
||||
langfusePublic := os.Getenv("LANGFUSE_PUBLIC_KEY")
|
||||
langfuseSecret := os.Getenv("LANGFUSE_SECRET_KEY")
|
||||
|
||||
if langfuseHost == "" || langfusePublic == "" || langfuseSecret == "" {
|
||||
c.JSON(http.StatusOK, []interface{}{})
|
||||
return
|
||||
}
|
||||
|
||||
// Fetch traces from Langfuse, filtered by workspace tag or name
|
||||
url := fmt.Sprintf("%s/api/public/traces?limit=20&orderBy=timestamp&orderDir=desc&tags=%s",
|
||||
cfg.Host, workspaceID)
|
||||
langfuseHost, workspaceID)
|
||||
|
||||
req, err := http.NewRequestWithContext(c.Request.Context(), "GET", url, nil)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to create request"})
|
||||
return
|
||||
}
|
||||
req.SetBasicAuth(cfg.Public, cfg.Secret)
|
||||
req.SetBasicAuth(langfusePublic, langfuseSecret)
|
||||
|
||||
resp, err := langfuseClient.Do(req)
|
||||
if err != nil {
|
||||
@@ -93,14 +51,10 @@ func (h *TracesHandler) List(c *gin.Context) {
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
body, readErr := io.ReadAll(resp.Body)
|
||||
if readErr != nil {
|
||||
c.JSON(http.StatusOK, []interface{}{})
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to read response body"})
|
||||
return
|
||||
}
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
c.JSON(http.StatusOK, []interface{}{})
|
||||
return
|
||||
}
|
||||
c.Data(http.StatusOK, "application/json", body)
|
||||
c.Data(resp.StatusCode, "application/json", body)
|
||||
}
|
||||
|
||||
@@ -1,31 +1,27 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/crypto"
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
// ==================== GET /workspaces/:id/traces ====================
|
||||
|
||||
func TestTracesList_NoLangfuseConfig(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
handler := NewTracesHandler()
|
||||
|
||||
// Ensure Langfuse env vars are not set
|
||||
os.Unsetenv("LANGFUSE_HOST")
|
||||
os.Unsetenv("LANGFUSE_PUBLIC_KEY")
|
||||
os.Unsetenv("LANGFUSE_SECRET_KEY")
|
||||
|
||||
expectMissingGlobalLangfuseSecrets(mock)
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-traces"}}
|
||||
@@ -45,16 +41,14 @@ func TestTracesList_NoLangfuseConfig(t *testing.T) {
|
||||
if len(resp) != 0 {
|
||||
t.Errorf("expected empty list when Langfuse not configured, got %d items", len(resp))
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sqlmock expectations not met: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTracesList_PartialLangfuseConfig(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
handler := NewTracesHandler()
|
||||
|
||||
// Set only host, missing keys
|
||||
os.Setenv("LANGFUSE_HOST", "http://localhost:3000")
|
||||
os.Unsetenv("LANGFUSE_PUBLIC_KEY")
|
||||
os.Unsetenv("LANGFUSE_SECRET_KEY")
|
||||
@@ -62,8 +56,6 @@ func TestTracesList_PartialLangfuseConfig(t *testing.T) {
|
||||
os.Unsetenv("LANGFUSE_HOST")
|
||||
}()
|
||||
|
||||
expectMissingGlobalLangfuseSecrets(mock)
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-traces-partial"}}
|
||||
@@ -80,13 +72,10 @@ func TestTracesList_PartialLangfuseConfig(t *testing.T) {
|
||||
if len(resp) != 0 {
|
||||
t.Errorf("expected empty list with partial config, got %d items", len(resp))
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sqlmock expectations not met: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTracesList_LangfuseUnreachable(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
handler := NewTracesHandler()
|
||||
|
||||
@@ -100,8 +89,6 @@ func TestTracesList_LangfuseUnreachable(t *testing.T) {
|
||||
os.Unsetenv("LANGFUSE_SECRET_KEY")
|
||||
}()
|
||||
|
||||
expectMissingGlobalLangfuseSecrets(mock)
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-traces-down"}}
|
||||
@@ -119,171 +106,4 @@ func TestTracesList_LangfuseUnreachable(t *testing.T) {
|
||||
if len(resp) != 0 {
|
||||
t.Errorf("expected empty list when Langfuse unreachable, got %d items", len(resp))
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sqlmock expectations not met: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTracesList_GlobalSecretsFallback(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
handler := NewTracesHandler()
|
||||
|
||||
os.Unsetenv("LANGFUSE_HOST")
|
||||
os.Unsetenv("LANGFUSE_PUBLIC_KEY")
|
||||
os.Unsetenv("LANGFUSE_SECRET_KEY")
|
||||
|
||||
expectGlobalLangfuseSecret(mock, "LANGFUSE_HOST", "http://localhost:3000")
|
||||
expectGlobalLangfuseSecret(mock, "LANGFUSE_PUBLIC_KEY", "pk-global")
|
||||
expectGlobalLangfuseSecret(mock, "LANGFUSE_SECRET_KEY", "sk-global")
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-traces-global"}}
|
||||
c.Request = httptest.NewRequest("GET", "/workspaces/ws-traces-global/traces", nil)
|
||||
|
||||
handler.List(c)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
|
||||
var resp []interface{}
|
||||
json.Unmarshal(w.Body.Bytes(), &resp)
|
||||
if len(resp) != 0 {
|
||||
t.Errorf("expected empty list when Langfuse unreachable, got %d items", len(resp))
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sqlmock expectations not met: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTracesList_GlobalPartialConfig(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
handler := NewTracesHandler()
|
||||
|
||||
os.Unsetenv("LANGFUSE_HOST")
|
||||
os.Unsetenv("LANGFUSE_PUBLIC_KEY")
|
||||
os.Unsetenv("LANGFUSE_SECRET_KEY")
|
||||
|
||||
expectGlobalLangfuseSecret(mock, "LANGFUSE_HOST", "http://localhost:3000")
|
||||
mock.ExpectQuery(`SELECT encrypted_value, encryption_version FROM global_secrets WHERE key = \$1`).
|
||||
WithArgs("LANGFUSE_PUBLIC_KEY").
|
||||
WillReturnError(sql.ErrNoRows)
|
||||
mock.ExpectQuery(`SELECT encrypted_value, encryption_version FROM global_secrets WHERE key = \$1`).
|
||||
WithArgs("LANGFUSE_SECRET_KEY").
|
||||
WillReturnError(sql.ErrNoRows)
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-traces-partial"}}
|
||||
c.Request = httptest.NewRequest("GET", "/workspaces/ws-traces-partial/traces", nil)
|
||||
|
||||
handler.List(c)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
|
||||
var resp []interface{}
|
||||
json.Unmarshal(w.Body.Bytes(), &resp)
|
||||
if len(resp) != 0 {
|
||||
t.Errorf("expected empty list with partial config, got %d items", len(resp))
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sqlmock expectations not met: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTracesList_LangfuseUpstreamError(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
handler := NewTracesHandler()
|
||||
|
||||
upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
w.Write([]byte("<html><body>Internal Server Error</body></html>"))
|
||||
}))
|
||||
defer upstream.Close()
|
||||
|
||||
os.Setenv("LANGFUSE_HOST", upstream.URL)
|
||||
os.Setenv("LANGFUSE_PUBLIC_KEY", "pk-test")
|
||||
os.Setenv("LANGFUSE_SECRET_KEY", "sk-test")
|
||||
defer func() {
|
||||
os.Unsetenv("LANGFUSE_HOST")
|
||||
os.Unsetenv("LANGFUSE_PUBLIC_KEY")
|
||||
os.Unsetenv("LANGFUSE_SECRET_KEY")
|
||||
}()
|
||||
|
||||
expectMissingGlobalLangfuseSecrets(mock)
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-traces-500"}}
|
||||
c.Request = httptest.NewRequest("GET", "/workspaces/ws-traces-500/traces", nil)
|
||||
|
||||
handler.List(c)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
|
||||
var resp []interface{}
|
||||
json.Unmarshal(w.Body.Bytes(), &resp)
|
||||
if len(resp) != 0 {
|
||||
t.Errorf("expected empty list on upstream error, got %d items", len(resp))
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sqlmock expectations not met: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTracesList_WorkspaceSecretsIgnored(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
handler := NewTracesHandler()
|
||||
|
||||
os.Unsetenv("LANGFUSE_HOST")
|
||||
os.Unsetenv("LANGFUSE_PUBLIC_KEY")
|
||||
os.Unsetenv("LANGFUSE_SECRET_KEY")
|
||||
|
||||
expectMissingGlobalLangfuseSecrets(mock)
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-traces-ssrf"}}
|
||||
c.Request = httptest.NewRequest("GET", "/workspaces/ws-traces-ssrf/traces", nil)
|
||||
|
||||
handler.List(c)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
|
||||
var resp []interface{}
|
||||
json.Unmarshal(w.Body.Bytes(), &resp)
|
||||
if len(resp) != 0 {
|
||||
t.Errorf("expected empty list when workspace secrets ignored, got %d items", len(resp))
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sqlmock expectations not met: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func expectMissingGlobalLangfuseSecrets(mock sqlmock.Sqlmock) {
|
||||
for _, key := range []string{"LANGFUSE_HOST", "LANGFUSE_PUBLIC_KEY", "LANGFUSE_SECRET_KEY"} {
|
||||
mock.ExpectQuery(`SELECT encrypted_value, encryption_version FROM global_secrets WHERE key = \$1`).
|
||||
WithArgs(key).
|
||||
WillReturnError(sql.ErrNoRows)
|
||||
}
|
||||
}
|
||||
|
||||
func expectGlobalLangfuseSecret(mock sqlmock.Sqlmock, key, value string) {
|
||||
enc, _ := crypto.Encrypt([]byte(value))
|
||||
ver := crypto.CurrentEncryptionVersion()
|
||||
mock.ExpectQuery(`SELECT encrypted_value, encryption_version FROM global_secrets WHERE key = \$1`).
|
||||
WithArgs(key).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}).AddRow(enc, ver))
|
||||
}
|
||||
|
||||
@@ -856,38 +856,11 @@ func (h *WorkspaceHandler) Create(c *gin.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
// Mint the workspace's first bearer token and return it inline
|
||||
// (#1644). Pre-fix, callers had to make a separate POST to
|
||||
// /admin/workspaces/:id/tokens (production path, AdminAuth-gated,
|
||||
// but the path-prefix differs in CP-admin deploys so staging E2E
|
||||
// got HTML 404) OR fall back to GET /admin/workspaces/:id/test-token
|
||||
// (dev-only — deliberately 404s on MOLECULE_ENV=production per
|
||||
// admin_test_token.go::TestTokensEnabled, which violates
|
||||
// feedback_no_dev_only_routes_in_e2e). Inlining the first token here
|
||||
// makes the create response the SSOT — every caller (canvas Save,
|
||||
// org_import, E2E, third-party API) gets the bearer they need to
|
||||
// authenticate /activity, /a2a, /memory etc. without an extra
|
||||
// round trip to a separate mint endpoint.
|
||||
//
|
||||
// Failure is non-fatal: the workspace row already committed; the
|
||||
// operator can recover via POST /admin/workspaces/:id/tokens
|
||||
// (canonical admin mint) or POST /workspaces/:id/external/rotate
|
||||
// (already-used for the external pre-register path above). We log
|
||||
// the failure and return 201 without the field — callers that need
|
||||
// the token will get a clear-shaped fallback (auth_token absent
|
||||
// from response = use the admin mint path).
|
||||
resp := gin.H{
|
||||
c.JSON(http.StatusCreated, gin.H{
|
||||
"id": id,
|
||||
"status": "provisioning",
|
||||
"workspace_access": workspaceAccess,
|
||||
}
|
||||
if authToken, tokErr := wsauth.IssueToken(ctx, db.DB, id); tokErr != nil {
|
||||
log.Printf("Create workspace %s: inline auth_token mint failed (non-fatal — caller can use POST /admin/workspaces/:id/tokens): %v", id, tokErr)
|
||||
} else {
|
||||
resp["auth_token"] = authToken
|
||||
}
|
||||
|
||||
c.JSON(http.StatusCreated, resp)
|
||||
})
|
||||
}
|
||||
|
||||
// addProvisionTimeoutMs decorates a workspace response map with the
|
||||
|
||||
@@ -1,200 +0,0 @@
|
||||
package handlers
|
||||
|
||||
// Sqlmock-backed coverage for workspace_abilities.go (PatchAbilities).
|
||||
// Closes #1312 — handler was at 0% coverage.
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
func patchAbilitiesReq(t *testing.T, wsID string, body string) *httptest.ResponseRecorder {
|
||||
t.Helper()
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: wsID}}
|
||||
c.Request = httptest.NewRequest("PATCH", "/workspaces/"+wsID+"/abilities", bytes.NewBufferString(body))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
PatchAbilities(c)
|
||||
return w
|
||||
}
|
||||
|
||||
// ---------- Validation errors ----------
|
||||
|
||||
func TestPatchAbilities_InvalidWorkspaceID(t *testing.T) {
|
||||
w := patchAbilitiesReq(t, "not-a-uuid", `{"broadcast_enabled":true}`)
|
||||
if w.Code != http.StatusBadRequest {
|
||||
t.Fatalf("expected 400, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestPatchAbilities_InvalidJSON(t *testing.T) {
|
||||
w := patchAbilitiesReq(t, wsUUID1, `not json`)
|
||||
if w.Code != http.StatusBadRequest {
|
||||
t.Fatalf("expected 400, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestPatchAbilities_EmptyBody(t *testing.T) {
|
||||
w := patchAbilitiesReq(t, wsUUID1, `{}`)
|
||||
if w.Code != http.StatusBadRequest {
|
||||
t.Fatalf("expected 400, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
// ---------- Not found ----------
|
||||
|
||||
func TestPatchAbilities_WorkspaceNotFound(t *testing.T) {
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
|
||||
mock.ExpectQuery(`SELECT EXISTS\(SELECT 1 FROM workspaces WHERE id = \$1 AND status != 'removed'\)`).
|
||||
WithArgs(wsUUID1).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(false))
|
||||
|
||||
w := patchAbilitiesReq(t, wsUUID1, `{"broadcast_enabled":true}`)
|
||||
if w.Code != http.StatusNotFound {
|
||||
t.Fatalf("expected 404, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPatchAbilities_ExistsQueryError(t *testing.T) {
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
|
||||
mock.ExpectQuery(`SELECT EXISTS\(SELECT 1 FROM workspaces WHERE id = \$1 AND status != 'removed'\)`).
|
||||
WithArgs(wsUUID1).
|
||||
WillReturnError(errors.New("conn refused"))
|
||||
|
||||
w := patchAbilitiesReq(t, wsUUID1, `{"broadcast_enabled":true}`)
|
||||
if w.Code != http.StatusNotFound {
|
||||
t.Fatalf("expected 404 on exists query error, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
// ---------- Happy paths ----------
|
||||
|
||||
func TestPatchAbilities_BroadcastOnly(t *testing.T) {
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
|
||||
mock.ExpectQuery(`SELECT EXISTS\(SELECT 1 FROM workspaces WHERE id = \$1 AND status != 'removed'\)`).
|
||||
WithArgs(wsUUID1).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
|
||||
mock.ExpectExec(`UPDATE workspaces SET broadcast_enabled = \$2, updated_at = now\(\) WHERE id = \$1`).
|
||||
WithArgs(wsUUID1, true).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
w := patchAbilitiesReq(t, wsUUID1, `{"broadcast_enabled":true}`)
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPatchAbilities_TalkToUserOnly(t *testing.T) {
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
|
||||
mock.ExpectQuery(`SELECT EXISTS\(SELECT 1 FROM workspaces WHERE id = \$1 AND status != 'removed'\)`).
|
||||
WithArgs(wsUUID1).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
|
||||
mock.ExpectExec(`UPDATE workspaces SET talk_to_user_enabled = \$2, updated_at = now\(\) WHERE id = \$1`).
|
||||
WithArgs(wsUUID1, false).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
w := patchAbilitiesReq(t, wsUUID1, `{"talk_to_user_enabled":false}`)
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPatchAbilities_BothFields(t *testing.T) {
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
|
||||
mock.ExpectQuery(`SELECT EXISTS\(SELECT 1 FROM workspaces WHERE id = \$1 AND status != 'removed'\)`).
|
||||
WithArgs(wsUUID1).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
|
||||
mock.ExpectExec(`UPDATE workspaces SET broadcast_enabled = \$2, updated_at = now\(\) WHERE id = \$1`).
|
||||
WithArgs(wsUUID1, true).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec(`UPDATE workspaces SET talk_to_user_enabled = \$2, updated_at = now\(\) WHERE id = \$1`).
|
||||
WithArgs(wsUUID1, true).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
w := patchAbilitiesReq(t, wsUUID1, `{"broadcast_enabled":true,"talk_to_user_enabled":true}`)
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// ---------- DB errors on update ----------
|
||||
|
||||
func TestPatchAbilities_BroadcastUpdateError(t *testing.T) {
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
|
||||
mock.ExpectQuery(`SELECT EXISTS\(SELECT 1 FROM workspaces WHERE id = \$1 AND status != 'removed'\)`).
|
||||
WithArgs(wsUUID1).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
|
||||
mock.ExpectExec(`UPDATE workspaces SET broadcast_enabled = \$2, updated_at = now\(\) WHERE id = \$1`).
|
||||
WithArgs(wsUUID1, true).
|
||||
WillReturnError(errors.New("disk full"))
|
||||
|
||||
w := patchAbilitiesReq(t, wsUUID1, `{"broadcast_enabled":true}`)
|
||||
if w.Code != http.StatusInternalServerError {
|
||||
t.Fatalf("expected 500, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestPatchAbilities_TalkToUserUpdateError(t *testing.T) {
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
|
||||
mock.ExpectQuery(`SELECT EXISTS\(SELECT 1 FROM workspaces WHERE id = \$1 AND status != 'removed'\)`).
|
||||
WithArgs(wsUUID1).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
|
||||
mock.ExpectExec(`UPDATE workspaces SET talk_to_user_enabled = \$2, updated_at = now\(\) WHERE id = \$1`).
|
||||
WithArgs(wsUUID1, false).
|
||||
WillReturnError(errors.New("disk full"))
|
||||
|
||||
w := patchAbilitiesReq(t, wsUUID1, `{"talk_to_user_enabled":false}`)
|
||||
if w.Code != http.StatusInternalServerError {
|
||||
t.Fatalf("expected 500, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestPatchAbilities_BothFields_BroadcastFails(t *testing.T) {
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
|
||||
mock.ExpectQuery(`SELECT EXISTS\(SELECT 1 FROM workspaces WHERE id = \$1 AND status != 'removed'\)`).
|
||||
WithArgs(wsUUID1).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
|
||||
mock.ExpectExec(`UPDATE workspaces SET broadcast_enabled = \$2, updated_at = now\(\) WHERE id = \$1`).
|
||||
WithArgs(wsUUID1, true).
|
||||
WillReturnError(errors.New("disk full"))
|
||||
|
||||
w := patchAbilitiesReq(t, wsUUID1, `{"broadcast_enabled":true,"talk_to_user_enabled":true}`)
|
||||
if w.Code != http.StatusInternalServerError {
|
||||
t.Fatalf("expected 500, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
@@ -91,18 +91,6 @@ func (h *WorkspaceHandler) BootstrapFailed(c *gin.Context) {
|
||||
"log_tail": tail,
|
||||
"source": "bootstrap_watcher",
|
||||
})
|
||||
|
||||
// RFC internal#742 Part 2: this is one of the two boot-failure
|
||||
// verdict points. We've just flipped a still-running (but
|
||||
// unconfigured) workspace EC2 to `failed`; the control plane will
|
||||
// reap the instance shortly. Capture a forensic rescue bundle off
|
||||
// the live box NOW, before it's torn down, so a wedged workspace is
|
||||
// post-mortem-inspectable. Best-effort + non-blocking: runs in its
|
||||
// own goroutine with its own timeout, detached from this request's
|
||||
// context (which is cancelled the instant this handler returns).
|
||||
// Failure to capture never changes the boot-failure handling.
|
||||
captureRescueBundle(id, "bootstrap_watcher")
|
||||
|
||||
log.Printf("BootstrapFailed: marked %s failed (tail=%d bytes, err=%q)", id, len(tail), errMsg)
|
||||
c.JSON(http.StatusOK, gin.H{"ok": true})
|
||||
}
|
||||
|
||||
@@ -85,15 +85,15 @@ func (h *BroadcastHandler) Broadcast(c *gin.Context) {
|
||||
var orgRootID string
|
||||
err = db.DB.QueryRowContext(ctx, `
|
||||
WITH RECURSIVE org_chain AS (
|
||||
SELECT id, parent_id
|
||||
SELECT id, parent_id, id AS root_id
|
||||
FROM workspaces
|
||||
WHERE id = $1
|
||||
UNION ALL
|
||||
SELECT w.id, w.parent_id
|
||||
SELECT w.id, w.parent_id, c.root_id
|
||||
FROM workspaces w
|
||||
JOIN org_chain c ON w.id = c.parent_id
|
||||
)
|
||||
SELECT id AS root_id FROM org_chain WHERE parent_id IS NULL LIMIT 1
|
||||
SELECT root_id FROM org_chain WHERE parent_id IS NULL LIMIT 1
|
||||
`, senderID).Scan(&orgRootID)
|
||||
if err != nil {
|
||||
log.Printf("Broadcast: org root lookup for %s: %v", senderID, err)
|
||||
|
||||
@@ -1,144 +0,0 @@
|
||||
//go:build integration
|
||||
// +build integration
|
||||
|
||||
// workspace_broadcast_org_root_integration_test.go — REAL Postgres
|
||||
// regression test for #1959: the Broadcast org-root recursive CTE.
|
||||
//
|
||||
// Run with:
|
||||
//
|
||||
// INTEGRATION_DB_URL="postgres://postgres:test@localhost:55432/molecule?sslmode=disable" \
|
||||
// go test -tags=integration ./internal/handlers/ -run Integration_BroadcastOrgRoot -v
|
||||
//
|
||||
// CI: piggybacks on .github/workflows/handlers-postgres-integration.yml
|
||||
// (path-filter includes workspace-server/internal/handlers/**).
|
||||
//
|
||||
// Why this is NOT a sqlmock test
|
||||
// ------------------------------
|
||||
// The unit tests in workspace_broadcast_test.go use sqlmock, which
|
||||
// returns whatever rows the test stubs — it CANNOT execute the
|
||||
// recursive CTE, so it cannot catch the #1959 bug where the anchor
|
||||
// pinned `id AS root_id` to the SENDER's own id and carried it
|
||||
// unchanged up the chain. With that bug a non-root sender resolved
|
||||
// ITSELF as the org root (wrong broadcast scoping). Only a real
|
||||
// Postgres can prove the corrected CTE resolves UP to the true
|
||||
// null-parent ancestor.
|
||||
//
|
||||
// The query under test is copied verbatim from Broadcast() in
|
||||
// workspace_broadcast.go; if that query changes, this test must be
|
||||
// updated in lockstep (it is the real-artifact gate for the fix).
|
||||
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/google/uuid"
|
||||
_ "github.com/lib/pq"
|
||||
)
|
||||
|
||||
// orgRootCTE is the exact org-root resolution query from Broadcast().
|
||||
// Kept here verbatim so the test fails loudly if the handler regresses
|
||||
// to the #1959 sender-id-pinned form.
|
||||
const orgRootCTE = `
|
||||
WITH RECURSIVE org_chain AS (
|
||||
SELECT id, parent_id
|
||||
FROM workspaces
|
||||
WHERE id = $1
|
||||
UNION ALL
|
||||
SELECT w.id, w.parent_id
|
||||
FROM workspaces w
|
||||
JOIN org_chain c ON w.id = c.parent_id
|
||||
)
|
||||
SELECT id AS root_id FROM org_chain WHERE parent_id IS NULL LIMIT 1
|
||||
`
|
||||
|
||||
func integrationDB_BroadcastOrgRoot(t *testing.T) *sql.DB {
|
||||
t.Helper()
|
||||
url := os.Getenv("INTEGRATION_DB_URL")
|
||||
if url == "" {
|
||||
t.Skip("INTEGRATION_DB_URL not set; skipping (see file header)")
|
||||
}
|
||||
conn, err := sql.Open("postgres", url)
|
||||
if err != nil {
|
||||
t.Fatalf("open: %v", err)
|
||||
}
|
||||
if err := conn.Ping(); err != nil {
|
||||
t.Fatalf("ping: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { conn.Close() })
|
||||
return conn
|
||||
}
|
||||
|
||||
// TestIntegration_BroadcastOrgRoot_NonRootSenderResolvesToRoot builds a
|
||||
// real three-level org chain in Postgres:
|
||||
//
|
||||
// root (parent_id = NULL)
|
||||
// └── mid (parent_id = root)
|
||||
// └── leaf (parent_id = mid) ← non-root sender
|
||||
//
|
||||
// and runs the handler's org-root CTE for each node. Every node — root,
|
||||
// mid, and leaf — MUST resolve to `root`. Under the #1959 bug the leaf
|
||||
// (and mid) resolved to themselves; this test pins the fix.
|
||||
func TestIntegration_BroadcastOrgRoot_NonRootSenderResolvesToRoot(t *testing.T) {
|
||||
conn := integrationDB_BroadcastOrgRoot(t)
|
||||
ctx := context.Background()
|
||||
|
||||
prefix := fmt.Sprintf("itest-bcastroot-%s", uuid.New().String()[:8])
|
||||
t.Cleanup(func() {
|
||||
if _, err := conn.ExecContext(ctx,
|
||||
`DELETE FROM workspaces WHERE name LIKE $1`, prefix+"%"); err != nil {
|
||||
t.Logf("cleanup (non-fatal): %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
rootID := uuid.New().String()
|
||||
midID := uuid.New().String()
|
||||
leafID := uuid.New().String()
|
||||
|
||||
// root — parent_id NULL.
|
||||
if _, err := conn.ExecContext(ctx, `
|
||||
INSERT INTO workspaces (id, name, tier, runtime, status, parent_id)
|
||||
VALUES ($1, $2, 2, 'claude-code', 'online', NULL)
|
||||
`, rootID, prefix+"-root"); err != nil {
|
||||
t.Fatalf("seed root: %v", err)
|
||||
}
|
||||
// mid — child of root.
|
||||
if _, err := conn.ExecContext(ctx, `
|
||||
INSERT INTO workspaces (id, name, tier, runtime, status, parent_id)
|
||||
VALUES ($1, $2, 2, 'claude-code', 'online', $3)
|
||||
`, midID, prefix+"-mid", rootID); err != nil {
|
||||
t.Fatalf("seed mid: %v", err)
|
||||
}
|
||||
// leaf — child of mid (a non-root, non-direct-child sender).
|
||||
if _, err := conn.ExecContext(ctx, `
|
||||
INSERT INTO workspaces (id, name, tier, runtime, status, parent_id)
|
||||
VALUES ($1, $2, 2, 'claude-code', 'online', $3)
|
||||
`, leafID, prefix+"-leaf", midID); err != nil {
|
||||
t.Fatalf("seed leaf: %v", err)
|
||||
}
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
senderID string
|
||||
}{
|
||||
{"root sender resolves to itself", rootID},
|
||||
{"mid sender resolves to root", midID},
|
||||
{"leaf (deep non-root) sender resolves to root", leafID},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
var got string
|
||||
if err := conn.QueryRowContext(ctx, orgRootCTE, tc.senderID).Scan(&got); err != nil {
|
||||
t.Fatalf("org-root CTE for %s: %v", tc.senderID, err)
|
||||
}
|
||||
if got != rootID {
|
||||
t.Errorf("org root for sender %s = %s; want %s (the true null-parent ancestor) — #1959 regression: a non-root sender resolved to the wrong root",
|
||||
tc.senderID, got, rootID)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -168,8 +168,6 @@ func TestWorkspaceBudget_Create_WithLimit(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec("INSERT INTO structure_events").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec("INSERT INTO workspace_auth_tokens").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
|
||||
@@ -108,8 +108,6 @@ func TestWorkspaceCreate_WithCompute_PersistsComputeJSON(t *testing.T) {
|
||||
mock.ExpectCommit()
|
||||
mock.ExpectExec("INSERT INTO canvas_layouts").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec("INSERT INTO workspace_auth_tokens").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
|
||||
@@ -93,16 +93,3 @@ func formatMissingBYOKCredentialError(mode string) string {
|
||||
mode,
|
||||
)
|
||||
}
|
||||
|
||||
// formatMissingPlatformProxyError builds the user-facing message for a
|
||||
// provision failure caused by a platform-managed workspace whose control-plane
|
||||
// proxy environment is absent (#2162). The platform-managed path requires
|
||||
// MOLECULE_LLM_BASE_URL + MOLECULE_LLM_USAGE_TOKEN (or their OPENAI_*
|
||||
// fallbacks) to inject a usable credential; without them the workspace must
|
||||
// NOT start credential-less.
|
||||
func formatMissingPlatformProxyError() string {
|
||||
return "this workspace is configured for platform-managed LLM billing but the control-plane proxy is not ready. " +
|
||||
"The required platform proxy env (MOLECULE_LLM_BASE_URL + MOLECULE_LLM_USAGE_TOKEN) is absent. " +
|
||||
"This is usually a transient boot-race; retry in 30 seconds. If it persists, verify the platform proxy " +
|
||||
"is configured for this tenant/runtime and contact the platform team."
|
||||
}
|
||||
|
||||
@@ -1003,13 +1003,12 @@ func applyPlatformManagedLLMEnv(ctx context.Context, envVars map[string]string,
|
||||
anthropicBaseURL := firstNonEmptyEnv("MOLECULE_LLM_ANTHROPIC_BASE_URL", "ANTHROPIC_BASE_URL")
|
||||
token := firstNonEmptyEnv("MOLECULE_LLM_USAGE_TOKEN", "OPENAI_API_KEY")
|
||||
if baseURL == "" || token == "" {
|
||||
// Proxy not configured (boot race / misconfig). The platform_managed
|
||||
// path REQUIRES the CP proxy env to inject a usable credential.
|
||||
// Reporting HasUsableLLMCred=true here would start the workspace
|
||||
// credential-less — the adk-demo dark-wedge class (#2162).
|
||||
// Return false so the caller's fail-closed branch aborts with
|
||||
// MISSING_PLATFORM_PROXY.
|
||||
return platformLLMEnvResult{ResolvedMode: res.ResolvedMode, HasUsableLLMCred: false, Source: res.Source}
|
||||
// Proxy not configured (boot race / misconfig). On the platform_managed
|
||||
// path the workspace IS entitled to platform creds, so we do NOT strip
|
||||
// here — but we report HasUsableLLMCred from whatever survived so the
|
||||
// caller's fail-closed branch (non-platform only) is never reached on
|
||||
// this path.
|
||||
return platformLLMEnvResult{ResolvedMode: res.ResolvedMode, HasUsableLLMCred: true, Source: res.Source}
|
||||
}
|
||||
stripPlatformManagedLLMBypassEnv(envVars)
|
||||
|
||||
|
||||
@@ -134,11 +134,6 @@ func TestProvisionWorkspaceAuto_NoBackendMarksFailed(t *testing.T) {
|
||||
// This is the regression-prevention test for the Design Director bug
|
||||
// where 7-of-7 sub-agents went down the Docker path on SaaS.
|
||||
func TestProvisionWorkspaceAuto_RoutesToCPWhenSet(t *testing.T) {
|
||||
// Supply the CP proxy env so the platform-managed default does not abort
|
||||
// with MISSING_PLATFORM_PROXY (molecule-core#2162).
|
||||
t.Setenv("MOLECULE_LLM_BASE_URL", "https://api.example.test/api/v1/internal/llm/openai/v1")
|
||||
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "tenant-admin-token")
|
||||
|
||||
mock := setupTestDB(t)
|
||||
mock.MatchExpectationsInOrder(false)
|
||||
|
||||
@@ -602,11 +597,6 @@ func TestNoCallSiteCallsBareStop(t *testing.T) {
|
||||
// count without mocking out the retry helper itself, which would
|
||||
// invert the test contract — the retry IS the dispatcher's job here).
|
||||
func TestRestartWorkspaceAuto_RoutesToCPWhenSet(t *testing.T) {
|
||||
// Supply the CP proxy env so the platform-managed default does not abort
|
||||
// with MISSING_PLATFORM_PROXY (molecule-core#2162).
|
||||
t.Setenv("MOLECULE_LLM_BASE_URL", "https://api.example.test/api/v1/internal/llm/openai/v1")
|
||||
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "tenant-admin-token")
|
||||
|
||||
rec := &trackingCPProv{}
|
||||
bcast := &concurrentSafeBroadcaster{}
|
||||
h := NewWorkspaceHandler(bcast, nil, "http://localhost:8080", t.TempDir())
|
||||
@@ -805,11 +795,6 @@ func TestResumeHandler_UsesProvisionWorkspaceAuto(t *testing.T) {
|
||||
// the async tests; the absence of `go` semantics is the load-bearing
|
||||
// distinction we're pinning.
|
||||
func TestProvisionWorkspaceAutoSync_RoutesToCPWhenSet(t *testing.T) {
|
||||
// Supply the CP proxy env so the platform-managed default does not abort
|
||||
// with MISSING_PLATFORM_PROXY (molecule-core#2162).
|
||||
t.Setenv("MOLECULE_LLM_BASE_URL", "https://api.example.test/api/v1/internal/llm/openai/v1")
|
||||
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "tenant-admin-token")
|
||||
|
||||
mock := setupTestDB(t)
|
||||
mock.MatchExpectationsInOrder(false)
|
||||
// provisionWorkspaceCP runs prepareProvisionContext synchronously, which
|
||||
|
||||
@@ -98,11 +98,6 @@ func (r *recordingCPProv) startedSet() map[string]struct{} {
|
||||
func TestProvisionWorkspaceCP_ConcurrentBurst_NoSilentDrop(t *testing.T) {
|
||||
const numWorkspaces = 7
|
||||
|
||||
// Supply the CP proxy env so the platform-managed default does not abort
|
||||
// with MISSING_PLATFORM_PROXY (molecule-core#2162).
|
||||
t.Setenv("MOLECULE_LLM_BASE_URL", "https://api.example.test/api/v1/internal/llm/openai/v1")
|
||||
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "tenant-admin-token")
|
||||
|
||||
mock := setupTestDB(t)
|
||||
|
||||
// Every goroutine runs prepareProvisionContext → mintWorkspaceSecrets
|
||||
|
||||
@@ -230,18 +230,6 @@ func (h *WorkspaceHandler) prepareProvisionContext(
|
||||
Extra: map[string]interface{}{"error": msg, "code": "MISSING_BYOK_CREDENTIAL", "billing_mode": llmRes.ResolvedMode, "issue": "1994"},
|
||||
}
|
||||
}
|
||||
// Fail closed for a platform-managed workspace whose CP proxy env is
|
||||
// absent: do NOT start it credential-less (adk-demo dark-wedge class,
|
||||
// #2162). The platform_managed path requires the proxy injection to
|
||||
// produce a usable credential.
|
||||
if llmRes.ResolvedMode == LLMBillingModePlatformManaged && !llmRes.HasUsableLLMCred {
|
||||
msg := formatMissingPlatformProxyError()
|
||||
log.Printf("Provisioner: ABORT workspace=%s — platform-managed billing mode but CP proxy env absent (MISSING_PLATFORM_PROXY, molecule-core#2162)", workspaceID)
|
||||
return nil, &provisionAbort{
|
||||
Msg: msg,
|
||||
Extra: map[string]interface{}{"error": msg, "code": "MISSING_PLATFORM_PROXY", "billing_mode": llmRes.ResolvedMode, "issue": "2162"},
|
||||
}
|
||||
}
|
||||
applyRuntimeModelEnv(envVars, payload.Runtime, payload.Model)
|
||||
if payload.Role != "" {
|
||||
envVars["MOLECULE_AGENT_ROLE"] = payload.Role
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user