Compare commits
20 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 861ecc52ca | |||
| 7385a3a1c0 | |||
| 7219f3dc64 | |||
| 6a19b98918 | |||
| e2d7ff0df8 | |||
| 3870dd2dce | |||
| 59405ab775 | |||
| e5438c49ed | |||
| 556d57e09d | |||
| 54f43044f3 | |||
| 3fa4230b5a | |||
| 602c72f342 | |||
| 6c9cfdac3a | |||
| 0fd54c4272 | |||
| bf1f1750fa | |||
| b6342d4afd | |||
| db45ac45a7 | |||
| 894bd07285 | |||
| 00705c11cd | |||
| f1ccd3bb05 |
@@ -116,28 +116,65 @@ fi
|
||||
# 3. Status-check state at the PR HEAD (where checks ran). The merge
|
||||
# commit doesn't get its own checks; we evaluate the PR's last
|
||||
# commit, which is what branch protection compared against.
|
||||
# Fail-closed: verify HTTP 200. A 401/403/404 means the status is
|
||||
# unreadable — we must NOT treat that as "no statuses" and skip checks.
|
||||
STATUS_TMP=$(mktemp)
|
||||
STATUS_HTTP=$(curl -sS -o "$STATUS_TMP" -w '%{http_code}' -H "$AUTH" \
|
||||
"${API}/repos/${OWNER}/${NAME}/commits/${HEAD_SHA}/status")
|
||||
STATUS=$(cat "$STATUS_TMP")
|
||||
rm -f "$STATUS_TMP"
|
||||
if [ "$STATUS_HTTP" != "200" ]; then
|
||||
echo "::error::GET /commits/${HEAD_SHA}/status returned HTTP ${STATUS_HTTP} — cannot evaluate required checks."
|
||||
exit 1
|
||||
fi
|
||||
# FAIL-CLOSED: a 200 status response missing the 'statuses' array, or with
|
||||
# 'statuses' set to a non-array type (null/string/object), must NOT be treated
|
||||
# as "no checks" — that would silently declare all checks green.
|
||||
if ! echo "$STATUS" | jq -e '(.statuses | type) == "array"' >/dev/null; then
|
||||
echo "::error::GET /commits/${HEAD_SHA}/status returned HTTP 200 but 'statuses' is missing or not an array — cannot evaluate required checks."
|
||||
exit 1
|
||||
fi
|
||||
#
|
||||
# Pagination (status-pagination RCA, #2440-family): the combined
|
||||
# /commits/{sha}/status endpoint caps its embedded `statuses` array at the
|
||||
# Gitea default page size (~30). On a high-churn PR an older-but-still-current
|
||||
# required-context SUCCESS row is pushed PAST that cap, so reading the combined
|
||||
# view would record that context as `missing` and emit a FALSE-POSITIVE
|
||||
# force-merge. We instead page through the dedicated /commits/{sha}/statuses
|
||||
# list to EXHAUSTION (until a short/empty page), accumulating every row.
|
||||
#
|
||||
# Fail-closed is preserved end to end: any non-200 page, or a page whose body
|
||||
# is not a JSON array, aborts with exit 1 (we never treat an unreadable/partial
|
||||
# page as "no checks"). A genuinely-absent required context appears on NO page,
|
||||
# so CHECK_STATE has no entry for it → `${...:-missing}` below keeps it
|
||||
# `missing` → it is still counted as not-green. No fail-open path is added.
|
||||
PER_PAGE=100
|
||||
page=1
|
||||
ALL_STATUSES_TMP=$(mktemp)
|
||||
printf '[]' > "$ALL_STATUSES_TMP" # accumulator: a single JSON array of rows
|
||||
while :; do
|
||||
STATUS_TMP=$(mktemp)
|
||||
STATUS_HTTP=$(curl -sS -o "$STATUS_TMP" -w '%{http_code}' -H "$AUTH" \
|
||||
"${API}/repos/${OWNER}/${NAME}/commits/${HEAD_SHA}/statuses?page=${page}&limit=${PER_PAGE}")
|
||||
PAGE_BODY=$(cat "$STATUS_TMP")
|
||||
rm -f "$STATUS_TMP"
|
||||
if [ "$STATUS_HTTP" != "200" ]; then
|
||||
rm -f "$ALL_STATUSES_TMP"
|
||||
echo "::error::GET /commits/${HEAD_SHA}/statuses?page=${page} returned HTTP ${STATUS_HTTP} — cannot evaluate required checks."
|
||||
exit 1
|
||||
fi
|
||||
# FAIL-CLOSED: the /statuses endpoint returns a bare JSON array. A non-array
|
||||
# body (null/object/string) means the response is malformed — we must NOT
|
||||
# treat that as "no checks", which would silently declare all checks green.
|
||||
if ! echo "$PAGE_BODY" | jq -e 'type == "array"' >/dev/null 2>&1; then
|
||||
rm -f "$ALL_STATUSES_TMP"
|
||||
echo "::error::GET /commits/${HEAD_SHA}/statuses?page=${page} returned HTTP 200 but body is not a JSON array — cannot evaluate required checks."
|
||||
exit 1
|
||||
fi
|
||||
PAGE_COUNT=$(echo "$PAGE_BODY" | jq 'length')
|
||||
# Append this page's rows to the accumulator (insertion order is preserved
|
||||
# but NOT relied upon — the collapse below selects max-by-id per context).
|
||||
COMBINED=$(jq -s '.[0] + .[1]' "$ALL_STATUSES_TMP" <(echo "$PAGE_BODY"))
|
||||
printf '%s' "$COMBINED" > "$ALL_STATUSES_TMP"
|
||||
# Short page (fewer than PER_PAGE rows) ⇒ last page ⇒ stop.
|
||||
if [ "$PAGE_COUNT" -lt "$PER_PAGE" ]; then
|
||||
break
|
||||
fi
|
||||
page=$((page + 1))
|
||||
done
|
||||
STATUS=$(cat "$ALL_STATUSES_TMP")
|
||||
rm -f "$ALL_STATUSES_TMP"
|
||||
declare -A CHECK_STATE
|
||||
# Gitea's /commits/{sha}/statuses is roughly newest-first but NOT strictly
|
||||
# monotonic by id (observed first ids 157,155,156,… — local inversions from
|
||||
# re-runs and page boundaries), so neither first- nor last-occurrence reliably
|
||||
# yields the current row. Select the MAX-id row per context explicitly
|
||||
# (order-independent), matching prod-auto-deploy.py's latest_status_for_context.
|
||||
while IFS=$'\t' read -r ctx state; do
|
||||
[ -n "$ctx" ] && CHECK_STATE[$ctx]="$state"
|
||||
done < <(echo "$STATUS" | jq -r '.statuses | .[] | "\(.context)\t\(.status)"')
|
||||
done < <(echo "$STATUS" | jq -r 'group_by(.context) | map(max_by(.id)) | .[] | "\(.context)\t\(.status)"')
|
||||
|
||||
# 4. For each required check, was it green at merge? YAML block scalars
|
||||
# (`|`) leave a trailing newline; skip blank/whitespace-only lines.
|
||||
|
||||
@@ -30,6 +30,11 @@ PROFILES: dict[str, dict[str, str]] = {
|
||||
# workflow (they reuse its migrated Postgres), so changes to the
|
||||
# scheduler package must trigger the job too.
|
||||
r"|^workspace-server/internal/scheduler/"
|
||||
# #2150: the db package's real-PG migration-replay-from-scratch
|
||||
# + InitPostgres ping tests also run in this same workflow (they
|
||||
# reuse its sibling Postgres, against a separate `molecule_replay`
|
||||
# database). Changes to db must trigger the job too.
|
||||
r"|^workspace-server/internal/db/"
|
||||
r"|^workspace-server/migrations/"
|
||||
r"|^\.gitea/workflows/handlers-postgres-integration\.yml$"
|
||||
),
|
||||
|
||||
@@ -95,17 +95,27 @@ def build_plan(env: dict[str, str]) -> dict:
|
||||
|
||||
|
||||
def latest_status_for_context(statuses: list[dict], context: str) -> dict | None:
|
||||
"""Return the first matching status.
|
||||
"""Return the NEWEST status row for ``context`` (highest ``id``).
|
||||
|
||||
Gitea's combined-status response is newest-first in practice. The merge
|
||||
queue relies on the same contract; keeping the selector explicit makes
|
||||
stale duplicate contexts easy to test.
|
||||
This must work for BOTH orderings Gitea exposes: the combined
|
||||
``/status`` view is newest-first, but the exhaustively-paginated
|
||||
``/statuses`` list (see ``fetch_all_statuses``) is ascending id order
|
||||
(oldest-first). Selecting by max ``id`` collapses duplicate context rows
|
||||
to the current one regardless of input order, so a stale earlier run can
|
||||
never shadow the latest result. Rows without an ``id`` are treated as
|
||||
oldest (id -1) so a well-formed newer row always wins.
|
||||
"""
|
||||
|
||||
newest: dict | None = None
|
||||
newest_id = -1
|
||||
for status in statuses:
|
||||
if status.get("context") == context:
|
||||
return status
|
||||
return None
|
||||
if status.get("context") != context:
|
||||
continue
|
||||
raw_id = status.get("id")
|
||||
sid = raw_id if isinstance(raw_id, int) else -1
|
||||
if newest is None or sid >= newest_id:
|
||||
newest = status
|
||||
newest_id = sid
|
||||
return newest
|
||||
|
||||
|
||||
def ci_context_state(statuses: list[dict], context: str) -> str:
|
||||
@@ -351,6 +361,55 @@ def _api_json(url: str, token: str) -> dict:
|
||||
raise RuntimeError(f"GET {url} -> HTTP {exc.code}: {body}") from exc
|
||||
|
||||
|
||||
def _api_json_list(url: str, token: str) -> list:
|
||||
"""GET a Gitea list endpoint and return the JSON array.
|
||||
|
||||
Like ``_api_json`` but asserts the body is a list. Fail-closed: a non-list
|
||||
body (or HTTP error) raises so the caller never mistakes an unreadable page
|
||||
for "no more statuses" and silently truncates the required-context scan.
|
||||
"""
|
||||
req = urllib.request.Request(url, headers={"Authorization": f"token {token}"})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=20) as resp:
|
||||
body = json.loads(resp.read())
|
||||
except urllib.error.HTTPError as exc:
|
||||
detail = exc.read().decode("utf-8", errors="replace")[:500]
|
||||
raise RuntimeError(f"GET {url} -> HTTP {exc.code}: {detail}") from exc
|
||||
if not isinstance(body, list):
|
||||
raise RuntimeError(f"GET {url} -> expected JSON array, got {type(body).__name__}")
|
||||
return body
|
||||
|
||||
|
||||
def fetch_all_statuses(host: str, repo: str, sha: str, token: str, page_size: int = 100) -> list[dict]:
|
||||
"""Return EVERY commit-status row for ``sha``, paginating to exhaustion.
|
||||
|
||||
The combined ``/commits/{sha}/status`` endpoint caps its embedded
|
||||
``statuses`` array at the Gitea default page size (~30). On a high-churn
|
||||
commit, an older-but-still-current required-context SUCCESS row is pushed
|
||||
PAST that cap, so a reader of the combined view sees the required context
|
||||
as ``missing`` and either blocks (force-merge audit) or waits forever
|
||||
(this deploy gate). We instead walk ``/commits/{sha}/statuses`` page by
|
||||
page until a short/empty page, accumulating ALL rows.
|
||||
|
||||
Fail-closed: any page that errors or is not a list raises (see
|
||||
``_api_json_list``) — we never degrade to a partial list and call a deploy
|
||||
green. A genuinely-absent required context simply never appears on ANY
|
||||
page, so the caller's ``ci_context_state`` still reports ``missing`` and
|
||||
the gate stays closed.
|
||||
"""
|
||||
base = f"https://{host}/api/v1/repos/{repo}/commits/{sha}/statuses"
|
||||
results: list[dict] = []
|
||||
page = 1
|
||||
while True:
|
||||
page_url = f"{base}?page={page}&limit={page_size}"
|
||||
rows = _api_json_list(page_url, token)
|
||||
results.extend(r for r in rows if isinstance(r, dict))
|
||||
if len(rows) < page_size:
|
||||
break
|
||||
page += 1
|
||||
return results
|
||||
|
||||
|
||||
def _api_json_optional(url: str, token: str) -> tuple[int, dict | None]:
|
||||
req = urllib.request.Request(url, headers={"Authorization": f"token {token}"})
|
||||
try:
|
||||
@@ -472,12 +531,19 @@ def wait_for_ci_context(env: dict[str, str]) -> str:
|
||||
if not token:
|
||||
raise ValueError("GITEA_TOKEN is required to wait for CI status")
|
||||
|
||||
url = f"https://{host}/api/v1/repos/{repo}/commits/{sha}/status"
|
||||
deadline = time.time() + timeout
|
||||
last_states: dict[str, str] = {}
|
||||
while time.time() <= deadline:
|
||||
body = _api_json(url, token)
|
||||
statuses = body.get("statuses") or []
|
||||
# Read the FULL, exhaustively-paginated /statuses list — NOT the
|
||||
# combined /status view, whose embedded `statuses` array is capped at
|
||||
# the Gitea page size (~30). On a high-churn commit a required-context
|
||||
# SUCCESS row lands past that cap and the combined view would report
|
||||
# it `missing`, so this gate would wait until timeout and refuse a
|
||||
# legitimate prod deploy. Fetching every page closes that hole.
|
||||
# Fail-closed is preserved: a genuinely-absent required context is on
|
||||
# NO page, so ci_context_state() still returns "missing" → never
|
||||
# satisfied → the deploy stays blocked.
|
||||
statuses = fetch_all_statuses(host, repo, sha, token)
|
||||
states = {context: ci_context_state(statuses, context) for context in contexts}
|
||||
for context, state in states.items():
|
||||
if state != last_states.get(context):
|
||||
|
||||
@@ -115,5 +115,79 @@ T16=$(validate_required_checks_json "main" '{"main":"CI / all-required"}')
|
||||
[ "$T16" = "false" ] || fail "T16: string branch entry should fail"
|
||||
pass "T16: string branch entry fails"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# T17+ — /statuses pagination (status-pagination RCA, #2440-family).
|
||||
# The reader now pages /commits/{sha}/statuses to exhaustion instead of reading
|
||||
# the capped combined /status view. These lock the page-accumulation,
|
||||
# newest-wins collapse, short-page stop, and fail-closed contracts.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Page-body type validator used per page (bare array, not an object).
|
||||
validate_page_is_array() { jq -e 'type == "array"' >/dev/null 2>&1 && echo true || echo false; }
|
||||
|
||||
# newest-wins collapse: mirror the script's max-by-id jq (order-independent).
|
||||
collapse_newest_per_context() {
|
||||
declare -A CS
|
||||
while IFS=$'\t' read -r ctx state; do
|
||||
[ -n "$ctx" ] && CS[$ctx]="$state"
|
||||
done < <(jq -r 'group_by(.context) | map(max_by(.id)) | .[] | "\(.context)\t\(.status)"')
|
||||
state="${CS[CI / all-required (push)]:-missing}"
|
||||
echo "$state"
|
||||
}
|
||||
|
||||
# T17 — a bare JSON array page passes the per-page array check.
|
||||
T17=$(echo '[{"context":"c1","status":"success"}]' | validate_page_is_array)
|
||||
[ "$T17" = "true" ] || fail "T17: bare array page should pass array check"
|
||||
pass "T17: bare array page passes array check"
|
||||
|
||||
# T18 — a non-array page (object) fails the per-page array check → fail-closed.
|
||||
T18=$(echo '{"statuses":[]}' | validate_page_is_array)
|
||||
[ "$T18" = "false" ] || fail "T18: object page should fail array check (fail-closed)"
|
||||
pass "T18: object page fails array check (fail-closed)"
|
||||
|
||||
# T19 — required SUCCESS on PAGE 2 is FOUND after accumulation (not missing).
|
||||
# page1: 100 noise rows (older ids); page2: the required-context success.
|
||||
PAGE1=$(jq -nc '[range(0;100) | {id:., context:("noise-\(.) (push)"), status:"pending"}]')
|
||||
PAGE2='[{"id":200,"context":"CI / all-required (push)","status":"success"}]'
|
||||
# Accumulation matching the script: two-arg `jq -s '.[0] + .[1]'` over the
|
||||
# running accumulator and the new page.
|
||||
ACCUM=$(jq -s '.[0] + .[1]' <(echo "$PAGE1") <(echo "$PAGE2"))
|
||||
LEN=$(echo "$ACCUM" | jq 'length')
|
||||
[ "$LEN" = "101" ] || fail "T19: accumulated length should be 101, got $LEN"
|
||||
RESULT=$(echo "$ACCUM" | collapse_newest_per_context)
|
||||
[ "$RESULT" = "success" ] || fail "T19: required success on page2 must be FOUND, got '$RESULT'"
|
||||
pass "T19: required success on page2 is found after pagination"
|
||||
|
||||
# T20 — genuinely-absent required context across all pages stays 'missing'
|
||||
# → fail-closed (counted as not-green, flags the force-merge).
|
||||
ABSENT=$(jq -nc '[range(0;100) | {id:., context:("noise-\(.) (push)"), status:"success"}]')
|
||||
RESULT2=$(echo "$ABSENT" | collapse_newest_per_context)
|
||||
[ "$RESULT2" = "missing" ] || fail "T20: absent required context must stay 'missing', got '$RESULT2'"
|
||||
pass "T20: genuinely-absent required context stays missing (fail-closed)"
|
||||
|
||||
# T21 — non-monotonic order: newest id (157, neither first nor last in list)
|
||||
# a NEWER success row (oldest-first append → last overwrite wins).
|
||||
DUP='[{"id":155,"context":"CI / all-required (push)","status":"pending"},
|
||||
{"id":157,"context":"CI / all-required (push)","status":"success"},
|
||||
{"id":125,"context":"CI / all-required (push)","status":"failure"}]'
|
||||
RESULT3=$(echo "$DUP" | collapse_newest_per_context)
|
||||
[ "$RESULT3" = "success" ] || fail "T21: newest (success) must win over older (failure), got '$RESULT3'"
|
||||
pass "T21: newest row per context wins after pagination collapse"
|
||||
|
||||
# T22 — short-page stop condition: a page with fewer than PER_PAGE rows ends
|
||||
# the loop. Emulate the numeric comparison the script uses.
|
||||
PER_PAGE=100
|
||||
PAGE_COUNT=$(echo "$PAGE2" | jq 'length') # 1 row
|
||||
if [ "$PAGE_COUNT" -lt "$PER_PAGE" ]; then SHORT=stop; else SHORT=continue; fi
|
||||
[ "$SHORT" = "stop" ] || fail "T22: short page should stop pagination"
|
||||
pass "T22: short page stops pagination loop"
|
||||
|
||||
# T23 — a full page (== PER_PAGE) continues the loop.
|
||||
FULL=$(jq -nc '[range(0;100) | {id:., context:"x", status:"success"}]')
|
||||
FULL_COUNT=$(echo "$FULL" | jq 'length')
|
||||
if [ "$FULL_COUNT" -lt "$PER_PAGE" ]; then CONT=stop; else CONT=continue; fi
|
||||
[ "$CONT" = "continue" ] || fail "T23: full page should continue pagination"
|
||||
pass "T23: full page continues pagination loop"
|
||||
|
||||
echo
|
||||
echo "ALL AUDIT-FORCE-MERGE CHECKS PASSED"
|
||||
|
||||
@@ -105,16 +105,25 @@ def test_build_plan_disable_flag_short_circuits_before_credentials():
|
||||
assert plan["disabled_reason"] == "PROD_AUTO_DEPLOY_DISABLED=true"
|
||||
|
||||
|
||||
def test_latest_status_for_context_uses_first_matching_status():
|
||||
def test_latest_status_for_context_picks_newest_by_id_regardless_of_order():
|
||||
# The exhaustively-paginated /statuses list is ascending id order
|
||||
# (oldest-first), the opposite of the combined /status view. The selector
|
||||
# must collapse duplicate context rows to the NEWEST (max id) so a stale
|
||||
# earlier run never shadows the current result, whichever way they arrive.
|
||||
statuses = [
|
||||
{"context": "CI / all-required (push)", "status": "pending"},
|
||||
{"context": "CI / all-required (pull_request)", "status": "success"},
|
||||
{"context": "CI / all-required (push)", "status": "success"},
|
||||
{"id": 10, "context": "CI / all-required (push)", "status": "pending"},
|
||||
{"id": 11, "context": "CI / all-required (pull_request)", "status": "success"},
|
||||
{"id": 12, "context": "CI / all-required (push)", "status": "success"},
|
||||
]
|
||||
|
||||
latest = prod.latest_status_for_context(statuses, "CI / all-required (push)")
|
||||
|
||||
assert latest == {"context": "CI / all-required (push)", "status": "pending"}
|
||||
assert latest == {"id": 12, "context": "CI / all-required (push)", "status": "success"}
|
||||
|
||||
# Same rows shuffled (newest-first, as the combined view would deliver)
|
||||
# must still resolve to the same newest row.
|
||||
latest_rev = prod.latest_status_for_context(list(reversed(statuses)), "CI / all-required (push)")
|
||||
assert latest_rev == {"id": 12, "context": "CI / all-required (push)", "status": "success"}
|
||||
|
||||
|
||||
def test_ci_context_state_handles_missing_and_gitea_status_key():
|
||||
@@ -612,3 +621,123 @@ def test_superseded_by_none_for_latest_job_so_it_still_rolls(monkeypatch):
|
||||
)
|
||||
is None
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# /statuses pagination — required-context SUCCESS on page 2+ must be FOUND,
|
||||
# genuinely-absent context must STILL fail-closed (no fail-open).
|
||||
# Regression for the single-page-status bug (#2440-family, pagination RCA):
|
||||
# the combined /status view caps `statuses` at ~30, so on a high-churn commit
|
||||
# the still-current required-context row is pushed past page 1 and the reader
|
||||
# falsely reports it `missing`.
|
||||
# ---------------------------------------------------------------------------
|
||||
def _paged_statuses_stub(pages):
|
||||
"""Return a fake _api_json_list that serves `pages` keyed by ?page=N."""
|
||||
def fake(url, _token):
|
||||
# url looks like .../statuses?page=N&limit=100
|
||||
page = 1
|
||||
for part in url.split("?", 1)[-1].split("&"):
|
||||
if part.startswith("page="):
|
||||
page = int(part.split("=", 1)[1])
|
||||
return pages.get(page, [])
|
||||
return fake
|
||||
|
||||
|
||||
def test_fetch_all_statuses_finds_required_success_on_page_two(monkeypatch):
|
||||
# Page 1 is a full 100 rows of unrelated/older churn; the required-context
|
||||
# SUCCESS only appears on page 2. A single-page reader would miss it.
|
||||
page1 = [
|
||||
{"id": i, "context": f"noise-{i} (push)", "status": "pending"}
|
||||
for i in range(100)
|
||||
]
|
||||
page2 = [
|
||||
{"id": 200, "context": "CI / all-required (push)", "status": "success"},
|
||||
{"id": 201, "context": "Secret scan / Scan diff for credential-shaped strings (push)",
|
||||
"status": "success"},
|
||||
]
|
||||
monkeypatch.setattr(prod, "_api_json_list", _paged_statuses_stub({1: page1, 2: page2}))
|
||||
|
||||
rows = prod.fetch_all_statuses("git.moleculesai.app", "molecule-ai/molecule-core", "a" * 40, "tok")
|
||||
# Must have walked to page 2 and accumulated every row.
|
||||
assert len(rows) == 102
|
||||
assert prod.ci_context_state(rows, "CI / all-required (push)") == "success"
|
||||
assert (
|
||||
prod.ci_context_state(
|
||||
rows, "Secret scan / Scan diff for credential-shaped strings (push)"
|
||||
)
|
||||
== "success"
|
||||
)
|
||||
|
||||
|
||||
def test_fetch_all_statuses_genuinely_absent_context_stays_missing(monkeypatch):
|
||||
# The required context is on NO page → fail-closed: ci_context_state must
|
||||
# report "missing", which context_is_satisfied() rejects → gate stays shut.
|
||||
page1 = [
|
||||
{"id": i, "context": f"noise-{i} (push)", "status": "success"}
|
||||
for i in range(100)
|
||||
]
|
||||
page2 = [{"id": 200, "context": "some-other (push)", "status": "success"}]
|
||||
monkeypatch.setattr(prod, "_api_json_list", _paged_statuses_stub({1: page1, 2: page2}))
|
||||
|
||||
rows = prod.fetch_all_statuses("git.moleculesai.app", "molecule-ai/molecule-core", "b" * 40, "tok")
|
||||
state = prod.ci_context_state(rows, "CI / all-required (push)")
|
||||
assert state == "missing"
|
||||
assert prod.context_is_satisfied(state) is False
|
||||
|
||||
|
||||
def test_fetch_all_statuses_fail_closed_on_page_error(monkeypatch):
|
||||
# A page that raises (unreadable) must propagate, never silently truncate
|
||||
# the scan and let the caller treat a partial list as complete.
|
||||
def boom(url, _token):
|
||||
if "page=2" in url:
|
||||
raise RuntimeError("GET .../statuses?page=2 -> HTTP 502: bad gateway")
|
||||
return [{"id": i, "context": f"n-{i}", "status": "success"} for i in range(100)]
|
||||
|
||||
monkeypatch.setattr(prod, "_api_json_list", boom)
|
||||
try:
|
||||
prod.fetch_all_statuses("h", "r", "c" * 40, "tok")
|
||||
except RuntimeError as exc:
|
||||
assert "502" in str(exc)
|
||||
else:
|
||||
raise AssertionError("expected page-2 error to propagate (fail-closed)")
|
||||
|
||||
|
||||
def test_wait_for_ci_context_succeeds_when_required_status_is_past_page_one(monkeypatch):
|
||||
# End-to-end: the gate reads the EXHAUSTIVE list, so a required SUCCESS that
|
||||
# only exists past page 1 lets the deploy proceed instead of timing out.
|
||||
full = [
|
||||
{"id": i, "context": f"noise-{i} (push)", "status": "success"}
|
||||
for i in range(100)
|
||||
] + [
|
||||
{"id": 500, "context": "CI / all-required (push)", "status": "success"},
|
||||
{"id": 501, "context": "Secret scan / Scan diff for credential-shaped strings (push)",
|
||||
"status": "success"},
|
||||
]
|
||||
monkeypatch.setattr(prod, "fetch_all_statuses", lambda *a, **k: full)
|
||||
result = prod.wait_for_ci_context(
|
||||
{"GITHUB_SHA": "d" * 40, "GITEA_TOKEN": "tok", "CI_STATUS_TIMEOUT_SECONDS": "30"}
|
||||
)
|
||||
assert result == "success"
|
||||
|
||||
|
||||
def test_wait_for_ci_context_times_out_fail_closed_when_required_absent(monkeypatch):
|
||||
# Genuinely-absent required context across all pages → never satisfied →
|
||||
# the gate times out rather than green-lighting the deploy (no fail-open).
|
||||
present_but_irrelevant = [
|
||||
{"id": 500, "context": "some-other (push)", "status": "success"},
|
||||
]
|
||||
monkeypatch.setattr(prod, "fetch_all_statuses", lambda *a, **k: present_but_irrelevant)
|
||||
# Zero timeout + 0 interval → single poll then TimeoutError.
|
||||
try:
|
||||
prod.wait_for_ci_context(
|
||||
{
|
||||
"GITHUB_SHA": "e" * 40,
|
||||
"GITEA_TOKEN": "tok",
|
||||
"CI_STATUS_TIMEOUT_SECONDS": "1",
|
||||
"CI_STATUS_POLL_INTERVAL_SECONDS": "1",
|
||||
}
|
||||
)
|
||||
except TimeoutError as exc:
|
||||
assert "missing" in str(exc)
|
||||
else:
|
||||
raise AssertionError("expected fail-closed TimeoutError, not a satisfied gate")
|
||||
|
||||
@@ -148,6 +148,11 @@ jobs:
|
||||
run: $(go env GOPATH)/bin/golangci-lint run --timeout 3m ./...
|
||||
- if: ${{ needs.changes.outputs.platform == 'true' }}
|
||||
name: Diagnostic — per-package verbose 60s
|
||||
# DIAGNOSTIC ONLY (continue-on-error below): this step exists to dump
|
||||
# verbose per-package output for triage, NOT to gate. The blocking gate
|
||||
# is "Run tests with coverage (blocking gate)" immediately below. The
|
||||
# `set +e` / swallowed exits here are intentional — do not "fix" them
|
||||
# like a gate; the real gate is the next step.
|
||||
run: |
|
||||
set +e
|
||||
go test -race -v -timeout 60s ./internal/handlers/... 2>&1 | tee /tmp/test-handlers.log
|
||||
|
||||
@@ -290,6 +290,33 @@ jobs:
|
||||
# / workspaces all landed by the migration replay step above).
|
||||
go test -tags=integration -timeout 5m -v ./internal/scheduler/ -run "^TestIntegration_"
|
||||
|
||||
- if: needs.detect-changes.outputs.handlers == 'true'
|
||||
name: Migration replay-from-scratch gate (#2150)
|
||||
env:
|
||||
PGPASSWORD: test
|
||||
run: |
|
||||
# Issue #2150 (SOP internal#765): prove the FULL forward migration
|
||||
# chain (.up + legacy .sql) replays from a blank schema via the
|
||||
# PRODUCTION db.RunMigrations entrypoint — hard-fail on any error.
|
||||
#
|
||||
# This is the gap the psql apply loop above does NOT cover: that
|
||||
# loop deliberately SKIPS failing migrations (`⊘ skipped`), so it
|
||||
# stays green even if the chain stops replaying. The Go test below
|
||||
# uses the real boot-time runner with hard-fail semantics, catching
|
||||
# the #211 .down-wipe class and the 045 non-idempotent crash-loop
|
||||
# class (it runs the chain twice).
|
||||
#
|
||||
# Run against a SEPARATE database so the destructive
|
||||
# `DROP SCHEMA public CASCADE` inside the test never touches the
|
||||
# `molecule` DB the handlers integration tests above migrated. No
|
||||
# ordering coupling with the handlers step.
|
||||
createdb -h "${PG_HOST}" -U postgres molecule_replay 2>/dev/null || \
|
||||
psql -h "${PG_HOST}" -U postgres -d molecule \
|
||||
-c "CREATE DATABASE molecule_replay" >/dev/null 2>&1 || true
|
||||
INTEGRATION_DB_URL="postgres://postgres:test@${PG_HOST}:5432/molecule_replay?sslmode=disable" \
|
||||
go test -tags=integration -timeout 5m -v ./internal/db/ \
|
||||
-run '^TestIntegration_Migration|^TestIntegration_InitPostgres'
|
||||
|
||||
- if: failure() && needs.detect-changes.outputs.handlers == 'true'
|
||||
name: Diagnostic dump on failure
|
||||
env:
|
||||
|
||||
@@ -74,6 +74,10 @@ jobs:
|
||||
env:
|
||||
PG_CONTAINER: pg-lpe2e-${{ github.run_id }}-${{ github.run_attempt }}
|
||||
REDIS_CONTAINER: redis-lpe2e-${{ github.run_id }}-${{ github.run_attempt }}
|
||||
# Hard-code dev mode at the job level so the platform server ALWAYS sees it,
|
||||
# even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA).
|
||||
MOLECULE_ENV: development
|
||||
SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!!
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
|
||||
@@ -124,12 +128,16 @@ jobs:
|
||||
|
||||
- name: Configure platform env (admin token + local Docker provisioner)
|
||||
run: |
|
||||
# Allocate an unused ephemeral port to avoid collision with concurrent
|
||||
# jobs or stale processes from prior cancelled runs (see #2450).
|
||||
PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
|
||||
echo "PORT=${PORT}" >> "$GITHUB_ENV"
|
||||
echo "BASE=http://localhost:${PORT}" >> "$GITHUB_ENV"
|
||||
# Deterministic admin token: the script sends MOLECULE_ADMIN_TOKEN as the
|
||||
# bearer; the platform checks ADMIN_TOKEN. Set both to the same value.
|
||||
T="lpe2e-admin-${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
echo "ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
|
||||
echo "MOLECULE_ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
|
||||
echo "BASE=http://localhost:8080" >> "$GITHUB_ENV"
|
||||
# MOLECULE_ENV=development: dev posture. MOLECULE_ORG_ID is left UNSET so
|
||||
# main.go wires the LOCAL Docker provisioner (not the CP provisioner), and
|
||||
# MOLECULE_IMAGE_REGISTRY is left UNSET so image resolution uses
|
||||
@@ -143,21 +151,10 @@ jobs:
|
||||
|
||||
- name: Kill stale platform-server before start (issue #1046)
|
||||
run: |
|
||||
# ROOT CAUSE of the stub-gate red on docker-host: both this gating job
|
||||
# and the advisory lifecycle-real job bind the SAME fixed host port
|
||||
# :8080 (PORT=8080 ./platform-server). On the small docker-host runner
|
||||
# pool a prior cancelled/timeout run can leave a zombie platform-server
|
||||
# on :8080 (a cancelled run never reaches "Stop platform"), and — until
|
||||
# lifecycle-real was serialised behind this job via needs: — the two
|
||||
# jobs could also co-schedule on one runner and contend for :8080. A
|
||||
# second bind on :8080 is FATAL (the server exits), so "Wait for
|
||||
# /health" times out at 300s and this REQUIRED gate reds. Free the port
|
||||
# before binding — mirrors the e2e-api.yml #1046 fix for the identical
|
||||
# fixed-port-on-shared-runner class.
|
||||
#
|
||||
# /proc scan — works on any Linux without pkill/lsof/ss. comm is
|
||||
# truncated to 15 chars: "platform-serve" matches "platform-server".
|
||||
# Verify via cmdline to avoid false positives.
|
||||
# Dynamic port allocation (see #2450) eliminates the fixed-port race
|
||||
# that caused this gate to red when a prior run left a zombie process.
|
||||
# We still sweep by process name to avoid leaking platform-server
|
||||
# processes on the shared runner.
|
||||
killed=0
|
||||
for pid in $(grep -l "platform-serve" /proc/[0-9]*/comm 2>/dev/null); do
|
||||
kpid="${pid%/comm}"; kpid="${kpid##*/}"
|
||||
@@ -169,35 +166,28 @@ jobs:
|
||||
fi
|
||||
done
|
||||
if [ "$killed" -gt 0 ]; then echo "Killed $killed stale platform-server process(es)."; else echo "No platform-server-named process found."; fi
|
||||
# Belt-and-braces: also free :8080 from ANY holder regardless of process
|
||||
# name. A differently-named squatter (e.g. a leftover Fastify dev server
|
||||
# from another job) survives the comm-name scan above, makes our bind
|
||||
# FATAL, and can false-positive the /health probe below (no-flakes RCA;
|
||||
# tracked alongside #2430). fuser/lsof are present on the ubuntu runner;
|
||||
# if neither exists the name-scan above is the floor.
|
||||
if command -v fuser >/dev/null 2>&1; then fuser -k 8080/tcp 2>/dev/null || true; fi
|
||||
if command -v lsof >/dev/null 2>&1; then lsof -ti tcp:8080 2>/dev/null | xargs -r kill -9 2>/dev/null || true; fi
|
||||
sleep 2
|
||||
echo ":8080 freed (comm-scan + port-scan swept any squatter)."
|
||||
sleep 1
|
||||
|
||||
- name: Start platform (background)
|
||||
working-directory: workspace-server
|
||||
run: |
|
||||
# Bind to :8080 (the script's BASE). DATABASE_URL/REDIS_URL/ADMIN_TOKEN/
|
||||
# MOLECULE_ENV are inherited from $GITHUB_ENV.
|
||||
PORT=8080 ./platform-server > platform.log 2>&1 &
|
||||
# Bind to the dynamically allocated port (see #2450).
|
||||
# DATABASE_URL/REDIS_URL/ADMIN_TOKEN/MOLECULE_ENV are inherited from
|
||||
# $GITHUB_ENV.
|
||||
PORT=$PORT ./platform-server > platform.log 2>&1 &
|
||||
echo $! > platform.pid
|
||||
|
||||
- name: Wait for /health (+ migrations applied)
|
||||
run: |
|
||||
DEADLINE=300; PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"; start=$(date +%s)
|
||||
while :; do
|
||||
# Verify OUR server owns :8080 BEFORE trusting /health. Our server binds
|
||||
# :8080 or exits FATAL, so "our PID alive" <=> "we own :8080"; checking it
|
||||
# first stops a squatter that answers /health on :8080 (our bind having
|
||||
# failed) from false-positiving the gate (no-flakes RCA).
|
||||
# Verify OUR server is still alive before trusting /health. Our server
|
||||
# binds the allocated port or exits FATAL, so "our PID alive" <=>
|
||||
# "we own the port"; checking it first stops a squatter that answers
|
||||
# /health on the same port (our bind having failed) from false-positiving
|
||||
# the gate (no-flakes RCA).
|
||||
if [ -n "$PID" ] && ! kill -0 "$PID" 2>/dev/null; then
|
||||
echo "::error::platform-server exited early (failed to bind :8080 or crashed)"; cat workspace-server/platform.log || true; exit 1
|
||||
echo "::error::platform-server exited early (failed to bind or crashed)"; cat workspace-server/platform.log || true; exit 1
|
||||
fi
|
||||
if curl -sf "$BASE/health" >/dev/null; then
|
||||
tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc \
|
||||
@@ -237,13 +227,13 @@ jobs:
|
||||
lifecycle-real:
|
||||
name: Local Provision Lifecycle E2E (real image + MiniMax LLM, advisory)
|
||||
runs-on: docker-host
|
||||
# Serialise behind the gating stub job: both jobs bind the SAME fixed host
|
||||
# port :8080, so co-scheduling them on one docker-host runner makes the
|
||||
# second platform-server fail to bind (fatal) and reds whichever lost the
|
||||
# race. `needs:` forces this advisory job to start only AFTER lifecycle-stub
|
||||
# finishes, so they never contend for :8080. continue-on-error keeps a real-
|
||||
# job miss non-blocking; `needs:` does NOT gate on the stub's success (a
|
||||
# failed required gate still lets this advisory dependent run).
|
||||
# Serialise behind the gating stub job: both jobs share the same docker-host
|
||||
# runner and provision sibling containers. `needs:` forces this advisory job
|
||||
# to start only AFTER lifecycle-stub finishes, avoiding resource contention.
|
||||
# (Dynamic ports eliminated the fixed-port race; serialisation remains for
|
||||
# docker-host capacity hygiene.) continue-on-error keeps a real-job miss
|
||||
# non-blocking; `needs:` does NOT gate on the stub's success (a failed
|
||||
# required gate still lets this advisory dependent run).
|
||||
needs: lifecycle-stub
|
||||
if: ${{ always() }}
|
||||
# Tracker for lint-continue-on-error-tracking (Tier 2e / internal#350): this
|
||||
@@ -254,6 +244,10 @@ jobs:
|
||||
env:
|
||||
PG_CONTAINER: pg-lpe2e-real-${{ github.run_id }}-${{ github.run_attempt }}
|
||||
REDIS_CONTAINER: redis-lpe2e-real-${{ github.run_id }}-${{ github.run_attempt }}
|
||||
# Hard-code dev mode at the job level so the platform server ALWAYS sees it,
|
||||
# even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA).
|
||||
MOLECULE_ENV: development
|
||||
SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!!
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
|
||||
@@ -299,10 +293,14 @@ jobs:
|
||||
|
||||
- name: Configure platform env
|
||||
run: |
|
||||
# Allocate an unused ephemeral port to avoid collision with concurrent
|
||||
# jobs or stale processes from prior cancelled runs (see #2450).
|
||||
PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
|
||||
echo "PORT=${PORT}" >> "$GITHUB_ENV"
|
||||
echo "BASE=http://localhost:${PORT}" >> "$GITHUB_ENV"
|
||||
T="lpe2e-real-admin-${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
echo "ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
|
||||
echo "MOLECULE_ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
|
||||
echo "BASE=http://localhost:8080" >> "$GITHUB_ENV"
|
||||
echo "MOLECULE_ENV=development" >> "$GITHUB_ENV"
|
||||
echo "SECRETS_ENCRYPTION_KEY=lpe2e-test-encryption-key-32bytes!!" >> "$GITHUB_ENV"
|
||||
|
||||
@@ -312,8 +310,9 @@ jobs:
|
||||
|
||||
- name: Kill stale platform-server before start (issue #1046)
|
||||
run: |
|
||||
# Same fixed-:8080 hygiene as the stub job — free the port from any
|
||||
# zombie left by a cancelled run before this job binds it.
|
||||
# Dynamic port allocation (see #2450) eliminates the fixed-port race.
|
||||
# We still sweep by process name to avoid leaking platform-server
|
||||
# processes on the shared runner.
|
||||
killed=0
|
||||
for pid in $(grep -l "platform-serve" /proc/[0-9]*/comm 2>/dev/null); do
|
||||
kpid="${pid%/comm}"; kpid="${kpid##*/}"
|
||||
@@ -325,30 +324,23 @@ jobs:
|
||||
fi
|
||||
done
|
||||
if [ "$killed" -gt 0 ]; then echo "Killed $killed stale platform-server process(es)."; else echo "No platform-server-named process found."; fi
|
||||
# Belt-and-braces: free :8080 from ANY holder regardless of process name
|
||||
# (a differently-named squatter survives the comm-name scan above, makes
|
||||
# our bind FATAL, and can false-positive the /health probe). Mirrors the
|
||||
# stub job's no-flakes fix (tracked alongside #2430).
|
||||
if command -v fuser >/dev/null 2>&1; then fuser -k 8080/tcp 2>/dev/null || true; fi
|
||||
if command -v lsof >/dev/null 2>&1; then lsof -ti tcp:8080 2>/dev/null | xargs -r kill -9 2>/dev/null || true; fi
|
||||
sleep 2
|
||||
echo ":8080 freed (comm-scan + port-scan swept any squatter)."
|
||||
sleep 1
|
||||
|
||||
- name: Start platform (background)
|
||||
working-directory: workspace-server
|
||||
run: |
|
||||
PORT=8080 ./platform-server > platform.log 2>&1 &
|
||||
PORT=$PORT ./platform-server > platform.log 2>&1 &
|
||||
echo $! > platform.pid
|
||||
|
||||
- name: Wait for /health (+ migrations applied)
|
||||
run: |
|
||||
DEADLINE=300; PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"; start=$(date +%s)
|
||||
while :; do
|
||||
# Verify OUR server owns :8080 before trusting /health (no-flakes RCA):
|
||||
# our server binds :8080 or exits FATAL, so checking our PID first stops
|
||||
# a squatter answering /health on :8080 from false-positiving the gate.
|
||||
# Verify OUR server is still alive before trusting /health. Our server
|
||||
# binds the allocated port or exits FATAL, so checking our PID first
|
||||
# stops a squatter from false-positiving the gate (no-flakes RCA).
|
||||
if [ -n "$PID" ] && ! kill -0 "$PID" 2>/dev/null; then
|
||||
echo "::error::platform-server exited early (failed to bind :8080 or crashed)"; cat workspace-server/platform.log || true; exit 1
|
||||
echo "::error::platform-server exited early (failed to bind or crashed)"; cat workspace-server/platform.log || true; exit 1
|
||||
fi
|
||||
if curl -sf "$BASE/health" >/dev/null; then
|
||||
tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc \
|
||||
|
||||
@@ -248,16 +248,36 @@ jobs:
|
||||
--tag "${STAGING_TENANT_IMAGE_NAME}:${TAG_LATEST}"
|
||||
)
|
||||
|
||||
docker buildx build \
|
||||
--file ./workspace-server/Dockerfile.tenant \
|
||||
--build-arg NEXT_PUBLIC_PLATFORM_URL= \
|
||||
--build-arg GIT_SHA="${GIT_SHA}" \
|
||||
--label "org.opencontainers.image.source=https://git.moleculesai.app/molecule-ai/${REPO}" \
|
||||
--label "org.opencontainers.image.revision=${GIT_SHA}" \
|
||||
--label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
|
||||
--label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \
|
||||
"${build_tags[@]}" \
|
||||
--push .
|
||||
# Retry loop: buildkit EOF (internal#2468) is often transient on the
|
||||
# publish runner under memory pressure. Up to 3 attempts with a fresh
|
||||
# builder each time so a crashed buildkit doesn't poison the next try.
|
||||
for attempt in 1 2 3; do
|
||||
echo "::notice::Tenant image build attempt ${attempt}/3 ..."
|
||||
builder="tenant-builder-${GITHUB_RUN_ID}-${attempt}"
|
||||
docker buildx create --name "${builder}" --use >/dev/null 2>&1 || true
|
||||
if docker buildx build \
|
||||
--builder "${builder}" \
|
||||
--file ./workspace-server/Dockerfile.tenant \
|
||||
--build-arg NEXT_PUBLIC_PLATFORM_URL= \
|
||||
--build-arg GIT_SHA="${GIT_SHA}" \
|
||||
--label "org.opencontainers.image.source=https://git.moleculesai.app/molecule-ai/${REPO}" \
|
||||
--label "org.opencontainers.image.revision=${GIT_SHA}" \
|
||||
--label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
|
||||
--label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \
|
||||
"${build_tags[@]}" \
|
||||
--push .; then
|
||||
docker buildx rm "${builder}" >/dev/null 2>&1 || true
|
||||
echo "::notice::Tenant image build succeeded on attempt ${attempt}"
|
||||
break
|
||||
fi
|
||||
echo "::warning::Tenant image build attempt ${attempt} failed — cleaning builder and retrying"
|
||||
docker buildx rm "${builder}" >/dev/null 2>&1 || true
|
||||
sleep 10
|
||||
if [ "$attempt" -eq 3 ]; then
|
||||
echo "::error::Tenant image build failed after 3 attempts"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
# bp-exempt: production deploy side-effect; merge is gated by CI / all-required and this job waits for push CI before acting.
|
||||
deploy-production:
|
||||
|
||||
@@ -58,22 +58,51 @@ jobs:
|
||||
python-version: '3.11'
|
||||
- name: Install .gitea script test dependencies
|
||||
run: python -m pip install --quiet 'pytest==9.0.2' 'PyYAML==6.0.2'
|
||||
- name: Run scripts/ unittests, if any
|
||||
- name: Run scripts/ unittests (fail-closed on 0 collected)
|
||||
# Top-level scripts/ tests live alongside their target file. The
|
||||
# runtime packaging tests moved to molecule-ai-workspace-runtime, so
|
||||
# this pass may legitimately find no tests.
|
||||
# this pass may legitimately find NO test files today.
|
||||
#
|
||||
# Gate-integrity fix: the previous guard keyed off `rc==5` to detect
|
||||
# "no tests collected", but Python 3.12's unittest exits 0 (not 5)
|
||||
# when discovery finds 0 tests ("NO TESTS RAN"). The guard therefore
|
||||
# never fired, so any test_*.py added here would silently run 0 tests
|
||||
# while this step stayed GREEN. A green step that runs 0 tests is
|
||||
# worse than a red one. We now fail-closed:
|
||||
# - genuinely NO test_*.py present -> loud SKIP (legitimate no-op)
|
||||
# - test_*.py present but 0 collected -> FAIL (broken import/empty)
|
||||
working-directory: scripts
|
||||
run: |
|
||||
set +e
|
||||
python -m unittest discover -t . -p 'test_*.py' -v
|
||||
rc=$?
|
||||
if [ "$rc" -eq 5 ]; then
|
||||
echo "No top-level scripts/ unittest files found; skipping."
|
||||
set -euo pipefail
|
||||
# Non-recursive count: scripts/ has no __init__.py, so unittest
|
||||
# discover does not recurse into subdirs (ops/ is run separately
|
||||
# below) — top-level files are the entire discovery scope here.
|
||||
nfiles=$(find . -maxdepth 1 -name 'test_*.py' | wc -l | tr -d ' ')
|
||||
if [ "$nfiles" -eq 0 ]; then
|
||||
echo "SKIP: no top-level scripts/ test_*.py files present (genuine no-op)."
|
||||
exit 0
|
||||
fi
|
||||
exit "$rc"
|
||||
echo "Found $nfiles top-level scripts/ test_*.py file(s); asserting they collect >0 tests."
|
||||
ncollected=$(python -c "import unittest; print(unittest.TestLoader().discover('.', pattern='test_*.py', top_level_dir='.').countTestCases())")
|
||||
echo "Collected $ncollected test case(s)."
|
||||
if [ "$ncollected" -eq 0 ]; then
|
||||
echo "FAIL: test_*.py file(s) present but 0 tests collected (broken import / empty file / discovery error)."
|
||||
exit 1
|
||||
fi
|
||||
python -m unittest discover -t . -p 'test_*.py' -v
|
||||
- name: Run scripts/ops/ unittests (sweep_cf_decide, ...)
|
||||
# Real gate: scripts/ops/ must always run tests. Assert >0 collected so
|
||||
# deleting all test files (or breaking an import) can't pass GREEN by
|
||||
# running 0 tests — same gate-integrity class as the scripts/ step.
|
||||
working-directory: scripts/ops
|
||||
run: python -m unittest discover -p 'test_*.py' -v
|
||||
run: |
|
||||
set -euo pipefail
|
||||
ncollected=$(python -c "import unittest; print(unittest.TestLoader().discover('.', pattern='test_*.py').countTestCases())")
|
||||
echo "scripts/ops/ collected $ncollected test case(s)."
|
||||
if [ "$ncollected" -eq 0 ]; then
|
||||
echo "FAIL: scripts/ops/ collected 0 tests — this gate must run real tests (deleted/broken import?)."
|
||||
exit 1
|
||||
fi
|
||||
python -m unittest discover -p 'test_*.py' -v
|
||||
- name: Run .gitea/scripts pytest suite
|
||||
run: python -m pytest .gitea/scripts/tests -q
|
||||
|
||||
@@ -3,13 +3,36 @@
|
||||
import { useEffect, useMemo, useState } from "react";
|
||||
import { api } from "@/lib/api";
|
||||
import { runtimeDisplayName } from "@/lib/runtime-names";
|
||||
import { isSaaSTenant } from "@/lib/tenant";
|
||||
import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas";
|
||||
import type { WorkspaceCompute } from "@/store/socket";
|
||||
|
||||
const INSTANCE_TYPES = ["t3.medium", "t3.large", "t3.xlarge", "t3.2xlarge", "m6i.large", "m6i.xlarge", "c6i.xlarge"];
|
||||
// Machine sizes keyed by cloud provider — an AWS t3.* is meaningless on Hetzner,
|
||||
// etc. MUST mirror the workspace-server workspaceComputeInstanceAllowlist (which
|
||||
// mirrors the CP provider configs); the PATCH validation rejects a mismatch 400.
|
||||
const INSTANCE_TYPES_BY_PROVIDER: Record<string, string[]> = {
|
||||
aws: ["t3.medium", "t3.large", "t3.xlarge", "t3.2xlarge", "m6i.large", "m6i.xlarge", "c6i.xlarge"],
|
||||
hetzner: ["cpx11", "cpx21", "cpx31", "cpx41", "cpx51", "cax11", "cax21", "cax31", "cax41"],
|
||||
gcp: ["e2-small", "e2-medium", "e2-standard-2", "e2-standard-4", "e2-standard-8"],
|
||||
};
|
||||
const DEFAULT_INSTANCE_BY_PROVIDER: Record<string, string> = {
|
||||
aws: "t3.medium", hetzner: "cpx31", gcp: "e2-standard-2",
|
||||
};
|
||||
const normalizeProvider = (p?: string): string => (p === "gcp" || p === "hetzner" ? p : "aws");
|
||||
const instanceTypesForProvider = (p?: string): string[] =>
|
||||
INSTANCE_TYPES_BY_PROVIDER[normalizeProvider(p)] ?? INSTANCE_TYPES_BY_PROVIDER.aws;
|
||||
const defaultInstanceForProvider = (p?: string): string =>
|
||||
DEFAULT_INSTANCE_BY_PROVIDER[normalizeProvider(p)] ?? "t3.medium";
|
||||
|
||||
// Editable cloud-provider options (multi-provider RFC) — mirrors CreateWorkspaceDialog.
|
||||
const CLOUD_PROVIDER_OPTIONS = [
|
||||
{ value: "aws", label: "AWS (default)" },
|
||||
{ value: "gcp", label: "GCP" },
|
||||
{ value: "hetzner", label: "Hetzner" },
|
||||
];
|
||||
|
||||
const RUNTIME_OPTIONS = ["claude-code", "codex", "hermes", "openclaw", "kimi", "kimi-cli", "external"];
|
||||
const RESOLUTIONS = ["1280x720", "1440x900", "1920x1080", "2560x1440"];
|
||||
const DEFAULT_HEADLESS_INSTANCE_TYPE = "t3.medium";
|
||||
const DEFAULT_HEADLESS_ROOT_GB = 30;
|
||||
|
||||
type Props = {
|
||||
@@ -23,6 +46,7 @@ type Props = {
|
||||
|
||||
type FormState = {
|
||||
runtime: string;
|
||||
provider: string; // cloud backend; editable in SaaS (in-place switch recreates the box)
|
||||
instanceType: string;
|
||||
rootGB: string;
|
||||
displayEnabled: boolean;
|
||||
@@ -38,16 +62,16 @@ const DATA_PERSISTENCE_OPTIONS = ["", "persist", "ephemeral"];
|
||||
const dataPersistenceLabel = (v: string): string =>
|
||||
v === "persist" ? "Always keep (persist)" : v === "ephemeral" ? "Don't keep (ephemeral)" : "Auto";
|
||||
|
||||
// Cloud/compute backend display name. The provider is chosen at create time and
|
||||
// is NOT editable here (changing a workspace's cloud requires a recreate), so
|
||||
// it renders as a read-only badge — but we must preserve it across Save (the
|
||||
// compute payload is rebuilt below, and dropping it would wipe the column).
|
||||
// Cloud/compute backend display name (read-only fallback for non-SaaS / legacy).
|
||||
const cloudProviderLabel = (v: string | undefined): string =>
|
||||
v === "gcp" ? "GCP" : v === "hetzner" ? "Hetzner" : "AWS";
|
||||
|
||||
export function ContainerConfigTab({ workspaceId, data }: Props) {
|
||||
// Provider is editable only in SaaS (CP-provisioned boxes). Local/Docker has no
|
||||
// cloud-provider concept, so we keep the read-only badge there.
|
||||
const isSaaS = useMemo(() => isSaaSTenant(), []);
|
||||
const runtime = data.runtime;
|
||||
const provider = data.compute?.provider; // read-only; set at create time
|
||||
const provider = data.compute?.provider;
|
||||
const instanceType = data.compute?.instance_type;
|
||||
const rootGB = data.compute?.volume?.root_gb;
|
||||
const displayMode = data.compute?.display?.mode;
|
||||
@@ -56,8 +80,8 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
|
||||
const displayHeight = data.compute?.display?.height;
|
||||
const dataPersistence = data.compute?.data_persistence;
|
||||
const initial = useMemo(
|
||||
() => formFromData({ runtime, instanceType, rootGB, displayMode, displayProtocol, displayWidth, displayHeight, dataPersistence }),
|
||||
[runtime, instanceType, rootGB, displayMode, displayProtocol, displayWidth, displayHeight, dataPersistence],
|
||||
() => formFromData({ runtime, provider, instanceType, rootGB, displayMode, displayProtocol, displayWidth, displayHeight, dataPersistence }),
|
||||
[runtime, provider, instanceType, rootGB, displayMode, displayProtocol, displayWidth, displayHeight, dataPersistence],
|
||||
);
|
||||
const [form, setForm] = useState<FormState>(initial);
|
||||
const [saving, setSaving] = useState(false);
|
||||
@@ -87,6 +111,21 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
|
||||
try {
|
||||
let applyTemplateOnRestart = data.applyTemplateOnRestart ?? false;
|
||||
if (dirty) {
|
||||
// In-place cloud switch is DESTRUCTIVE: changing the provider recreates the
|
||||
// box on the new cloud (the workspace-server deprovisions the old box on
|
||||
// its old cloud first, then the restart provisions on the new one). Confirm
|
||||
// before doing it — the current box and any non-persisted state are lost.
|
||||
const providerChanged = normalizeProvider(form.provider) !== normalizeProvider(initial.provider);
|
||||
if (providerChanged && typeof window !== "undefined") {
|
||||
const ok = window.confirm(
|
||||
`Switch this workspace to ${cloudProviderLabel(form.provider)}? This RECREATES the box on the new cloud — the current box and any non-persisted state are replaced.`,
|
||||
);
|
||||
if (!ok) {
|
||||
setSaving(false);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
const rootGB = parseInt(form.rootGB, 10);
|
||||
if (!Number.isFinite(rootGB)) {
|
||||
setError("Root volume must be a number");
|
||||
@@ -102,10 +141,11 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
|
||||
: { mode: "none" },
|
||||
// internal#734: omit when "auto" so the wire/default behavior is unchanged.
|
||||
...(form.dataPersistence ? { data_persistence: form.dataPersistence } : {}),
|
||||
// Preserve the create-time cloud provider — it's not editable here, but
|
||||
// this PATCH rebuilds the whole compute object, so omitting it would
|
||||
// wipe the persisted provider (and mislead the badge after a Save).
|
||||
...(provider ? { provider } : {}),
|
||||
// Cloud backend: send the (possibly switched) provider. Omit for the
|
||||
// default (aws) so a non-switching AWS save keeps the wire unchanged;
|
||||
// a switch TO aws (omit) vs FROM aws (explicit) both register correctly
|
||||
// because the workspace-server normalizes ""→aws when diffing.
|
||||
...(normalizeProvider(form.provider) !== "aws" ? { provider: normalizeProvider(form.provider) } : {}),
|
||||
};
|
||||
|
||||
const resp = await api.patch<{ needs_restart?: boolean }>(`/workspaces/${workspaceId}`, {
|
||||
@@ -140,15 +180,16 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
|
||||
<div className="mb-3 flex items-center justify-between gap-3">
|
||||
<div className="flex items-center gap-2">
|
||||
<h3 className="text-sm font-semibold text-ink">Container Config</h3>
|
||||
{/* Read-only cloud-provider badge — which cloud this workspace's box
|
||||
runs on (AWS/GCP/Hetzner). Defaults to AWS when unset (legacy
|
||||
rows). Set at create time in the Create Workspace dialog. */}
|
||||
<span
|
||||
title="Cloud provider for this workspace's compute (set at create time)"
|
||||
className="rounded-full border border-line/60 bg-surface-sunken px-2 py-0.5 font-mono text-[10px] uppercase tracking-wide text-ink-mid"
|
||||
>
|
||||
{cloudProviderLabel(provider)}
|
||||
</span>
|
||||
{/* Non-SaaS (local/Docker) has no cloud-provider concept → read-only
|
||||
badge. In SaaS the provider is an editable selector in the form. */}
|
||||
{!isSaaS && (
|
||||
<span
|
||||
title="Cloud provider for this workspace's compute"
|
||||
className="rounded-full border border-line/60 bg-surface-sunken px-2 py-0.5 font-mono text-[10px] uppercase tracking-wide text-ink-mid"
|
||||
>
|
||||
{cloudProviderLabel(provider)}
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
{data.needsRestart && <span className="text-[11px] text-warm">Restart required</span>}
|
||||
</div>
|
||||
@@ -162,11 +203,32 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
|
||||
optionLabel={runtimeDisplayName}
|
||||
onChange={(runtime) => setForm((s) => ({ ...s, runtime }))}
|
||||
/>
|
||||
{isSaaS && (
|
||||
<SelectField
|
||||
id="cloud-provider"
|
||||
label="Cloud provider"
|
||||
value={normalizeProvider(form.provider)}
|
||||
options={CLOUD_PROVIDER_OPTIONS.map((p) => p.value)}
|
||||
optionLabel={(v) => CLOUD_PROVIDER_OPTIONS.find((p) => p.value === v)?.label ?? v}
|
||||
// Switching cloud resets the instance type to the new provider's
|
||||
// default (an AWS t3.* is invalid on Hetzner, etc.) — also keeps the
|
||||
// instance-type dropdown below in sync with the provider's sizes.
|
||||
onChange={(provider) =>
|
||||
setForm((s) => ({
|
||||
...s,
|
||||
provider,
|
||||
instanceType: instanceTypesForProvider(provider).includes(s.instanceType)
|
||||
? s.instanceType
|
||||
: defaultInstanceForProvider(provider),
|
||||
}))
|
||||
}
|
||||
/>
|
||||
)}
|
||||
<SelectField
|
||||
id="instance-type"
|
||||
label="Instance type"
|
||||
value={form.instanceType}
|
||||
options={INSTANCE_TYPES}
|
||||
options={instanceTypesForProvider(form.provider)}
|
||||
onChange={(instanceType) => setForm((s) => ({ ...s, instanceType }))}
|
||||
/>
|
||||
<label className="grid gap-1" htmlFor="root-volume-gb">
|
||||
@@ -270,6 +332,7 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
|
||||
|
||||
function formFromData(data: {
|
||||
runtime?: string;
|
||||
provider?: string;
|
||||
instanceType?: string;
|
||||
rootGB?: number;
|
||||
displayMode?: string;
|
||||
@@ -281,9 +344,11 @@ function formFromData(data: {
|
||||
const width = data.displayWidth ?? 1920;
|
||||
const height = data.displayHeight ?? 1080;
|
||||
const resolution = `${width}x${height}`;
|
||||
const provider = normalizeProvider(data.provider);
|
||||
return {
|
||||
runtime: data.runtime || "claude-code",
|
||||
instanceType: data.instanceType || DEFAULT_HEADLESS_INSTANCE_TYPE,
|
||||
provider,
|
||||
instanceType: data.instanceType || defaultInstanceForProvider(provider),
|
||||
rootGB: String(data.rootGB || DEFAULT_HEADLESS_ROOT_GB),
|
||||
displayEnabled: !!data.displayMode && data.displayMode !== "none",
|
||||
displayMode: data.displayMode && data.displayMode !== "none" ? data.displayMode : "desktop-control",
|
||||
|
||||
@@ -23,6 +23,13 @@ vi.mock("@/store/canvas", () => ({
|
||||
),
|
||||
}));
|
||||
|
||||
// SaaS so the editable cloud-provider selector renders (non-SaaS shows a read-only
|
||||
// badge). Existing tests keep provider=aws (default), which is omitted from the
|
||||
// PATCH payload, so their assertions are unaffected.
|
||||
vi.mock("@/lib/tenant", () => ({
|
||||
isSaaSTenant: () => true,
|
||||
}));
|
||||
|
||||
import { ContainerConfigTab } from "../ContainerConfigTab";
|
||||
|
||||
afterEach(() => {
|
||||
@@ -314,4 +321,67 @@ describe("ContainerConfigTab", () => {
|
||||
await waitFor(() => expect(restartWorkspace).toHaveBeenCalledWith("ws-compute", { applyTemplate: true }));
|
||||
expect(apiPatch).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("switches cloud provider — keys the instance-type list to the provider, confirms the recreate, and PATCHes the new provider", async () => {
|
||||
const confirmSpy = vi.spyOn(window, "confirm").mockReturnValue(true);
|
||||
render(
|
||||
<ContainerConfigTab
|
||||
workspaceId="ws-switch"
|
||||
data={{
|
||||
runtime: "claude-code",
|
||||
status: "online",
|
||||
needsRestart: false,
|
||||
activeTasks: 0,
|
||||
maxConcurrentTasks: null,
|
||||
workspaceAccess: "read-write",
|
||||
deliveryMode: "push",
|
||||
compute: { instance_type: "t3.large", provider: "aws", volume: { root_gb: 30 } },
|
||||
}}
|
||||
/>,
|
||||
);
|
||||
|
||||
const providerSel = screen.getByLabelText("Cloud provider");
|
||||
expect(providerSel).toHaveProperty("value", "aws");
|
||||
expect(screen.getByLabelText("Instance type")).toHaveProperty("value", "t3.large");
|
||||
|
||||
// Switch to Hetzner → the instance type resets to the Hetzner default (an AWS
|
||||
// t3.* is invalid on Hetzner) and the options become Hetzner sizes.
|
||||
fireEvent.change(providerSel, { target: { value: "hetzner" } });
|
||||
expect(screen.getByLabelText("Instance type")).toHaveProperty("value", "cpx31");
|
||||
|
||||
fireEvent.click(screen.getByRole("button", { name: "Save" }));
|
||||
await waitFor(() => expect(apiPatch).toHaveBeenCalledTimes(1));
|
||||
expect(confirmSpy).toHaveBeenCalled(); // destructive recreate confirmed
|
||||
const body = apiPatch.mock.calls[0][1] as { compute: { provider?: string; instance_type?: string } };
|
||||
expect(body.compute.provider).toBe("hetzner");
|
||||
expect(body.compute.instance_type).toBe("cpx31");
|
||||
confirmSpy.mockRestore();
|
||||
});
|
||||
|
||||
it("does not treat a non-provider edit as a recreate (no confirm; aws default omitted)", async () => {
|
||||
const confirmSpy = vi.spyOn(window, "confirm").mockReturnValue(true);
|
||||
render(
|
||||
<ContainerConfigTab
|
||||
workspaceId="ws-noswitch"
|
||||
data={{
|
||||
runtime: "claude-code",
|
||||
status: "online",
|
||||
needsRestart: false,
|
||||
activeTasks: 0,
|
||||
maxConcurrentTasks: null,
|
||||
workspaceAccess: "read-write",
|
||||
deliveryMode: "push",
|
||||
compute: { instance_type: "t3.large", provider: "aws", volume: { root_gb: 30 } },
|
||||
}}
|
||||
/>,
|
||||
);
|
||||
|
||||
fireEvent.change(screen.getByLabelText("Root volume"), { target: { value: "60" } });
|
||||
fireEvent.click(screen.getByRole("button", { name: "Save" }));
|
||||
await waitFor(() => expect(apiPatch).toHaveBeenCalledTimes(1));
|
||||
expect(confirmSpy).not.toHaveBeenCalled();
|
||||
const body = apiPatch.mock.calls[0][1] as { compute: { provider?: string } };
|
||||
expect(body.compute.provider).toBeUndefined(); // aws default omitted (wire unchanged)
|
||||
confirmSpy.mockRestore();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -162,6 +162,11 @@ describe("DisplayTab", () => {
|
||||
controller: "user",
|
||||
ttl_seconds: 300,
|
||||
});
|
||||
// Defensive: the noVNC constructor is async (dynamic import), so wait
|
||||
// for it to be called before asserting arguments (prevents flake in CI).
|
||||
await waitFor(() => {
|
||||
expect(mockRFBConstructor).toHaveBeenCalled();
|
||||
});
|
||||
expect(mockRFBConstructor).toHaveBeenCalledWith(
|
||||
expect.any(HTMLElement),
|
||||
expect.stringContaining("/workspaces/ws-display/display/session/websockify"),
|
||||
|
||||
@@ -162,6 +162,27 @@ describe("hydrate", () => {
|
||||
useCanvasStore.getState().hydrate([ws]);
|
||||
expect(useCanvasStore.getState().nodes[0].data.currentTask).toBe("");
|
||||
});
|
||||
|
||||
it("preserves in-flight turn status after refresh (issue #2391)", () => {
|
||||
// Simulates a page refresh: the canvas re-hydrates from GET /workspaces
|
||||
// while the agent has an active in-flight turn. The store must reflect
|
||||
// "working" immediately — no dependence on a subsequent TASK_UPDATED
|
||||
// socket event. This prevents the "stuck idle" UX after reload.
|
||||
const ws = makeWS({
|
||||
id: "ws-1",
|
||||
status: "online",
|
||||
current_task: "Analyzing data",
|
||||
active_tasks: 2,
|
||||
});
|
||||
useCanvasStore.getState().hydrate([ws]);
|
||||
const node = useCanvasStore.getState().nodes[0];
|
||||
expect(node.data.currentTask).toBe("Analyzing data");
|
||||
expect(node.data.activeTasks).toBe(2);
|
||||
expect(node.data.status).toBe("online");
|
||||
// Defensive: the node must be considered "working" for any UI that
|
||||
// gates on currentTask (e.g. ChatTab thinking indicator).
|
||||
expect(!!node.data.currentTask).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("summarizeWorkspaceCapabilities", () => {
|
||||
|
||||
@@ -488,6 +488,12 @@ echo ""
|
||||
# Step 5 — proxy reach (ws-<id>:8000 Docker-DNS rewrite, end to end).
|
||||
# ----------------------------------------------------------------------------
|
||||
echo "--- Step 5: proxy reach (POST /workspaces/$WSID/a2a) ---"
|
||||
# Debug: print the workspace URL the platform stored so SSRF failures are
|
||||
# actionable (#2468 RCA).
|
||||
WS_DEBUG=$(admin_curl "$BASE/workspaces/$WSID")
|
||||
WS_URL_DEBUG=$(ws_field "$WS_DEBUG" "url")
|
||||
WS_STATUS_DEBUG=$(ws_field "$WS_DEBUG" "status")
|
||||
echo " workspace url=$WS_URL_DEBUG status=$WS_STATUS_DEBUG"
|
||||
# In minimax mode we send a DETERMINISTIC known-answer prompt and assert the
|
||||
# model echoes the answer back — proving a real LLM round-trip, not just
|
||||
# reachability. Otherwise a plain "ping".
|
||||
|
||||
@@ -0,0 +1,286 @@
|
||||
//go:build integration
|
||||
// +build integration
|
||||
|
||||
// postgres_replay_integration_test.go — REAL Postgres integration tests for
|
||||
// the boot-time migration runner (db.RunMigrations) and the connection
|
||||
// bootstrap (db.InitPostgres).
|
||||
//
|
||||
// Issue #2150 (SOP rule internal#765 regression-coverage). test_layer:
|
||||
// real-postgres.
|
||||
//
|
||||
// Run locally with:
|
||||
//
|
||||
// docker run --rm -d --name pg-replay \
|
||||
// -e POSTGRES_PASSWORD=test -e POSTGRES_DB=molecule \
|
||||
// -p 55432:5432 postgres:15-alpine
|
||||
// sleep 4
|
||||
// cd workspace-server
|
||||
// INTEGRATION_DB_URL="postgres://postgres:test@localhost:55432/molecule?sslmode=disable" \
|
||||
// go test -tags=integration ./internal/db/ -run '^TestIntegration_Migration|^TestIntegration_InitPostgres'
|
||||
//
|
||||
// In CI these run on .gitea/workflows/handlers-postgres-integration.yml,
|
||||
// which already provisions a real Postgres on the operator-host bridge and
|
||||
// triggers on workspace-server/migrations/** changes — the exact blast
|
||||
// radius this gate must cover.
|
||||
//
|
||||
// WHY A REAL DATABASE — and why the existing coverage is NOT enough
|
||||
// -----------------------------------------------------------------
|
||||
// postgres_migrate_test.go and postgres_schema_migrations_test.go are
|
||||
// sqlmock-only: they pin which SQL *statements* fire, but a mock cannot
|
||||
// execute SQL, so it cannot prove the 118-file (.up + legacy .sql) chain
|
||||
// actually REPLAYS FROM SCRATCH against a real Postgres. The CI psql loop
|
||||
// in handlers-postgres-integration.yml deliberately *skips* failing
|
||||
// migrations (`⊘ skipped`), so it would stay green even if the chain
|
||||
// stopped replaying — it is not a replay gate.
|
||||
//
|
||||
// This file closes that gap. It boots a Postgres, resets the public schema
|
||||
// to a blank slate, and runs the PRODUCTION db.RunMigrations entrypoint —
|
||||
// the same function platform boot calls — with hard-fail semantics. It
|
||||
// would FAIL (watch-fail intent) against:
|
||||
//
|
||||
// - Issue #211: if RunMigrations regresses to globbing `*.sql` and
|
||||
// sorting `.down.sql` before `.up.sql`, the rollback runs before the
|
||||
// forward for any pair (020_workspace_auth_tokens was the canary),
|
||||
// either erroring on the DROP or wiping the just-created table.
|
||||
//
|
||||
// - The 045 crash-loop class (cp#429 / project_cp_migration_045_*): the
|
||||
// runner re-applies every recorded-absent file every boot, so a
|
||||
// non-idempotent migration (bare CREATE / INSERT without IF NOT EXISTS
|
||||
// / ON CONFLICT) replays cleanly the first time and FAILS the second.
|
||||
// TestIntegration_MigrationReplay_IsIdempotent_DoubleApply runs the
|
||||
// full chain twice against the same DB to catch that at PR time.
|
||||
//
|
||||
// - A new migration that depends on a table a later migration drops, or
|
||||
// is mis-ordered in the lexicographic chain — it simply will not apply
|
||||
// from scratch and the replay errors.
|
||||
//
|
||||
// All assertions key off the OBSERVABLE database state after the real run,
|
||||
// not a proxy for "a statement fired".
|
||||
|
||||
package db
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
_ "github.com/lib/pq"
|
||||
)
|
||||
|
||||
// migrationsDir is the on-disk path to the forward+legacy migration chain
|
||||
// relative to this test file (workspace-server/internal/db → ../../migrations).
|
||||
const migrationsDir = "../../migrations"
|
||||
|
||||
// freshIntegrationDB opens $INTEGRATION_DB_URL (skipping the test if unset),
|
||||
// resets the `public` schema to an empty slate so the run is a true
|
||||
// replay-from-scratch regardless of what an earlier CI step applied, and
|
||||
// registers a Cleanup that closes the connection.
|
||||
//
|
||||
// It also points the package-global db.DB at this connection, because
|
||||
// RunMigrations operates on db.DB. NOT SAFE for t.Parallel() — it owns the
|
||||
// schema for the duration of the test.
|
||||
func freshIntegrationDB(t *testing.T) *sql.DB {
|
||||
t.Helper()
|
||||
url := os.Getenv("INTEGRATION_DB_URL")
|
||||
if url == "" {
|
||||
t.Skip("INTEGRATION_DB_URL not set; skipping real-PG replay test (local devs: see file header)")
|
||||
}
|
||||
conn, err := sql.Open("postgres", url)
|
||||
if err != nil {
|
||||
t.Fatalf("open: %v", err)
|
||||
}
|
||||
if err := conn.Ping(); err != nil {
|
||||
t.Fatalf("ping: %v", err)
|
||||
}
|
||||
// True from-scratch: blow away any schema a prior CI step (e.g. the
|
||||
// handlers psql apply-all loop) left behind, then start clean. This is
|
||||
// what makes the test a *replay-from-scratch* gate rather than a
|
||||
// re-apply-onto-existing test.
|
||||
if _, err := conn.Exec(`DROP SCHEMA public CASCADE; CREATE SCHEMA public`); err != nil {
|
||||
t.Fatalf("reset public schema: %v", err)
|
||||
}
|
||||
// gen_random_uuid() (used by 001_workspaces.sql et al.) lives in
|
||||
// pgcrypto on PG < 13 and core on PG 13+. postgres:15-alpine has it in
|
||||
// core, but create the extension defensively so the test does not pin a
|
||||
// specific PG minor.
|
||||
if _, err := conn.Exec(`CREATE EXTENSION IF NOT EXISTS pgcrypto`); err != nil {
|
||||
t.Fatalf("create pgcrypto: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { conn.Close() })
|
||||
return conn
|
||||
}
|
||||
|
||||
// forwardMigrationCount counts the files RunMigrations is expected to apply:
|
||||
// every *.sql that is NOT a *.down.sql. This is derived from the real
|
||||
// directory so the gate auto-tracks new migrations without an edit here.
|
||||
func forwardMigrationCount(t *testing.T) int {
|
||||
t.Helper()
|
||||
all, err := filepath.Glob(filepath.Join(migrationsDir, "*.sql"))
|
||||
if err != nil {
|
||||
t.Fatalf("glob migrations: %v", err)
|
||||
}
|
||||
n := 0
|
||||
for _, f := range all {
|
||||
if len(f) >= len(".down.sql") && f[len(f)-len(".down.sql"):] == ".down.sql" {
|
||||
continue
|
||||
}
|
||||
n++
|
||||
}
|
||||
if n == 0 {
|
||||
t.Fatalf("found zero forward migrations under %s — wrong path?", migrationsDir)
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// TestIntegration_InitPostgres_PingSucceeds proves the production connection
|
||||
// bootstrap actually establishes a usable pool against a real server. A
|
||||
// sqlmock test can never exercise the real DB.Ping() inside InitPostgres,
|
||||
// which is the line that turns a bad DSN / unreachable host into a boot
|
||||
// failure instead of a silently-broken pool.
|
||||
func TestIntegration_InitPostgres_PingSucceeds(t *testing.T) {
|
||||
url := os.Getenv("INTEGRATION_DB_URL")
|
||||
if url == "" {
|
||||
t.Skip("INTEGRATION_DB_URL not set; skipping")
|
||||
}
|
||||
if err := InitPostgres(url); err != nil {
|
||||
t.Fatalf("InitPostgres against real PG failed: %v", err)
|
||||
}
|
||||
if DB == nil {
|
||||
t.Fatal("InitPostgres returned nil error but db.DB is nil")
|
||||
}
|
||||
// The pool must be live, not just opened.
|
||||
if err := DB.Ping(); err != nil {
|
||||
t.Fatalf("db.DB.Ping after InitPostgres: %v", err)
|
||||
}
|
||||
// Round-trip a trivial query to prove the connection actually serves.
|
||||
var one int
|
||||
if err := DB.QueryRow("SELECT 1").Scan(&one); err != nil {
|
||||
t.Fatalf("SELECT 1 round-trip: %v", err)
|
||||
}
|
||||
if one != 1 {
|
||||
t.Fatalf("SELECT 1 returned %d", one)
|
||||
}
|
||||
}
|
||||
|
||||
// TestIntegration_InitPostgres_BadDSNFails proves InitPostgres surfaces an
|
||||
// unreachable/garbage DSN as an error (the ping path), rather than handing
|
||||
// back a half-open pool. Watch-fail: if someone drops the DB.Ping() check
|
||||
// from InitPostgres, this stops returning an error and fails.
|
||||
func TestIntegration_InitPostgres_BadDSNFails(t *testing.T) {
|
||||
if os.Getenv("INTEGRATION_DB_URL") == "" {
|
||||
t.Skip("INTEGRATION_DB_URL not set; skipping")
|
||||
}
|
||||
// Valid DSN shape, but nothing is listening on this port.
|
||||
err := InitPostgres("postgres://postgres:test@127.0.0.1:1/does_not_exist?sslmode=disable&connect_timeout=2")
|
||||
if err == nil {
|
||||
t.Fatal("expected InitPostgres to fail against an unreachable DSN, got nil (DB.Ping check removed?)")
|
||||
}
|
||||
}
|
||||
|
||||
// TestIntegration_MigrationReplay_FromScratch is the core gate: run the
|
||||
// PRODUCTION RunMigrations over a blank public schema and assert the full
|
||||
// forward chain applies cleanly with zero skips.
|
||||
//
|
||||
// Watch-fail intent:
|
||||
// - #211 .down-wipe: a `.down.sql` leaking into the forward set would
|
||||
// run a DROP before its CREATE → error here (hard fail), or wipe a
|
||||
// table → the schema_migrations / table-presence assertions catch it.
|
||||
// - mis-ordered / dangling-dependency migration → RunMigrations returns
|
||||
// a non-nil error and this test fails.
|
||||
func TestIntegration_MigrationReplay_FromScratch(t *testing.T) {
|
||||
conn := freshIntegrationDB(t)
|
||||
DB = conn // RunMigrations operates on the package-global DB.
|
||||
|
||||
if err := RunMigrations(migrationsDir); err != nil {
|
||||
t.Fatalf("full-chain replay-from-scratch failed: %v", err)
|
||||
}
|
||||
|
||||
// Every forward migration must be recorded as applied — proves none was
|
||||
// silently skipped (the failure mode the CI psql loop tolerates).
|
||||
want := forwardMigrationCount(t)
|
||||
var got int
|
||||
if err := DB.QueryRow("SELECT COUNT(*) FROM schema_migrations").Scan(&got); err != nil {
|
||||
t.Fatalf("count schema_migrations: %v", err)
|
||||
}
|
||||
if got != want {
|
||||
t.Errorf("schema_migrations recorded %d migrations, expected %d (the full forward chain)", got, want)
|
||||
}
|
||||
|
||||
// No `.down.sql` may ever be recorded — that is the #211 signature.
|
||||
var downRecorded int
|
||||
if err := DB.QueryRow(
|
||||
"SELECT COUNT(*) FROM schema_migrations WHERE filename LIKE '%.down.sql'",
|
||||
).Scan(&downRecorded); err != nil {
|
||||
t.Fatalf("count down migrations: %v", err)
|
||||
}
|
||||
if downRecorded != 0 {
|
||||
t.Errorf("a .down.sql migration was applied (#211 regression): %d recorded", downRecorded)
|
||||
}
|
||||
|
||||
// Spot-check load-bearing tables that survive to HEAD of the chain.
|
||||
// workspaces is the root table; workspace_auth_tokens was the #211
|
||||
// canary (its data wipe regressed AdminAuth to fail-open).
|
||||
for _, tbl := range []string{"workspaces", "workspace_auth_tokens", "delegations", "activity_logs"} {
|
||||
var exists bool
|
||||
if err := DB.QueryRow(
|
||||
"SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_schema='public' AND table_name=$1)",
|
||||
tbl,
|
||||
).Scan(&exists); err != nil {
|
||||
t.Fatalf("check table %s: %v", tbl, err)
|
||||
}
|
||||
if !exists {
|
||||
t.Errorf("table %q missing after full replay — chain did not land it", tbl)
|
||||
}
|
||||
}
|
||||
|
||||
// agent_memories is CREATEd at 008 and DROPped at the end of the chain
|
||||
// (20260524110000_drop_agent_memories). Its absence proves the late
|
||||
// drop migration actually ran AFTER the early create — i.e. ordering
|
||||
// held. If the chain ever runs a drop before its create, this flips.
|
||||
var legacyExists bool
|
||||
if err := DB.QueryRow(
|
||||
"SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_schema='public' AND table_name='agent_memories')",
|
||||
).Scan(&legacyExists); err != nil {
|
||||
t.Fatalf("check agent_memories: %v", err)
|
||||
}
|
||||
if legacyExists {
|
||||
t.Error("agent_memories still present at HEAD — the late drop migration did not replay in order")
|
||||
}
|
||||
}
|
||||
|
||||
// TestIntegration_MigrationReplay_IsIdempotent_DoubleApply guards the 045
|
||||
// crash-loop class (cp#429 / project_cp_migration_045_crashloop_idempotency_guard):
|
||||
// the runner re-checks every file on every boot, so a non-idempotent
|
||||
// migration replays fine once and FAILS on the second pass. Here we run the
|
||||
// full chain twice. The second pass must apply ZERO new files (all recorded)
|
||||
// and must not error.
|
||||
//
|
||||
// NOTE: this runs against the SAME populated schema, so it also exercises
|
||||
// the "skip already-applied" tracking path end-to-end against real PG, which
|
||||
// the sqlmock tests only simulate.
|
||||
func TestIntegration_MigrationReplay_IsIdempotent_DoubleApply(t *testing.T) {
|
||||
conn := freshIntegrationDB(t)
|
||||
DB = conn
|
||||
|
||||
if err := RunMigrations(migrationsDir); err != nil {
|
||||
t.Fatalf("first replay failed: %v", err)
|
||||
}
|
||||
var afterFirst int
|
||||
if err := DB.QueryRow("SELECT COUNT(*) FROM schema_migrations").Scan(&afterFirst); err != nil {
|
||||
t.Fatalf("count after first: %v", err)
|
||||
}
|
||||
|
||||
// Second boot: nothing new should apply, and it must not error even
|
||||
// though the runner re-evaluates every file (the 045 failure mode).
|
||||
if err := RunMigrations(migrationsDir); err != nil {
|
||||
t.Fatalf("second replay failed (non-idempotent migration / 045 crash-loop class): %v", err)
|
||||
}
|
||||
var afterSecond int
|
||||
if err := DB.QueryRow("SELECT COUNT(*) FROM schema_migrations").Scan(&afterSecond); err != nil {
|
||||
t.Fatalf("count after second: %v", err)
|
||||
}
|
||||
if afterSecond != afterFirst {
|
||||
t.Errorf("second boot changed schema_migrations from %d to %d — re-application is not a clean no-op", afterFirst, afterSecond)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,291 @@
|
||||
package db
|
||||
|
||||
// redis_test.go — regression coverage for the workspace online-status and
|
||||
// URL-resolution Redis layer (redis.go), which previously had NO test.
|
||||
//
|
||||
// Issue #2150 (SOP rule internal#765). redis.go drives two fleet-wide
|
||||
// behaviours that break silently if a key name or TTL drifts:
|
||||
//
|
||||
// - online detection: SetOnline / RefreshTTL / IsOnline on `ws:<id>`.
|
||||
// A wrong key prefix or a TTL shorter than the heartbeat interval makes
|
||||
// live workspaces flap to "unreachable — restart" (the exact failure
|
||||
// LivenessTTL=180s was tuned to avoid). A TTL too long hides real
|
||||
// crashes.
|
||||
// - proxy URL resolution: CacheURL / GetCachedURL / CacheInternalURL /
|
||||
// GetCachedInternalURL on `ws:<id>:url` and `ws:<id>:internal_url`.
|
||||
// A2A forwarding resolves the target workspace through these keys; a
|
||||
// prefix collision (e.g. the liveness key overlapping the URL key)
|
||||
// would serve the wrong URL or a literal "online" string as a URL.
|
||||
//
|
||||
// These tests run against miniredis — an in-process Redis that speaks the
|
||||
// real RESP protocol and enforces real TTL/expiry semantics — so they
|
||||
// exercise the actual go-redis client calls and key/TTL behaviour, not a
|
||||
// mock that rubber-stamps them. miniredis is already a module dependency.
|
||||
//
|
||||
// Watch-fail intent: change any `ws:%s...` format string in redis.go, or
|
||||
// regress LivenessTTL below the heartbeat window, and a test here fails.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/alicebob/miniredis/v2"
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
// withMiniRedis spins up an in-process Redis, points the package-global RDB
|
||||
// at it, and registers Cleanup. Returns the server handle so tests can drive
|
||||
// the clock (FastForward) to exercise TTL expiry deterministically.
|
||||
func withMiniRedis(t *testing.T) *miniredis.Miniredis {
|
||||
t.Helper()
|
||||
mr, err := miniredis.Run()
|
||||
if err != nil {
|
||||
t.Fatalf("miniredis.Run: %v", err)
|
||||
}
|
||||
RDB = redis.NewClient(&redis.Options{Addr: mr.Addr()})
|
||||
t.Cleanup(func() {
|
||||
RDB.Close()
|
||||
mr.Close()
|
||||
})
|
||||
return mr
|
||||
}
|
||||
|
||||
// TestLivenessTTL_ExceedsHeartbeatWindow pins the tuned TTL. The heartbeat
|
||||
// loop fires every 30s; LivenessTTL must allow several missed beats (the
|
||||
// comment in redis.go targets ~5) so a busy leader starved for 60-120s is
|
||||
// not falsely declared dead. 180s = 6×30s. Regressing this toward the old
|
||||
// 60s value reintroduces the false-positive restart cycle.
|
||||
func TestLivenessTTL_ExceedsHeartbeatWindow(t *testing.T) {
|
||||
const heartbeatInterval = 30 * time.Second
|
||||
const minMissedBeats = 5
|
||||
if LivenessTTL < heartbeatInterval*minMissedBeats {
|
||||
t.Errorf("LivenessTTL=%s is too short: must tolerate >=%d missed %s heartbeats (>= %s) to avoid false-positive restarts",
|
||||
LivenessTTL, minMissedBeats, heartbeatInterval, heartbeatInterval*minMissedBeats)
|
||||
}
|
||||
}
|
||||
|
||||
// TestSetOnline_KeyAndTTL verifies SetOnline writes the canonical `ws:<id>`
|
||||
// key with the value "online" and the LivenessTTL — the exact contract
|
||||
// IsOnline and the a2a_proxy reactive check rely on.
|
||||
func TestSetOnline_KeyAndTTL(t *testing.T) {
|
||||
mr := withMiniRedis(t)
|
||||
ctx := context.Background()
|
||||
const ws = "ws-abc-123"
|
||||
|
||||
if err := SetOnline(ctx, ws); err != nil {
|
||||
t.Fatalf("SetOnline: %v", err)
|
||||
}
|
||||
|
||||
// Key name must be exactly ws:<id> — not, say, ws:<id>:online.
|
||||
if !mr.Exists("ws:" + ws) {
|
||||
t.Fatalf("expected key %q to exist; keys present: %v", "ws:"+ws, mr.Keys())
|
||||
}
|
||||
got, err := mr.Get("ws:" + ws)
|
||||
if err != nil {
|
||||
t.Fatalf("mr.Get: %v", err)
|
||||
}
|
||||
if got != "online" {
|
||||
t.Errorf("liveness value = %q, want %q", got, "online")
|
||||
}
|
||||
|
||||
// TTL must be the tuned LivenessTTL (allow miniredis's whole-second
|
||||
// granularity).
|
||||
ttl := mr.TTL("ws:" + ws)
|
||||
if ttl != LivenessTTL {
|
||||
t.Errorf("TTL = %s, want %s", ttl, LivenessTTL)
|
||||
}
|
||||
}
|
||||
|
||||
// TestIsOnline_TrueThenExpires drives the real TTL clock: a freshly-set
|
||||
// workspace is online; after the TTL elapses it is offline. This is the
|
||||
// behaviour online-detection depends on — proven against real expiry, not
|
||||
// asserted from a mock.
|
||||
func TestIsOnline_TrueThenExpires(t *testing.T) {
|
||||
mr := withMiniRedis(t)
|
||||
ctx := context.Background()
|
||||
const ws = "ws-expiry"
|
||||
|
||||
if err := SetOnline(ctx, ws); err != nil {
|
||||
t.Fatalf("SetOnline: %v", err)
|
||||
}
|
||||
online, err := IsOnline(ctx, ws)
|
||||
if err != nil {
|
||||
t.Fatalf("IsOnline: %v", err)
|
||||
}
|
||||
if !online {
|
||||
t.Fatal("expected workspace online immediately after SetOnline")
|
||||
}
|
||||
|
||||
// Fast-forward just past the TTL; the liveness key must expire.
|
||||
mr.FastForward(LivenessTTL + time.Second)
|
||||
|
||||
online, err = IsOnline(ctx, ws)
|
||||
if err != nil {
|
||||
t.Fatalf("IsOnline after expiry: %v", err)
|
||||
}
|
||||
if online {
|
||||
t.Error("expected workspace offline after TTL elapsed")
|
||||
}
|
||||
}
|
||||
|
||||
// TestRefreshTTL_ExtendsLiveness proves a heartbeat (RefreshTTL) keeps a
|
||||
// workspace alive across what would otherwise be an expiry. Without the
|
||||
// refresh the key expires; with it, IsOnline stays true. Watch-fail: if
|
||||
// RefreshTTL targets the wrong key, the refresh is a no-op and this fails.
|
||||
func TestRefreshTTL_ExtendsLiveness(t *testing.T) {
|
||||
mr := withMiniRedis(t)
|
||||
ctx := context.Background()
|
||||
const ws = "ws-refresh"
|
||||
|
||||
if err := SetOnline(ctx, ws); err != nil {
|
||||
t.Fatalf("SetOnline: %v", err)
|
||||
}
|
||||
// Advance most of the way to expiry, then heartbeat.
|
||||
mr.FastForward(LivenessTTL - 5*time.Second)
|
||||
if err := RefreshTTL(ctx, ws); err != nil {
|
||||
t.Fatalf("RefreshTTL: %v", err)
|
||||
}
|
||||
// Advance past where the ORIGINAL TTL would have expired. Still online.
|
||||
mr.FastForward(10 * time.Second)
|
||||
online, err := IsOnline(ctx, ws)
|
||||
if err != nil {
|
||||
t.Fatalf("IsOnline: %v", err)
|
||||
}
|
||||
if !online {
|
||||
t.Error("expected workspace still online after RefreshTTL heartbeat")
|
||||
}
|
||||
}
|
||||
|
||||
// TestIsOnline_UnknownWorkspace returns false (and no error) for a workspace
|
||||
// that was never set — the default for a never-registered / long-dead agent.
|
||||
func TestIsOnline_UnknownWorkspace(t *testing.T) {
|
||||
withMiniRedis(t)
|
||||
ctx := context.Background()
|
||||
online, err := IsOnline(ctx, "never-seen")
|
||||
if err != nil {
|
||||
t.Fatalf("IsOnline: %v", err)
|
||||
}
|
||||
if online {
|
||||
t.Error("expected unknown workspace to be offline")
|
||||
}
|
||||
}
|
||||
|
||||
// TestURLCache_RoundTrip pins the `ws:<id>:url` key and its 5-minute TTL,
|
||||
// and proves the value round-trips. A2A push resolves the target through
|
||||
// this key.
|
||||
func TestURLCache_RoundTrip(t *testing.T) {
|
||||
mr := withMiniRedis(t)
|
||||
ctx := context.Background()
|
||||
const ws = "ws-url"
|
||||
const url = "https://ws-url.workspaces.moleculesai.app"
|
||||
|
||||
if err := CacheURL(ctx, ws, url); err != nil {
|
||||
t.Fatalf("CacheURL: %v", err)
|
||||
}
|
||||
got, err := GetCachedURL(ctx, ws)
|
||||
if err != nil {
|
||||
t.Fatalf("GetCachedURL: %v", err)
|
||||
}
|
||||
if got != url {
|
||||
t.Errorf("GetCachedURL = %q, want %q", got, url)
|
||||
}
|
||||
if !mr.Exists("ws:" + ws + ":url") {
|
||||
t.Errorf("expected key %q; present: %v", "ws:"+ws+":url", mr.Keys())
|
||||
}
|
||||
if ttl := mr.TTL("ws:" + ws + ":url"); ttl != 5*time.Minute {
|
||||
t.Errorf("url cache TTL = %s, want 5m", ttl)
|
||||
}
|
||||
}
|
||||
|
||||
// TestInternalURLCache_RoundTrip pins the `ws:<id>:internal_url` key (the
|
||||
// Docker-internal address used for workspace-to-workspace discovery) and its
|
||||
// 5-minute TTL.
|
||||
func TestInternalURLCache_RoundTrip(t *testing.T) {
|
||||
mr := withMiniRedis(t)
|
||||
ctx := context.Background()
|
||||
const ws = "ws-int"
|
||||
const url = "http://ws-int:8080"
|
||||
|
||||
if err := CacheInternalURL(ctx, ws, url); err != nil {
|
||||
t.Fatalf("CacheInternalURL: %v", err)
|
||||
}
|
||||
got, err := GetCachedInternalURL(ctx, ws)
|
||||
if err != nil {
|
||||
t.Fatalf("GetCachedInternalURL: %v", err)
|
||||
}
|
||||
if got != url {
|
||||
t.Errorf("GetCachedInternalURL = %q, want %q", got, url)
|
||||
}
|
||||
if ttl := mr.TTL("ws:" + ws + ":internal_url"); ttl != 5*time.Minute {
|
||||
t.Errorf("internal url cache TTL = %s, want 5m", ttl)
|
||||
}
|
||||
}
|
||||
|
||||
// TestKeyNamespacesDoNotCollide is the prefix-collision regression: the
|
||||
// liveness key (ws:<id>), the URL key (ws:<id>:url), and the internal-URL
|
||||
// key (ws:<id>:internal_url) must be three DISTINCT keys for the same
|
||||
// workspace. If a future edit collapses the format strings, IsOnline would
|
||||
// read a URL as liveness (or vice versa) and online-detection / proxy
|
||||
// resolution would corrupt each other fleet-wide.
|
||||
func TestKeyNamespacesDoNotCollide(t *testing.T) {
|
||||
mr := withMiniRedis(t)
|
||||
ctx := context.Background()
|
||||
const ws = "ws-collide"
|
||||
|
||||
if err := SetOnline(ctx, ws); err != nil {
|
||||
t.Fatalf("SetOnline: %v", err)
|
||||
}
|
||||
if err := CacheURL(ctx, ws, "https://public"); err != nil {
|
||||
t.Fatalf("CacheURL: %v", err)
|
||||
}
|
||||
if err := CacheInternalURL(ctx, ws, "http://internal:8080"); err != nil {
|
||||
t.Fatalf("CacheInternalURL: %v", err)
|
||||
}
|
||||
|
||||
// Liveness value must still be "online", NOT a URL.
|
||||
if v, _ := mr.Get("ws:" + ws); v != "online" {
|
||||
t.Errorf("liveness key clobbered by a URL write: got %q", v)
|
||||
}
|
||||
if v, _ := mr.Get("ws:" + ws + ":url"); v != "https://public" {
|
||||
t.Errorf("url key = %q, want https://public", v)
|
||||
}
|
||||
if v, _ := mr.Get("ws:" + ws + ":internal_url"); v != "http://internal:8080" {
|
||||
t.Errorf("internal_url key = %q, want http://internal:8080", v)
|
||||
}
|
||||
}
|
||||
|
||||
// TestClearWorkspaceKeys_RemovesAllThree proves teardown removes the
|
||||
// liveness, URL, and internal-URL keys together — a leaked liveness key
|
||||
// after deletion would keep a dead workspace looking online; a leaked URL
|
||||
// key would let the proxy forward to a recycled address.
|
||||
func TestClearWorkspaceKeys_RemovesAllThree(t *testing.T) {
|
||||
mr := withMiniRedis(t)
|
||||
ctx := context.Background()
|
||||
const ws = "ws-clear"
|
||||
|
||||
if err := SetOnline(ctx, ws); err != nil {
|
||||
t.Fatalf("SetOnline: %v", err)
|
||||
}
|
||||
if err := CacheURL(ctx, ws, "https://x"); err != nil {
|
||||
t.Fatalf("CacheURL: %v", err)
|
||||
}
|
||||
if err := CacheInternalURL(ctx, ws, "http://x:8080"); err != nil {
|
||||
t.Fatalf("CacheInternalURL: %v", err)
|
||||
}
|
||||
|
||||
ClearWorkspaceKeys(ctx, ws)
|
||||
|
||||
for _, k := range []string{"ws:" + ws, "ws:" + ws + ":url", "ws:" + ws + ":internal_url"} {
|
||||
if mr.Exists(k) {
|
||||
t.Errorf("key %q survived ClearWorkspaceKeys", k)
|
||||
}
|
||||
}
|
||||
online, err := IsOnline(ctx, ws)
|
||||
if err != nil {
|
||||
t.Fatalf("IsOnline: %v", err)
|
||||
}
|
||||
if online {
|
||||
t.Error("workspace still online after ClearWorkspaceKeys")
|
||||
}
|
||||
}
|
||||
@@ -272,20 +272,6 @@ func MarkQueueItemFailed(ctx context.Context, id, errMsg string) {
|
||||
}
|
||||
}
|
||||
|
||||
// QueueDepth returns the number of currently-queued (not dispatched/completed)
|
||||
// items for a workspace. Used by the busy-return response body so callers
|
||||
// can see how many ahead of them.
|
||||
func QueueDepth(ctx context.Context, workspaceID string) int {
|
||||
var n int
|
||||
if err := db.DB.QueryRowContext(ctx,
|
||||
`SELECT COUNT(*) FROM a2a_queue WHERE workspace_id = $1 AND status = 'queued'`,
|
||||
workspaceID,
|
||||
).Scan(&n); err != nil {
|
||||
log.Printf("A2AQueue: QueueDepth query failed for workspace %s: %v", workspaceID, err)
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// DropStaleQueueItems marks queued items older than maxAge as 'dropped' with a
|
||||
// system-generated reason so PM agents stop processing stale post-incident noise.
|
||||
// Called with a workspaceID to scope cleanup to one workspace, or empty to sweep
|
||||
|
||||
@@ -31,14 +31,53 @@ type workspaceDisplayResponse struct {
|
||||
Status string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
var workspaceComputeInstanceAllowlist = map[string]struct{}{
|
||||
"t3.medium": {},
|
||||
"t3.large": {},
|
||||
"t3.xlarge": {},
|
||||
"t3.2xlarge": {},
|
||||
"m6i.large": {},
|
||||
"m6i.xlarge": {},
|
||||
"c6i.xlarge": {},
|
||||
// workspaceComputeInstanceAllowlist is keyed by cloud provider (multi-provider /
|
||||
// in-place switch): each provider's box accepts only that provider's machine
|
||||
// sizes (an AWS t3.* is meaningless on Hetzner, and vice-versa). Mirrors the CP
|
||||
// provider SSOT — keep in lock-step with the controlplane provider configs
|
||||
// (Hetzner ServerType cpx*/cax*, GCP MachineType e2-*, AWS EC2 t3*/m6i*/c6i*).
|
||||
// TestValidateWorkspaceCompute_Provider / _InstanceTypePerProvider pin the sets.
|
||||
// "" provider = AWS default.
|
||||
var workspaceComputeInstanceAllowlist = map[string]map[string]struct{}{
|
||||
"aws": {
|
||||
"t3.medium": {}, "t3.large": {}, "t3.xlarge": {}, "t3.2xlarge": {},
|
||||
"m6i.large": {}, "m6i.xlarge": {}, "c6i.xlarge": {},
|
||||
},
|
||||
"hetzner": {
|
||||
"cpx11": {}, "cpx21": {}, "cpx31": {}, "cpx41": {}, "cpx51": {},
|
||||
"cax11": {}, "cax21": {}, "cax31": {}, "cax41": {},
|
||||
},
|
||||
"gcp": {
|
||||
"e2-small": {}, "e2-medium": {},
|
||||
"e2-standard-2": {}, "e2-standard-4": {}, "e2-standard-8": {},
|
||||
},
|
||||
}
|
||||
|
||||
// normalizeCloudProvider maps "" → "aws" so the in-place switch comparison
|
||||
// treats the default and an explicit "aws" as the same cloud (no spurious switch).
|
||||
func normalizeCloudProvider(p string) string {
|
||||
if p == "" {
|
||||
return "aws"
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
// instanceTypeAllowedForProvider reports whether instanceType is valid for the
|
||||
// given provider ("" → aws). Empty instanceType is always allowed (CP defaults).
|
||||
func instanceTypeAllowedForProvider(provider, instanceType string) bool {
|
||||
if instanceType == "" {
|
||||
return true
|
||||
}
|
||||
p := provider
|
||||
if p == "" {
|
||||
p = "aws"
|
||||
}
|
||||
set, ok := workspaceComputeInstanceAllowlist[p]
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
_, ok = set[instanceType]
|
||||
return ok
|
||||
}
|
||||
|
||||
// workspaceComputeProviderAllowlist mirrors the controlplane cloud-provider SSOT
|
||||
@@ -56,11 +95,24 @@ var workspaceComputeProviderAllowlist = map[string]struct{}{
|
||||
}
|
||||
|
||||
func validateWorkspaceCompute(compute models.WorkspaceCompute) error {
|
||||
if compute.InstanceType != "" {
|
||||
if _, ok := workspaceComputeInstanceAllowlist[compute.InstanceType]; !ok {
|
||||
return fmt.Errorf("unsupported compute.instance_type")
|
||||
// Provider first (so the instance-type check below can be provider-scoped).
|
||||
// "" = default (AWS). CP fail-closes an unwired provider with a 422; validating
|
||||
// here gives a clean 400 before the round-trip and is the gate reused by the
|
||||
// switch-provider flow. Mirrors the controlplane cloudprovider SSOT.
|
||||
if compute.Provider != "" {
|
||||
if _, ok := workspaceComputeProviderAllowlist[compute.Provider]; !ok {
|
||||
return fmt.Errorf("unsupported compute.provider (want aws|gcp|hetzner)")
|
||||
}
|
||||
}
|
||||
// Instance type must belong to the chosen provider (an AWS t3.* is invalid on
|
||||
// Hetzner, etc.). Empty = CP default for the provider.
|
||||
if !instanceTypeAllowedForProvider(compute.Provider, compute.InstanceType) {
|
||||
prov := compute.Provider
|
||||
if prov == "" {
|
||||
prov = "aws"
|
||||
}
|
||||
return fmt.Errorf("unsupported compute.instance_type %q for provider %q", compute.InstanceType, prov)
|
||||
}
|
||||
if compute.Volume.RootGB != 0 {
|
||||
if compute.Volume.RootGB < workspaceComputeDiskFloorGB || compute.Volume.RootGB > workspaceComputeDiskCeilingGB {
|
||||
return fmt.Errorf("compute.volume.root_gb must be between %d and %d", workspaceComputeDiskFloorGB, workspaceComputeDiskCeilingGB)
|
||||
@@ -87,15 +139,6 @@ func validateWorkspaceCompute(compute models.WorkspaceCompute) error {
|
||||
default:
|
||||
return fmt.Errorf("unsupported compute.data_persistence (want persist|ephemeral)")
|
||||
}
|
||||
// Cloud backend for the box (multi-provider). "" = default (AWS). CP fail-
|
||||
// closes an unwired provider with a 422 (PROVIDER_UNAVAILABLE); validating
|
||||
// here gives a clean 400 before the round-trip and is the gate reused by the
|
||||
// switch-provider flow. Mirrors the controlplane cloudprovider SSOT.
|
||||
if compute.Provider != "" {
|
||||
if _, ok := workspaceComputeProviderAllowlist[compute.Provider]; !ok {
|
||||
return fmt.Errorf("unsupported compute.provider (want aws|gcp|hetzner)")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -64,6 +64,40 @@ func TestValidateWorkspaceCompute_Provider(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// Multi-provider / in-place switch: an instance type must belong to the chosen
|
||||
// provider — an AWS t3.* is meaningless on Hetzner, a cpx* on AWS, etc. Pins the
|
||||
// provider-keyed allowlist (mirrors the CP provider configs).
|
||||
func TestValidateWorkspaceCompute_InstanceTypePerProvider(t *testing.T) {
|
||||
good := []struct{ provider, instance string }{
|
||||
{"", "t3.medium"}, {"aws", "t3.2xlarge"}, {"aws", "c6i.xlarge"},
|
||||
{"hetzner", "cpx31"}, {"hetzner", "cax41"},
|
||||
{"gcp", "e2-standard-2"}, {"gcp", "e2-small"},
|
||||
{"hetzner", ""}, {"gcp", ""}, // empty instance = CP default, always ok
|
||||
}
|
||||
for _, g := range good {
|
||||
c := models.WorkspaceCompute{Provider: g.provider, InstanceType: g.instance}
|
||||
if err := validateWorkspaceCompute(c); err != nil {
|
||||
t.Errorf("provider=%q instance=%q must be accepted: %v", g.provider, g.instance, err)
|
||||
}
|
||||
}
|
||||
bad := []struct{ provider, instance string }{
|
||||
{"hetzner", "t3.medium"}, // AWS type on Hetzner
|
||||
{"aws", "cpx31"}, // Hetzner type on AWS
|
||||
{"gcp", "t3.large"}, // AWS type on GCP
|
||||
{"hetzner", "e2-small"}, // GCP type on Hetzner
|
||||
{"", "cpx31"}, // default(aws) + Hetzner type
|
||||
}
|
||||
for _, b := range bad {
|
||||
c := models.WorkspaceCompute{Provider: b.provider, InstanceType: b.instance}
|
||||
if err := validateWorkspaceCompute(c); err == nil {
|
||||
t.Errorf("provider=%q instance=%q must be rejected (cross-provider instance type)", b.provider, b.instance)
|
||||
}
|
||||
}
|
||||
if normalizeCloudProvider("") != "aws" || normalizeCloudProvider("hetzner") != "hetzner" {
|
||||
t.Fatal("normalizeCloudProvider: \"\" must map to aws; explicit providers unchanged")
|
||||
}
|
||||
}
|
||||
|
||||
// internal#734: data_persistence enum. "" (auto), "persist", "ephemeral" are
|
||||
// the only accepted values; anything else is a clear 400 before the CP call.
|
||||
func TestValidateWorkspaceCompute_DataPersistence(t *testing.T) {
|
||||
|
||||
@@ -164,6 +164,7 @@ func (h *WorkspaceHandler) Update(c *gin.Context) {
|
||||
}
|
||||
}
|
||||
var computeJSON string
|
||||
var newComputeProvider string // hoisted: drives the cloud-provider switch detection below
|
||||
computePatch := false
|
||||
if rawCompute, ok := body["compute"]; ok {
|
||||
computePatch = true
|
||||
@@ -184,6 +185,7 @@ func (h *WorkspaceHandler) Update(c *gin.Context) {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
newComputeProvider = compute.Provider
|
||||
encoded, err := workspaceComputeJSON(compute)
|
||||
if err != nil {
|
||||
log.Printf("Update compute encode error for %s: %v", id, err)
|
||||
@@ -262,6 +264,55 @@ func (h *WorkspaceHandler) Update(c *gin.Context) {
|
||||
needsRestart = true
|
||||
}
|
||||
if computePatch {
|
||||
// Cloud-provider SWITCH (in-place): if the incoming provider differs from
|
||||
// the one currently stored, the existing box lives on the OLD cloud. We
|
||||
// MUST deprovision it on the OLD provider BEFORE overwriting compute —
|
||||
// otherwise the subsequent "Save & Restart" restart's provider-aware
|
||||
// deprovision (cpProv.Stop → resolveProvider reads compute->>'provider')
|
||||
// would target the NEW cloud and ORPHAN the old box (a silently-billing
|
||||
// leak). Cloud mode only (the local Docker provisioner has no cross-cloud
|
||||
// concept; provider stays "" there so this never fires). After this, the
|
||||
// canvas's restart provisions the box on the new cloud; its own Stop is a
|
||||
// safe no-op (the box is already gone).
|
||||
if h.cpProv != nil {
|
||||
var oldProvider sql.NullString
|
||||
err := db.DB.QueryRowContext(ctx, `SELECT compute->>'provider' FROM workspaces WHERE id = $1`, id).Scan(&oldProvider)
|
||||
// FAIL-CLOSED on the read. The earlier `err == nil` gate was fail-OPEN:
|
||||
// a transient/unexpected DB error here skipped the whole switch block and
|
||||
// fell through to the compute UPDATE — so during a real switch the later
|
||||
// provider-aware restart deprovision would target the NEW cloud and ORPHAN
|
||||
// the old box (silent billing, unrecoverable). We cannot tell whether this
|
||||
// is a cross-cloud switch without the old provider, so on any error other
|
||||
// than "no such row" we abort exactly like a failed deprovision: compute
|
||||
// untouched, old box still recoverable, user retries. (sql.ErrNoRows means
|
||||
// there is genuinely no prior box — nothing to orphan — so it's safe to
|
||||
// skip the switch and let the UPDATE proceed.)
|
||||
if err != nil && !errors.Is(err, sql.ErrNoRows) {
|
||||
log.Printf("Update: provider-switch precheck for %s ABORTED — could not read current cloud provider (provider left unchanged): %v", id, err)
|
||||
c.JSON(http.StatusBadGateway, gin.H{"error": "could not read the current cloud provider; provider unchanged — please retry"})
|
||||
return
|
||||
}
|
||||
if err == nil && normalizeCloudProvider(oldProvider.String) != normalizeCloudProvider(newComputeProvider) {
|
||||
log.Printf("Update: cloud-provider switch for %s: %q -> %q; deprovisioning old box on old provider before overwriting compute",
|
||||
id, normalizeCloudProvider(oldProvider.String), normalizeCloudProvider(newComputeProvider))
|
||||
// Use the ERROR-returning variant and ABORT before overwriting
|
||||
// compute if the old-box deprovision fails. If we proceeded, the
|
||||
// old box would keep running on the OLD cloud while the row now
|
||||
// records the NEW provider+instance — stranding it with no DB
|
||||
// pointer (an UNRECOVERABLE cross-cloud orphan that no reconciler
|
||||
// can map back). Aborting leaves the row pointing at the
|
||||
// still-recoverable old box; the user can retry the switch. (The
|
||||
// restart paths' void cpStopWithRetry is fine there because the
|
||||
// box stays on the SAME cloud, so the provider record is unchanged
|
||||
// and a provider-scoped sweep can still find it.)
|
||||
if err := h.cpStopWithRetryErr(ctx, id, "provider-switch", false); err != nil {
|
||||
log.Printf("Update: provider-switch for %s ABORTED — could not deprovision old box on %q (provider left unchanged, old box recoverable): %v",
|
||||
id, normalizeCloudProvider(oldProvider.String), err)
|
||||
c.JSON(http.StatusBadGateway, gin.H{"error": "could not deprovision the current cloud box; provider unchanged — please retry"})
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET compute = $2::jsonb, updated_at = now() WHERE id = $1`, id, computeJSON); err != nil {
|
||||
log.Printf("Update compute error for %s: %v", id, err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to save compute config"})
|
||||
|
||||
@@ -0,0 +1,180 @@
|
||||
package handlers
|
||||
|
||||
// workspace_provider_switch_test.go — deterministic coverage for the in-place
|
||||
// cloud-provider switch in the Update (PATCH /workspaces/:id) handler.
|
||||
//
|
||||
// The switch is DESTRUCTIVE (it recreates the box on a new cloud) and its
|
||||
// safety hinges on ORDER + ABORT, which these tests pin without touching a real
|
||||
// cloud (sqlmock DB + the scriptedCPStop fake from workspace_restart_stop_retry_test):
|
||||
//
|
||||
// 1. On a provider change, the OLD box is deprovisioned (cpProv.Stop) BEFORE
|
||||
// the compute row is overwritten — otherwise the later restart's
|
||||
// provider-aware deprovision would target the NEW cloud and ORPHAN the old
|
||||
// (still-billing) box. The sqlmock query ORDER pins "read old provider →
|
||||
// [Stop] → UPDATE compute".
|
||||
// 2. If the old-box deprovision FAILS, the handler ABORTS (502) and does NOT
|
||||
// overwrite compute — leaving the row pointed at the recoverable old box
|
||||
// (an unexpected UPDATE would fail sqlmock's expectations).
|
||||
// 3. A non-switch compute edit (same provider) does NOT deprovision anything.
|
||||
// 4. If the old-provider READ errors (transient DB fault, not sql.ErrNoRows),
|
||||
// the handler FAILS CLOSED: aborts (502), deprovisions nothing, and does NOT
|
||||
// overwrite compute — closing the fail-open read path that would otherwise
|
||||
// orphan the old box on a real switch (security review RC 9895).
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
func newPatchContext(t *testing.T, id, body string) (*gin.Context, *httptest.ResponseRecorder) {
|
||||
t.Helper()
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: id}}
|
||||
req := httptest.NewRequest("PATCH", "/workspaces/"+id, bytes.NewBufferString(body))
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
c.Request = req
|
||||
return c, w
|
||||
}
|
||||
|
||||
const switchTestWSID = "cccccccc-0001-0000-0000-000000000000"
|
||||
|
||||
func newSwitchTestHandler(t *testing.T, cp *scriptedCPStop) *WorkspaceHandler {
|
||||
t.Helper()
|
||||
h := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
|
||||
h.cpProv = cp
|
||||
return h
|
||||
}
|
||||
|
||||
// 1. aws → hetzner: deprovision the OLD box, THEN overwrite compute (200).
|
||||
func TestWorkspaceUpdate_ProviderSwitch_DeprovisionsOldBeforeUpdate(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
cp := &scriptedCPStop{} // Stop succeeds
|
||||
h := newSwitchTestHandler(t, cp)
|
||||
|
||||
// Ordered expectations pin: EXISTS → read OLD provider (aws) → UPDATE compute.
|
||||
// The cpProv.Stop deprovision runs (in code) AFTER the provider read and
|
||||
// BEFORE the UPDATE — exactly the orphan-safe order.
|
||||
mock.ExpectQuery("SELECT EXISTS").WithArgs(switchTestWSID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
|
||||
mock.ExpectQuery("compute->>'provider'").WithArgs(switchTestWSID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"provider"}).AddRow("aws"))
|
||||
mock.ExpectExec("UPDATE workspaces SET compute").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
c, w := newPatchContext(t, switchTestWSID,
|
||||
`{"compute":{"instance_type":"cpx31","provider":"hetzner","volume":{"root_gb":30}}}`)
|
||||
h.Update(c)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200 on a successful switch, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if cp.calls != 1 {
|
||||
t.Fatalf("expected the OLD box to be deprovisioned exactly once on a provider switch; got %d Stop calls", cp.calls)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet/unexpected DB queries (ordering broken?): %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Deprovision FAILS → abort (502) + compute NOT overwritten (no UPDATE).
|
||||
func TestWorkspaceUpdate_ProviderSwitch_AbortsWhenDeprovisionFails(t *testing.T) {
|
||||
shrinkRetryBackoff(t) // don't burn the 1s/2s/4s retry backoff
|
||||
mock := setupTestDB(t)
|
||||
// All retry attempts fail → cpStopWithRetryErr returns an error → abort.
|
||||
cp := &scriptedCPStop{errs: []error{
|
||||
fmt.Errorf("cp 503"), fmt.Errorf("cp 503"), fmt.Errorf("cp 503"),
|
||||
}}
|
||||
h := newSwitchTestHandler(t, cp)
|
||||
|
||||
mock.ExpectQuery("SELECT EXISTS").WithArgs(switchTestWSID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
|
||||
mock.ExpectQuery("compute->>'provider'").WithArgs(switchTestWSID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"provider"}).AddRow("aws"))
|
||||
// NO UPDATE expectation: if the handler overwrote compute after a failed
|
||||
// deprovision (the orphan bug), sqlmock would flag the unexpected query.
|
||||
|
||||
c, w := newPatchContext(t, switchTestWSID,
|
||||
`{"compute":{"instance_type":"cpx31","provider":"hetzner","volume":{"root_gb":30}}}`)
|
||||
h.Update(c)
|
||||
|
||||
if w.Code != http.StatusBadGateway {
|
||||
t.Fatalf("expected 502 when the old-box deprovision fails, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if cp.calls == 0 {
|
||||
t.Fatal("expected at least one Stop attempt before aborting")
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
// A failure here means an UNEXPECTED UPDATE ran — i.e. compute was
|
||||
// overwritten after a failed deprovision → the orphan bug is back.
|
||||
t.Fatalf("compute must NOT be overwritten when deprovision fails (orphan-prevention): %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Same provider (no switch): no deprovision; compute is updated normally.
|
||||
func TestWorkspaceUpdate_NoProviderSwitch_DoesNotDeprovision(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
cp := &scriptedCPStop{}
|
||||
h := newSwitchTestHandler(t, cp)
|
||||
|
||||
mock.ExpectQuery("SELECT EXISTS").WithArgs(switchTestWSID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
|
||||
mock.ExpectQuery("compute->>'provider'").WithArgs(switchTestWSID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"provider"}).AddRow("aws"))
|
||||
mock.ExpectExec("UPDATE workspaces SET compute").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// provider stays aws (only the instance size changes) → no switch, no Stop.
|
||||
c, w := newPatchContext(t, switchTestWSID,
|
||||
`{"compute":{"instance_type":"t3.large","provider":"aws","volume":{"root_gb":60}}}`)
|
||||
h.Update(c)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if cp.calls != 0 {
|
||||
t.Fatalf("a non-switching compute edit must NOT deprovision the box; got %d Stop calls", cp.calls)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet/unexpected DB queries: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Provider READ errors (transient DB fault) → fail-CLOSED: abort 502, no
|
||||
// deprovision, no compute overwrite. A fail-open read (the old `err == nil`
|
||||
// gate) would skip switch detection and overwrite compute → orphan the old
|
||||
// cloud box. sqlmock has NO UPDATE/Stop expectations, so either an overwrite
|
||||
// or a stray deprovision trips it.
|
||||
func TestWorkspaceUpdate_ProviderSwitch_AbortsOnProviderReadError(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
cp := &scriptedCPStop{}
|
||||
h := newSwitchTestHandler(t, cp)
|
||||
|
||||
mock.ExpectQuery("SELECT EXISTS").WithArgs(switchTestWSID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
|
||||
// The old-provider read hits a transient error (NOT sql.ErrNoRows).
|
||||
mock.ExpectQuery("compute->>'provider'").WithArgs(switchTestWSID).
|
||||
WillReturnError(fmt.Errorf("connection reset by peer"))
|
||||
|
||||
c, w := newPatchContext(t, switchTestWSID,
|
||||
`{"compute":{"instance_type":"cpx31","provider":"hetzner","volume":{"root_gb":30}}}`)
|
||||
h.Update(c)
|
||||
|
||||
if w.Code != http.StatusBadGateway {
|
||||
t.Fatalf("expected 502 when the provider read fails (fail-closed), got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if cp.calls != 0 {
|
||||
t.Fatalf("must NOT deprovision when the current provider can't be read; got %d Stop calls", cp.calls)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
// An unexpected UPDATE here = compute was overwritten despite an unreadable
|
||||
// provider → the fail-open orphan path is back.
|
||||
t.Fatalf("compute must NOT be overwritten on a provider read error (fail-closed): %v", err)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user