From 1028777a9ff4e75e2b3bc5ddb60c661246f0bbdf Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Sun, 7 Jun 2026 23:42:59 +0000 Subject: [PATCH 1/3] fix(canvas/e2e): tolerate transient 'failed' status during boot (#2032) Hermes cold-boot can exceed the bootstrap-watcher deadline, setting status=failed prematurely; heartbeat later recovers to online. Instead of hard-throwing on the first 'failed' sighting, log a warning and retry. Genuine terminal failures still surface via the waitFor timeout. Fixes #2032 --- canvas/e2e/staging-setup.ts | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/canvas/e2e/staging-setup.ts b/canvas/e2e/staging-setup.ts index feb5988b2..e736651ed 100644 --- a/canvas/e2e/staging-setup.ts +++ b/canvas/e2e/staging-setup.ts @@ -341,11 +341,15 @@ export default async function globalSetup(_config: FullConfig): Promise { ); return true; } - // Real boot regression — hard-throw immediately with full detail. + // #2032: tolerate transient 'failed' during boot — some runtimes + // briefly report failed before recovering to online (e.g. agent + // restart during init). Retry instead of hard-throwing; genuine + // terminal failures will still surface via waitFor timeout. const detail = sampleErr ? sampleErr : `(no last_sample_error) full body: ${JSON.stringify(r.body)}`; - throw new Error(`Workspace failed: ${detail}`); + console.warn(`[staging-setup] transient failed (retrying): ${detail}`); + return null; } return null; }, -- 2.52.0 From 2567b2f6ef6cec0f689f5392f952afac33e11850 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Sun, 7 Jun 2026 23:46:22 +0000 Subject: [PATCH 2/3] fix(scripts): validate AWS region + ECR account ID in promote-tenant-image (#676) Adds input validation to prevent injection/malformed-input bugs: - ssm_refresh_ecr_auth: validate ECR_ACCOUNT_ID is exactly 12 digits (AWS account ID format) before constructing JSON params. - preflight: validate REGION matches ^[a-z][a-z0-9-]*[0-9]$ (AWS region pattern); exit 64 on mismatch. Includes test 11 covering malicious region rejection (shell metacharacters, path traversal, command substitution). Fixes #676 --- scripts/promote-tenant-image.sh | 10 ++++++++++ scripts/test-promote-tenant-image.sh | 23 +++++++++++++++++++---- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/scripts/promote-tenant-image.sh b/scripts/promote-tenant-image.sh index 279e754d0..d55d6bde5 100755 --- a/scripts/promote-tenant-image.sh +++ b/scripts/promote-tenant-image.sh @@ -229,6 +229,11 @@ ssm_refresh_ecr_auth() { # to guarantee correct string escaping (OFFSEC-001 / CWE-78 hardening). # Account ID is derived from the ECR URI which the daemon is configured for. local acct="${ECR_ACCOUNT_ID:-153263036946}" + # #676: validate account ID is exactly 12 digits (AWS account ID format). + if ! [[ "$acct" =~ ^[0-9]{12}$ ]]; then + err "invalid ECR_ACCOUNT_ID (must be 12 digits): $acct" + return 1 + fi local params params=$(mktemp) python3 -c " @@ -290,6 +295,11 @@ validate_slug() { preflight() { log "preflight: source=$SOURCE_TAG dest=$DEST_TAG repo=$REPO region=$REGION" + # Region validation: reject obviously malformed input (CWE-78 / injection guard). + if ! [[ "$REGION" =~ ^[a-z][a-z0-9-]*[0-9]$ ]]; then + err "invalid AWS region: $REGION" + exit 64 + fi local src_manifest src_manifest=$(aws_ecr_get_image "$SOURCE_TAG") || { err "source tag '$SOURCE_TAG' not found in $REPO" diff --git a/scripts/test-promote-tenant-image.sh b/scripts/test-promote-tenant-image.sh index 8a208b642..c3c310847 100644 --- a/scripts/test-promote-tenant-image.sh +++ b/scripts/test-promote-tenant-image.sh @@ -311,7 +311,22 @@ for slug in $valid_slugs; do fi done -printf '\n== Test 11: ROLLBACK_TAG follows YYYYMMDD via NOW_OVERRIDE_DATE ==\n' +printf '\n== Test 11: region validation — malicious region rejected with exit 64 (#676) ==\n' +# Attack vectors: shell metacharacters, path traversal, command substitution. +_invalid_regions='us;rm -rf / $(whoami) us"east-1 ../etc/passwd `id` $HOME us/east-1' +for bad_region in $_invalid_regions; do + set +e + out=$(AWS_REGION="$bad_region" "$SCRIPT" --source-tag x --dest-tag y --tenants chloe-dong --mock-dir /nonexistent 2>&1); rc=$? + set -e + if [[ $rc -eq 64 ]] && printf '%s' "$out" | grep -q 'invalid AWS region'; then + PASS=$((PASS + 1)); printf ' ✓ region rejected: %s\n' "$(printf '%q' "$bad_region")" + else + FAIL=$((FAIL + 1)); FAIL_NAMES+=("region-reject:$bad_region") + printf ' ✗ region should be rejected: %s — got exit %s\n' "$(printf '%q' "$bad_region")" "$rc" + fi +done + +printf '\n== Test 12: ROLLBACK_TAG follows YYYYMMDD via NOW_OVERRIDE_DATE ==\n' m=$(mkmock) mock_set "$m" aws_ecr_get_image '{}' 0 mock_set "$m" aws_ecr_describe_image '' 1 @@ -333,7 +348,7 @@ fi assert_calls_contain "rollback tag uses NOW_OVERRIDE_DATE (20260603)" "$m" 'aws_ecr_put_image b-prev-20260603' rm -rf "$m" -printf '\n== Test 12: empty source manifest fails preflight ==\n' +printf '\n== Test 13: empty source manifest fails preflight ==\n' m=$(mkmock) mock_set "$m" aws_ecr_get_image '' 0 # rc=0 but empty body (the "None" case) out=$(run_script "$m") @@ -341,7 +356,7 @@ assert_exit "empty source manifest fails preflight" "$out" 1 assert_contains "empty manifest message" "$out" 'returned empty manifest' rm -rf "$m" -printf '\n== Test 13: tenant_buildinfo failure during verify → rollback ==\n' +printf '\n== Test 14: tenant_buildinfo failure during verify → rollback ==\n' m=$(mkmock) mock_set "$m" aws_ecr_get_image '{"manifests":[]}' 0 mock_set "$m" aws_ecr_describe_image '' 1 @@ -355,7 +370,7 @@ assert_contains "logs buildinfo failure" "$out" '/buildinfo failed for chloe-don assert_contains "rollback fired after verify fail" "$out" 'ROLLBACK:' rm -rf "$m" -printf '\n== Test 14: ssm_refresh_ecr_auth JSON escaping (CWE-78 / OFFSEC-001) ==\n' +printf '\n== Test 15: ssm_refresh_ecr_auth JSON escaping (CWE-78 / OFFSEC-001) ==\n' # Verify the python3 snippet in ssm_refresh_ecr_auth produces valid JSON and # correctly escapes shell-injection characters in region + account ID fields. # The fix replaces unquoted shell-printf interpolation with json.dumps. -- 2.52.0 From f14ad38cb43623e74e1328462fec96a67622fbb0 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Mon, 8 Jun 2026 00:42:13 +0000 Subject: [PATCH 3/3] =?UTF-8?q?fix(sop-checklist):=20revert=20#1974=20body?= =?UTF-8?q?-unfilled=20bypass=20=E2=80=94=20keep=20fail-closed=20(#2418=20?= =?UTF-8?q?CR)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes the gate-weakening #1974 change that made body-section presence informational only. The SOP checklist gate must remain fail-closed: missing body sections → failure even when peer acks are present. Fixes #2418 --- .gitea/scripts/sop-checklist.py | 3 +-- .gitea/scripts/tests/test_sop_checklist.py | 11 +++-------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/.gitea/scripts/sop-checklist.py b/.gitea/scripts/sop-checklist.py index 3d5125f68..2da3788bb 100644 --- a/.gitea/scripts/sop-checklist.py +++ b/.gitea/scripts/sop-checklist.py @@ -858,8 +858,7 @@ def render_status( if len(missing_body) > 3: shown += f", +{len(missing_body) - 3}" desc_parts.append(f"body-unfilled: {shown}") - # #1974: body-section presence is informational only; the gate is peer-ack. - state = "success" if not missing else "failure" + state = "success" if not missing and not missing_body else "failure" return state, " — ".join(desc_parts) diff --git a/.gitea/scripts/tests/test_sop_checklist.py b/.gitea/scripts/tests/test_sop_checklist.py index 13cc37a1a..cae9ef149 100644 --- a/.gitea/scripts/tests/test_sop_checklist.py +++ b/.gitea/scripts/tests/test_sop_checklist.py @@ -428,9 +428,7 @@ class TestRenderStatus(unittest.TestCase): self._state_with(all_slugs), {it["slug"]: False for it in self.items}, ) - # #1974: body-section presence is informational only; state is success - # when all items are peer-acked, even if body sections are missing. - self.assertEqual(state, "success") + self.assertEqual(state, "failure") self.assertIn("body-unfilled", desc) @@ -502,8 +500,7 @@ class TestEndToEndAckFlow(unittest.TestCase): self.assertEqual(result_state, "success") self.assertIn("7/7", desc) - def test_all_acks_succeed_when_body_section_unfilled(self): - """#1974: body-section presence is informational; ack gate is peer-ack.""" + def test_all_acks_still_fail_when_body_section_unfilled(self): items = _items_by_slug() aliases = _numeric_aliases() comments = [ @@ -524,9 +521,7 @@ class TestEndToEndAckFlow(unittest.TestCase): body["root-cause"] = False items_list = list(items.values()) result_state, desc = sop.render_status(items_list, state, body) - # #1974: body-unfilled is informational only; state is success when - # all required acks are present. - self.assertEqual(result_state, "success") + self.assertEqual(result_state, "failure") self.assertIn("7/7", desc) self.assertIn("body-unfilled: root-cause", desc) -- 2.52.0