From 463316772bf8661b8b5cb0e8a84798329b3acfac Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 18:29:38 -0700 Subject: [PATCH] fix(workflows): rewrite curl status-capture to prevent exit-code pollution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 2026-05-04 redeploy-tenants-on-main run for sha 2b862f6 emitted "HTTP 000000" and failed the deploy. Root cause: when curl exits non- zero (connection reset → 56, --fail-with-body 4xx/5xx → 22), the `-w '%{http_code}'` already wrote a status to stdout; the inline `|| echo "000"` then fires AND appends another "000" to the captured substitution stdout. Result: HTTP_CODE="<000>" — fails string comparisons against "200" while looking superficially right. Same class of bug the synth-E2E §7c gate hit twice (PRs #2779/#2783 + #2797). Memory feedback_curl_status_capture_pollution.md. Mass fix in 8 workflows: route -w into a tempfile so curl's exit code can't pollute stdout. Wrap with set +e/-e so the non-zero curl exit doesn't trip the outer pipeline. redeploy-tenants-on-main.yml (production-critical, caught the bug) redeploy-tenants-on-staging.yml (sibling) sweep-stale-e2e-orgs.yml (cleanup loop) e2e-staging-sanity.yml (E2E safety-net teardown) e2e-staging-saas.yml e2e-staging-external.yml e2e-staging-canvas.yml canary-staging.yml Plus a new lint workflow `lint-curl-status-capture.yml` that runs on every PR/push touching `.github/workflows/**`. Multi-line aware: collapses bash `\` continuations, then matches the buggy $(curl ... -w '%{http_code}' ... || echo "000") subshell shape. Distinguishes from the SAFE $(cat tempfile || echo "000") shape (cat with missing file emits empty stdout, no pollution). Verified: - All 8 workflows pass the lint locally - A known-bad injection is caught - A known-safe cat-fallback passes through - yaml.safe_load clean on all changed files Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/canary-staging.yml | 10 +- .github/workflows/e2e-staging-canvas.yml | 10 +- .github/workflows/e2e-staging-external.yml | 10 +- .github/workflows/e2e-staging-saas.yml | 10 +- .github/workflows/e2e-staging-sanity.yml | 10 +- .../workflows/lint-curl-status-capture.yml | 94 +++++++++++++++++++ .../workflows/redeploy-tenants-on-main.yml | 18 +++- .../workflows/redeploy-tenants-on-staging.yml | 16 +++- .github/workflows/sweep-stale-e2e-orgs.yml | 9 +- 9 files changed, 166 insertions(+), 21 deletions(-) create mode 100644 .github/workflows/lint-curl-status-capture.yml diff --git a/.github/workflows/canary-staging.yml b/.github/workflows/canary-staging.yml index 5f1384dc..c2a4d7f4 100644 --- a/.github/workflows/canary-staging.yml +++ b/.github/workflows/canary-staging.yml @@ -295,12 +295,16 @@ jobs: # See molecule-controlplane#420. leaks=() for slug in $orgs; do - code=$(curl -sS -o /tmp/canary-cleanup.out -w "%{http_code}" \ + # Tempfile-routed -w + set +e/-e prevents curl-exit-code + # pollution of the captured status (lint-curl-status-capture.yml). + set +e + curl -sS -o /tmp/canary-cleanup.out -w "%{http_code}" \ -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ -H "Authorization: Bearer $ADMIN_TOKEN" \ -H "Content-Type: application/json" \ - -d "{\"confirm\":\"$slug\"}" \ - || echo "000") + -d "{\"confirm\":\"$slug\"}" >/tmp/canary-cleanup.code 2>/dev/null + set -e + code=$(cat /tmp/canary-cleanup.code 2>/dev/null || echo "000") if [ "$code" = "200" ] || [ "$code" = "204" ]; then echo "[teardown] deleted $slug (HTTP $code)" else diff --git a/.github/workflows/e2e-staging-canvas.yml b/.github/workflows/e2e-staging-canvas.yml index 6c59e72a..1c5b15ff 100644 --- a/.github/workflows/e2e-staging-canvas.yml +++ b/.github/workflows/e2e-staging-canvas.yml @@ -192,12 +192,16 @@ jobs: # cleanup miss shouldn't fail-flag the canvas test when the # actual smoke check passed; the sweeper is the safety net. # See molecule-controlplane#420. - code=$(curl -sS -o /tmp/canvas-cleanup.out -w "%{http_code}" \ + # Tempfile-routed -w + set +e/-e prevents curl-exit-code + # pollution of the captured status (lint-curl-status-capture.yml). + set +e + curl -sS -o /tmp/canvas-cleanup.out -w "%{http_code}" \ -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ -H "Authorization: Bearer $ADMIN_TOKEN" \ -H "Content-Type: application/json" \ - -d "{\"confirm\":\"$slug\"}" \ - || echo "000") + -d "{\"confirm\":\"$slug\"}" >/tmp/canvas-cleanup.code 2>/dev/null + set -e + code=$(cat /tmp/canvas-cleanup.code 2>/dev/null || echo "000") if [ "$code" = "200" ] || [ "$code" = "204" ]; then echo "[teardown] deleted $slug (HTTP $code)" else diff --git a/.github/workflows/e2e-staging-external.yml b/.github/workflows/e2e-staging-external.yml index 12ac4577..923423a1 100644 --- a/.github/workflows/e2e-staging-external.yml +++ b/.github/workflows/e2e-staging-external.yml @@ -159,12 +159,16 @@ jobs: # leaked. Sweeper catches the rest within ~45 min. leaks=() for slug in $orgs; do - code=$(curl -sS -o /tmp/external-cleanup.out -w "%{http_code}" \ + # Tempfile-routed -w + set +e/-e prevents curl-exit-code + # pollution of the captured status (lint-curl-status-capture.yml). + set +e + curl -sS -o /tmp/external-cleanup.out -w "%{http_code}" \ -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ -H "Authorization: Bearer $ADMIN_TOKEN" \ -H "Content-Type: application/json" \ - -d "{\"confirm\":\"$slug\"}" \ - || echo "000") + -d "{\"confirm\":\"$slug\"}" >/tmp/external-cleanup.code 2>/dev/null + set -e + code=$(cat /tmp/external-cleanup.code 2>/dev/null || echo "000") if [ "$code" = "200" ] || [ "$code" = "204" ]; then echo "[teardown] deleted $slug (HTTP $code)" else diff --git a/.github/workflows/e2e-staging-saas.yml b/.github/workflows/e2e-staging-saas.yml index 8cbe468b..aab10b96 100644 --- a/.github/workflows/e2e-staging-saas.yml +++ b/.github/workflows/e2e-staging-saas.yml @@ -224,12 +224,16 @@ jobs: leaks=() for slug in $orgs; do echo "Safety-net teardown: $slug" - code=$(curl -sS -o /tmp/saas-cleanup.out -w "%{http_code}" \ + # Tempfile-routed -w + set +e/-e prevents curl-exit-code + # pollution of the captured status (lint-curl-status-capture.yml). + set +e + curl -sS -o /tmp/saas-cleanup.out -w "%{http_code}" \ -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ -H "Authorization: Bearer $ADMIN_TOKEN" \ -H "Content-Type: application/json" \ - -d "{\"confirm\":\"$slug\"}" \ - || echo "000") + -d "{\"confirm\":\"$slug\"}" >/tmp/saas-cleanup.code 2>/dev/null + set -e + code=$(cat /tmp/saas-cleanup.code 2>/dev/null || echo "000") if [ "$code" = "200" ] || [ "$code" = "204" ]; then echo "[teardown] deleted $slug (HTTP $code)" else diff --git a/.github/workflows/e2e-staging-sanity.yml b/.github/workflows/e2e-staging-sanity.yml index e98b38fe..a89821a7 100644 --- a/.github/workflows/e2e-staging-sanity.yml +++ b/.github/workflows/e2e-staging-sanity.yml @@ -148,12 +148,16 @@ jobs: # safety net within ~45 min. leaks=() for slug in $orgs; do - code=$(curl -sS -o /tmp/sanity-cleanup.out -w "%{http_code}" \ + # Tempfile-routed -w + set +e/-e prevents curl-exit-code + # pollution of the captured status (lint-curl-status-capture.yml). + set +e + curl -sS -o /tmp/sanity-cleanup.out -w "%{http_code}" \ -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ -H "Authorization: Bearer $ADMIN_TOKEN" \ -H "Content-Type: application/json" \ - -d "{\"confirm\":\"$slug\"}" \ - || echo "000") + -d "{\"confirm\":\"$slug\"}" >/tmp/sanity-cleanup.code 2>/dev/null + set -e + code=$(cat /tmp/sanity-cleanup.code 2>/dev/null || echo "000") if [ "$code" = "200" ] || [ "$code" = "204" ]; then echo "[teardown] deleted $slug (HTTP $code)" else diff --git a/.github/workflows/lint-curl-status-capture.yml b/.github/workflows/lint-curl-status-capture.yml new file mode 100644 index 00000000..487b2eb4 --- /dev/null +++ b/.github/workflows/lint-curl-status-capture.yml @@ -0,0 +1,94 @@ +name: Lint curl status-code capture + +# Pins the workflow-bash anti-pattern that produced "HTTP 000000" on the +# 2026-05-04 redeploy-tenants-on-main run for sha 2b862f6: +# +# HTTP_CODE=$(curl ... -w '%{http_code}' ... || echo "000") +# +# When curl exits non-zero (connection reset → 56, --fail-with-body 4xx/5xx +# → 22), the `-w '%{http_code}'` already wrote a status to stdout — usually +# "000" for connection failures or the actual code for HTTP errors. The +# `|| echo "000"` then fires AND appends ANOTHER "000" to the captured +# stdout, producing values like "000000" or "409000" that fail string +# comparisons against "200" while looking superficially right. +# +# Same class of bug the synth-E2E §7c gate hit twice (PRs #2779/#2783 + +# #2797). Memory: feedback_curl_status_capture_pollution.md. +# +# Fix shape (route -w into a tempfile so curl's exit code can't pollute): +# +# set +e +# curl ... -w '%{http_code}' >code.txt 2>/dev/null +# set -e +# HTTP_CODE=$(cat code.txt 2>/dev/null) +# [ -z "$HTTP_CODE" ] && HTTP_CODE="000" + +on: + pull_request: + paths: ['.github/workflows/**'] + push: + branches: [main, staging] + paths: ['.github/workflows/**'] + merge_group: + types: [checks_requested] + +jobs: + scan: + name: Scan workflows for curl status-capture pollution + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Find curl ... -w '%{http_code}' ... || echo "000" subshells + run: | + set -uo pipefail + # Multi-line aware: look for `$(curl ... -w '%{http_code}' ... || echo "000")` + # subshell where the entire command-substitution wraps a curl that + # ends with `|| echo "000"`. Must distinguish from the SAFE shape + # `$(cat tempfile 2>/dev/null || echo "000")` — `cat` with a missing + # tempfile produces empty stdout, no pollution. + python3 <<'PY' + import os, re, sys, glob + + BAD_FILES = [] + + # Match the buggy substitution across newlines: $(curl ... -w '%{http_code}' ... || echo "000") + # The `\\n` is the bash line-continuation that lets curl flags span lines. + # We collapse continuation lines first, then look for the single-line bad pattern. + PATTERN = re.compile( + r'\$\(\s*curl\b[^)]*-w\s*[\'"]%\{http_code\}[\'"][^)]*\|\|\s*echo\s+"000"\s*\)', + re.DOTALL, + ) + + # Self-skip: this lint workflow contains the literal anti-pattern in + # its own docstring — that's intentional, not a bug. + SELF = ".github/workflows/lint-curl-status-capture.yml" + + for f in sorted(glob.glob(".github/workflows/*.yml")): + if f == SELF: + continue + with open(f) as fh: + content = fh.read() + # Collapse bash line-continuations (\\\n + leading whitespace) + # into a single logical line so the regex can see the full + # curl invocation as one chunk. + flat = re.sub(r'\\\s*\n\s*', ' ', content) + for m in PATTERN.finditer(flat): + BAD_FILES.append((f, m.group(0)[:120])) + + if not BAD_FILES: + print("✓ No curl-status-capture pollution patterns detected") + sys.exit(0) + + print(f"::error::Found {len(BAD_FILES)} curl-status-capture pollution site(s):") + for f, snippet in BAD_FILES: + print(f"::error file={f}::Curl status-capture pollution: '|| echo \"000\"' inside a $(curl ... -w '%{{http_code}}' ...) subshell. On non-2xx or connection failure, curl's -w writes a status, then exits non-zero, then the || echo appends another '000' — producing 'HTTP 000000' or '409000' that fails comparisons silently. Fix: route -w into a tempfile so the exit code can't pollute stdout. See memory feedback_curl_status_capture_pollution.md.") + print(f" matched: {snippet}…") + print() + print("Fix template:") + print(' set +e') + print(' curl ... -w \'%{http_code}\' >code.txt 2>/dev/null') + print(' set -e') + print(' HTTP_CODE=$(cat code.txt 2>/dev/null)') + print(' [ -z "$HTTP_CODE" ] && HTTP_CODE="000"') + sys.exit(1) + PY diff --git a/.github/workflows/redeploy-tenants-on-main.yml b/.github/workflows/redeploy-tenants-on-main.yml index 85acda60..7d68a6d9 100644 --- a/.github/workflows/redeploy-tenants-on-main.yml +++ b/.github/workflows/redeploy-tenants-on-main.yml @@ -184,12 +184,26 @@ jobs: echo " body: $BODY" HTTP_RESPONSE=$(mktemp) - HTTP_CODE=$(curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \ + HTTP_CODE_FILE=$(mktemp) + # Route -w into its own tempfile so curl's exit code (e.g. 56 + # on connection-reset, 22 on --fail-with-body 4xx/5xx) can't + # pollute the captured stdout. The previous inline-substitution + # shape produced "000000" on connection reset (curl wrote + # "000" via -w, then the inline echo-fallback appended another + # "000") — caught on the 2026-05-04 redeploy of sha 2b862f6. + # set +e/-e keeps the non-zero curl exit from tripping the + # outer pipeline. See lint-curl-status-capture.yml for the + # CI gate that pins this fix shape. + set +e + curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \ -m 1200 \ -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \ -H "Content-Type: application/json" \ -X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \ - -d "$BODY" || echo "000") + -d "$BODY" >"$HTTP_CODE_FILE" 2>/dev/null + set -e + HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000") + [ -z "$HTTP_CODE" ] && HTTP_CODE="000" echo "HTTP $HTTP_CODE" cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE" diff --git a/.github/workflows/redeploy-tenants-on-staging.yml b/.github/workflows/redeploy-tenants-on-staging.yml index 97392172..433df238 100644 --- a/.github/workflows/redeploy-tenants-on-staging.yml +++ b/.github/workflows/redeploy-tenants-on-staging.yml @@ -146,12 +146,24 @@ jobs: echo " body: $BODY" HTTP_RESPONSE=$(mktemp) - HTTP_CODE=$(curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \ + HTTP_CODE_FILE=$(mktemp) + # Route -w into its own tempfile so curl's exit code (e.g. 56 + # on connection-reset) can't pollute the captured stdout. The + # previous inline-substitution shape produced "000000" on + # connection reset — caught on main variant 2026-05-04 + # redeploying sha 2b862f6. Same fix shape as the synth-E2E + # §9c gate (PR #2797). See lint-curl-status-capture.yml for + # the CI gate that pins this fix shape. + set +e + curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \ -m 1200 \ -H "Authorization: Bearer $CP_STAGING_ADMIN_API_TOKEN" \ -H "Content-Type: application/json" \ -X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \ - -d "$BODY" || echo "000") + -d "$BODY" >"$HTTP_CODE_FILE" 2>/dev/null + set -e + HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000") + [ -z "$HTTP_CODE" ] && HTTP_CODE="000" echo "HTTP $HTTP_CODE" cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE" diff --git a/.github/workflows/sweep-stale-e2e-orgs.yml b/.github/workflows/sweep-stale-e2e-orgs.yml index d2fcb8be..1531f117 100644 --- a/.github/workflows/sweep-stale-e2e-orgs.yml +++ b/.github/workflows/sweep-stale-e2e-orgs.yml @@ -159,12 +159,17 @@ jobs: # The DELETE handler requires {"confirm": ""} matching # the URL slug — fat-finger guard. Idempotent: re-issuing # picks up via org_purges.last_step. - http_code=$(curl -sS -o /tmp/del_resp -w "%{http_code}" \ + # Tempfile-routed -w + set +e/-e prevents curl-exit-code + # pollution of the captured status (lint-curl-status-capture.yml). + set +e + curl -sS -o /tmp/del_resp -w "%{http_code}" \ --max-time 60 \ -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ -H "Authorization: Bearer $ADMIN_TOKEN" \ -H "Content-Type: application/json" \ - -d "{\"confirm\":\"$slug\"}" || echo "000") + -d "{\"confirm\":\"$slug\"}" >/tmp/del_code 2>/dev/null + set -e + http_code=$(cat /tmp/del_code 2>/dev/null || echo "000") if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then deleted=$((deleted+1)) echo " deleted: $slug"