From f75fa0089cfbc7b3271b771aec0695fbaded87a7 Mon Sep 17 00:00:00 2001 From: "hongming-codex-laptop (Molecule AI agent)" Date: Tue, 12 May 2026 13:38:43 -0700 Subject: [PATCH] fix: soften staging smoke preflight failures --- .gitea/workflows/staging-smoke.yml | 42 +++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/.gitea/workflows/staging-smoke.yml b/.gitea/workflows/staging-smoke.yml index f6f61e99..5b343150 100644 --- a/.gitea/workflows/staging-smoke.yml +++ b/.gitea/workflows/staging-smoke.yml @@ -123,15 +123,19 @@ jobs: steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - name: Verify admin token present + - name: Verify prerequisites + id: preflight run: | - if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then + set -euo pipefail + result=0 + reasons=() + + if [ -z "${MOLECULE_ADMIN_TOKEN:-}" ]; then echo "::error::CP_STAGING_ADMIN_API_TOKEN not set" - exit 2 + reasons+=("CP_STAGING_ADMIN_API_TOKEN not set") + result=2 fi - - name: Verify LLM key present - run: | # Per-runtime key check — claude-code uses MiniMax; hermes / # langgraph (operator-dispatched only) use OpenAI. Hard-fail # rather than soft-skip per the lesson from synth E2E #2578: @@ -168,12 +172,23 @@ jobs: esac if [ -n "$required_secret_name" ] && [ -z "$required_secret_value" ]; then echo "::error::${required_secret_name} secret not set for runtime=${E2E_RUNTIME} — A2A will fail at request time with 'No LLM provider configured'" - exit 2 + reasons+=("${required_secret_name} not set for runtime=${E2E_RUNTIME}") + result=2 fi + if [ "$result" -ne 0 ]; then + { + echo "result=$result" + printf 'reason=%s\n' "$(IFS='; '; echo "${reasons[*]}")" + } >> "$GITHUB_OUTPUT" + echo "::error::staging smoke preflight failed; alert issue step will record it, but cron monitor status stays green for main" + exit 0 + fi + echo "result=0" >> "$GITHUB_OUTPUT" echo "LLM key present ✓ (runtime=${E2E_RUNTIME}, key=${required_secret_name}, len=${#required_secret_value})" - name: Smoke run id: smoke + if: steps.preflight.outputs.result == '0' run: | set +e bash tests/e2e/test_staging_full_saas.sh @@ -201,12 +216,15 @@ jobs: # not wait 90 min for it to "count." Real flakes get one issue + # a quick close-on-green; persistent reds accumulate comments. - name: Open issue on failure (Gitea API) - if: steps.smoke.outputs.result != '0' + if: steps.preflight.outputs.result != '0' || steps.smoke.outputs.result != '0' env: GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPO: ${{ github.repository }} SERVER_URL: ${{ env.GITHUB_SERVER_URL }} RUN_ID: ${{ github.run_id }} + PREFLIGHT_RESULT: ${{ steps.preflight.outputs.result }} + PREFLIGHT_REASON: ${{ steps.preflight.outputs.reason }} + SMOKE_RESULT: ${{ steps.smoke.outputs.result }} run: | set -euo pipefail API="${SERVER_URL%/}/api/v1" @@ -223,19 +241,19 @@ jobs: if [ -n "$EXISTING" ]; then curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \ "${API}/repos/${REPO}/issues/${EXISTING}/comments" \ - -d "$(jq -nc --arg run "$RUN_URL" '{body: ("Smoke still failing. " + $run)}')" >/dev/null + -d "$(jq -nc --arg run "$RUN_URL" --arg preflight "${PREFLIGHT_RESULT:-}" --arg reason "${PREFLIGHT_REASON:-}" --arg smoke "${SMOKE_RESULT:-skipped}" '{body: ("Smoke still failing. " + $run + "\n\nPreflight result: " + $preflight + "\nSmoke result: " + $smoke + (if $reason == "" then "" else "\nReason: " + $reason end))}')" >/dev/null echo "Commented on existing issue #${EXISTING}" else NOW=$(date -u +%Y-%m-%dT%H:%M:%SZ) - BODY=$(jq -nc --arg t "$TITLE" --arg now "$NOW" --arg run "$RUN_URL" \ - '{title: $t, body: ("Smoke run failed at " + $now + ".\n\nRun: " + $run + "\n\nThis issue auto-closes on the next green smoke run. Consecutive failures add a comment here rather than a new issue.")}') + BODY=$(jq -nc --arg t "$TITLE" --arg now "$NOW" --arg run "$RUN_URL" --arg preflight "${PREFLIGHT_RESULT:-}" --arg reason "${PREFLIGHT_REASON:-}" --arg smoke "${SMOKE_RESULT:-skipped}" \ + '{title: $t, body: ("Smoke run failed at " + $now + ".\n\nRun: " + $run + "\n\nPreflight result: " + $preflight + "\nSmoke result: " + $smoke + (if $reason == "" then "" else "\nReason: " + $reason end) + "\n\nThis issue auto-closes on the next green smoke run. Consecutive failures add a comment here rather than a new issue.")}') curl -fsS -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \ "${API}/repos/${REPO}/issues" -d "$BODY" >/dev/null echo "Opened smoke failure issue (first red)" fi - name: Auto-close smoke issue on success (Gitea API) - if: steps.smoke.outputs.result == '0' + if: steps.preflight.outputs.result == '0' && steps.smoke.outputs.result == '0' env: GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPO: ${{ github.repository }} @@ -349,7 +367,7 @@ jobs: # loop) can grep on. Mirrors PR#461's sweep-stale-e2e-orgs # pattern. Runs AFTER the teardown safety net (which is # if: always()) so failures don't suppress cleanup. - if: steps.smoke.outputs.result != '0' + if: steps.preflight.outputs.result != '0' || steps.smoke.outputs.result != '0' run: | echo "::error::staging-smoke FAILED — staging SaaS canary is red. See prior step logs + the auto-filed alert issue. Common causes: (a) CP_STAGING_ADMIN_API_TOKEN secret missing/rotated, (b) staging-api.moleculesai.app 5xx, (c) MiniMax/Anthropic LLM key dead, (d) AMI/CF/WorkOS drift. The 30-min cron will retry, but a chronic red here indicates the staging SaaS stack is broken end-to-end." exit 0