From 42ff6be15c1b86e4a09b815ba9858c180012488f Mon Sep 17 00:00:00 2001 From: dev-lead Date: Fri, 8 May 2026 10:52:09 -0700 Subject: [PATCH] =?UTF-8?q?fix(ci):=20canary=20alerting=20=E2=80=94=20drop?= =?UTF-8?q?=20Gitea-incompatible=20actions=20API=20call?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "Open issue on failure" step was failing on every canary run because Gitea 1.22.6 doesn't expose /api/v1/actions endpoints (per memory reference_gitea_actions_log_fetch). The threshold check called github.rest.actions.listWorkflowRuns() to count consecutive prior failures and gate issue creation behind 3 reds — that call ALWAYS 404'd on Gitea, breaking the entire alerting step. Net effect: the canary's own self-alerting was broken, so the underlying staging regression went unflagged for 38h+ (2026-05-07 02:30 UTC → 2026-05-08 17:34 UTC, every cron tick red, zero issues filed). Fix: drop the consecutive-failures threshold entirely. File a sticky issue on the FIRST failure; comment-on-existing handles deduplication for subsequent failures. The auto-close-on-success step is unchanged. Why not a Gitea-compatible threshold (e.g., walk recent commit statuses): comment-on-existing already gives ops a single accumulating issue per regression streak. The threshold's purpose was to avoid spamming on transient flakes — but with sticky issue + auto-close-on-green, transient flakes get one issue + one quick close, which is fine signal. Filing on first failure is also better UX: catches the regression in 30 min instead of 90 min. Also: rewrote runURL from hardcoded https://github.com/... to context.serverUrl so the link actually points at Gitea (https://git.moleculesai.app) — was always broken on Gitea but nobody noticed because the issue-filing step itself was broken. Net: 21 insertions, 40 deletions. Removes WORKFLOW_PATH + CONSECUTIVE_THRESHOLD env vars (no longer needed). Tracked in: molecule-core#129 (failure mode 3 of 3) Verification: yaml syntax-valid; no remaining github.rest.actions.* calls; only github.rest.issues.* (all Gitea-supported per memory feedback_persona_token_v2_scope). Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/canary-staging.yml | 61 ++++++++++------------------ 1 file changed, 21 insertions(+), 40 deletions(-) diff --git a/.github/workflows/canary-staging.yml b/.github/workflows/canary-staging.yml index b040b196..8f0d74ac 100644 --- a/.github/workflows/canary-staging.yml +++ b/.github/workflows/canary-staging.yml @@ -137,27 +137,28 @@ jobs: id: canary run: bash tests/e2e/test_staging_full_saas.sh - # Alerting: open an issue only after THREE consecutive failures so - # transient flakes (Cloudflare DNS hiccup, AWS API blip) don't spam - # the issue list. If an issue is already open, we still comment on - # every failure so ops sees the streak. Auto-close on next green. + # Alerting: open a sticky issue on the FIRST failure; comment on + # subsequent failures; auto-close on next green. Comment-on-existing + # de-duplicates so a single open issue accumulates the streak — + # ops sees one issue with N comments rather than N issues. # - # Threshold rationale: canary fires every 30 min, so 3 failures = - # ~90 min of consecutive red — well past any single-run flake but - # still tight enough that a real outage gets surfaced before the - # next deploy window. + # Why no consecutive-failures threshold (e.g., wait 3 runs before + # filing): the prior threshold check used + # `github.rest.actions.listWorkflowRuns()` which Gitea 1.22.6 does + # not expose (returns 404). On Gitea Actions the threshold call + # ALWAYS failed, breaking the entire alerting step and going days + # silent on real regressions (38h+ chronic red on 2026-05-07/08 + # before this fix; tracked in molecule-core#129). Filing on first + # failure is also better UX — we want to know about the first red, + # not wait 90 min for it to "count." Real flakes get one issue + + # a quick close-on-green; persistent reds accumulate comments. - name: Open issue on failure if: failure() uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 - env: - # Inject the workflow path explicitly — context.workflow is - # the *name*, not the file path the actions API needs. - WORKFLOW_PATH: '.github/workflows/canary-staging.yml' - CONSECUTIVE_THRESHOLD: '3' with: script: | const title = '🔴 Canary failing: staging SaaS smoke'; - const runURL = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; + const runURL = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; // Find an existing open canary issue (stable title match). // If one exists, this isn't a "first failure" — comment and exit. @@ -177,32 +178,12 @@ jobs: return; } - // No open issue yet — check the last N-1 runs' conclusions. - // We open the issue only if the last (THRESHOLD-1) runs ALSO - // failed (so this is the 3rd consecutive red). - const threshold = parseInt(process.env.CONSECUTIVE_THRESHOLD, 10); - const { data: runs } = await github.rest.actions.listWorkflowRuns({ - owner: context.repo.owner, repo: context.repo.repo, - workflow_id: process.env.WORKFLOW_PATH, - status: 'completed', - per_page: threshold, - // Skip the current in-progress run; it isn't 'completed' yet. - }); - // listWorkflowRuns returns recent first. We need (threshold-1) - // prior failures (current run is the threshold-th). - const priorFailures = (runs.workflow_runs || []) - .slice(0, threshold - 1) - .filter(r => r.id !== context.runId) - .filter(r => r.conclusion === 'failure') - .length; - if (priorFailures < threshold - 1) { - core.info(`Below threshold: ${priorFailures + 1}/${threshold} consecutive failures — not filing yet`); - return; - } - + // No open issue yet — file one on this first failure. The + // comment-on-existing branch above means subsequent failures + // accumulate as comments on this same issue, so we don't + // spam new issues per run. const body = - `Canary run failed at ${new Date().toISOString()}, ` + - `${threshold} consecutive runs red.\n\n` + + `Canary run failed at ${new Date().toISOString()}.\n\n` + `Run: ${runURL}\n\n` + `This issue auto-closes on the next green canary run. ` + `Consecutive failures add a comment here rather than a new issue.`; @@ -211,7 +192,7 @@ jobs: title, body, labels: ['canary-staging', 'bug'], }); - core.info(`Opened canary failure issue (${threshold} consecutive reds)`); + core.info('Opened canary failure issue (first red)'); - name: Auto-close canary issue on success if: success()