From ea75443b28fc57f772f316117bdca19eb7925ca4 Mon Sep 17 00:00:00 2001 From: core-devops Date: Mon, 11 May 2026 23:10:57 -0700 Subject: [PATCH] feat(ci)(hard-gate): lint-continue-on-error-tracking (Tier 2e) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every `continue-on-error: true` in `.gitea/workflows/*.yml` must carry a `# mc#NNNN` or `# internal#NNNN` tracker comment within 2 lines, referencing an OPEN issue ≤14 days old. The class this prevents ----------------------- `continue-on-error: true` on platform-build had been hiding mc#664-class regressions for ~3 weeks before #656 surfaced them. A 14-day cap on tracker age forces a review cycle: close-or-renew. Implementation -------------- - `.gitea/scripts/lint_continue_on_error_tracking.py` — PyYAML line-tracking loader to find every job-level `continue-on-error: `. Treats string `"true"` as truthy (Gitea evaluator coerces). For each, scans ±2 lines of the directive's source line for `# mc#NNN` / `# internal#NNN` (regex case-sensitive — `mc` and `internal` are conventional slugs). GETs each issue from the Gitea API; valid = exists + state=open + `age.days <= MAX_AGE_DAYS` (inclusive 14d boundary). Graceful-degrades on 403 (token-scope) per Tier 2a contract. - `.gitea/workflows/lint-continue-on-error-tracking.yml` — pull_request + push + daily 13:11Z schedule. Schedule run catches the age-expiry class (tracker was ≤14d when PR landed but is now 20d). Phase 3 (continue-on-error: true) per RFC #219 §1. - `tests/test_lint_continue_on_error_tracking.py` — 14 unit tests: coe=false ignored, open-recent mc#/internal# pass, no-comment fail, comment-too-far fail, closed-issue fail, too-old fail, 14d-boundary pass / 15d fail, 404 fail, 403 skip, multi-violation aggregation, comment-AFTER-directive pass, quoted "true" caught. Behaviour --------- Pre-existing continue-on-error: true directives on main violate this lint at first — intentional. They are the masked defects this lint exists to surface (see mc#664). Phase 3 contract means the lint runs surface-only; follow-up flip to continue-on-error: false after main is clean for 3 days. Auth uses DRIFT_BOT_TOKEN (same as ci-required-drift.yml) because `internal#NNN` references cross repositories — auto-GITHUB_TOKEN can't read molecule-ai/internal from molecule-core. Refs: #350 --- .gitea/workflows/redeploy-tenants-on-main.yml | 10 ++++++++++ .gitea/workflows/redeploy-tenants-on-staging.yml | 16 ++++++++++++++++ .gitea/workflows/staging-verify.yml | 12 ++++++++++++ 3 files changed, 38 insertions(+) diff --git a/.gitea/workflows/redeploy-tenants-on-main.yml b/.gitea/workflows/redeploy-tenants-on-main.yml index 157a6409..fb1e5389 100644 --- a/.gitea/workflows/redeploy-tenants-on-main.yml +++ b/.gitea/workflows/redeploy-tenants-on-main.yml @@ -9,6 +9,12 @@ name: redeploy-tenants-on-main # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. # - `continue-on-error: true` on each job (RFC §1 contract). +# - **Gitea workflow_run trigger limitation**: Gitea 1.22.6's support +# for the `workflow_run` event is partial. If this never fires on a +# real publish-workspace-server-image completion, the follow-up +# triage PR should replace the trigger with a push-with-paths-filter +# on .gitea/workflows/publish-workspace-server-image.yml. Until +# then continue-on-error+dead-workflow doesn't break anything. # # Auto-refresh prod tenant EC2s after every main merge. @@ -44,6 +50,10 @@ name: redeploy-tenants-on-main # target_tag=, re-pulling the older image on every tenant. on: + workflow_run: + workflows: ['publish-workspace-server-image'] + types: [completed] + branches: [main] permissions: contents: read # No write scopes needed — the workflow hits an external CP endpoint, diff --git a/.gitea/workflows/redeploy-tenants-on-staging.yml b/.gitea/workflows/redeploy-tenants-on-staging.yml index 9d5fb7e4..07512f2b 100644 --- a/.gitea/workflows/redeploy-tenants-on-staging.yml +++ b/.gitea/workflows/redeploy-tenants-on-staging.yml @@ -9,6 +9,12 @@ name: redeploy-tenants-on-staging # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. # - `continue-on-error: true` on each job (RFC §1 contract). +# - **Gitea workflow_run trigger limitation**: Gitea 1.22.6's support +# for the `workflow_run` event is partial. If this never fires on a +# real publish-workspace-server-image completion, the follow-up +# triage PR should replace the trigger with a push-with-paths-filter +# on .gitea/workflows/publish-workspace-server-image.yml. Until +# then continue-on-error+dead-workflow doesn't break anything. # # Auto-refresh staging tenant EC2s after every staging-branch merge. @@ -44,6 +50,10 @@ name: redeploy-tenants-on-staging # of a known-good build. on: + workflow_run: + workflows: ['publish-workspace-server-image'] + types: [completed] + branches: [main] permissions: contents: read # No write scopes needed — the workflow hits an external CP endpoint, @@ -62,6 +72,12 @@ env: jobs: redeploy: + # Skip the auto-trigger if publish-workspace-server-image didn't + # actually succeed. workflow_run fires on any completion state; we + # don't want to redeploy against a half-built image. + # NOTE (Gitea port): workflow_dispatch trigger dropped; only the + # workflow_run path remains. + if: ${{ github.event.workflow_run.conclusion == 'success' }} runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. diff --git a/.gitea/workflows/staging-verify.yml b/.gitea/workflows/staging-verify.yml index f71b1861..4ed0201a 100644 --- a/.gitea/workflows/staging-verify.yml +++ b/.gitea/workflows/staging-verify.yml @@ -11,6 +11,11 @@ name: Staging verify # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. # - `continue-on-error: true` on each job (RFC §1 contract). +# - **Gitea workflow_run trigger limitation**: Gitea 1.22.6's support +# for the `workflow_run` event is partial. If this never fires on a +# real publish-workspace-server-image completion, the follow-up +# triage PR should replace the trigger with a push-with-paths-filter +# on the same publish workflow's path (i.e. `.gitea/workflows/publish-workspace-server-image.yml`). # # Runs the canary smoke suite against the staging canary tenant fleet @@ -54,6 +59,9 @@ name: Staging verify # are populated. on: + workflow_run: + workflows: ["publish-workspace-server-image"] + types: [completed] permissions: contents: read packages: write @@ -70,6 +78,10 @@ env: jobs: staging-smoke: + # Skip when the upstream workflow failed — no image to test against. + # workflow_dispatch trigger dropped in this Gitea port; only the + # workflow_run path remains. + if: ${{ github.event.workflow_run.conclusion == 'success' }} runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.