From 3f1425b46f7bdbf459beb5455682e213f2ffa8d0 Mon Sep 17 00:00:00 2001 From: hongming-codex-laptop Date: Wed, 13 May 2026 15:57:11 -0700 Subject: [PATCH 1/5] fix(ci): harden production redeploy workflow --- .gitea/workflows/redeploy-tenants-on-main.yml | 45 ++++++++++++------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/.gitea/workflows/redeploy-tenants-on-main.yml b/.gitea/workflows/redeploy-tenants-on-main.yml index 2e216ff4..6fdd803b 100644 --- a/.gitea/workflows/redeploy-tenants-on-main.yml +++ b/.gitea/workflows/redeploy-tenants-on-main.yml @@ -77,13 +77,11 @@ env: GITHUB_SERVER_URL: https://git.moleculesai.app jobs: + # bp-exempt: production redeploy is a side-effect workflow, not a merge gate. redeploy: - # Skip the auto-trigger if publish-workspace-server-image didn't - # actually succeed. workflow_run fires on any completion state; we - # don't want to redeploy against a half-built image. - # NOTE (Gitea port): workflow_dispatch trigger dropped; only the - # workflow_run path remains. - if: ${{ github.event.workflow_run.conclusion == 'success' }} + # Gitea 1.22.6 does not support workflow_run. This workflow is now + # controlled by push/path triggers plus an explicit kill switch. + if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }} runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. @@ -120,15 +118,15 @@ jobs: # the only thing retagging `:latest` today is the manual # promote-latest.yml — last run 2026-04-28). Auto-trigger # from workflow_run uses workflow_run.head_sha; manual - # dispatch with no input falls through to github.sha. + # dispatch with no variable falls through to github.sha. env: - INPUT_TAG: ${{ inputs.target_tag }} - HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }} + PROD_MANUAL_REDEPLOY_TARGET_TAG: ${{ vars.PROD_MANUAL_REDEPLOY_TARGET_TAG || secrets.PROD_MANUAL_REDEPLOY_TARGET_TAG || '' }} + HEAD_SHA: ${{ github.sha }} run: | set -euo pipefail - if [ -n "${INPUT_TAG:-}" ]; then - echo "target_tag=$INPUT_TAG" >> "$GITHUB_OUTPUT" - echo "Using operator-pinned tag: $INPUT_TAG" + if [ -n "${PROD_MANUAL_REDEPLOY_TARGET_TAG:-}" ]; then + echo "target_tag=$PROD_MANUAL_REDEPLOY_TARGET_TAG" >> "$GITHUB_OUTPUT" + echo "Using operator-pinned tag from PROD_MANUAL_REDEPLOY_TARGET_TAG." else SHORT="${HEAD_SHA:0:7}" echo "target_tag=staging-$SHORT" >> "$GITHUB_OUTPUT" @@ -144,13 +142,26 @@ jobs: CP_URL: ${{ vars.CP_URL || 'https://api.moleculesai.app' }} CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }} TARGET_TAG: ${{ steps.tag.outputs.target_tag }} - CANARY_SLUG: ${{ inputs.canary_slug || 'hongming' }} - SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }} - BATCH_SIZE: ${{ inputs.batch_size || '3' }} - DRY_RUN: ${{ inputs.dry_run || false }} + CANARY_SLUG: ${{ vars.PROD_REDEPLOY_CANARY_SLUG || secrets.PROD_REDEPLOY_CANARY_SLUG || '' }} + SOAK_SECONDS: ${{ vars.PROD_REDEPLOY_SOAK_SECONDS || secrets.PROD_REDEPLOY_SOAK_SECONDS || '' }} + BATCH_SIZE: ${{ vars.PROD_REDEPLOY_BATCH_SIZE || secrets.PROD_REDEPLOY_BATCH_SIZE || '' }} + DRY_RUN: ${{ vars.PROD_REDEPLOY_DRY_RUN || secrets.PROD_REDEPLOY_DRY_RUN || '' }} + PROD_AUTO_DEPLOY_DISABLED: ${{ vars.PROD_AUTO_DEPLOY_DISABLED || secrets.PROD_AUTO_DEPLOY_DISABLED || '' }} run: | set -euo pipefail + case "${PROD_AUTO_DEPLOY_DISABLED,,}" in + 1|true|yes|on) + echo "::notice::PROD_AUTO_DEPLOY_DISABLED is set; skipping production redeploy." + exit 0 + ;; + esac + + CANARY_SLUG="${CANARY_SLUG:-hongming}" + SOAK_SECONDS="${SOAK_SECONDS:-60}" + BATCH_SIZE="${BATCH_SIZE:-3}" + DRY_RUN="${DRY_RUN:-false}" + if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then echo "::error::CP_ADMIN_API_TOKEN secret not set — skipping redeploy" echo "::notice::Set CP_ADMIN_API_TOKEN in repo secrets to enable auto-redeploy." @@ -172,7 +183,7 @@ jobs: }') echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet" - echo " body: $BODY" + echo " target_tag=$TARGET_TAG canary=$CANARY_SLUG soak_seconds=$SOAK_SECONDS batch_size=$BATCH_SIZE dry_run=$DRY_RUN" HTTP_RESPONSE=$(mktemp) HTTP_CODE_FILE=$(mktemp) -- 2.45.2 From d7e55ccb9f1d00e904c19f6dd161086876b063aa Mon Sep 17 00:00:00 2001 From: Molecule AI Core-BE Date: Wed, 13 May 2026 23:14:32 +0000 Subject: [PATCH 2/5] chore: re-trigger CI for PR #904 SOP checklist [core-be-agent] Co-Authored-By: Claude Opus 4.7 -- 2.45.2 From cbe4055edc95a2d42d44324c8bcdc8d1b0d3df34 Mon Sep 17 00:00:00 2001 From: hongming-codex-laptop Date: Wed, 13 May 2026 16:35:00 -0700 Subject: [PATCH 3/5] docs(ci): align prod redeploy workflow comments --- .gitea/workflows/redeploy-tenants-on-main.yml | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/.gitea/workflows/redeploy-tenants-on-main.yml b/.gitea/workflows/redeploy-tenants-on-main.yml index 6fdd803b..0411e149 100644 --- a/.gitea/workflows/redeploy-tenants-on-main.yml +++ b/.gitea/workflows/redeploy-tenants-on-main.yml @@ -36,17 +36,19 @@ name: redeploy-tenants-on-main # # Runtime ordering: # 1. publish-workspace-server-image completes → new :staging- in ECR. -# 2. This workflow fires via workflow_run, calls redeploy-fleet with -# target_tag=staging-. No CDN propagation wait needed — -# ECR image manifest is consistent immediately after push. +# 2. The merge that updates publish-workspace-server-image.yml triggers +# this push/path-filtered workflow, which calls redeploy-fleet with +# target_tag=staging-. No CDN propagation wait needed — ECR image +# manifest is consistent immediately after push. # 3. Calls redeploy-fleet with canary_slug (if set) and a soak # period. Canary proves the image boots; batches follow. # 4. Any failure aborts the rollout and leaves older tenants on the # prior image — safer default than half-and-half state. # -# Rollback path: re-run this workflow with a specific SHA pinned via -# the workflow_dispatch input. That calls redeploy-fleet with -# target_tag=, re-pulling the older image on every tenant. +# Rollback path: set PROD_MANUAL_REDEPLOY_TARGET_TAG as a repo/org +# variable or secret, run workflow_dispatch, then unset it after the +# rollback. That calls redeploy-fleet with target_tag=, +# re-pulling the pinned image on every tenant. on: push: @@ -117,7 +119,7 @@ jobs: # dead (staging-verify soft-skips without canary fleet, so # the only thing retagging `:latest` today is the manual # promote-latest.yml — last run 2026-04-28). Auto-trigger - # from workflow_run uses workflow_run.head_sha; manual + # from the main push uses github.sha; manual # dispatch with no variable falls through to github.sha. env: PROD_MANUAL_REDEPLOY_TARGET_TAG: ${{ vars.PROD_MANUAL_REDEPLOY_TARGET_TAG || secrets.PROD_MANUAL_REDEPLOY_TARGET_TAG || '' }} @@ -292,10 +294,10 @@ jobs: if [ "$TARGET_TAG" != "latest" ] \ && [ "$TARGET_TAG" != "$EXPECTED_SHA" ] \ && [ "$TARGET_TAG" != "staging-$EXPECTED_SHORT" ]; then - # workflow_dispatch with a pinned tag that isn't the head + # Manual redeploy with a pinned tag that isn't the head # SHA — operator is rolling back / pinning. Skip the # verification because we don't have the expected SHA in - # this context (would need to crane-inspect the GHCR + # this context (would need to inspect the ECR # manifest, which is a follow-up). Failing-open here is # safe: the operator chose the tag deliberately. # -- 2.45.2 From daeed93fe9fd5f354fac710a67e70b8bd4783830 Mon Sep 17 00:00:00 2001 From: hongming-codex-laptop Date: Wed, 13 May 2026 16:46:34 -0700 Subject: [PATCH 4/5] fix(ci): avoid PR pending traps in CI sentinel --- .gitea/workflows/ci.yml | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index cad7a727..2703f0f7 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -380,17 +380,27 @@ jobs: # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true needs: [changes, canvas-build] - # Only fires on direct pushes to main (i.e. after staging→main promotion). - if: needs.changes.outputs.canvas == 'true' && github.event_name == 'push' && github.ref == 'refs/heads/main' + # Keep the job itself always runnable. Gitea 1.22.6 leaves job-level + # event/ref `if:` gates as pending on PRs, which blocks the combined + # status even though this reminder is intentionally non-required. steps: - name: Write deploy reminder to step summary env: COMMIT_SHA: ${{ github.sha }} + CANVAS_CHANGED: ${{ needs.changes.outputs.canvas }} + EVENT_NAME: ${{ github.event_name }} + REF_NAME: ${{ github.ref }} # github.server_url resolves via the workflow-level env override # to the Gitea instance, so the RUN_URL points at the Gitea run # page (not github.com). See feedback_act_runner_github_server_url. RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} run: | + set -euo pipefail + if [ "$CANVAS_CHANGED" != "true" ] || [ "$EVENT_NAME" != "push" ] || [ "$REF_NAME" != "refs/heads/main" ]; then + echo "Canvas deploy reminder not applicable for event=$EVENT_NAME ref=$REF_NAME canvas_changed=$CANVAS_CHANGED." + exit 0 + fi + # Write body to a temp file — avoids backtick escaping in shell. cat > /tmp/deploy-reminder.md << 'BODY' ## Canvas build passed — deploy required @@ -535,11 +545,10 @@ jobs: # hourly if this list diverges from status_check_contexts or from # audit-force-merge.yml's REQUIRED_CHECKS env (RFC §4 + §6). # - # Excluded from `needs:`: `canvas-deploy-reminder` — gated by - # `if: ... github.event_name == 'push' && github.ref == 'refs/heads/main'`, - # so on PR events it's legitimately `skipped`. The drift detector - # explicitly excludes `github.event_name`-gated jobs from F1 (see - # `.gitea/scripts/ci-required-drift.py::ci_job_names`). + # Excluded from `needs:`: `canvas-deploy-reminder` — it is an + # operational reminder, not a CI prerequisite. Keep that job runnable + # on PRs with an internal no-op guard; job-level event/ref `if:` gates + # are a Gitea 1.22.6 pending-status trap. # # Phase 3 (RFC #219 §1) safety: underlying build jobs carry # continue-on-error: true so their failures are masked to null (2026-05-12: re-enabled mc#774 interim) @@ -559,7 +568,7 @@ jobs: - canvas-build - shellcheck - python-lint - if: always() + if: ${{ always() }} steps: - name: Assert every required dependency succeeded run: | -- 2.45.2 From 785a4175a495221e86baed9bb3f7197df6692c5d Mon Sep 17 00:00:00 2001 From: hongming-codex-laptop Date: Wed, 13 May 2026 17:11:12 -0700 Subject: [PATCH 5/5] fix(ci): avoid heavy fanout for workflow-only PRs --- .gitea/workflows/ci.yml | 29 +++++++---- tests/test_lint_workflow_yaml.py | 87 ++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 10 deletions(-) diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 2703f0f7..16560e92 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -107,16 +107,25 @@ jobs: echo "scripts=true" >> "$GITHUB_OUTPUT" exit 0 fi - # Both .github/workflows/ci.yml AND .gitea/workflows/ci.yml count - # as "this workflow changed" — either edit should force-run every - # downstream job. The Gitea port follows the same shape as the - # GitHub original so behavior matches when triggered on either - # platform. - DIFF=$(git diff --name-only "$BASE" HEAD 2>/dev/null || echo ".gitea/workflows/ci.yml") - echo "platform=$(echo "$DIFF" | grep -qE '^workspace-server/|^\.gitea/workflows/ci\.yml$|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT" - echo "canvas=$(echo "$DIFF" | grep -qE '^canvas/|^\.gitea/workflows/ci\.yml$|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT" - echo "python=$(echo "$DIFF" | grep -qE '^workspace/|^\.gitea/workflows/ci\.yml$|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT" - echo "scripts=$(echo "$DIFF" | grep -qE '^tests/e2e/|^scripts/|^infra/scripts/|^\.gitea/workflows/ci\.yml$|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT" + # Workflow-only edits are covered by the workflow lint family + # and by this workflow's always-present required jobs. Do not fan + # those edits out into Go/Canvas/Python/shellcheck work; the + # downstream jobs still emit their required contexts via no-op + # steps when their surface flag is false. + # + # If the diff itself cannot be trusted, fail open by running every + # surface instead of silently under-testing the PR. + if ! DIFF=$(git diff --name-only "$BASE" HEAD 2>/dev/null); then + echo "platform=true" >> "$GITHUB_OUTPUT" + echo "canvas=true" >> "$GITHUB_OUTPUT" + echo "python=true" >> "$GITHUB_OUTPUT" + echo "scripts=true" >> "$GITHUB_OUTPUT" + exit 0 + fi + echo "platform=$(echo "$DIFF" | grep -qE '^workspace-server/' && echo true || echo false)" >> "$GITHUB_OUTPUT" + echo "canvas=$(echo "$DIFF" | grep -qE '^canvas/' && echo true || echo false)" >> "$GITHUB_OUTPUT" + echo "python=$(echo "$DIFF" | grep -qE '^workspace/' && echo true || echo false)" >> "$GITHUB_OUTPUT" + echo "scripts=$(echo "$DIFF" | grep -qE '^tests/e2e/|^scripts/|^infra/scripts/' && echo true || echo false)" >> "$GITHUB_OUTPUT" # Platform (Go) — Go build/vet/test/lint + coverage gates. The always-run # + per-step gating shape preserves the GitHub-side required-check name diff --git a/tests/test_lint_workflow_yaml.py b/tests/test_lint_workflow_yaml.py index 55835235..4cd4b151 100644 --- a/tests/test_lint_workflow_yaml.py +++ b/tests/test_lint_workflow_yaml.py @@ -22,6 +22,7 @@ Cross-links: """ from __future__ import annotations +import re import subprocess import sys import textwrap @@ -542,3 +543,89 @@ def test_rule9_prod_manual_deploy_allows_rollback_control(tmp_path): _write(tmp_path, "ok.yml", PROD_ROLLBACK_OK) r = _run_lint(tmp_path) assert r.returncode == 0, f"stdout={r.stdout}\nstderr={r.stderr}" + + +# --------------------------------------------------------------------------- +# CI change detector fanout — workflow-only PRs keep required contexts without +# running Go/Canvas/Python/shellcheck heavy steps. +# --------------------------------------------------------------------------- + +CI_WORKFLOW = REPO_ROOT / ".gitea" / "workflows" / "ci.yml" +CI_SURFACES = ("platform", "canvas", "python", "scripts") + + +def _ci_change_patterns() -> dict[str, re.Pattern[str]]: + text = CI_WORKFLOW.read_text(encoding="utf-8") + patterns: dict[str, re.Pattern[str]] = {} + for surface, pattern in re.findall( + r'echo "(platform|canvas|python|scripts)=.*?grep -qE \'([^\']+)\'', + text, + ): + patterns[surface] = re.compile(pattern) + assert set(patterns) == set(CI_SURFACES) + return patterns + + +def _classify_ci_change(*paths: str) -> dict[str, bool]: + patterns = _ci_change_patterns() + return { + surface: any(pattern.search(path) for path in paths) + for surface, pattern in patterns.items() + } + + +def test_ci_change_detector_workflow_only_edits_do_not_trigger_heavy_surfaces(): + assert _classify_ci_change(".gitea/workflows/ci.yml") == { + "platform": False, + "canvas": False, + "python": False, + "scripts": False, + } + assert _classify_ci_change(".github/workflows/ci.yml") == { + "platform": False, + "canvas": False, + "python": False, + "scripts": False, + } + + +def test_ci_change_detector_narrow_surface_edits_only_trigger_their_surface(): + assert _classify_ci_change("workspace-server/internal/handlers/foo.go") == { + "platform": True, + "canvas": False, + "python": False, + "scripts": False, + } + assert _classify_ci_change("canvas/app/page.tsx") == { + "platform": False, + "canvas": True, + "python": False, + "scripts": False, + } + assert _classify_ci_change("workspace/a2a_mcp_server.py") == { + "platform": False, + "canvas": False, + "python": True, + "scripts": False, + } + assert _classify_ci_change("tests/e2e/test_model_slug.sh") == { + "platform": False, + "canvas": False, + "python": False, + "scripts": True, + } + + +def test_ci_change_detector_docs_and_meta_scripts_do_not_trigger_surfaces(): + assert _classify_ci_change("README.md") == { + "platform": False, + "canvas": False, + "python": False, + "scripts": False, + } + assert _classify_ci_change(".gitea/scripts/lint-workflow-yaml.py") == { + "platform": False, + "canvas": False, + "python": False, + "scripts": False, + } -- 2.45.2