diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 1dc6af80c..29fb69434 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -25,10 +25,9 @@ # sufficient for `actions/checkout` against this same repo. # # 4. Docs — no docs/scripts reference github.com URLs that need swapping. -# The canvas-deploy-reminder step writes a `ghcr.io/...` image -# reference into the step summary text — that's documentation prose -# pointing at the ECR-mirrored canvas image and stays unchanged for -# this port (a separate cleanup if ghcr→ECR sweep is in scope). +# The canvas-deploy-status step (core#2226, formerly canvas-deploy-reminder) +# writes the canvas ordered-deploy status into the step summary; it points +# at the ECR canvas image and the publish workflow, no ghcr.io prose. # # Cross-links: # - RFC: internal#219 (CI/CD hard-gate hardening) @@ -389,61 +388,61 @@ jobs: # mc#959 root-fix (sre) - canvas-deploy-reminder: - name: Canvas Deploy Reminder + canvas-deploy-status: + # core#2226: replaces the old advisory "Canvas Deploy Reminder". The canvas + # image now has a real ORDERED auto-deploy (publish-canvas-image.yml: + # build → push :staging- → wait green main CI → promote :latest by + # digest), and docker-compose pins via CANVAS_IMAGE_TAG. There is no longer + # a manual "go run docker compose pull by hand" step to remind operators + # about — so this job just records, on a canvas-touching main push, that the + # ordered deploy is handling it (and where to watch), instead of prescribing + # a manual action that determinism made obsolete. + name: Canvas Deploy Status runs-on: docker-host - # mc#1982 root-fix: added job-level `if:` so ci-required-drift.py's - # ci_job_names() detects this as github.ref-gated and skips it from F1. - # The step-level exit 0 handles the "not main push" case; the job-level - # `if:` makes the gating explicit so the drift script sees it. - # Runs on both main and staging pushes; step exits 0 when not applicable. + # Job-level `if:` so ci-required-drift.py's ci_job_names() detects this as + # github.ref-gated and skips it from the required-context F1 set (mc#1982). + # Step-level exit 0 handles the "not a canvas main push" case. if: ${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/staging' }} needs: [changes, canvas-build] steps: - - name: Write deploy reminder to step summary + - name: Record canvas ordered-deploy status env: COMMIT_SHA: ${{ github.sha }} CANVAS_CHANGED: ${{ needs.changes.outputs.canvas }} EVENT_NAME: ${{ github.event_name }} REF_NAME: ${{ github.ref }} - # github.server_url resolves via the workflow-level env override - # to the Gitea instance, so the RUN_URL points at the Gitea run - # page (not github.com). See feedback_act_runner_github_server_url. - RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + # github.server_url resolves via the workflow-level env override to the + # Gitea instance, so RUN_URL points at the Gitea run page (not github.com). + RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions run: | set -euo pipefail if [ "$CANVAS_CHANGED" != "true" ] || [ "$EVENT_NAME" != "push" ] || [ "$REF_NAME" != "refs/heads/main" ]; then - echo "Canvas deploy reminder not applicable for event=$EVENT_NAME ref=$REF_NAME canvas_changed=$CANVAS_CHANGED." + echo "Canvas deploy status not applicable for event=$EVENT_NAME ref=$REF_NAME canvas_changed=$CANVAS_CHANGED." exit 0 fi # Write body to a temp file — avoids backtick escaping in shell. - cat > /tmp/deploy-reminder.md << 'BODY' - ## Canvas build passed — deploy required + cat > /tmp/deploy-status.md << 'BODY' + ## Canvas ordered deploy in progress — no manual action required - The `publish-canvas-image` workflow is now building a fresh Docker image - (`ghcr.io/molecule-ai/canvas:latest`) in the background. + This canvas-touching main push triggers `publish-canvas-image`, which now + runs an ORDERED, CI-gated deploy (core#2226) — the same shape as the + platform's deploy-production: - Once it completes (~3–5 min), apply on the host machine with: - ```bash - cd - git pull origin main - docker compose pull canvas && docker compose up -d canvas - ``` + 1. Build → push `molecule-ai/canvas:staging-` + `:staging-latest`. + 2. Wait for green main CI on this SHA. + 3. Promote `:latest` to the verified `:staging-` by digest. - If you need to rebuild from local source instead (e.g. testing unreleased - changes or a new `NEXT_PUBLIC_*` URL), use: - ```bash - docker compose build canvas && docker compose up -d canvas - ``` + Tenants/hosts pin via `CANVAS_IMAGE_TAG` (default `latest` = the last + CI-green build), so a deploy is reproducible — no hand-run + `docker compose pull` needed. Watch the run in the canvas publish workflow. BODY - printf '\n> Posted automatically by CI · commit `%s` · [build log](%s)\n' \ - "$COMMIT_SHA" "$RUN_URL" >> /tmp/deploy-reminder.md + printf '\n> Posted automatically by CI · commit `%s` · [publish workflow](%s)\n' \ + "$COMMIT_SHA" "$RUN_URL" >> /tmp/deploy-status.md - # Gitea has no commit-comments API; write to GITHUB_STEP_SUMMARY, - # which both GitHub Actions and Gitea Actions render as the - # workflow run's summary page. (#75 / PR-D) - cat /tmp/deploy-reminder.md >> "$GITHUB_STEP_SUMMARY" + # Gitea has no commit-comments API; write to GITHUB_STEP_SUMMARY, which + # both GitHub and Gitea Actions render as the run's summary page. + cat /tmp/deploy-status.md >> "$GITHUB_STEP_SUMMARY" # Python Lint & Test — required check, always runs. # Runtime Python moved to molecule-ai-workspace-runtime. Keep this context as diff --git a/.gitea/workflows/publish-canvas-image.yml b/.gitea/workflows/publish-canvas-image.yml index e40e6b219..b140dead0 100644 --- a/.gitea/workflows/publish-canvas-image.yml +++ b/.gitea/workflows/publish-canvas-image.yml @@ -14,10 +14,37 @@ name: publish-canvas-image # authenticate to ghcr.io. # -# Builds and pushes the canvas Docker image to ECR whenever a commit lands -# on main that touches canvas code. Previously canvas changes were visible in -# CI (npm run build passed) but the live container was never updated — -# operators had to manually run `docker compose build canvas` each time. +# Builds, pushes, and (ordered) deploys the standalone canvas Docker image to +# ECR whenever a commit lands on main that touches canvas code. +# +# Ordered deploy (core#2226) — mirrors publish-workspace-server-image.yml so the +# standalone `molecule-ai/canvas` image is deterministic + verifiable, not a +# side effect of the platform fleet pulling a mutable `:latest`: +# +# build-and-push: build → push :staging- + :staging-latest + :sha- +# (does NOT move :latest — an unpromoted build must never +# become the prod-blessed tag). +# promote-canvas: waits for green main CI on this SHA, then re-points +# :latest to the verified :staging- by digest +# (imagetools create — no rebuild). So `:latest` == the +# current prod-blessed canvas, byte-identical to staging-. +# +# Tag scheme produced (parallels platform-tenant): +# :staging- — per-commit immutable digest, what docker-compose pins to. +# :staging-latest — most recent BUILD on main (last-writer-wins, NOT gated). +# :sha- — kept for back-compat with any consumer pinning the old tag. +# :latest — most recent CI-GREEN build. Only moved by promote-canvas. +# +# WHY this is the canvas analogue of the platform's deploy-production, not a +# literal copy: the standalone canvas co-deploys with the platform on the same +# host via the root docker-compose.yml (`docker compose pull && up -d`). Gating +# the canvas `:latest` promotion on the SAME green-main-CI signal the platform +# deploy waits on makes platform + canvas roll together by the same SHA. The +# canvas has no per-tenant fleet of its own and no /buildinfo endpoint, so there +# is no fleet-rollout / per-tenant verify step to mirror here — CI-green + +# digest-pin + immutable :staging- is the determinism contract. (A future +# canvas /buildinfo would let this assert the served SHA like the platform does; +# tracked in core#2226.) # # Mirror of publish-platform-image.yml, adapted for the Next.js canvas layer. # See that workflow for inline notes on macOS Keychain isolation and QEMU. @@ -30,6 +57,7 @@ on: # platform-only / docs-only / MCP-only merges. - 'canvas/**' - '.gitea/workflows/publish-canvas-image.yml' + workflow_dispatch: # NOTE (Gitea port): the original GitHub workflow had a # `workflow_dispatch:` manual trigger for the # non-canvas-merge-but-need-fresh-image scenario. Dropped in the @@ -69,6 +97,10 @@ jobs: # Phase 3 (RFC #219 §1): surface broken workflows without blocking. # mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true + outputs: + # Exposed so promote-canvas re-points :latest to the EXACT per-commit tag + # this build produced (digest-level), never a re-resolved mutable tag. + staging_sha: ${{ steps.tags.outputs.staging_sha }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -140,6 +172,7 @@ jobs: shell: bash run: | echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT" + echo "staging_sha=staging-${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT" - name: Resolve build args id: build_args @@ -175,8 +208,14 @@ jobs: build-args: | NEXT_PUBLIC_PLATFORM_URL=${{ steps.build_args.outputs.platform_url }} NEXT_PUBLIC_WS_URL=${{ steps.build_args.outputs.ws_url }} + # Ordered deploy (core#2226): the build job pushes the immutable + # per-commit tag + the build-tracking staging-latest + the legacy + # back-compat :sha- tag. It does NOT push :latest — :latest is + # the prod-blessed tag and is only re-pointed by promote-canvas after + # green main CI, so an unpromoted/red build can never become :latest. tags: | - ${{ env.IMAGE_NAME }}:latest + ${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.staging_sha }} + ${{ env.IMAGE_NAME }}:staging-latest ${{ env.IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }} # Gitea artifact-cache reachability is best-effort on the operator # runner network. Do not let cache export fail an image that already @@ -185,3 +224,107 @@ jobs: org.opencontainers.image.source=https://git.moleculesai.app/${{ github.repository }} org.opencontainers.image.revision=${{ github.sha }} org.opencontainers.image.description=Molecule AI canvas (Next.js 15 + React Flow) + + # bp-exempt: post-merge canvas promote side-effect; merge is gated by CI / + # all-required and this job waits for green push CI on the SHA before acting. + promote-canvas: + name: Promote canvas :latest to CI-green build + needs: build-and-push + # Only on a real main push — workflow_dispatch / non-main never promotes. + if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} + # Side-effect deploy only; the image publish above is the durable artifact. + # mc#1982: do NOT renew this mask silently — it mirrors deploy-production's + # contract (a flaky promote must not red the ship lane), tracked in core#2226. + continue-on-error: true + runs-on: publish + timeout-minutes: 60 + env: + # Same green-main-CI gate the platform deploy-production waits on, so + # platform + canvas advance :latest off the identical signal/SHA. + GITEA_HOST: git.moleculesai.app + GITEA_TOKEN: ${{ secrets.PROD_AUTO_DEPLOY_CONTROL_TOKEN || secrets.AUTO_SYNC_TOKEN }} + CI_STATUS_TIMEOUT_SECONDS: "3600" + # Re-uses the platform's disable kill-switch: when prod auto-deploy is + # paused, the canvas :latest promote pauses too (correct — an unpromoted + # build must not become :latest while the fleet is frozen). + PROD_AUTO_DEPLOY_DISABLED: ${{ vars.PROD_AUTO_DEPLOY_DISABLED || secrets.PROD_AUTO_DEPLOY_DISABLED || '' }} + steps: + # The publish runner's default HOME (/home/hongming) is not writable, so + # docker credential saves fail and halt the promote (#2193 on the platform + # side). Point HOME + DOCKER_CONFIG at the writable job temp dir. + - name: Prepare writable HOME + Docker config + run: | + set -euo pipefail + H="$RUNNER_TEMP/canvas-promote-home" + mkdir -p "$H/.docker" + echo "HOME=$H" >> "$GITHUB_ENV" + echo "DOCKER_CONFIG=$H/.docker" >> "$GITHUB_ENV" + + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Resolve promote gate + id: gate + env: + PROD_AUTO_DEPLOY_DISABLED: ${{ env.PROD_AUTO_DEPLOY_DISABLED }} + run: | + set -euo pipefail + if [ -n "${PROD_AUTO_DEPLOY_DISABLED:-}" ]; then + case "$(printf '%s' "$PROD_AUTO_DEPLOY_DISABLED" | tr '[:upper:]' '[:lower:]')" in + 1|true|yes|on|disabled|disable) + echo "enabled=false" >> "$GITHUB_OUTPUT" + echo "::notice::Canvas :latest promote skipped: PROD_AUTO_DEPLOY_DISABLED=$PROD_AUTO_DEPLOY_DISABLED" + { + echo "## Canvas :latest promote skipped" + echo "" + echo "Reason: \`PROD_AUTO_DEPLOY_DISABLED=$PROD_AUTO_DEPLOY_DISABLED\`. The CI-green build is published as \`:staging-${GITHUB_SHA::7}\`; \`:latest\` was left unchanged." + } >> "$GITHUB_STEP_SUMMARY" + exit 0 ;; + esac + fi + if [ -z "${GITEA_TOKEN:-}" ]; then + echo "::error::AUTO_SYNC_TOKEN/PROD_AUTO_DEPLOY_CONTROL_TOKEN is required so the canvas promote can wait for green CI." + exit 1 + fi + echo "enabled=true" >> "$GITHUB_OUTPUT" + + - name: Wait for green main CI on this SHA + if: ${{ steps.gate.outputs.enabled == 'true' }} + run: | + set -euo pipefail + # Same SSOT wait the platform deploy uses: blocks until the required + # push contexts (CI / all-required (push) + Secret scan) go green on + # THIS sha, and fails closed if any required context terminally fails. + python3 .gitea/scripts/prod-auto-deploy.py wait-ci + + - name: Promote canvas :latest to the CI-green image + if: ${{ steps.gate.outputs.enabled == 'true' }} + env: + IMAGE_NAME: ${{ env.IMAGE_NAME }} + STAGING_SHA_TAG: ${{ needs.build-and-push.outputs.staging_sha }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-east-2 + run: | + set -euo pipefail + # Fail-safe: if the build job's output didn't propagate, recompute the + # immutable per-commit tag from the SHA so we never promote a guess. + SHA_TAG="${STAGING_SHA_TAG:-staging-${GITHUB_SHA::7}}" + ECR_REGISTRY="${IMAGE_NAME%%/*}" + aws ecr get-login-password --region us-east-2 | \ + docker login --username AWS --password-stdin "${ECR_REGISTRY}" + + # Digest-level re-tag (no pull/rebuild): :latest becomes byte-identical + # to the verified :staging- for this commit. + docker buildx imagetools create \ + --tag "${IMAGE_NAME}:latest" \ + "${IMAGE_NAME}:${SHA_TAG}" + + { + echo "## Canvas :latest promoted" + echo "" + echo "Re-pointed \`molecule-ai/canvas:latest\` → \`${SHA_TAG}\` (by digest)." + echo ":latest now tracks the CI-green canvas build for commit \`${GITHUB_SHA::7}\`." + echo "" + echo "Tenants/hosts that \`docker compose pull canvas\` now get the same build the platform deploy rolled for this SHA." + } >> "$GITHUB_STEP_SUMMARY" diff --git a/docker-compose.yml b/docker-compose.yml index eb80449ea..470fc9bb8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -159,15 +159,28 @@ services: # --- Canvas --- canvas: - # The publish-canvas-image CI workflow pushes a fresh image to GHCR on - # every canvas/** merge to main. To update the running container: - # docker compose pull canvas && docker compose up -d canvas - # First-time local setup or testing unreleased changes — build from source: - # docker compose build canvas && docker compose up -d canvas + # The publish-canvas-image CI workflow runs an ORDERED deploy (core#2226): + # build → push :staging- + :staging-latest → (after green main CI) + # re-point :latest to the verified :staging- by digest. So both tags + # below resolve to a CI-green, reproducible build, never a raw/red one. + # + # Reproducible deploy: pin CANVAS_IMAGE_TAG to the immutable per-commit tag + # the ordered deploy produced, e.g. + # CANVAS_IMAGE_TAG=staging- docker compose pull canvas && docker compose up -d canvas + # This makes a tenant/host deploy reproducible (resolves the standing + # `TODO: pin canvas ECR image digest`). Unset it and the default `latest` + # is the prod-blessed tag the ordered deploy keeps pointed at the last + # green build — still deterministic vs. the old raw `:latest`. + # + # To pin by content digest instead of tag (fully immutable): + # aws ecr describe-images --repository-name molecule-ai/canvas \ + # --image-tags staging- --region us-east-2 \ + # --query 'imageDetails[0].imageDigest' --output text + # then set CANVAS_IMAGE_TAG=staging-@ (compose passes it through). + # # Note: ECR images require AWS auth — `aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 153263036946.dkr.ecr.us-east-2.amazonaws.com` before pull. - # Digest-pin requires: aws ecr describe-images --repository-name molecule-ai/canvas --image-tags latest --query 'imageDetails[0].imageDigest' - # TODO: pin canvas ECR image digest once AWS creds are available in CI. - image: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/canvas:latest + # Local dev keeps working via the `build:` context below (docker compose build canvas). + image: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/canvas:${CANVAS_IMAGE_TAG:-latest} build: context: ./canvas dockerfile: Dockerfile