molecule-core/.github/workflows/publish-workspace-server-image.yml

name: publish-workspace-server-image

# Builds and pushes Docker images to GHCR on staging or main pushes.
# EC2 tenant instances pull the tenant image from GHCR.
#
# Branch / tag policy (see Compute tags step for the per-branch logic):
#
#   staging push  → builds image, tags :staging-<sha> + :staging-latest.
#                   staging-CP pins TENANT_IMAGE=:staging-latest, so it
#                   picks up staging-branch code automatically. This is
#                   what makes staging-CP actually test staging-branch
#                   code instead of "yesterday's main" — pre-fix, this
#                   workflow only ran on main, so staging tenants
#                   silently served stale code (#2308 fix RFC #2312
#                   landed on staging but never reached tenants because
#                   staging→main was wedged on path-filter parity bugs).
#
#   main push     → builds image, tags :staging-<sha> + :staging-latest
#                   (same as before). canary-verify.yml retags
#                   :staging-<sha> → :latest after canary tenants
#                   green-light the digest. The :staging-latest retag
#                   on main push is intentional: when main lands AFTER a
#                   staging push, staging-CP gets the post-promote code
#                   (which equals what it had + any merge resolution),
#                   so the canary-on-staging-CP step still runs against
#                   the prod-bound digest.
#
# In the steady state both branches refresh :staging-latest; the
# semantic is "most recent staging-or-main build of tenant code."
# Drift between the two is bounded by the staging→main auto-promote
# cadence and is corrected on the next staging push.

on:
  push:
    branches: [staging, main]
    paths:
      - 'workspace-server/**'
      - 'canvas/**'
      - 'manifest.json'
      - 'scripts/**'
      - '.github/workflows/publish-workspace-server-image.yml'
  workflow_dispatch:

# Serialize per-branch so two rapid staging pushes don't race the same
# :staging-latest tag retag. Allow staging and main to run in parallel
# (different github.ref → different concurrency group) since they
# produce different :staging-<sha> tags and last-write-wins on
# :staging-latest is acceptable across branches (the post-promote
# main code equals current staging code in a healthy flow).
#
# cancel-in-progress: false → in-flight builds finish; the next push's
# build queues. This avoids a partially-pushed image and keeps the
# canary fleet pin (:staging-<sha>) consistent with what was actually
# tested at canary-verify time.
concurrency:
  group: publish-workspace-server-image-${{ github.ref }}
  cancel-in-progress: false

permissions:
  contents: read
  packages: write

env:
  IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform
  TENANT_IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant

jobs:
  build-and-push:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      # github-app-auth sibling-checkout removed 2026-05-07 (#157):
      # plugin was dropped + workspace-server/Dockerfile no longer
      # COPYs it.

      - name: Configure AWS credentials for ECR
        # GHCR was the pre-suspension target; the molecule-ai org on
        # GitHub got swept 2026-05-06 and ghcr.io/molecule-ai/* is no
        # longer reachable. Post-suspension target is the operator's
        # ECR org (153263036946.dkr.ecr.us-east-2.amazonaws.com/
        # molecule-ai/*), which already hosts platform-tenant +
        # workspace-template-* + runner-base images. AWS creds come
        # from the AWS_ACCESS_KEY_ID/SECRET secrets bound to the
        # molecule-cp IAM user. Closes #161.
        uses: aws-actions/configure-aws-credentials@v4
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: us-east-2

      - name: Log in to ECR
        id: ecr-login
        uses: aws-actions/amazon-ecr-login@v2

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0

      - name: Compute tags
        id: tags
        run: |
          echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"

      # Pre-clone manifest deps before docker build (Task #173 fix).
      #
      # Why pre-clone: post-2026-05-06, every workspace-template-* repo on
      # Gitea (codex, crewai, deepagents, gemini-cli, langgraph) plus all
      # 7 org-template-* repos are private. The pre-fix Dockerfile.tenant
      # ran `git clone` inside an in-image stage, which had no auth path
      # — every CI build failed with "fatal: could not read Username for
      # https://git.moleculesai.app". For weeks, every workspace-server
      # rebuild required a manual operator-host push. Now we clone in the
      # trusted CI context (where AUTO_SYNC_TOKEN is naturally available)
      # and Dockerfile.tenant just COPYs from .tenant-bundle-deps/.
      #
      # Token shape: AUTO_SYNC_TOKEN is the devops-engineer persona PAT
      # (see /etc/molecule-bootstrap/agent-secrets.env). Per saved memory
      # `feedback_per_agent_gitea_identity_default`, every CI surface uses
      # a per-persona token, never the founder PAT. clone-manifest.sh
      # embeds it as basic-auth (oauth2:<token>) for the duration of the
      # clones, then strips .git directories — the token never enters
      # the resulting image.
      #
      # Idempotent: if a re-run finds populated dirs, clone-manifest.sh
      # skips them; safe to retrigger via path-filter or workflow_dispatch.
      - name: Pre-clone manifest deps
        env:
          MOLECULE_GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
        run: |
          set -euo pipefail
          if [ -z "${MOLECULE_GITEA_TOKEN}" ]; then
            echo "::error::AUTO_SYNC_TOKEN secret is empty — register the devops-engineer persona PAT in repo Actions secrets"
            exit 1
          fi
          mkdir -p .tenant-bundle-deps
          bash scripts/clone-manifest.sh \
            manifest.json \
            .tenant-bundle-deps/workspace-configs-templates \
            .tenant-bundle-deps/org-templates \
            .tenant-bundle-deps/plugins
          # Sanity-check counts so a silent partial clone fails fast
          # instead of producing a half-empty image.
          ws_count=$(find .tenant-bundle-deps/workspace-configs-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
          org_count=$(find .tenant-bundle-deps/org-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
          plugins_count=$(find .tenant-bundle-deps/plugins -mindepth 1 -maxdepth 1 -type d | wc -l)
          echo "Cloned: ws=$ws_count org=$org_count plugins=$plugins_count"
          # Counts are derived from manifest.json (9 ws / 7 org / 21
          # plugins as of 2026-05-07). If manifest.json grows but the
          # clone step regresses silently, the find above caps at the
          # actual disk state — but clone-manifest.sh's own EXPECTED vs
          # CLONED check (line ~95) is the authoritative fail-fast.

      # Canary-gated release flow:
      #   - This step always publishes :staging-<sha> + :staging-latest.
      #   - On staging push, staging-CP picks up :staging-latest immediately
      #     (its TENANT_IMAGE pin is :staging-latest) — so staging-branch
      #     code reaches staging tenants without waiting for main.
      #   - On main push, canary-verify.yml runs smoke tests against
      #     canary tenants (which pin :staging-<sha>), and on green retags
      #     :staging-<sha> → :latest. Prod tenants pull :latest.
      #   - On red, :latest stays on the prior good digest — prod is safe.
      #
      # Why :staging-latest is retagged on main push too: when main lands
      # after a staging promote, staging-CP gets the post-promote code so
      # the canary-on-staging-CP step still runs against the prod-bound
      # digest. In a healthy flow the post-promote main code == the
      # current staging code, so this is effectively a no-op except for
      # the canary fleet pin handoff.
      #
      # Pre-fix history: this workflow used to only trigger on main. That
      # meant staging-CP served "yesterday's main" indefinitely whenever
      # staging→main was wedged. The 2026-04-30 dogfooding session
      # surfaced this when RFC #2312 (chat upload HTTP-forward) landed on
      # staging but staging tenants kept failing chat upload because they
      # were running pre-RFC code. Adding the staging trigger above closes
      # that gap. Earlier 2026-04-24 incident: a static :staging-<sha> pin
      # drifted 10 days behind staging — same class of bug, different
      # mechanism.
      - name: Build & push platform image to GHCR (staging-<sha> + staging-latest)
        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
        with:
          context: .
          file: ./workspace-server/Dockerfile
          platforms: linux/amd64
          push: true
          tags: |
            ${{ env.IMAGE_NAME }}:staging-${{ steps.tags.outputs.sha }}
            ${{ env.IMAGE_NAME }}:staging-latest
          cache-from: type=gha
          cache-to: type=gha,mode=max
          # GIT_SHA bakes into the Go binary via -ldflags so /buildinfo
          # returns it at runtime — see Dockerfile + buildinfo/buildinfo.go.
          # This is the same value as the OCI revision label below; passing
          # it twice is intentional, the OCI label is for registry tooling
          # while /buildinfo is for the redeploy verification step.
          build-args: |
            GIT_SHA=${{ github.sha }}
          labels: |
            org.opencontainers.image.source=https://github.com/${{ github.repository }}
            org.opencontainers.image.revision=${{ github.sha }}
            org.opencontainers.image.description=Molecule AI platform (Go API server) — pending canary verify

      - name: Build & push tenant image to GHCR (staging-<sha> + staging-latest)
        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
        with:
          context: .
          file: ./workspace-server/Dockerfile.tenant
          platforms: linux/amd64
          push: true
          tags: |
            ${{ env.TENANT_IMAGE_NAME }}:staging-${{ steps.tags.outputs.sha }}
            ${{ env.TENANT_IMAGE_NAME }}:staging-latest
          cache-from: type=gha
          cache-to: type=gha,mode=max
          # Canvas uses same-origin fetches. The tenant Go platform
          # reverse-proxies /cp/* to the SaaS CP via its CP_UPSTREAM_URL
          # env; the tenant's /canvas/viewport, /approvals/pending,
          # /org/templates etc. live on the tenant platform itself.
          # Both legs share one origin (the tenant subdomain) so
          # PLATFORM_URL="" forces canvas to fetch paths as relative,
          # which land same-origin.
          #
          # Self-hosted / private-label deployments override this at
          # build time with a specific backend (e.g. local dev:
          # NEXT_PUBLIC_PLATFORM_URL=http://localhost:8080).
          build-args: |
            NEXT_PUBLIC_PLATFORM_URL=
            GIT_SHA=${{ github.sha }}
          labels: |
            org.opencontainers.image.source=https://github.com/${{ github.repository }}
            org.opencontainers.image.revision=${{ github.sha }}
            org.opencontainers.image.description=Molecule AI tenant platform + canvas — pending canary verify