molecule-core/.github/workflows/publish-workspace-server-image.yml

name: publish-workspace-server-image

# Builds and pushes Docker images to GHCR on staging or main pushes.
# EC2 tenant instances pull the tenant image from GHCR.
#
# Branch / tag policy (see Compute tags step for the per-branch logic):
#
#   staging push  → builds image, tags :staging-<sha> + :staging-latest.
#                   staging-CP pins TENANT_IMAGE=:staging-latest, so it
#                   picks up staging-branch code automatically. This is
#                   what makes staging-CP actually test staging-branch
#                   code instead of "yesterday's main" — pre-fix, this
#                   workflow only ran on main, so staging tenants
#                   silently served stale code (#2308 fix RFC #2312
#                   landed on staging but never reached tenants because
#                   staging→main was wedged on path-filter parity bugs).
#
#   main push     → builds image, tags :staging-<sha> + :staging-latest
#                   (same as before). canary-verify.yml retags
#                   :staging-<sha> → :latest after canary tenants
#                   green-light the digest. The :staging-latest retag
#                   on main push is intentional: when main lands AFTER a
#                   staging push, staging-CP gets the post-promote code
#                   (which equals what it had + any merge resolution),
#                   so the canary-on-staging-CP step still runs against
#                   the prod-bound digest.
#
# In the steady state both branches refresh :staging-latest; the
# semantic is "most recent staging-or-main build of tenant code."
# Drift between the two is bounded by the staging→main auto-promote
# cadence and is corrected on the next staging push.

on:
  push:
    branches: [staging, main]
    paths:
      - 'workspace-server/**'
      - 'canvas/**'
      - 'manifest.json'
      - '.github/workflows/publish-workspace-server-image.yml'
  workflow_dispatch:

# Serialize per-branch so two rapid staging pushes don't race the same
# :staging-latest tag retag. Allow staging and main to run in parallel
# (different github.ref → different concurrency group) since they
# produce different :staging-<sha> tags and last-write-wins on
# :staging-latest is acceptable across branches (the post-promote
# main code equals current staging code in a healthy flow).
#
# cancel-in-progress: false → in-flight builds finish; the next push's
# build queues. This avoids a partially-pushed image and keeps the
# canary fleet pin (:staging-<sha>) consistent with what was actually
# tested at canary-verify time.
concurrency:
  group: publish-workspace-server-image-${{ github.ref }}
  cancel-in-progress: false

permissions:
  contents: read
  packages: write

env:
  IMAGE_NAME: ghcr.io/molecule-ai/platform
  TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant

jobs:
  build-and-push:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Checkout sibling plugin repo
        # workspace-server/Dockerfile expects
        # ./molecule-ai-plugin-github-app-auth at build-context root because
        # the Go module has a `replace` directive pointing at /plugin inside
        # the image. Pre-repo-split the plugin lived in the monorepo; the
        # 2026-04-18 restructure moved it out but didn't add this clone step
        # — which is why publish was failing after that restructure.
        #
        # Uses a fine-grained PAT (PLUGIN_REPO_PAT) because the plugin repo
        # is private and the default GITHUB_TOKEN is scoped to THIS repo.
        # The PAT needs Contents:Read on Molecule-AI/molecule-ai-plugin-
        # github-app-auth. Falls back to the default token for the (rare)
        # case where an operator made the plugin repo public.
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          repository: Molecule-AI/molecule-ai-plugin-github-app-auth
          path: molecule-ai-plugin-github-app-auth
          token: ${{ secrets.PLUGIN_REPO_PAT || secrets.GITHUB_TOKEN }}

      - name: Log in to GHCR
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0

      - name: Compute tags
        id: tags
        run: |
          echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"

      # Canary-gated release flow:
      #   - This step always publishes :staging-<sha> + :staging-latest.
      #   - On staging push, staging-CP picks up :staging-latest immediately
      #     (its TENANT_IMAGE pin is :staging-latest) — so staging-branch
      #     code reaches staging tenants without waiting for main.
      #   - On main push, canary-verify.yml runs smoke tests against
      #     canary tenants (which pin :staging-<sha>), and on green retags
      #     :staging-<sha> → :latest. Prod tenants pull :latest.
      #   - On red, :latest stays on the prior good digest — prod is safe.
      #
      # Why :staging-latest is retagged on main push too: when main lands
      # after a staging promote, staging-CP gets the post-promote code so
      # the canary-on-staging-CP step still runs against the prod-bound
      # digest. In a healthy flow the post-promote main code == the
      # current staging code, so this is effectively a no-op except for
      # the canary fleet pin handoff.
      #
      # Pre-fix history: this workflow used to only trigger on main. That
      # meant staging-CP served "yesterday's main" indefinitely whenever
      # staging→main was wedged. The 2026-04-30 dogfooding session
      # surfaced this when RFC #2312 (chat upload HTTP-forward) landed on
      # staging but staging tenants kept failing chat upload because they
      # were running pre-RFC code. Adding the staging trigger above closes
      # that gap. Earlier 2026-04-24 incident: a static :staging-<sha> pin
      # drifted 10 days behind staging — same class of bug, different
      # mechanism.
      - name: Build & push platform image to GHCR (staging-<sha> + staging-latest)
        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
        with:
          context: .
          file: ./workspace-server/Dockerfile
          platforms: linux/amd64
          push: true
          tags: |
            ${{ env.IMAGE_NAME }}:staging-${{ steps.tags.outputs.sha }}
            ${{ env.IMAGE_NAME }}:staging-latest
          cache-from: type=gha
          cache-to: type=gha,mode=max
          # GIT_SHA bakes into the Go binary via -ldflags so /buildinfo
          # returns it at runtime — see Dockerfile + buildinfo/buildinfo.go.
          # This is the same value as the OCI revision label below; passing
          # it twice is intentional, the OCI label is for registry tooling
          # while /buildinfo is for the redeploy verification step.
          build-args: |
            GIT_SHA=${{ github.sha }}
          labels: |
            org.opencontainers.image.source=https://github.com/${{ github.repository }}
            org.opencontainers.image.revision=${{ github.sha }}
            org.opencontainers.image.description=Molecule AI platform (Go API server) — pending canary verify

      - name: Build & push tenant image to GHCR (staging-<sha> + staging-latest)
        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
        with:
          context: .
          file: ./workspace-server/Dockerfile.tenant
          platforms: linux/amd64
          push: true
          tags: |
            ${{ env.TENANT_IMAGE_NAME }}:staging-${{ steps.tags.outputs.sha }}
            ${{ env.TENANT_IMAGE_NAME }}:staging-latest
          cache-from: type=gha
          cache-to: type=gha,mode=max
          # Canvas uses same-origin fetches. The tenant Go platform
          # reverse-proxies /cp/* to the SaaS CP via its CP_UPSTREAM_URL
          # env; the tenant's /canvas/viewport, /approvals/pending,
          # /org/templates etc. live on the tenant platform itself.
          # Both legs share one origin (the tenant subdomain) so
          # PLATFORM_URL="" forces canvas to fetch paths as relative,
          # which land same-origin.
          #
          # Self-hosted / private-label deployments override this at
          # build time with a specific backend (e.g. local dev:
          # NEXT_PUBLIC_PLATFORM_URL=http://localhost:8080).
          build-args: |
            NEXT_PUBLIC_PLATFORM_URL=
            GIT_SHA=${{ github.sha }}
          labels: |
            org.opencontainers.image.source=https://github.com/${{ github.repository }}
            org.opencontainers.image.revision=${{ github.sha }}
            org.opencontainers.image.description=Molecule AI tenant platform + canvas — pending canary verify