molecule-core/.github/workflows/publish-workspace-server-image.yml

name: publish-workspace-server-image

# Builds and pushes Docker images to GHCR on staging or main pushes.
# EC2 tenant instances pull the tenant image from GHCR.
#
# Branch / tag policy (see Compute tags step for the per-branch logic):
#
#   staging push  → builds image, tags :staging-<sha> + :staging-latest.
#                   staging-CP pins TENANT_IMAGE=:staging-latest, so it
#                   picks up staging-branch code automatically. This is
#                   what makes staging-CP actually test staging-branch
#                   code instead of "yesterday's main" — pre-fix, this
#                   workflow only ran on main, so staging tenants
#                   silently served stale code (#2308 fix RFC #2312
#                   landed on staging but never reached tenants because
#                   staging→main was wedged on path-filter parity bugs).
#
#   main push     → builds image, tags :staging-<sha> + :staging-latest
#                   (same as before). canary-verify.yml retags
#                   :staging-<sha> → :latest after canary tenants
#                   green-light the digest. The :staging-latest retag
#                   on main push is intentional: when main lands AFTER a
#                   staging push, staging-CP gets the post-promote code
#                   (which equals what it had + any merge resolution),
#                   so the canary-on-staging-CP step still runs against
#                   the prod-bound digest.
#
# In the steady state both branches refresh :staging-latest; the
# semantic is "most recent staging-or-main build of tenant code."
# Drift between the two is bounded by the staging→main auto-promote
# cadence and is corrected on the next staging push.

on:
  push:
    branches: [staging, main]
    paths:
      - 'workspace-server/**'
      - 'canvas/**'
      - 'manifest.json'
      - 'scripts/**'
      - '.github/workflows/publish-workspace-server-image.yml'
  workflow_dispatch:

# Serialize per-branch so two rapid staging pushes don't race the same
# :staging-latest tag retag. Allow staging and main to run in parallel
# (different github.ref → different concurrency group) since they
# produce different :staging-<sha> tags and last-write-wins on
# :staging-latest is acceptable across branches (the post-promote
# main code equals current staging code in a healthy flow).
#
# cancel-in-progress: false → in-flight builds finish; the next push's
# build queues. This avoids a partially-pushed image and keeps the
# canary fleet pin (:staging-<sha>) consistent with what was actually
# tested at canary-verify time.
concurrency:
  group: publish-workspace-server-image-${{ github.ref }}
  cancel-in-progress: false

permissions:
  contents: read
  packages: write

env:
  IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform
  TENANT_IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant

jobs:
  build-and-push:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      # github-app-auth sibling-checkout removed 2026-05-07 (#157):
      # plugin was dropped + workspace-server/Dockerfile no longer
      # COPYs it.

      # ECR auth + buildx setup are now inline in each build step
      # below (Task #173, 2026-05-07).
      #
      # Why moved inline: aws-actions/configure-aws-credentials@v4 +
      # aws-actions/amazon-ecr-login@v2 + docker/setup-buildx-action
      # all left auth state in places that the actual `docker push`
      # couldn't see on Gitea Actions:
      #   - The actions wrote to a step-scoped DOCKER_CONFIG path
      #     that didn't survive into subsequent shell steps.
      #   - Buildx couldn't bridge the runner container ↔
      #     operator-host docker daemon auth gap (401 on the
      #     docker-container driver, "no basic auth credentials"
      #     with the action-driven login).
      #
      # Doing AWS+ECR auth inline (`aws ecr get-login-password |
      # docker login`) in the same shell step as `docker build` +
      # `docker push` is the operator-host manual approach, mapped
      # 1:1 into CI. Auth state is guaranteed to live in the env that
      # `docker push` actually runs from.
      #
      # Post-suspension target is the operator's ECR org
      # (153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/*),
      # which already hosts platform-tenant + workspace-template-* +
      # runner-base images. AWS creds come from the
      # AWS_ACCESS_KEY_ID/SECRET secrets bound to the molecule-cp
      # IAM user. Closes #161.

      - name: Compute tags
        id: tags
        run: |
          echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"

      # Pre-clone manifest deps before docker build (Task #173 fix).
      #
      # Why pre-clone: post-2026-05-06, every workspace-template-* repo on
      # Gitea (codex, crewai, deepagents, gemini-cli, langgraph) plus all
      # 7 org-template-* repos are private. The pre-fix Dockerfile.tenant
      # ran `git clone` inside an in-image stage, which had no auth path
      # — every CI build failed with "fatal: could not read Username for
      # https://git.moleculesai.app". For weeks, every workspace-server
      # rebuild required a manual operator-host push. Now we clone in the
      # trusted CI context (where AUTO_SYNC_TOKEN is naturally available)
      # and Dockerfile.tenant just COPYs from .tenant-bundle-deps/.
      #
      # Token shape: AUTO_SYNC_TOKEN is the devops-engineer persona PAT
      # (see /etc/molecule-bootstrap/agent-secrets.env). Per saved memory
      # `feedback_per_agent_gitea_identity_default`, every CI surface uses
      # a per-persona token, never the founder PAT. clone-manifest.sh
      # embeds it as basic-auth (oauth2:<token>) for the duration of the
      # clones, then strips .git directories — the token never enters
      # the resulting image.
      #
      # Idempotent: if a re-run finds populated dirs, clone-manifest.sh
      # skips them; safe to retrigger via path-filter or workflow_dispatch.
      - name: Pre-clone manifest deps
        env:
          MOLECULE_GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
        run: |
          set -euo pipefail
          if [ -z "${MOLECULE_GITEA_TOKEN}" ]; then
            echo "::error::AUTO_SYNC_TOKEN secret is empty — register the devops-engineer persona PAT in repo Actions secrets"
            exit 1
          fi
          mkdir -p .tenant-bundle-deps
          bash scripts/clone-manifest.sh \
            manifest.json \
            .tenant-bundle-deps/workspace-configs-templates \
            .tenant-bundle-deps/org-templates \
            .tenant-bundle-deps/plugins
          # Sanity-check counts so a silent partial clone fails fast
          # instead of producing a half-empty image.
          ws_count=$(find .tenant-bundle-deps/workspace-configs-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
          org_count=$(find .tenant-bundle-deps/org-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
          plugins_count=$(find .tenant-bundle-deps/plugins -mindepth 1 -maxdepth 1 -type d | wc -l)
          echo "Cloned: ws=$ws_count org=$org_count plugins=$plugins_count"
          # Counts are derived from manifest.json (9 ws / 7 org / 21
          # plugins as of 2026-05-07). If manifest.json grows but the
          # clone step regresses silently, the find above caps at the
          # actual disk state — but clone-manifest.sh's own EXPECTED vs
          # CLONED check (line ~95) is the authoritative fail-fast.

      # Canary-gated release flow:
      #   - This step always publishes :staging-<sha> + :staging-latest.
      #   - On staging push, staging-CP picks up :staging-latest immediately
      #     (its TENANT_IMAGE pin is :staging-latest) — so staging-branch
      #     code reaches staging tenants without waiting for main.
      #   - On main push, canary-verify.yml runs smoke tests against
      #     canary tenants (which pin :staging-<sha>), and on green retags
      #     :staging-<sha> → :latest. Prod tenants pull :latest.
      #   - On red, :latest stays on the prior good digest — prod is safe.
      #
      # Why :staging-latest is retagged on main push too: when main lands
      # after a staging promote, staging-CP gets the post-promote code so
      # the canary-on-staging-CP step still runs against the prod-bound
      # digest. In a healthy flow the post-promote main code == the
      # current staging code, so this is effectively a no-op except for
      # the canary fleet pin handoff.
      #
      # Pre-fix history: this workflow used to only trigger on main. That
      # meant staging-CP served "yesterday's main" indefinitely whenever
      # staging→main was wedged. The 2026-04-30 dogfooding session
      # surfaced this when RFC #2312 (chat upload HTTP-forward) landed on
      # staging but staging tenants kept failing chat upload because they
      # were running pre-RFC code. Adding the staging trigger above closes
      # that gap. Earlier 2026-04-24 incident: a static :staging-<sha> pin
      # drifted 10 days behind staging — same class of bug, different
      # mechanism. ECR repo molecule-ai/platform created 2026-05-07.
      # Build + push platform image with plain `docker` (no buildx).
      # GIT_SHA bakes into the Go binary via -ldflags so /buildinfo
      # returns it at runtime — see Dockerfile + buildinfo/buildinfo.go.
      # The OCI revision label below carries the same value for registry
      # tooling; the duplication is intentional.
      - name: Build & push platform image to ECR (staging-<sha> + staging-latest)
        env:
          IMAGE_NAME: ${{ env.IMAGE_NAME }}
          TAG_SHA: staging-${{ steps.tags.outputs.sha }}
          TAG_LATEST: staging-latest
          GIT_SHA: ${{ github.sha }}
          REPO: ${{ github.repository }}
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          AWS_DEFAULT_REGION: us-east-2
        run: |
          set -euo pipefail
          # ECR auth in-step so config.json is populated in the same
          # shell env that runs `docker push`. ECR get-login-password
          # tokens last 12h, plenty for a single-step build+push.
          ECR_REGISTRY="${IMAGE_NAME%%/*}"
          aws ecr get-login-password --region us-east-2 | \
            docker login --username AWS --password-stdin "${ECR_REGISTRY}"
          docker build \
            --file ./workspace-server/Dockerfile \
            --build-arg GIT_SHA="${GIT_SHA}" \
            --label "org.opencontainers.image.source=https://github.com/${REPO}" \
            --label "org.opencontainers.image.revision=${GIT_SHA}" \
            --label "org.opencontainers.image.description=Molecule AI platform (Go API server) — pending canary verify" \
            --tag "${IMAGE_NAME}:${TAG_SHA}" \
            --tag "${IMAGE_NAME}:${TAG_LATEST}" \
            .
          docker push "${IMAGE_NAME}:${TAG_SHA}"
          docker push "${IMAGE_NAME}:${TAG_LATEST}"

      # Canvas uses same-origin fetches. The tenant Go platform
      # reverse-proxies /cp/* to the SaaS CP via its CP_UPSTREAM_URL
      # env; the tenant's /canvas/viewport, /approvals/pending,
      # /org/templates etc. live on the tenant platform itself.
      # Both legs share one origin (the tenant subdomain) so
      # PLATFORM_URL="" forces canvas to fetch paths as relative,
      # which land same-origin.
      #
      # Self-hosted / private-label deployments override this at
      # build time with a specific backend (e.g. local dev:
      # NEXT_PUBLIC_PLATFORM_URL=http://localhost:8080).
      - name: Build & push tenant image to ECR (staging-<sha> + staging-latest)
        env:
          TENANT_IMAGE_NAME: ${{ env.TENANT_IMAGE_NAME }}
          TAG_SHA: staging-${{ steps.tags.outputs.sha }}
          TAG_LATEST: staging-latest
          GIT_SHA: ${{ github.sha }}
          REPO: ${{ github.repository }}
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          AWS_DEFAULT_REGION: us-east-2
        run: |
          set -euo pipefail
          # Re-login: the platform-image step's docker login wrote to
          # the same config.json, so this is technically redundant — but
          # making each push step self-contained keeps the workflow
          # robust to step reordering / future extraction.
          ECR_REGISTRY="${TENANT_IMAGE_NAME%%/*}"
          aws ecr get-login-password --region us-east-2 | \
            docker login --username AWS --password-stdin "${ECR_REGISTRY}"
          docker build \
            --file ./workspace-server/Dockerfile.tenant \
            --build-arg NEXT_PUBLIC_PLATFORM_URL= \
            --build-arg GIT_SHA="${GIT_SHA}" \
            --label "org.opencontainers.image.source=https://github.com/${REPO}" \
            --label "org.opencontainers.image.revision=${GIT_SHA}" \
            --label "org.opencontainers.image.description=Molecule AI tenant platform + canvas — pending canary verify" \
            --tag "${TENANT_IMAGE_NAME}:${TAG_SHA}" \
            --tag "${TENANT_IMAGE_NAME}:${TAG_LATEST}" \
            .
          docker push "${TENANT_IMAGE_NAME}:${TAG_SHA}"
          docker push "${TENANT_IMAGE_NAME}:${TAG_LATEST}"