molecule-core/.github/workflows/publish-platform-image.yml

name: publish-platform-image

# Builds and pushes the tenant-platform Docker image to GHCR whenever a
# commit lands on main. The private molecule-controlplane provisioner sets
# TENANT_IMAGE=ghcr.io/molecule-ai/platform:<tag> to spawn tenant Fly
# Machines from this image. See molecule-controlplane README for the pairing.

on:
  push:
    branches: [main]
    paths:
      # Only rebuild when something platform-relevant changes — saves GHA
      # minutes on docs-only / canvas-only / MCP-only PRs.
      - 'platform/**'
      - '.github/workflows/publish-platform-image.yml'
      # Templates now live in standalone repos — template changes no longer
      # trigger a platform rebuild. Use workflow_dispatch to manually rebuild
      # if a template repo update needs to be baked into the image.
  # Manual trigger for re-publishing a tag after a non-platform merge.
  workflow_dispatch:

permissions:
  contents: read
  packages: write   # required to push to ghcr.io/${{ github.repository_owner }}/*

env:
  # GHCR accepts mixed-case, but most tooling lowercases — keep us consistent.
  IMAGE_NAME: ghcr.io/molecule-ai/platform
  # Fly registry mirror — tenant machines provisioned by the private
  # `molecule-controlplane` pull from here (private GHCR image can't be
  # pulled by Fly machines without auth plumbing we don't want to add).
  # Fly auto-authenticates same-org machines against registry.fly.io, so
  # mirroring keeps GHCR private while tenants still boot.
  FLY_IMAGE_NAME: registry.fly.io/molecule-tenant

jobs:
  build-and-push:
    runs-on: [self-hosted, macos, arm64]
    steps:
      - name: Checkout
        uses: actions/checkout@v4

      - name: Isolate Docker config (skip keychain)
        # The Mac mini self-hosted runner runs as a non-interactive
        # launchd service; docker/login-action's default credential store
        # is the macOS Keychain, which raises
        #   error storing credentials - err: exit status 1, out:
        #   `User interaction is not allowed. (-25308)`
        # without an unlocked desktop session.
        #
        # Point DOCKER_CONFIG at a per-run temp dir. IMPORTANT: writing
        # `{"auths": {}}` alone is NOT enough — Docker on macOS picks up
        # `osxkeychain` as the default credential store even when
        # config.json doesn't declare one, inheriting from Docker
        # Desktop's bundled credsStore binding. We must explicitly set
        # `credsStore` to an empty string AND clear `credHelpers` so the
        # login step writes credentials into the auths map of this
        # disposable config.json rather than reaching for the keychain.
        # (First tried in #273 without the empty-credsStore line; #319
        # + #322 merges showed it still regressed.)
        #
        # Plus diagnostics: print the docker path so a future EACCES on
        # /usr/local/bin/docker surfaces in the log instead of via a
        # cryptic docker-login failure mid-step.
        shell: bash
        run: |
          set -euo pipefail
          mkdir -p "${RUNNER_TEMP}/docker-config"
          cat > "${RUNNER_TEMP}/docker-config/config.json" <<'JSON'
{
  "auths": {},
  "credsStore": "",
  "credHelpers": {}
}
JSON
          echo "DOCKER_CONFIG=${RUNNER_TEMP}/docker-config" >> "${GITHUB_ENV}"
          echo "=== Runner docker diagnostics ==="
          echo "PATH=$PATH"
          command -v docker || echo "(docker not in PATH — the runner is missing the Docker CLI or it's not symlinked to a visible location)"
          docker --version 2>&1 || true
          ls -la /usr/local/bin/docker /opt/homebrew/bin/docker 2>&1 || true
          echo "=== config.json after setup ==="
          cat "${RUNNER_TEMP}/docker-config/config.json"

      - name: Set up QEMU
        # Required on the Apple-silicon self-hosted runner — Fly tenant machines
        # pull linux/amd64, and buildx needs binfmt handlers in Docker Desktop's
        # VM to emulate amd64 during the build.
        uses: docker/setup-qemu-action@v3
        with:
          platforms: linux/amd64

      - name: Set up Docker Buildx
        # Buildx enables cache-from/cache-to via GHA cache and multi-arch
        # builds without local docker daemon wrangling.
        uses: docker/setup-buildx-action@v3

      - name: Log in to GHCR
        uses: docker/login-action@v3
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Log in to Fly registry
        # username MUST be literal "x". Fly's registry returns 401 for any
        # other value (verified locally 2026-04-15 — "molecule-ai" fails,
        # "x" succeeds with the same token). The password is the FLY_API_TOKEN.
        # Rotation: see docs/runbooks/saas-secrets.md — FLY_API_TOKEN lives in
        # two places (GitHub Actions secret here + `fly secrets` on molecule-cp)
        # and MUST be updated in both on rotation.
        uses: docker/login-action@v3
        with:
          registry: registry.fly.io
          username: x
          password: ${{ secrets.FLY_API_TOKEN }}

      - name: Compute tags
        id: tags
        # Emit two tags per build: `latest` (floating, always the main tip)
        # and the short commit SHA (immutable, pin-friendly). Control plane
        # can deploy `latest` today and pin to :sha in Phase H hardening.
        run: |
          echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"

      - name: Build & push to GHCR
        # Split from the Fly mirror so a registry.fly.io outage doesn't block
        # GHCR (or vice versa) — each registry's failure mode is isolated.
        # GHA cache is shared because both steps re-use the same Dockerfile
        # context + build args.
        # Explicit linux/amd64 target: the runner is Apple-silicon (arm64),
        # but Fly tenant machines are amd64. QEMU handles the emulation.
        uses: docker/build-push-action@v5
        with:
          context: .
          file: ./platform/Dockerfile
          platforms: linux/amd64
          push: true
          tags: |
            ${{ env.IMAGE_NAME }}:latest
            ${{ env.IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }}
          cache-from: type=gha
          cache-to: type=gha,mode=max
          labels: |
            org.opencontainers.image.source=https://github.com/${{ github.repository }}
            org.opencontainers.image.revision=${{ github.sha }}
            org.opencontainers.image.description=Molecule AI tenant platform (one instance per org)

      - name: Build & push to Fly registry
        # Continues even if GHCR push failed — `if: always()` ensures the
        # private control plane's tenant-image mirror lands regardless of
        # any GHCR-side flakiness.
        if: always()
        uses: docker/build-push-action@v5
        with:
          context: .
          file: ./platform/Dockerfile
          platforms: linux/amd64
          push: true
          tags: |
            ${{ env.FLY_IMAGE_NAME }}:latest
            ${{ env.FLY_IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }}
          cache-from: type=gha
          labels: |
            org.opencontainers.image.source=https://github.com/${{ github.repository }}
            org.opencontainers.image.revision=${{ github.sha }}
            org.opencontainers.image.description=Molecule AI tenant platform (one instance per org)

      - name: Install flyctl
        uses: superfly/flyctl-actions/setup-flyctl@master

      - name: Deploy to Fly tenant machines
        env:
          FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
        run: |
          MACHINES=$(flyctl machines list -a molecule-tenant --json | jq -r '.[] | select(.state == "started" or .state == "stopped") | .id')
          if [ -z "$MACHINES" ]; then
            echo "No tenant machines found — skipping deploy (control plane provisions on demand)"
            exit 0
          fi
          for id in $MACHINES; do
            echo "Updating machine $id to sha-${{ steps.tags.outputs.sha }}..."
            flyctl machines update "$id" \
              --image "${{ env.FLY_IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }}" \
              -a molecule-tenant \
              --yes
          done
          echo "All tenant machines updated to sha-${{ steps.tags.outputs.sha }}"