diff --git a/.github/workflows/publish-workspace-server-image.yml b/.github/workflows/publish-workspace-server-image.yml deleted file mode 100644 index 7d981c93..00000000 --- a/.github/workflows/publish-workspace-server-image.yml +++ /dev/null @@ -1,278 +0,0 @@ -name: publish-workspace-server-image - -# Builds and pushes Docker images to GHCR on staging or main pushes. -# EC2 tenant instances pull the tenant image from GHCR. -# -# Branch / tag policy (see Compute tags step for the per-branch logic): -# -# staging push → builds image, tags :staging- + :staging-latest. -# staging-CP pins TENANT_IMAGE=:staging-latest, so it -# picks up staging-branch code automatically. This is -# what makes staging-CP actually test staging-branch -# code instead of "yesterday's main" — pre-fix, this -# workflow only ran on main, so staging tenants -# silently served stale code (#2308 fix RFC #2312 -# landed on staging but never reached tenants because -# staging→main was wedged on path-filter parity bugs). -# -# main push → builds image, tags :staging- + :staging-latest -# (same as before). canary-verify.yml retags -# :staging- → :latest after canary tenants -# green-light the digest. The :staging-latest retag -# on main push is intentional: when main lands AFTER a -# staging push, staging-CP gets the post-promote code -# (which equals what it had + any merge resolution), -# so the canary-on-staging-CP step still runs against -# the prod-bound digest. -# -# In the steady state both branches refresh :staging-latest; the -# semantic is "most recent staging-or-main build of tenant code." -# Drift between the two is bounded by the staging→main auto-promote -# cadence and is corrected on the next staging push. - -on: - push: - branches: [main] - paths: - - 'workspace-server/**' - - 'canvas/**' - - 'manifest.json' - - 'scripts/**' - - '.github/workflows/publish-workspace-server-image.yml' - workflow_dispatch: - -# Serialize per-branch so two rapid staging pushes don't race the same -# :staging-latest tag retag. Allow staging and main to run in parallel -# (different github.ref → different concurrency group) since they -# produce different :staging- tags and last-write-wins on -# :staging-latest is acceptable across branches (the post-promote -# main code equals current staging code in a healthy flow). -# -# cancel-in-progress: false → in-flight builds finish; the next push's -# build queues. This avoids a partially-pushed image and keeps the -# canary fleet pin (:staging-) consistent with what was actually -# tested at canary-verify time. -concurrency: - group: publish-workspace-server-image-${{ github.ref }} - cancel-in-progress: false - -permissions: - contents: read - packages: write - -env: - IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform - TENANT_IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant - -jobs: - build-and-push: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - # github-app-auth sibling-checkout removed 2026-05-07 (#157): - # plugin was dropped + workspace-server/Dockerfile no longer - # COPYs it. - - # ECR auth + buildx setup are now inline in each build step - # below (Task #173, 2026-05-07). - # - # Why moved inline: aws-actions/configure-aws-credentials@v4 + - # aws-actions/amazon-ecr-login@v2 + docker/setup-buildx-action - # all left auth state in places that the actual `docker push` - # couldn't see on Gitea Actions: - # - The actions wrote to a step-scoped DOCKER_CONFIG path - # that didn't survive into subsequent shell steps. - # - Buildx couldn't bridge the runner container ↔ - # operator-host docker daemon auth gap (401 on the - # docker-container driver, "no basic auth credentials" - # with the action-driven login). - # - # Doing AWS+ECR auth inline (`aws ecr get-login-password | - # docker login`) in the same shell step as `docker build` + - # `docker push` is the operator-host manual approach, mapped - # 1:1 into CI. Auth state is guaranteed to live in the env that - # `docker push` actually runs from. - # - # Post-suspension target is the operator's ECR org - # (153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/*), - # which already hosts platform-tenant + workspace-template-* + - # runner-base images. AWS creds come from the - # AWS_ACCESS_KEY_ID/SECRET secrets bound to the molecule-cp - # IAM user. Closes #161. - - - name: Compute tags - id: tags - run: | - echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT" - - # Health check: verify Docker daemon is accessible before attempting any - # build steps. This fails loudly at step 1 when the runner's docker.sock - # is inaccessible rather than silently continuing to the build step - # where docker build fails deep in ECR auth with a cryptic error. - - name: Verify Docker daemon access - run: | - set -euo pipefail - echo "::group::Docker daemon health check" - docker info 2>&1 | head -5 || { - echo "::error::Docker daemon is not accessible at /var/run/docker.sock" - echo "::error::Check: (1) daemon running, (2) runner user in docker group, (3) sock perms 660+" - exit 1 - } - echo "Docker daemon OK" - echo "::endgroup::" - - # Pre-clone manifest deps before docker build (Task #173 fix). - # - # Why pre-clone: post-2026-05-06, every workspace-template-* repo on - # Gitea (codex, crewai, deepagents, gemini-cli, langgraph) plus all - # 7 org-template-* repos are private. The pre-fix Dockerfile.tenant - # ran `git clone` inside an in-image stage, which had no auth path - # — every CI build failed with "fatal: could not read Username for - # https://git.moleculesai.app". For weeks, every workspace-server - # rebuild required a manual operator-host push. Now we clone in the - # trusted CI context (where AUTO_SYNC_TOKEN is naturally available) - # and Dockerfile.tenant just COPYs from .tenant-bundle-deps/. - # - # Token shape: AUTO_SYNC_TOKEN is the devops-engineer persona PAT - # (see /etc/molecule-bootstrap/agent-secrets.env). Per saved memory - # `feedback_per_agent_gitea_identity_default`, every CI surface uses - # a per-persona token, never the founder PAT. clone-manifest.sh - # embeds it as basic-auth (oauth2:) for the duration of the - # clones, then strips .git directories — the token never enters - # the resulting image. - # - # Idempotent: if a re-run finds populated dirs, clone-manifest.sh - # skips them; safe to retrigger via path-filter or workflow_dispatch. - - name: Pre-clone manifest deps - env: - MOLECULE_GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }} - run: | - set -euo pipefail - if [ -z "${MOLECULE_GITEA_TOKEN}" ]; then - echo "::error::AUTO_SYNC_TOKEN secret is empty — register the devops-engineer persona PAT in repo Actions secrets" - exit 1 - fi - mkdir -p .tenant-bundle-deps - bash scripts/clone-manifest.sh \ - manifest.json \ - .tenant-bundle-deps/workspace-configs-templates \ - .tenant-bundle-deps/org-templates \ - .tenant-bundle-deps/plugins - # Sanity-check counts so a silent partial clone fails fast - # instead of producing a half-empty image. - ws_count=$(find .tenant-bundle-deps/workspace-configs-templates -mindepth 1 -maxdepth 1 -type d | wc -l) - org_count=$(find .tenant-bundle-deps/org-templates -mindepth 1 -maxdepth 1 -type d | wc -l) - plugins_count=$(find .tenant-bundle-deps/plugins -mindepth 1 -maxdepth 1 -type d | wc -l) - echo "Cloned: ws=$ws_count org=$org_count plugins=$plugins_count" - # Counts are derived from manifest.json (9 ws / 7 org / 21 - # plugins as of 2026-05-07). If manifest.json grows but the - # clone step regresses silently, the find above caps at the - # actual disk state — but clone-manifest.sh's own EXPECTED vs - # CLONED check (line ~95) is the authoritative fail-fast. - - # Canary-gated release flow: - # - This step always publishes :staging- + :staging-latest. - # - On staging push, staging-CP picks up :staging-latest immediately - # (its TENANT_IMAGE pin is :staging-latest) — so staging-branch - # code reaches staging tenants without waiting for main. - # - On main push, canary-verify.yml runs smoke tests against - # canary tenants (which pin :staging-), and on green retags - # :staging- → :latest. Prod tenants pull :latest. - # - On red, :latest stays on the prior good digest — prod is safe. - # - # Why :staging-latest is retagged on main push too: when main lands - # after a staging promote, staging-CP gets the post-promote code so - # the canary-on-staging-CP step still runs against the prod-bound - # digest. In a healthy flow the post-promote main code == the - # current staging code, so this is effectively a no-op except for - # the canary fleet pin handoff. - # - # Pre-fix history: this workflow used to only trigger on main. That - # meant staging-CP served "yesterday's main" indefinitely whenever - # staging→main was wedged. The 2026-04-30 dogfooding session - # surfaced this when RFC #2312 (chat upload HTTP-forward) landed on - # staging but staging tenants kept failing chat upload because they - # were running pre-RFC code. Adding the staging trigger above closes - # that gap. Earlier 2026-04-24 incident: a static :staging- pin - # drifted 10 days behind staging — same class of bug, different - # mechanism. ECR repo molecule-ai/platform created 2026-05-07. - # Build + push platform image with plain `docker` (no buildx). - # GIT_SHA bakes into the Go binary via -ldflags so /buildinfo - # returns it at runtime — see Dockerfile + buildinfo/buildinfo.go. - # The OCI revision label below carries the same value for registry - # tooling; the duplication is intentional. - - name: Build & push platform image to ECR (staging- + staging-latest) - env: - IMAGE_NAME: ${{ env.IMAGE_NAME }} - TAG_SHA: staging-${{ steps.tags.outputs.sha }} - TAG_LATEST: staging-latest - GIT_SHA: ${{ github.sha }} - REPO: ${{ github.repository }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: us-east-2 - run: | - set -euo pipefail - # ECR auth in-step so config.json is populated in the same - # shell env that runs `docker push`. ECR get-login-password - # tokens last 12h, plenty for a single-step build+push. - ECR_REGISTRY="${IMAGE_NAME%%/*}" - aws ecr get-login-password --region us-east-2 | \ - docker login --username AWS --password-stdin "${ECR_REGISTRY}" - docker build \ - --file ./workspace-server/Dockerfile \ - --build-arg GIT_SHA="${GIT_SHA}" \ - --label "org.opencontainers.image.source=https://github.com/${REPO}" \ - --label "org.opencontainers.image.revision=${GIT_SHA}" \ - --label "org.opencontainers.image.description=Molecule AI platform (Go API server) — pending canary verify" \ - --tag "${IMAGE_NAME}:${TAG_SHA}" \ - --tag "${IMAGE_NAME}:${TAG_LATEST}" \ - . - docker push "${IMAGE_NAME}:${TAG_SHA}" - docker push "${IMAGE_NAME}:${TAG_LATEST}" - - # Canvas uses same-origin fetches. The tenant Go platform - # reverse-proxies /cp/* to the SaaS CP via its CP_UPSTREAM_URL - # env; the tenant's /canvas/viewport, /approvals/pending, - # /org/templates etc. live on the tenant platform itself. - # Both legs share one origin (the tenant subdomain) so - # PLATFORM_URL="" forces canvas to fetch paths as relative, - # which land same-origin. - # - # Self-hosted / private-label deployments override this at - # build time with a specific backend (e.g. local dev: - # NEXT_PUBLIC_PLATFORM_URL=http://localhost:8080). - - name: Build & push tenant image to ECR (staging- + staging-latest) - env: - TENANT_IMAGE_NAME: ${{ env.TENANT_IMAGE_NAME }} - TAG_SHA: staging-${{ steps.tags.outputs.sha }} - TAG_LATEST: staging-latest - GIT_SHA: ${{ github.sha }} - REPO: ${{ github.repository }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: us-east-2 - run: | - set -euo pipefail - # Re-login: the platform-image step's docker login wrote to - # the same config.json, so this is technically redundant — but - # making each push step self-contained keeps the workflow - # robust to step reordering / future extraction. - ECR_REGISTRY="${TENANT_IMAGE_NAME%%/*}" - aws ecr get-login-password --region us-east-2 | \ - docker login --username AWS --password-stdin "${ECR_REGISTRY}" - docker build \ - --file ./workspace-server/Dockerfile.tenant \ - --build-arg NEXT_PUBLIC_PLATFORM_URL= \ - --build-arg GIT_SHA="${GIT_SHA}" \ - --label "org.opencontainers.image.source=https://github.com/${REPO}" \ - --label "org.opencontainers.image.revision=${GIT_SHA}" \ - --label "org.opencontainers.image.description=Molecule AI tenant platform + canvas — pending canary verify" \ - --tag "${TENANT_IMAGE_NAME}:${TAG_SHA}" \ - --tag "${TENANT_IMAGE_NAME}:${TAG_LATEST}" \ - . - docker push "${TENANT_IMAGE_NAME}:${TAG_SHA}" - docker push "${TENANT_IMAGE_NAME}:${TAG_LATEST}" -