From 43e2d24c5bab3f68bb48b06ab39537d8af94112c Mon Sep 17 00:00:00 2001 From: devops-engineer Date: Thu, 7 May 2026 13:43:50 -0700 Subject: [PATCH] fix(ci): replace buildx with plain docker build+push (followup #173) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI run #946 (post-#43) confirmed `driver: docker` doesn't fix the ECR push 401 either: buildx CLI inside the runner container talks to the operator-host docker daemon (mounted socket), but the daemon doesn't see the runner's ECR auth state, and the runner's buildx CLI doesn't attach the auth header in a way the daemon accepts. Drop buildx + build-push-action entirely. Plain `docker build` + `docker push` from the runner container works because both use the SAME docker socket + the SAME runner-container config.json (populated by `aws ecr get-login-password | docker login` from amazon-ecr-login). Trade-off: lose multi-arch support. We only ship linux/amd64 tenant images today, so this is fine. If multi-arch becomes a requirement later, we can revisit (likely with `docker buildx create --driver=remote` pointing at an external buildkit, but that's substantial infra work; not worth it for a single-arch shop). Closes #173 (fourth piece — and hopefully last; this matches the operator-host manual approach exactly). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../publish-workspace-server-image.yml | 144 +++++++++--------- 1 file changed, 70 insertions(+), 74 deletions(-) diff --git a/.github/workflows/publish-workspace-server-image.yml b/.github/workflows/publish-workspace-server-image.yml index 00405e93..85cdb647 100644 --- a/.github/workflows/publish-workspace-server-image.yml +++ b/.github/workflows/publish-workspace-server-image.yml @@ -94,20 +94,19 @@ jobs: id: ecr-login uses: aws-actions/amazon-ecr-login@v2 - - name: Set up Docker Buildx - # driver: docker — use the host docker daemon directly. The - # default `docker-container` driver spawns a buildkit container - # that doesn't share the host's ECR auth (set up by - # amazon-ecr-login above) and silently 401s on push to ECR. With - # driver: docker, buildx delegates to the host daemon which - # already has the ECR creds. Caught on Gitea Actions run #893 - # post-Task-#173 (2026-05-07): the pre-clone fix worked and the - # image built end-to-end, but `failed to push: 401 Unauthorized` - # because the build container couldn't see the host's - # ~/.docker/config.json. - uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0 - with: - driver: docker + # docker/setup-buildx-action removed (Task #173, 2026-05-07). + # Reason: on Gitea Actions, neither buildx driver works for our + # mounted-docker-socket runner topology: + # - docker-container driver: spawns a buildkit container that + # doesn't share the host's ECR auth (401 Unauthorized on push) + # - docker driver: delegates to the operator-host docker daemon, + # which doesn't see the runner container's ECR auth either + # Plain `docker build` + `docker push` from the runner container + # works because both use the same docker socket + the runner's + # config.json (populated by `aws ecr get-login-password | docker + # login` in the next step). Buildx's value here was only multi-arch + # builds, but we only ship linux/amd64 tenant images, so the + # complexity isn't earning anything. - name: Compute tags id: tags @@ -189,65 +188,62 @@ jobs: # that gap. Earlier 2026-04-24 incident: a static :staging- pin # drifted 10 days behind staging — same class of bug, different # mechanism. - - name: Build & push platform image to GHCR (staging- + staging-latest) - uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0 - with: - context: . - file: ./workspace-server/Dockerfile - platforms: linux/amd64 - push: true - tags: | - ${{ env.IMAGE_NAME }}:staging-${{ steps.tags.outputs.sha }} - ${{ env.IMAGE_NAME }}:staging-latest - # cache-from/cache-to: type=gha removed for Gitea Actions — - # the GHA artifact cache backend is GitHub-specific; on Gitea - # the cache endpoint is unreachable and times out - # ("artifactcache/cache?keys=index-buildkit-... i/o timeout"). - # Driver `docker` (set above) doesn't support the gha cache - # protocol either. Inline cache via type=registry could be - # added back later if rebuild time becomes painful, but - # 37-repo clone + Go/Node builds take <10min cold — fine for - # now, and a noisy failure is worse than a slow success. - # GIT_SHA bakes into the Go binary via -ldflags so /buildinfo - # returns it at runtime — see Dockerfile + buildinfo/buildinfo.go. - # This is the same value as the OCI revision label below; passing - # it twice is intentional, the OCI label is for registry tooling - # while /buildinfo is for the redeploy verification step. - build-args: | - GIT_SHA=${{ github.sha }} - labels: | - org.opencontainers.image.source=https://github.com/${{ github.repository }} - org.opencontainers.image.revision=${{ github.sha }} - org.opencontainers.image.description=Molecule AI platform (Go API server) — pending canary verify + # Build + push platform image with plain `docker` (no buildx). + # GIT_SHA bakes into the Go binary via -ldflags so /buildinfo + # returns it at runtime — see Dockerfile + buildinfo/buildinfo.go. + # The OCI revision label below carries the same value for registry + # tooling; the duplication is intentional. + - name: Build & push platform image to ECR (staging- + staging-latest) + env: + IMAGE_NAME: ${{ env.IMAGE_NAME }} + TAG_SHA: staging-${{ steps.tags.outputs.sha }} + TAG_LATEST: staging-latest + GIT_SHA: ${{ github.sha }} + REPO: ${{ github.repository }} + run: | + set -euo pipefail + docker build \ + --file ./workspace-server/Dockerfile \ + --build-arg GIT_SHA="${GIT_SHA}" \ + --label "org.opencontainers.image.source=https://github.com/${REPO}" \ + --label "org.opencontainers.image.revision=${GIT_SHA}" \ + --label "org.opencontainers.image.description=Molecule AI platform (Go API server) — pending canary verify" \ + --tag "${IMAGE_NAME}:${TAG_SHA}" \ + --tag "${IMAGE_NAME}:${TAG_LATEST}" \ + . + docker push "${IMAGE_NAME}:${TAG_SHA}" + docker push "${IMAGE_NAME}:${TAG_LATEST}" - - name: Build & push tenant image to GHCR (staging- + staging-latest) - uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0 - with: - context: . - file: ./workspace-server/Dockerfile.tenant - platforms: linux/amd64 - push: true - tags: | - ${{ env.TENANT_IMAGE_NAME }}:staging-${{ steps.tags.outputs.sha }} - ${{ env.TENANT_IMAGE_NAME }}:staging-latest - # cache-from/cache-to: type=gha removed — see platform image - # build step above for rationale. Same Gitea-Actions limitation. - # Canvas uses same-origin fetches. The tenant Go platform - # reverse-proxies /cp/* to the SaaS CP via its CP_UPSTREAM_URL - # env; the tenant's /canvas/viewport, /approvals/pending, - # /org/templates etc. live on the tenant platform itself. - # Both legs share one origin (the tenant subdomain) so - # PLATFORM_URL="" forces canvas to fetch paths as relative, - # which land same-origin. - # - # Self-hosted / private-label deployments override this at - # build time with a specific backend (e.g. local dev: - # NEXT_PUBLIC_PLATFORM_URL=http://localhost:8080). - build-args: | - NEXT_PUBLIC_PLATFORM_URL= - GIT_SHA=${{ github.sha }} - labels: | - org.opencontainers.image.source=https://github.com/${{ github.repository }} - org.opencontainers.image.revision=${{ github.sha }} - org.opencontainers.image.description=Molecule AI tenant platform + canvas — pending canary verify + # Canvas uses same-origin fetches. The tenant Go platform + # reverse-proxies /cp/* to the SaaS CP via its CP_UPSTREAM_URL + # env; the tenant's /canvas/viewport, /approvals/pending, + # /org/templates etc. live on the tenant platform itself. + # Both legs share one origin (the tenant subdomain) so + # PLATFORM_URL="" forces canvas to fetch paths as relative, + # which land same-origin. + # + # Self-hosted / private-label deployments override this at + # build time with a specific backend (e.g. local dev: + # NEXT_PUBLIC_PLATFORM_URL=http://localhost:8080). + - name: Build & push tenant image to ECR (staging- + staging-latest) + env: + TENANT_IMAGE_NAME: ${{ env.TENANT_IMAGE_NAME }} + TAG_SHA: staging-${{ steps.tags.outputs.sha }} + TAG_LATEST: staging-latest + GIT_SHA: ${{ github.sha }} + REPO: ${{ github.repository }} + run: | + set -euo pipefail + docker build \ + --file ./workspace-server/Dockerfile.tenant \ + --build-arg NEXT_PUBLIC_PLATFORM_URL= \ + --build-arg GIT_SHA="${GIT_SHA}" \ + --label "org.opencontainers.image.source=https://github.com/${REPO}" \ + --label "org.opencontainers.image.revision=${GIT_SHA}" \ + --label "org.opencontainers.image.description=Molecule AI tenant platform + canvas — pending canary verify" \ + --tag "${TENANT_IMAGE_NAME}:${TAG_SHA}" \ + --tag "${TENANT_IMAGE_NAME}:${TAG_LATEST}" \ + . + docker push "${TENANT_IMAGE_NAME}:${TAG_SHA}" + docker push "${TENANT_IMAGE_NAME}:${TAG_LATEST}"