From f0e8d9bb23b26fb55da3819f81a1b122d9b07d20 Mon Sep 17 00:00:00 2001
From: devops-engineer <devops-engineer@molecule.ai>
Date: Thu, 7 May 2026 13:49:12 -0700
Subject: [PATCH] fix(ci): inline aws ecr get-login-password + docker login
 (followup #173)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CI run #987 (post-#45) showed `docker push` from shell still hits
"no basic auth credentials" — `aws-actions/amazon-ecr-login@v2`
writes auth to a step-scoped DOCKER_CONFIG that doesn't carry across
to the next shell step on Gitea Actions.

Fix: drop both `aws-actions/configure-aws-credentials@v4` and
`aws-actions/amazon-ecr-login@v2`. Run `aws ecr get-login-password |
docker login` inline in the same shell step as `docker build` +
`docker push`. AWS creds come from secrets via env vars, ECR token
is fresh per-step (12h validity is plenty), config.json lives in the
same shell process — auth state is guaranteed.

This is the operator-host manual approach mapped 1:1 into CI.
runner-base image already has aws-cli + docker (verified locally).

Closes #173 (fifth piece — and final, this matches the manual flow
exactly).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../publish-workspace-server-image.yml        | 77 +++++++++++--------
 1 file changed, 45 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/publish-workspace-server-image.yml b/.github/workflows/publish-workspace-server-image.yml
index 85cdb647..728c4fb0 100644
--- a/.github/workflows/publish-workspace-server-image.yml
+++ b/.github/workflows/publish-workspace-server-image.yml
@@ -75,38 +75,32 @@ jobs:
       # plugin was dropped + workspace-server/Dockerfile no longer
       # COPYs it.
 
-      - name: Configure AWS credentials for ECR
-        # GHCR was the pre-suspension target; the molecule-ai org on
-        # GitHub got swept 2026-05-06 and ghcr.io/molecule-ai/* is no
-        # longer reachable. Post-suspension target is the operator's
-        # ECR org (153263036946.dkr.ecr.us-east-2.amazonaws.com/
-        # molecule-ai/*), which already hosts platform-tenant +
-        # workspace-template-* + runner-base images. AWS creds come
-        # from the AWS_ACCESS_KEY_ID/SECRET secrets bound to the
-        # molecule-cp IAM user. Closes #161.
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: us-east-2
-
-      - name: Log in to ECR
-        id: ecr-login
-        uses: aws-actions/amazon-ecr-login@v2
-
-      # docker/setup-buildx-action removed (Task #173, 2026-05-07).
-      # Reason: on Gitea Actions, neither buildx driver works for our
-      # mounted-docker-socket runner topology:
-      #   - docker-container driver: spawns a buildkit container that
-      #     doesn't share the host's ECR auth (401 Unauthorized on push)
-      #   - docker driver: delegates to the operator-host docker daemon,
-      #     which doesn't see the runner container's ECR auth either
-      # Plain `docker build` + `docker push` from the runner container
-      # works because both use the same docker socket + the runner's
-      # config.json (populated by `aws ecr get-login-password | docker
-      # login` in the next step). Buildx's value here was only multi-arch
-      # builds, but we only ship linux/amd64 tenant images, so the
-      # complexity isn't earning anything.
+      # ECR auth + buildx setup are now inline in each build step
+      # below (Task #173, 2026-05-07).
+      #
+      # Why moved inline: aws-actions/configure-aws-credentials@v4 +
+      # aws-actions/amazon-ecr-login@v2 + docker/setup-buildx-action
+      # all left auth state in places that the actual `docker push`
+      # couldn't see on Gitea Actions:
+      #   - The actions wrote to a step-scoped DOCKER_CONFIG path
+      #     that didn't survive into subsequent shell steps.
+      #   - Buildx couldn't bridge the runner container ↔
+      #     operator-host docker daemon auth gap (401 on the
+      #     docker-container driver, "no basic auth credentials"
+      #     with the action-driven login).
+      #
+      # Doing AWS+ECR auth inline (`aws ecr get-login-password |
+      # docker login`) in the same shell step as `docker build` +
+      # `docker push` is the operator-host manual approach, mapped
+      # 1:1 into CI. Auth state is guaranteed to live in the env that
+      # `docker push` actually runs from.
+      #
+      # Post-suspension target is the operator's ECR org
+      # (153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/*),
+      # which already hosts platform-tenant + workspace-template-* +
+      # runner-base images. AWS creds come from the
+      # AWS_ACCESS_KEY_ID/SECRET secrets bound to the molecule-cp
+      # IAM user. Closes #161.
 
       - name: Compute tags
         id: tags
@@ -200,8 +194,17 @@ jobs:
           TAG_LATEST: staging-latest
           GIT_SHA: ${{ github.sha }}
           REPO: ${{ github.repository }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          AWS_DEFAULT_REGION: us-east-2
         run: |
           set -euo pipefail
+          # ECR auth in-step so config.json is populated in the same
+          # shell env that runs `docker push`. ECR get-login-password
+          # tokens last 12h, plenty for a single-step build+push.
+          ECR_REGISTRY="${IMAGE_NAME%%/*}"
+          aws ecr get-login-password --region us-east-2 | \
+            docker login --username AWS --password-stdin "${ECR_REGISTRY}"
           docker build \
             --file ./workspace-server/Dockerfile \
             --build-arg GIT_SHA="${GIT_SHA}" \
@@ -232,8 +235,18 @@ jobs:
           TAG_LATEST: staging-latest
           GIT_SHA: ${{ github.sha }}
           REPO: ${{ github.repository }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          AWS_DEFAULT_REGION: us-east-2
         run: |
           set -euo pipefail
+          # Re-login: the platform-image step's docker login wrote to
+          # the same config.json, so this is technically redundant — but
+          # making each push step self-contained keeps the workflow
+          # robust to step reordering / future extraction.
+          ECR_REGISTRY="${TENANT_IMAGE_NAME%%/*}"
+          aws ecr get-login-password --region us-east-2 | \
+            docker login --username AWS --password-stdin "${ECR_REGISTRY}"
           docker build \
             --file ./workspace-server/Dockerfile.tenant \
             --build-arg NEXT_PUBLIC_PLATFORM_URL= \