From d23d3a4b3728493daf179efd4208bd9ba9e6a236 Mon Sep 17 00:00:00 2001 From: Molecule AI Infra Lead Date: Mon, 11 May 2026 22:10:19 +0000 Subject: [PATCH] [infra-lead-agent] ci(diagnostic): add runner-state probes to publish-workspace-server-image MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Workflow has been red on main post-#572. #572's AUTO_SYNC_TOKEN fix moved the failure from ~9s to ~50s — confirming the manifest-clone step is now passing, but a later step is dying. Strong suspects: `Set up Docker Buildx` (the action-fetch may be hitting the same Issue B class as molecule-app CI) or the buildx+ECR auth flow. Without Gitea Actions REST API logs (internal#273 Fix A still pending), the only way to surface the root cause is to add diagnostic probes in-line. This PR adds two `if: always()` diagnostic steps: 1. **pre-build**: docker version, docker info, buildx presence, `aws sts get-caller-identity`, relevant env (secrets redacted) 2. **post-buildx-setup**: `docker buildx ls`, `docker buildx version`, `docker buildx inspect --bootstrap` Both `if: always()` so they run even if a prior step has failed — captures the state at the moment of failure. The diagnostic carries a retirement TODO: remove once main is reliably green for ≥10 consecutive runs and the failure root is understood. This is workflow-only (qualifies for the §SOP-13 §3 carve-out being formalized: `.gitea/workflows/**`, tier:low, qa N/A, sec N/A, mergeable by any non-author engineer). Author = infra-lead; any non-author engineer can merge. Co-Authored-By: Claude Opus 4.7 --- .../publish-workspace-server-image.yml | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/.gitea/workflows/publish-workspace-server-image.yml b/.gitea/workflows/publish-workspace-server-image.yml index 72be551b..7c070ed3 100644 --- a/.gitea/workflows/publish-workspace-server-image.yml +++ b/.gitea/workflows/publish-workspace-server-image.yml @@ -111,6 +111,41 @@ jobs: run: | echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT" + # Diagnostic — surface the runner's docker/buildx/AWS state BEFORE the + # build steps so a failure here doesn't leave us guessing. Workflow has + # been red on main post-#572 (fix landed at the manifest-clone step, + # failure moved 9s→50s to a later step; suspect buildx setup or ECR + # auth). `if: always()` so this runs even if a prior step exits 1. + # TODO: remove once main is reliably green for ≥10 consecutive runs + # and the failure root is fully understood (track on internal#327 + # follow-up). + - name: Diagnostic — docker/buildx/AWS state (pre-build) + if: always() + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-east-2 + run: | + set +e # never fail the diagnostic itself + echo "::group::docker version" + docker version + echo "::endgroup::" + echo "::group::docker info (subset)" + docker info --format '{{.ServerVersion}} {{.Driver}} {{.CgroupDriver}} runtimes={{.Runtimes}}' + echo "::endgroup::" + echo "::group::docker buildx version (pre-setup)" + docker buildx version || echo ' (buildx not installed — setup-buildx-action will add it)' + echo "::endgroup::" + echo "::group::aws CLI presence + caller identity" + command -v aws && aws --version + aws sts get-caller-identity 2>&1 || echo ' (aws sts failed — credentials may be missing/invalid)' + echo "::endgroup::" + echo "::group::PATH + relevant env" + echo "PATH=$PATH" + env | grep -E '^(AWS_|DOCKER_|HOME|RUNNER_)' | sed 's/AWS_SECRET_ACCESS_KEY=.*/AWS_SECRET_ACCESS_KEY=***/' + echo "::endgroup::" + echo "Diagnostic completed at $(date -u +%Y-%m-%dT%H:%M:%SZ)" + # Build + push platform image (inline ECR auth — mirrors the operator-host # approach; credentials come from GITHUB_SECRET_AWS_ACCESS_KEY_ID / # GITHUB_SECRET_AWS_SECRET_ACCESS_KEY in Gitea Actions). @@ -119,6 +154,20 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0 + - name: Diagnostic — buildx state (post-setup) + if: always() + run: | + set +e + echo "::group::docker buildx ls" + docker buildx ls + echo "::endgroup::" + echo "::group::docker buildx version (post-setup)" + docker buildx version + echo "::endgroup::" + echo "::group::active builder inspect" + docker buildx inspect --bootstrap 2>&1 | head -40 + echo "::endgroup::" + - name: Build & push platform image to ECR (staging- + staging-latest) env: IMAGE_NAME: ${{ env.IMAGE_NAME }} -- 2.45.2