From 3870dd2dceeb9b56d2596b6c833ed7986ad49428 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Tue, 9 Jun 2026 02:37:45 +0000 Subject: [PATCH] fix(ci): hard-code MOLECULE_ENV in local-provision E2E + retry tenant image build - Moves MOLECULE_ENV=development and SECRETS_ENCRYPTION_KEY to the job-level env block in both lifecycle-stub and lifecycle-real so the platform server always sees dev mode even if the runner's $GITHUB_ENV propagation is flaky. This addresses the 'workspace URL is not publicly routable' SSRF failure on main (#2468) where loopback/private IPs were being rejected. - Adds workspace URL debug print in test_local_provision_lifecycle_e2e.sh so future SSRF failures show the actual stored URL immediately. - Wraps the tenant image build in publish-workspace-server-image.yml with a 3-attempt retry loop that creates a fresh buildx builder each time. The buildkit EOF error (#2468) is often transient under memory pressure on the publish runner; a clean builder retry avoids poisoning from a crashed one. Co-Authored-By: Claude Opus 4.8 --- .gitea/workflows/local-provision-e2e.yml | 8 ++++ .../publish-workspace-server-image.yml | 40 ++++++++++++++----- .../e2e/test_local_provision_lifecycle_e2e.sh | 6 +++ 3 files changed, 44 insertions(+), 10 deletions(-) diff --git a/.gitea/workflows/local-provision-e2e.yml b/.gitea/workflows/local-provision-e2e.yml index f0605d2a2..4f81f5bd5 100644 --- a/.gitea/workflows/local-provision-e2e.yml +++ b/.gitea/workflows/local-provision-e2e.yml @@ -74,6 +74,10 @@ jobs: env: PG_CONTAINER: pg-lpe2e-${{ github.run_id }}-${{ github.run_attempt }} REDIS_CONTAINER: redis-lpe2e-${{ github.run_id }}-${{ github.run_attempt }} + # Hard-code dev mode at the job level so the platform server ALWAYS sees it, + # even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA). + MOLECULE_ENV: development + SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!! steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5 @@ -254,6 +258,10 @@ jobs: env: PG_CONTAINER: pg-lpe2e-real-${{ github.run_id }}-${{ github.run_attempt }} REDIS_CONTAINER: redis-lpe2e-real-${{ github.run_id }}-${{ github.run_attempt }} + # Hard-code dev mode at the job level so the platform server ALWAYS sees it, + # even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA). + MOLECULE_ENV: development + SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!! steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5 diff --git a/.gitea/workflows/publish-workspace-server-image.yml b/.gitea/workflows/publish-workspace-server-image.yml index 6b54bea73..79110d611 100644 --- a/.gitea/workflows/publish-workspace-server-image.yml +++ b/.gitea/workflows/publish-workspace-server-image.yml @@ -248,16 +248,36 @@ jobs: --tag "${STAGING_TENANT_IMAGE_NAME}:${TAG_LATEST}" ) - docker buildx build \ - --file ./workspace-server/Dockerfile.tenant \ - --build-arg NEXT_PUBLIC_PLATFORM_URL= \ - --build-arg GIT_SHA="${GIT_SHA}" \ - --label "org.opencontainers.image.source=https://git.moleculesai.app/molecule-ai/${REPO}" \ - --label "org.opencontainers.image.revision=${GIT_SHA}" \ - --label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ - --label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \ - "${build_tags[@]}" \ - --push . + # Retry loop: buildkit EOF (internal#2468) is often transient on the + # publish runner under memory pressure. Up to 3 attempts with a fresh + # builder each time so a crashed buildkit doesn't poison the next try. + for attempt in 1 2 3; do + echo "::notice::Tenant image build attempt ${attempt}/3 ..." + builder="tenant-builder-${GITHUB_RUN_ID}-${attempt}" + docker buildx create --name "${builder}" --use >/dev/null 2>&1 || true + if docker buildx build \ + --builder "${builder}" \ + --file ./workspace-server/Dockerfile.tenant \ + --build-arg NEXT_PUBLIC_PLATFORM_URL= \ + --build-arg GIT_SHA="${GIT_SHA}" \ + --label "org.opencontainers.image.source=https://git.moleculesai.app/molecule-ai/${REPO}" \ + --label "org.opencontainers.image.revision=${GIT_SHA}" \ + --label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + --label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \ + "${build_tags[@]}" \ + --push .; then + docker buildx rm "${builder}" >/dev/null 2>&1 || true + echo "::notice::Tenant image build succeeded on attempt ${attempt}" + break + fi + echo "::warning::Tenant image build attempt ${attempt} failed — cleaning builder and retrying" + docker buildx rm "${builder}" >/dev/null 2>&1 || true + sleep 10 + if [ "$attempt" -eq 3 ]; then + echo "::error::Tenant image build failed after 3 attempts" + exit 1 + fi + done # bp-exempt: production deploy side-effect; merge is gated by CI / all-required and this job waits for push CI before acting. deploy-production: diff --git a/tests/e2e/test_local_provision_lifecycle_e2e.sh b/tests/e2e/test_local_provision_lifecycle_e2e.sh index d1f61c540..dc985d4cf 100755 --- a/tests/e2e/test_local_provision_lifecycle_e2e.sh +++ b/tests/e2e/test_local_provision_lifecycle_e2e.sh @@ -488,6 +488,12 @@ echo "" # Step 5 — proxy reach (ws-:8000 Docker-DNS rewrite, end to end). # ---------------------------------------------------------------------------- echo "--- Step 5: proxy reach (POST /workspaces/$WSID/a2a) ---" +# Debug: print the workspace URL the platform stored so SSRF failures are +# actionable (#2468 RCA). +WS_DEBUG=$(admin_curl "$BASE/workspaces/$WSID") +WS_URL_DEBUG=$(ws_field "$WS_DEBUG" "url") +WS_STATUS_DEBUG=$(ws_field "$WS_DEBUG" "status") +echo " workspace url=$WS_URL_DEBUG status=$WS_STATUS_DEBUG" # In minimax mode we send a DETERMINISTIC known-answer prompt and assert the # model echoes the answer back — proving a real LLM round-trip, not just # reachability. Otherwise a plain "ping". -- 2.52.0