diff --git a/.gitea/workflows/local-provision-e2e.yml b/.gitea/workflows/local-provision-e2e.yml index f0605d2a2..4f81f5bd5 100644 --- a/.gitea/workflows/local-provision-e2e.yml +++ b/.gitea/workflows/local-provision-e2e.yml @@ -74,6 +74,10 @@ jobs: env: PG_CONTAINER: pg-lpe2e-${{ github.run_id }}-${{ github.run_attempt }} REDIS_CONTAINER: redis-lpe2e-${{ github.run_id }}-${{ github.run_attempt }} + # Hard-code dev mode at the job level so the platform server ALWAYS sees it, + # even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA). + MOLECULE_ENV: development + SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!! steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5 @@ -254,6 +258,10 @@ jobs: env: PG_CONTAINER: pg-lpe2e-real-${{ github.run_id }}-${{ github.run_attempt }} REDIS_CONTAINER: redis-lpe2e-real-${{ github.run_id }}-${{ github.run_attempt }} + # Hard-code dev mode at the job level so the platform server ALWAYS sees it, + # even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA). + MOLECULE_ENV: development + SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!! steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5 diff --git a/.gitea/workflows/publish-workspace-server-image.yml b/.gitea/workflows/publish-workspace-server-image.yml index 6b54bea73..79110d611 100644 --- a/.gitea/workflows/publish-workspace-server-image.yml +++ b/.gitea/workflows/publish-workspace-server-image.yml @@ -248,16 +248,36 @@ jobs: --tag "${STAGING_TENANT_IMAGE_NAME}:${TAG_LATEST}" ) - docker buildx build \ - --file ./workspace-server/Dockerfile.tenant \ - --build-arg NEXT_PUBLIC_PLATFORM_URL= \ - --build-arg GIT_SHA="${GIT_SHA}" \ - --label "org.opencontainers.image.source=https://git.moleculesai.app/molecule-ai/${REPO}" \ - --label "org.opencontainers.image.revision=${GIT_SHA}" \ - --label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ - --label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \ - "${build_tags[@]}" \ - --push . + # Retry loop: buildkit EOF (internal#2468) is often transient on the + # publish runner under memory pressure. Up to 3 attempts with a fresh + # builder each time so a crashed buildkit doesn't poison the next try. + for attempt in 1 2 3; do + echo "::notice::Tenant image build attempt ${attempt}/3 ..." + builder="tenant-builder-${GITHUB_RUN_ID}-${attempt}" + docker buildx create --name "${builder}" --use >/dev/null 2>&1 || true + if docker buildx build \ + --builder "${builder}" \ + --file ./workspace-server/Dockerfile.tenant \ + --build-arg NEXT_PUBLIC_PLATFORM_URL= \ + --build-arg GIT_SHA="${GIT_SHA}" \ + --label "org.opencontainers.image.source=https://git.moleculesai.app/molecule-ai/${REPO}" \ + --label "org.opencontainers.image.revision=${GIT_SHA}" \ + --label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + --label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \ + "${build_tags[@]}" \ + --push .; then + docker buildx rm "${builder}" >/dev/null 2>&1 || true + echo "::notice::Tenant image build succeeded on attempt ${attempt}" + break + fi + echo "::warning::Tenant image build attempt ${attempt} failed — cleaning builder and retrying" + docker buildx rm "${builder}" >/dev/null 2>&1 || true + sleep 10 + if [ "$attempt" -eq 3 ]; then + echo "::error::Tenant image build failed after 3 attempts" + exit 1 + fi + done # bp-exempt: production deploy side-effect; merge is gated by CI / all-required and this job waits for push CI before acting. deploy-production: diff --git a/tests/e2e/test_local_provision_lifecycle_e2e.sh b/tests/e2e/test_local_provision_lifecycle_e2e.sh index d1f61c540..dc985d4cf 100755 --- a/tests/e2e/test_local_provision_lifecycle_e2e.sh +++ b/tests/e2e/test_local_provision_lifecycle_e2e.sh @@ -488,6 +488,12 @@ echo "" # Step 5 — proxy reach (ws-:8000 Docker-DNS rewrite, end to end). # ---------------------------------------------------------------------------- echo "--- Step 5: proxy reach (POST /workspaces/$WSID/a2a) ---" +# Debug: print the workspace URL the platform stored so SSRF failures are +# actionable (#2468 RCA). +WS_DEBUG=$(admin_curl "$BASE/workspaces/$WSID") +WS_URL_DEBUG=$(ws_field "$WS_DEBUG" "url") +WS_STATUS_DEBUG=$(ws_field "$WS_DEBUG" "status") +echo " workspace url=$WS_URL_DEBUG status=$WS_STATUS_DEBUG" # In minimax mode we send a DETERMINISTIC known-answer prompt and assert the # model echoes the answer back — proving a real LLM round-trip, not just # reachability. Otherwise a plain "ping".