From 87b971a29257c5ab279e6ce41f718872637c6b3f Mon Sep 17 00:00:00 2001 From: "claude-ceo-assistant (Claude Opus 4.7 on Hongming's MacBook)" Date: Thu, 7 May 2026 17:06:09 -0700 Subject: [PATCH 1/2] fix(ci): close 3 chronic Gitea-Actions workflow flakes (closes #88) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three workflows have been failing on every push to this Gitea repo for GitHub-shaped reasons that don't translate to act_runner. Surfaced while landing #84; bundled per `feedback_gitea_actions_migration_audit_pattern` ("bundle per-repo, not per-finding") instead of three separate PRs. 1) handlers-postgres-integration: localhost → 127.0.0.1 - lib/pq tries to dial localhost → ::1 first; the postgres service container only listens on IPv4 → ECONNREFUSED → all TestIntegration_* fail. Pin IPv4 to make the job deterministic. 2) pr-guards / disable-auto-merge-on-push: Gitea no-op - The previous reusable-workflow caller invoked `gh pr merge --disable-auto`, which calls GitHub's GraphQL API. Gitea returns HTTP 405 on /api/graphql → step always fails. Inline the step so it can detect Gitea (GITEA_ACTIONS=true OR repo url under moleculesai.app) and no-op with a notice. Auto-merge gating is moot on Gitea anyway: there's no `--auto` primitive being touched. Job stays ALWAYS-RUN so branch protection's required check still lands SUCCESS (avoids the SKIPPED-in-set trap from `feedback_branch_protection_check_name_parity`). 3) Harness Replays: cf-proxy nginx.conf via docker `configs:` (not bind) - act_runner runs the workflow inside a runner container; runc in the docker daemon below resolves bind-mount source paths on the OUTER host, not inside the runner. The path `/workspace/.../cf-proxy/nginx.conf` is invisible there → "not a directory" runc error. Switching to compose `configs:` packages the file as content rather than a host bind, sidestepping the DinD path-translation gap. Local validation: - YAML parsed clean for all 3 files. - cf-proxy nginx.conf: standalone `docker compose run cf-proxy nginx -T` reproduced the configs: mount end-to-end and dumped the config correctly. The full harness compose still renders via `docker compose config`. Real-CI verification will land on this branch's first push. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../handlers-postgres-integration.yml | 17 ++++-- .github/workflows/pr-guards.yml | 59 ++++++++++++++++--- tests/harness/compose.yml | 25 +++++++- 3 files changed, 85 insertions(+), 16 deletions(-) diff --git a/.github/workflows/handlers-postgres-integration.yml b/.github/workflows/handlers-postgres-integration.yml index 98927ac9..41f00b83 100644 --- a/.github/workflows/handlers-postgres-integration.yml +++ b/.github/workflows/handlers-postgres-integration.yml @@ -97,7 +97,7 @@ jobs: # Wait for postgres to actually accept connections (the # GHA --health-cmd is best-effort but psql can still race). for i in {1..15}; do - if pg_isready -h localhost -p 5432 -U postgres -q; then break; fi + if pg_isready -h 127.0.0.1 -p 5432 -U postgres -q; then break; fi echo "waiting for postgres..."; sleep 2 done @@ -131,7 +131,7 @@ jobs: # not fine once a cross-table atomicity test came in. set +e for migration in $(ls migrations/*.sql 2>/dev/null | grep -v '\.down\.sql$' | sort); do - if psql -h localhost -U postgres -d molecule -v ON_ERROR_STOP=1 \ + if psql -h 127.0.0.1 -U postgres -d molecule -v ON_ERROR_STOP=1 \ -f "$migration" >/dev/null 2>&1; then echo "✓ $(basename "$migration")" else @@ -145,7 +145,7 @@ jobs: # fail if any didn't land — that would be a real regression we # want loud. for tbl in delegations workspaces activity_logs pending_uploads; do - if ! psql -h localhost -U postgres -d molecule -tA \ + if ! psql -h 127.0.0.1 -U postgres -d molecule -tA \ -c "SELECT 1 FROM information_schema.tables WHERE table_name = '$tbl'" \ | grep -q 1; then echo "::error::$tbl table missing after migration replay — handler integration tests would be meaningless" @@ -157,7 +157,14 @@ jobs: - if: needs.detect-changes.outputs.handlers == 'true' name: Run integration tests env: - INTEGRATION_DB_URL: postgres://postgres:test@localhost:5432/molecule?sslmode=disable + # 127.0.0.1, NOT localhost. On Gitea / act_runner the runner host + # has IPv6 enabled, so `localhost` resolves to `::1` first, and + # the Postgres service container only listens on IPv4 → lib/pq's + # first dial hits ECONNREFUSED. The migration step uses psql -h + # localhost which falls back to IPv4 cleanly, so the flake hides + # there and surfaces only at test time. Pinning IPv4 makes the + # whole job deterministic. (Issue #88, item 3.) + INTEGRATION_DB_URL: postgres://postgres:test@127.0.0.1:5432/molecule?sslmode=disable run: | go test -tags=integration -timeout 5m -v ./internal/handlers/ -run "^TestIntegration_" @@ -167,5 +174,5 @@ jobs: PGPASSWORD: test run: | echo "::group::delegations table state" - psql -h localhost -U postgres -d molecule -c "SELECT * FROM delegations LIMIT 50;" || true + psql -h 127.0.0.1 -U postgres -d molecule -c "SELECT * FROM delegations LIMIT 50;" || true echo "::endgroup::" diff --git a/.github/workflows/pr-guards.yml b/.github/workflows/pr-guards.yml index 151757fe..7dd00c16 100644 --- a/.github/workflows/pr-guards.yml +++ b/.github/workflows/pr-guards.yml @@ -1,14 +1,25 @@ name: pr-guards -# Thin caller that delegates to the molecule-ci reusable guard. Today -# the guard is just "disable auto-merge when a new commit is pushed -# after auto-merge was enabled" — added 2026-04-27 after PR #2174 -# auto-merged with only its first commit because the second commit -# was pushed after the merge queue had locked the PR's SHA. +# PR-time guards. Today the only guard is "disable auto-merge when a +# new commit is pushed after auto-merge was enabled" — added 2026-04-27 +# after PR #2174 auto-merged with only its first commit because the +# second commit was pushed after the merge queue had locked the PR's +# SHA. # -# When more PR-time guards land in molecule-ci, add them here as -# additional jobs that share the same pull_request:synchronize -# trigger. +# Why this is inlined (not delegated to molecule-ci's reusable +# workflow): the reusable workflow uses `gh pr merge --disable-auto`, +# which calls GitHub's GraphQL API. Gitea has no GraphQL endpoint and +# returns HTTP 405 on /api/graphql, so the job failed on every Gitea +# PR push since the 2026-05-06 migration. Gitea also has no `--auto` +# merge primitive that this job could be acting on, so the right +# behaviour on Gitea is "no-op + green status" — not a 405. +# +# Inlining (vs. an `if:` on the `uses:` line) keeps the job ALWAYS +# running, which matters for branch protection: required-check names +# need a job that emits SUCCESS terminal state, not SKIPPED. See +# `feedback_branch_protection_check_name_parity` and `feedback_pr_merge_safety_guards`. +# +# Issue #88 item 1. on: pull_request: @@ -19,4 +30,34 @@ permissions: jobs: disable-auto-merge-on-push: - uses: molecule-ai/molecule-ci/.github/workflows/disable-auto-merge-on-push.yml@main + runs-on: ubuntu-latest + steps: + # Detect Gitea Actions. act_runner sets GITEA_ACTIONS=true in the + # step env on every job. Belt-and-suspenders: also check the repo + # url's host, which is independent of any runner-side env config + # (covers a future Gitea host where the env var is forgotten). + - name: Detect runner host + id: host + run: | + if [[ "${GITEA_ACTIONS:-}" == "true" ]] || [[ "${{ github.server_url }}" == *moleculesai.app* ]] || [[ "${{ github.event.repository.html_url }}" == *moleculesai.app* ]]; then + echo "is_gitea=true" >> "$GITHUB_OUTPUT" + echo "::notice::Gitea Actions detected — auto-merge gating is not applicable here (Gitea has no --auto merge primitive). Job will no-op." + else + echo "is_gitea=false" >> "$GITHUB_OUTPUT" + fi + + - name: Disable auto-merge (GitHub only) + if: steps.host.outputs.is_gitea != 'true' + env: + GH_TOKEN: ${{ github.token }} + PR: ${{ github.event.pull_request.number }} + REPO: ${{ github.repository }} + NEW_SHA: ${{ github.sha }} + run: | + set -eu + gh pr merge "$PR" --disable-auto -R "$REPO" || true + gh pr comment "$PR" -R "$REPO" --body "🔒 Auto-merge disabled — new commit (\`${NEW_SHA:0:7}\`) pushed after auto-merge was enabled. The merge queue locks SHAs at entry, so subsequent pushes can race. Verify the new commit and re-enable with \`gh pr merge --auto\`." + + - name: Gitea no-op + if: steps.host.outputs.is_gitea == 'true' + run: echo "Gitea Actions — auto-merge gating not applicable; no-op (job intentionally green so branch protection's required-check name lands SUCCESS)." diff --git a/tests/harness/compose.yml b/tests/harness/compose.yml index e209287d..c9489db9 100644 --- a/tests/harness/compose.yml +++ b/tests/harness/compose.yml @@ -167,6 +167,18 @@ services: # Production shape: same single CF tunnel front-doors every tenant # subdomain — the Host header carries the tenant identity, not the # routing destination. Local cf-proxy mirrors this exactly. + # + # nginx.conf delivery: docker compose `configs:` block (not a bind + # mount) so the file ships as content packaged by compose, not a + # host-path bind that has to be visible to the docker daemon's runc. + # Bind mounts break under Gitea's act_runner DinD because runc + # resolves the source path on the OUTER docker host (the runner's + # host filesystem), not inside the runner container — the path + # `/workspace/.../tests/harness/cf-proxy/nginx.conf` is only visible + # to the runner, not to the daemon below it. The `configs:` form + # uploads the file to the daemon as part of the service definition + # and is bind-mount-equivalent at the container level. See issue #88 + # item 2. cf-proxy: image: nginx:1.27-alpine depends_on: @@ -174,14 +186,23 @@ services: condition: service_healthy tenant-beta: condition: service_healthy - volumes: - - ./cf-proxy/nginx.conf:/etc/nginx/nginx.conf:ro + configs: + - source: cf-proxy-nginx-conf + target: /etc/nginx/nginx.conf + mode: 0444 # Bind to 127.0.0.1 only — hardcoded ADMIN_TOKENs make 0.0.0.0 # exposure unsafe even on a local network. ports: - "127.0.0.1:8080:8080" networks: [harness-net] +configs: + # Defined once at compose level so any future service (e.g. a second + # nginx variant for an external-connect smoke test) can reuse the + # same source file. + cf-proxy-nginx-conf: + file: ./cf-proxy/nginx.conf + networks: harness-net: name: molecule-harness-net From 7eb348536b882e122f893fdc0b0cac90907a1c55 Mon Sep 17 00:00:00 2001 From: "claude-ceo-assistant (Claude Opus 4.7 on Hongming's MacBook)" Date: Thu, 7 May 2026 17:09:08 -0700 Subject: [PATCH 2/2] fix(harness): bake cf-proxy nginx.conf at build time, not via configs: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous configs:-based fix (87b971a2) didn't actually fix the DinD issue — Compose v2 falls back to bind mounts for `configs:` when swarm mode is not active, so the resulting runc invocation still tries to mount /workspace/.../cf-proxy/nginx.conf from the OUTER host filesystem that the act_runner-vs-host-docker socket-mount can't see. Same "not a directory" error returned. Switch to a thin Dockerfile (cf-proxy/Dockerfile) that COPYs nginx.conf into nginx:1.27-alpine. The build context is uploaded to the daemon as a tarball, not bind-mounted from the host filesystem, so the path translation gap doesn't apply. Verified locally: `docker build` + `docker run cf-proxy nginx -T` reproduces the baked config end-to-end. Trade-off: ~2-3s build cost on every harness up. Acceptable for the Gitea CI gate; local-dev re-builds the image only when nginx.conf changes (Docker layer cache). Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/harness/cf-proxy/Dockerfile | 14 ++++++++++++ tests/harness/compose.yml | 36 +++++++++++-------------------- 2 files changed, 27 insertions(+), 23 deletions(-) create mode 100644 tests/harness/cf-proxy/Dockerfile diff --git a/tests/harness/cf-proxy/Dockerfile b/tests/harness/cf-proxy/Dockerfile new file mode 100644 index 00000000..d443f243 --- /dev/null +++ b/tests/harness/cf-proxy/Dockerfile @@ -0,0 +1,14 @@ +# cf-proxy harness image — nginx + the harness's tenant-routing config baked +# in at build time. +# +# Why bake (not bind-mount): on Gitea Actions / act_runner, the runner is a +# container talking to the OUTER docker daemon over the host socket; runc +# resolves bind-mount source paths on the outer host filesystem, where the +# repo at `/workspace/.../tests/harness/cf-proxy/nginx.conf` is invisible. +# Compose `configs:` (with `file:`) falls back to bind mounts when swarm is +# not active, so it hits the same gap. A build-time COPY uploads the file +# as part of the docker build context — the daemon receives the tarball +# directly and never bind-mounts. See issue #88 item 2. +FROM nginx:1.27-alpine + +COPY nginx.conf /etc/nginx/nginx.conf diff --git a/tests/harness/compose.yml b/tests/harness/compose.yml index c9489db9..afb623ee 100644 --- a/tests/harness/compose.yml +++ b/tests/harness/compose.yml @@ -168,41 +168,31 @@ services: # subdomain — the Host header carries the tenant identity, not the # routing destination. Local cf-proxy mirrors this exactly. # - # nginx.conf delivery: docker compose `configs:` block (not a bind - # mount) so the file ships as content packaged by compose, not a - # host-path bind that has to be visible to the docker daemon's runc. - # Bind mounts break under Gitea's act_runner DinD because runc - # resolves the source path on the OUTER docker host (the runner's - # host filesystem), not inside the runner container — the path - # `/workspace/.../tests/harness/cf-proxy/nginx.conf` is only visible - # to the runner, not to the daemon below it. The `configs:` form - # uploads the file to the daemon as part of the service definition - # and is bind-mount-equivalent at the container level. See issue #88 - # item 2. + # nginx.conf delivery: built into a custom image via cf-proxy/Dockerfile + # (a thin nginx:1.27-alpine + COPY). NOT a bind mount and NOT a + # compose `configs:` block, both of which break under Gitea's + # act_runner: the runner talks to the OUTER docker daemon over the + # host socket, and runc resolves bind sources on the outer host + # filesystem, where `/workspace/.../tests/harness/cf-proxy/nginx.conf` + # is invisible. Compose `configs:` falls back to bind mounts without + # swarm, so it hits the same gap. A build context, by contrast, is + # uploaded to the daemon as a tarball at build time — no bind. See + # issue #88 item 2. cf-proxy: - image: nginx:1.27-alpine + build: + context: ./cf-proxy + dockerfile: Dockerfile depends_on: tenant-alpha: condition: service_healthy tenant-beta: condition: service_healthy - configs: - - source: cf-proxy-nginx-conf - target: /etc/nginx/nginx.conf - mode: 0444 # Bind to 127.0.0.1 only — hardcoded ADMIN_TOKENs make 0.0.0.0 # exposure unsafe even on a local network. ports: - "127.0.0.1:8080:8080" networks: [harness-net] -configs: - # Defined once at compose level so any future service (e.g. a second - # nginx variant for an external-connect smoke test) can reuse the - # same source file. - cf-proxy-nginx-conf: - file: ./cf-proxy/nginx.conf - networks: harness-net: name: molecule-harness-net