forked from molecule-ai/molecule-core
Class B verification — second consecutive green run to demonstrate the fix isn't flaky. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
253 lines
12 KiB
YAML
253 lines
12 KiB
YAML
name: Handlers Postgres Integration
|
|
|
|
# Real-Postgres integration tests for workspace-server/internal/handlers/.
|
|
# Triggered on every PR/push that touches the handlers package.
|
|
#
|
|
# Why this workflow exists
|
|
# ------------------------
|
|
# Strict-sqlmock unit tests pin which SQL statements fire — they're fast
|
|
# and let us iterate without a DB. But sqlmock CANNOT detect bugs that
|
|
# depend on the row state AFTER the SQL runs. The result_preview-lost
|
|
# bug shipped to staging in PR #2854 because every unit test was
|
|
# satisfied with "an UPDATE statement fired" — none verified the row's
|
|
# preview field actually landed. The local-postgres E2E that retrofit
|
|
# self-review caught it took 2 minutes to set up and would have caught
|
|
# the bug at PR-time.
|
|
#
|
|
# Why this workflow does NOT use `services: postgres:` (Class B fix)
|
|
# ------------------------------------------------------------------
|
|
# Our act_runner config has `container.network: host` (operator host
|
|
# /opt/molecule/runners/config.yaml), which act_runner applies to BOTH
|
|
# the job container AND every service container. With host-net, two
|
|
# concurrent runs of this workflow both try to bind 0.0.0.0:5432 — the
|
|
# second postgres FATALs with `could not create any TCP/IP sockets:
|
|
# Address in use`, and Docker auto-removes it (act_runner sets
|
|
# AutoRemove:true on service containers). By the time the migrations
|
|
# step runs `psql`, the postgres container is gone, hence
|
|
# `Connection refused` then `failed to remove container: No such
|
|
# container` at cleanup time.
|
|
#
|
|
# Per-job `container.network` override is silently ignored by
|
|
# act_runner — `--network and --net in the options will be ignored.`
|
|
# appears in the runner log. Documented constraint.
|
|
#
|
|
# So we sidestep `services:` entirely. The job container still uses
|
|
# host-net (inherited from runner config; required for cache server
|
|
# discovery on the bridge IP 172.18.0.17:42631). We launch a sibling
|
|
# postgres on the existing `molecule-monorepo-net` bridge with a
|
|
# UNIQUE name per run — `pg-handlers-${RUN_ID}-${RUN_ATTEMPT}` — and
|
|
# read its bridge IP via `docker inspect`. A host-net job container
|
|
# can reach a bridge-net container directly via the bridge IP (verified
|
|
# manually on operator host 2026-05-08).
|
|
#
|
|
# Trade-offs vs. the original `services:` shape:
|
|
# + No host-port collision; N parallel runs share the bridge cleanly
|
|
# + `if: always()` cleanup runs even on test-step failure
|
|
# - One more step in the workflow (+~3 lines)
|
|
# - Requires `molecule-monorepo-net` to exist on the operator host
|
|
# (it does; declared in docker-compose.yml + docker-compose.infra.yml)
|
|
#
|
|
# Class B Hongming-owned CICD red sweep, 2026-05-08.
|
|
#
|
|
# Cost: ~30s job (postgres pull from cache + go build + 4 tests).
|
|
|
|
on:
|
|
push:
|
|
branches: [main, staging]
|
|
pull_request:
|
|
branches: [main, staging]
|
|
merge_group:
|
|
types: [checks_requested]
|
|
workflow_dispatch:
|
|
|
|
concurrency:
|
|
group: handlers-pg-integ-${{ github.event.pull_request.head.sha || github.sha }}
|
|
cancel-in-progress: false
|
|
|
|
jobs:
|
|
detect-changes:
|
|
name: detect-changes
|
|
runs-on: ubuntu-latest
|
|
outputs:
|
|
handlers: ${{ steps.filter.outputs.handlers }}
|
|
steps:
|
|
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
|
- uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1
|
|
id: filter
|
|
with:
|
|
filters: |
|
|
handlers:
|
|
- 'workspace-server/internal/handlers/**'
|
|
- 'workspace-server/internal/wsauth/**'
|
|
- 'workspace-server/migrations/**'
|
|
- '.github/workflows/handlers-postgres-integration.yml'
|
|
|
|
# Single-job-with-per-step-if pattern: always runs to satisfy the
|
|
# required-check name on branch protection; real work gates on the
|
|
# paths filter. See ci.yml's Platform (Go) for the same shape.
|
|
integration:
|
|
name: Handlers Postgres Integration
|
|
needs: detect-changes
|
|
runs-on: ubuntu-latest
|
|
env:
|
|
# Unique name per run so concurrent jobs don't collide on the
|
|
# bridge network. ${RUN_ID}-${RUN_ATTEMPT} is unique even across
|
|
# workflow_dispatch reruns of the same run_id.
|
|
PG_NAME: pg-handlers-${{ github.run_id }}-${{ github.run_attempt }}
|
|
# Bridge network already exists on the operator host (declared
|
|
# in docker-compose.yml + docker-compose.infra.yml).
|
|
PG_NETWORK: molecule-monorepo-net
|
|
defaults:
|
|
run:
|
|
working-directory: workspace-server
|
|
steps:
|
|
- if: needs.detect-changes.outputs.handlers != 'true'
|
|
working-directory: .
|
|
run: echo "No handlers/migrations changes — skipping; this job always runs to satisfy the required-check name."
|
|
|
|
- if: needs.detect-changes.outputs.handlers == 'true'
|
|
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
|
|
|
- if: needs.detect-changes.outputs.handlers == 'true'
|
|
uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
|
|
with:
|
|
go-version: 'stable'
|
|
|
|
- if: needs.detect-changes.outputs.handlers == 'true'
|
|
name: Start sibling Postgres on bridge network
|
|
working-directory: .
|
|
run: |
|
|
# Sanity: the bridge network must exist on the operator host.
|
|
# Hard-fail loud if it doesn't — easier to spot than a silent
|
|
# auto-create that diverges from the rest of the stack.
|
|
if ! docker network inspect "${PG_NETWORK}" >/dev/null 2>&1; then
|
|
echo "::error::Bridge network '${PG_NETWORK}' missing on operator host. Re-run docker-compose.infra.yml or check ops handbook."
|
|
exit 1
|
|
fi
|
|
|
|
# If a stale container with the same name exists (rerun on
|
|
# the same run_id), wipe it first.
|
|
docker rm -f "${PG_NAME}" >/dev/null 2>&1 || true
|
|
|
|
docker run -d \
|
|
--name "${PG_NAME}" \
|
|
--network "${PG_NETWORK}" \
|
|
--health-cmd "pg_isready -U postgres" \
|
|
--health-interval 5s \
|
|
--health-timeout 5s \
|
|
--health-retries 10 \
|
|
-e POSTGRES_PASSWORD=test \
|
|
-e POSTGRES_DB=molecule \
|
|
postgres:15-alpine >/dev/null
|
|
|
|
# Read back the bridge IP. Always present immediately after
|
|
# `docker run -d` for bridge networks.
|
|
PG_HOST=$(docker inspect "${PG_NAME}" \
|
|
--format "{{(index .NetworkSettings.Networks \"${PG_NETWORK}\").IPAddress}}")
|
|
if [ -z "${PG_HOST}" ]; then
|
|
echo "::error::Could not resolve PG_HOST for ${PG_NAME} on ${PG_NETWORK}"
|
|
docker logs "${PG_NAME}" || true
|
|
exit 1
|
|
fi
|
|
echo "PG_HOST=${PG_HOST}" >> "$GITHUB_ENV"
|
|
echo "INTEGRATION_DB_URL=postgres://postgres:test@${PG_HOST}:5432/molecule?sslmode=disable" >> "$GITHUB_ENV"
|
|
echo "Started ${PG_NAME} at ${PG_HOST}:5432"
|
|
|
|
- if: needs.detect-changes.outputs.handlers == 'true'
|
|
name: Apply migrations to Postgres service
|
|
env:
|
|
PGPASSWORD: test
|
|
run: |
|
|
# Wait for postgres to actually accept connections. Docker's
|
|
# health-cmd handles container-side readiness, but the wire
|
|
# to the bridge IP is best-tested with pg_isready directly.
|
|
for i in {1..15}; do
|
|
if pg_isready -h "${PG_HOST}" -p 5432 -U postgres -q; then break; fi
|
|
echo "waiting for postgres at ${PG_HOST}:5432..."; sleep 2
|
|
done
|
|
|
|
# Apply every .up.sql in lexicographic order with
|
|
# ON_ERROR_STOP=0 — failing migrations are SKIPPED rather than
|
|
# blocking the suite. This handles the current schema state
|
|
# where a few historical migrations (e.g. 017_memories_fts_*)
|
|
# depend on tables that were later renamed/dropped and so
|
|
# cannot replay from scratch. The migrations that DO succeed
|
|
# land their tables, which is sufficient for the integration
|
|
# tests in handlers/.
|
|
#
|
|
# Why not maintain a curated allowlist: every new migration
|
|
# touching a handlers/-tested table would have to update this
|
|
# workflow. With apply-all-or-skip, a future migration that
|
|
# adds a column to delegations runs automatically (its base
|
|
# table 049_delegations.up.sql already succeeded above it in
|
|
# the order). Operators only need to revisit this if the
|
|
# migration chain becomes legitimately replayable end-to-end.
|
|
#
|
|
# Per-migration result is logged so a failed migration that
|
|
# SHOULD have been replayable surfaces in the CI log instead
|
|
# of silently failing.
|
|
# Apply both *.sql (legacy, lives next to its module) and
|
|
# *.up.sql (newer up/down convention) in a single
|
|
# lexicographically-sorted pass. Excluding *.down.sql so the
|
|
# newest-naming-convention pairs don't undo themselves mid-run.
|
|
# Pre-#149-followup this loop only globbed *.up.sql, which
|
|
# silently skipped 001_workspaces.sql + 009_activity_logs.sql
|
|
# — fine while no integration test depended on those tables,
|
|
# not fine once a cross-table atomicity test came in.
|
|
set +e
|
|
for migration in $(ls migrations/*.sql 2>/dev/null | grep -v '\.down\.sql$' | sort); do
|
|
if psql -h "${PG_HOST}" -U postgres -d molecule -v ON_ERROR_STOP=1 \
|
|
-f "$migration" >/dev/null 2>&1; then
|
|
echo "✓ $(basename "$migration")"
|
|
else
|
|
echo "⊘ $(basename "$migration") (skipped — see comment in workflow)"
|
|
fi
|
|
done
|
|
set -e
|
|
|
|
# Sanity: the delegations + workspaces + activity_logs tables
|
|
# MUST exist for the integration tests to be meaningful. Hard-
|
|
# fail if any didn't land — that would be a real regression we
|
|
# want loud.
|
|
for tbl in delegations workspaces activity_logs pending_uploads; do
|
|
if ! psql -h "${PG_HOST}" -U postgres -d molecule -tA \
|
|
-c "SELECT 1 FROM information_schema.tables WHERE table_name = '$tbl'" \
|
|
| grep -q 1; then
|
|
echo "::error::$tbl table missing after migration replay — handler integration tests would be meaningless"
|
|
exit 1
|
|
fi
|
|
echo "✓ $tbl table present"
|
|
done
|
|
|
|
- if: needs.detect-changes.outputs.handlers == 'true'
|
|
name: Run integration tests
|
|
run: |
|
|
# INTEGRATION_DB_URL is exported by the start-postgres step;
|
|
# points at the per-run bridge IP, not 127.0.0.1, so concurrent
|
|
# workflow runs don't fight over a host-net 5432 port.
|
|
go test -tags=integration -timeout 5m -v ./internal/handlers/ -run "^TestIntegration_"
|
|
|
|
- if: failure() && needs.detect-changes.outputs.handlers == 'true'
|
|
name: Diagnostic dump on failure
|
|
env:
|
|
PGPASSWORD: test
|
|
run: |
|
|
echo "::group::postgres container status"
|
|
docker ps -a --filter "name=${PG_NAME}" --format '{{.Status}} {{.Names}}' || true
|
|
docker logs "${PG_NAME}" 2>&1 | tail -50 || true
|
|
echo "::endgroup::"
|
|
echo "::group::delegations table state"
|
|
psql -h "${PG_HOST}" -U postgres -d molecule -c "SELECT * FROM delegations LIMIT 50;" || true
|
|
echo "::endgroup::"
|
|
|
|
- if: always() && needs.detect-changes.outputs.handlers == 'true'
|
|
name: Stop sibling Postgres
|
|
working-directory: .
|
|
run: |
|
|
# always() so containers don't leak when migrations or tests
|
|
# fail. The cleanup is best-effort: if the container is
|
|
# already gone (e.g. concurrent rerun race), don't fail the job.
|
|
docker rm -f "${PG_NAME}" >/dev/null 2>&1 || true
|
|
echo "Cleaned up ${PG_NAME}"
|
|
|