diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 422af4d7..5e12c832 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,7 +9,7 @@ on: jobs: platform-build: name: Platform (Go) - runs-on: ubuntu-latest + runs-on: [self-hosted, macos, arm64] defaults: run: working-directory: platform @@ -43,7 +43,7 @@ jobs: canvas-build: name: Canvas (Next.js) - runs-on: ubuntu-latest + runs-on: [self-hosted, macos, arm64] defaults: run: working-directory: canvas @@ -59,7 +59,7 @@ jobs: mcp-server-build: name: MCP Server (Node.js) - runs-on: ubuntu-latest + runs-on: [self-hosted, macos, arm64] defaults: run: working-directory: mcp-server @@ -75,37 +75,17 @@ jobs: e2e-api: name: E2E API Smoke Test - runs-on: ubuntu-latest - timeout-minutes: 10 - services: - postgres: - # Credentials match .env.example (dev:dev) so local reproduction is - # identical to CI. POSTGRES_DB matches the default there too. - image: postgres:16 - env: - POSTGRES_USER: dev - POSTGRES_PASSWORD: dev - POSTGRES_DB: molecule - ports: - - 5432:5432 - options: >- - --health-cmd "pg_isready -U dev" - --health-interval 10s - --health-timeout 5s - --health-retries 5 - redis: - image: redis:7 - ports: - - 6379:6379 - options: >- - --health-cmd "redis-cli ping" - --health-interval 10s - --health-timeout 5s - --health-retries 5 + runs-on: [self-hosted, macos, arm64] + timeout-minutes: 15 + # `services:` is Linux-only on self-hosted runners — we start postgres + # and redis via `docker run` instead. Ports 15432/16379 avoid collision + # with anything the host may already have on the standard ports. env: - DATABASE_URL: postgres://dev:dev@localhost:5432/molecule?sslmode=disable - REDIS_URL: redis://localhost:6379 + DATABASE_URL: postgres://dev:dev@localhost:15432/molecule?sslmode=disable + REDIS_URL: redis://localhost:16379 PORT: "8080" + PG_CONTAINER: molecule-ci-postgres + REDIS_CONTAINER: molecule-ci-redis steps: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 @@ -113,6 +93,38 @@ jobs: go-version: 'stable' cache: true cache-dependency-path: platform/go.sum + - name: Start Postgres (docker) + run: | + docker rm -f "$PG_CONTAINER" 2>/dev/null || true + docker run -d --name "$PG_CONTAINER" \ + -e POSTGRES_USER=dev \ + -e POSTGRES_PASSWORD=dev \ + -e POSTGRES_DB=molecule \ + -p 15432:5432 \ + postgres:16 + for i in $(seq 1 30); do + if docker exec "$PG_CONTAINER" pg_isready -U dev >/dev/null 2>&1; then + echo "Postgres ready after ${i}s" + exit 0 + fi + sleep 1 + done + echo "::error::Postgres did not become ready in 30s" + docker logs "$PG_CONTAINER" || true + exit 1 + - name: Start Redis (docker) + run: | + docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true + docker run -d --name "$REDIS_CONTAINER" -p 16379:6379 redis:7 + for i in $(seq 1 15); do + if docker exec "$REDIS_CONTAINER" redis-cli ping 2>/dev/null | grep -q PONG; then + echo "Redis ready after ${i}s" + exit 0 + fi + sleep 1 + done + echo "::error::Redis did not become ready in 15s" + exit 1 - name: Build platform working-directory: platform run: go build -o platform-server ./cmd/server @@ -135,17 +147,9 @@ jobs: exit 1 - name: Assert migrations applied # Migrations auto-run at platform boot. Fail fast if they silently - # didn't — catches future migration-author mistakes (e.g. a new - # privileged op Postgres "dev" can't execute) before the E2E run. - # Uses docker exec into the service container's own psql — avoids - # a 10-20s apt-install step in the runner. + # didn't — catches future migration-author mistakes before the E2E run. run: | - pg_container=$(docker ps --filter "ancestor=postgres:16" --format "{{.ID}}" | head -1) - if [ -z "$pg_container" ]; then - echo "::error::Could not find postgres service container" - exit 1 - fi - tables=$(docker exec "$pg_container" psql -U dev -d molecule -tAc "SELECT count(*) FROM information_schema.tables WHERE table_schema='public' AND table_name='workspaces'") + tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc "SELECT count(*) FROM information_schema.tables WHERE table_schema='public' AND table_name='workspaces'") if [ "$tables" != "1" ]; then echo "::error::Migrations did not apply — 'workspaces' table missing" cat platform/platform.log || true @@ -163,22 +167,31 @@ jobs: if [ -f platform/platform.pid ]; then kill "$(cat platform/platform.pid)" 2>/dev/null || true fi + - name: Stop service containers + if: always() + run: | + docker rm -f "$PG_CONTAINER" 2>/dev/null || true + docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true shellcheck: name: Shellcheck (E2E scripts) - runs-on: ubuntu-latest + runs-on: [self-hosted, macos, arm64] steps: - uses: actions/checkout@v4 - name: Run shellcheck on tests/e2e/*.sh - uses: ludeeus/action-shellcheck@master - env: - SHELLCHECK_OPTS: --severity=warning - with: - scandir: tests/e2e + # `ludeeus/action-shellcheck` is a Docker action (Linux-only). We rely + # on shellcheck being pre-installed on the self-hosted runner instead. + run: | + if ! command -v shellcheck >/dev/null 2>&1; then + echo "::error::shellcheck is not installed on the runner" + exit 1 + fi + find tests/e2e -type f -name '*.sh' -print0 \ + | xargs -0 shellcheck --severity=warning canvas-deploy-reminder: name: Canvas Deploy Reminder - runs-on: ubuntu-latest + runs-on: [self-hosted, macos, arm64] needs: canvas-build # Only fires on direct pushes to main (i.e. after a PR merges). # PRs get canvas-build CI but no reminder — no deployment happens on PRs. @@ -216,19 +229,24 @@ jobs: python-lint: name: Python Lint & Test - runs-on: ubuntu-latest + runs-on: [self-hosted, macos, arm64] defaults: run: working-directory: workspace-template steps: - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: '3.11' - cache: pip - cache-dependency-path: workspace-template/requirements.txt - - run: pip install -r requirements.txt pytest pytest-asyncio pytest-cov - - run: python -m pytest --tb=short -q --cov=. --cov-report=term-missing + # setup-python@v5 cannot write to /Users/runner (GitHub-hosted path) on + # the self-hosted macOS arm64 runner (user: hongming-claw) and also hits + # EACCES on /usr/local/bin due to macOS SIP. Skip it — Homebrew installs + # Python 3.11 at /opt/homebrew/opt/python@3.11 which is already on PATH. + - name: Verify Python 3.11 (Homebrew) + run: | + export PATH="/opt/homebrew/opt/python@3.11/bin:/opt/homebrew/bin:$PATH" + python3.11 --version + echo "/opt/homebrew/opt/python@3.11/bin" >> "$GITHUB_PATH" + echo "/opt/homebrew/bin" >> "$GITHUB_PATH" + - run: pip3.11 install -r requirements.txt pytest pytest-asyncio pytest-cov + - run: python3.11 -m pytest --tb=short -q --cov=. --cov-report=term-missing # Lint first-party plugins. The validator checks each plugin # against the format it declares — currently agentskills.io for all @@ -236,10 +254,10 @@ jobs: # under a sibling adapter (MCP, DeepAgents sub-agent, etc.). - name: Install molecule-plugin SDK working-directory: sdk/python - run: pip install -e . + run: pip3.11 install -e . - name: Lint first-party plugins working-directory: ${{ github.workspace }} - run: python -m molecule_plugin validate plugins/molecule-dev plugins/superpowers plugins/ecc + run: python3.11 -m molecule_plugin validate plugins/molecule-dev plugins/superpowers plugins/ecc - name: Run SDK tests working-directory: sdk/python - run: python -m pytest --tb=short -q + run: python3.11 -m pytest --tb=short -q diff --git a/.github/workflows/publish-platform-image.yml b/.github/workflows/publish-platform-image.yml index 03479723..eed94c3e 100644 --- a/.github/workflows/publish-platform-image.yml +++ b/.github/workflows/publish-platform-image.yml @@ -32,11 +32,19 @@ env: jobs: build-and-push: - runs-on: ubuntu-latest + runs-on: [self-hosted, macos, arm64] steps: - name: Checkout uses: actions/checkout@v4 + - name: Set up QEMU + # Required on the Apple-silicon self-hosted runner — Fly tenant machines + # pull linux/amd64, and buildx needs binfmt handlers in Docker Desktop's + # VM to emulate amd64 during the build. + uses: docker/setup-qemu-action@v3 + with: + platforms: linux/amd64 + - name: Set up Docker Buildx # Buildx enables cache-from/cache-to via GHA cache and multi-arch # builds without local docker daemon wrangling. @@ -75,10 +83,13 @@ jobs: # GHCR (or vice versa) — each registry's failure mode is isolated. # GHA cache is shared because both steps re-use the same Dockerfile # context + build args. + # Explicit linux/amd64 target: the runner is Apple-silicon (arm64), + # but Fly tenant machines are amd64. QEMU handles the emulation. uses: docker/build-push-action@v5 with: context: ./platform file: ./platform/Dockerfile + platforms: linux/amd64 push: true tags: | ${{ env.IMAGE_NAME }}:latest @@ -99,6 +110,7 @@ jobs: with: context: ./platform file: ./platform/Dockerfile + platforms: linux/amd64 push: true tags: | ${{ env.FLY_IMAGE_NAME }}:latest diff --git a/org-templates/molecule-dev/org.yaml b/org-templates/molecule-dev/org.yaml index 1f8883d7..55366f6d 100644 --- a/org-templates/molecule-dev/org.yaml +++ b/org-templates/molecule-dev/org.yaml @@ -554,10 +554,10 @@ workspaces: 5. Use commit_memory to save security patterns and concerns 6. Wait for tasks from Dev Lead. schedules: - - name: Hourly security audit - cron_expr: "1,11,21,31,41,51 * * * *" + - name: Security audit (every 12h) + cron_expr: "7 6,18 * * *" prompt: | - Recurring hourly security audit. Be thorough on recently changed code. + Recurring security audit. Be thorough and incremental. 1. SETUP: cd /workspace/repo && git pull 2>/dev/null || true diff --git a/platform/internal/middleware/ratelimit_test.go b/platform/internal/middleware/ratelimit_test.go index 4f17ce56..43655915 100644 --- a/platform/internal/middleware/ratelimit_test.go +++ b/platform/internal/middleware/ratelimit_test.go @@ -45,6 +45,107 @@ func TestRateLimit_HeadersPresentOnAllowedRequest(t *testing.T) { } } +// TestRateLimit_XFF_BypassDocumented shows that WITHOUT SetTrustedProxies(nil) +// a spoofed X-Forwarded-For header can rotate an attacker's effective IP and +// bypass per-IP rate limiting (documents the issue #179 vulnerability). +func TestRateLimit_XFF_BypassDocumented(t *testing.T) { + gin.SetMode(gin.TestMode) + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + rl := NewRateLimiter(2, 5*time.Second, ctx) + + r := gin.New() + // Intentionally NOT calling r.SetTrustedProxies(nil) — replicates the + // pre-fix behaviour where Gin trusts all proxies by default. + r.Use(rl.Middleware()) + r.GET("/x", func(c *gin.Context) { c.String(http.StatusOK, "ok") }) + + // Exhaust both tokens for the real IP 10.0.0.1. + for i := 0; i < 2; i++ { + req := httptest.NewRequest(http.MethodGet, "/x", nil) + req.RemoteAddr = "10.0.0.1:1234" + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusOK { + t.Fatalf("setup request %d: want 200, got %d", i+1, w.Code) + } + } + // Third request without XFF must be rate-limited. + { + req := httptest.NewRequest(http.MethodGet, "/x", nil) + req.RemoteAddr = "10.0.0.1:1234" + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusTooManyRequests { + t.Fatalf("3rd request (no XFF): want 429, got %d", w.Code) + } + } + // With default proxy trust, spoofing X-Forwarded-For rotates the effective + // IP → new bucket → bypass succeeds (returns 200). + { + req := httptest.NewRequest(http.MethodGet, "/x", nil) + req.RemoteAddr = "10.0.0.1:1234" + req.Header.Set("X-Forwarded-For", "20.0.0.1") + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusOK { + t.Skipf("bypass no longer works without trusted-proxy config (Gin version changed?): got %d", w.Code) + } + } +} + +// TestRateLimit_XFF_NoBypassWithTrustedProxiesNil is the regression test for +// issue #179: after r.SetTrustedProxies(nil) is added to router.Setup(), a +// spoofed X-Forwarded-For header is ignored and the real RemoteAddr is used, +// so the bypass no longer works. +func TestRateLimit_XFF_NoBypassWithTrustedProxiesNil(t *testing.T) { + gin.SetMode(gin.TestMode) + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + rl := NewRateLimiter(2, 5*time.Second, ctx) + + r := gin.New() + // Fix for issue #179 — mirror what router.Setup() now does. + if err := r.SetTrustedProxies(nil); err != nil { + t.Fatalf("SetTrustedProxies: %v", err) + } + r.Use(rl.Middleware()) + r.GET("/x", func(c *gin.Context) { c.String(http.StatusOK, "ok") }) + + // Exhaust both tokens for RemoteAddr 10.0.0.2. + for i := 0; i < 2; i++ { + req := httptest.NewRequest(http.MethodGet, "/x", nil) + req.RemoteAddr = "10.0.0.2:9999" + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusOK { + t.Fatalf("setup request %d: want 200, got %d", i+1, w.Code) + } + } + // Third plain request must be rate-limited. + { + req := httptest.NewRequest(http.MethodGet, "/x", nil) + req.RemoteAddr = "10.0.0.2:9999" + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusTooManyRequests { + t.Fatalf("3rd plain request: want 429, got %d", w.Code) + } + } + // Spoofed XFF must NOT rotate the bucket — still 429 because + // SetTrustedProxies(nil) forces c.ClientIP() to return RemoteAddr. + { + req := httptest.NewRequest(http.MethodGet, "/x", nil) + req.RemoteAddr = "10.0.0.2:9999" + req.Header.Set("X-Forwarded-For", "99.99.99.99") + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + if w.Code != http.StatusTooManyRequests { + t.Errorf("XFF bypass still works after fix: want 429, got %d — SetTrustedProxies(nil) not effective", w.Code) + } + } +} + // TestRateLimit_RetryAfterOn429 — throttled responses must carry Retry-After // per RFC 6585, so curl/fetch clients back off the exact required window. func TestRateLimit_RetryAfterOn429(t *testing.T) { diff --git a/platform/internal/middleware/wsauth_middleware_test.go b/platform/internal/middleware/wsauth_middleware_test.go index 5d677170..5fbb861f 100644 --- a/platform/internal/middleware/wsauth_middleware_test.go +++ b/platform/internal/middleware/wsauth_middleware_test.go @@ -473,6 +473,89 @@ func TestAdminAuth_InvalidBearer_Returns401(t *testing.T) { } } +// ──────────────────────────────────────────────────────────────────────────── +// Issue #170 regression — unauthenticated DELETE /workspaces/:id/secrets/:key +// +// Before this fix, the route was registered as: +// r.DELETE("/workspaces/:id/secrets/:key", sech.Delete) +// on the bare Gin router — no auth at all. Any caller could delete a secret +// AND trigger a forced workspace restart (the handler calls go restartFunc(id) +// on every successful delete). CWE-306. +// +// The fix: move the route inside the wsAuth group so it matches all other +// /workspaces/:id/secrets mutations (POST + PUT are already auth-gated). +// ──────────────────────────────────────────────────────────────────────────── + +// TestWorkspaceAuth_Issue170_SecretDelete_NoBearer_Returns401 is the primary +// regression test: when the workspace has live tokens, a DELETE /secrets/:key +// without a bearer token must be rejected with 401. +func TestWorkspaceAuth_Issue170_SecretDelete_NoBearer_Returns401(t *testing.T) { + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer mockDB.Close() + + // HasAnyLiveToken returns 1 — workspace is token-enrolled. + mock.ExpectQuery(hasLiveTokenQuery). + WithArgs("ws-secret-owner"). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1)) + + r := gin.New() + // Mirror the fix: DELETE /secrets/:key is inside the wsAuth group. + r.DELETE("/workspaces/:id/secrets/:key", + WorkspaceAuth(mockDB), + func(c *gin.Context) { c.JSON(http.StatusOK, gin.H{"status": "deleted"}) }) + + w := httptest.NewRecorder() + // #170 attack: no Authorization header. + req, _ := http.NewRequest(http.MethodDelete, + "/workspaces/ws-secret-owner/secrets/OPENAI_API_KEY", nil) + r.ServeHTTP(w, req) + + if w.Code != http.StatusUnauthorized { + t.Errorf("#170 secret delete no-bearer: expected 401, got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +// TestWorkspaceAuth_Issue170_SecretDelete_FailOpen_NoTokens verifies the +// fail-open contract is preserved: a workspace with NO tokens (bootstrap / +// rolling-upgrade path) lets the DELETE through so legacy workspaces aren't +// bricked. +func TestWorkspaceAuth_Issue170_SecretDelete_FailOpen_NoTokens(t *testing.T) { + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer mockDB.Close() + + // HasAnyLiveToken returns 0 — no tokens on file (pre-Phase-30 workspace). + mock.ExpectQuery(hasLiveTokenQuery). + WithArgs("ws-legacy"). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) + + r := gin.New() + r.DELETE("/workspaces/:id/secrets/:key", + WorkspaceAuth(mockDB), + func(c *gin.Context) { c.JSON(http.StatusOK, gin.H{"status": "deleted"}) }) + + w := httptest.NewRecorder() + req, _ := http.NewRequest(http.MethodDelete, + "/workspaces/ws-legacy/secrets/SOME_KEY", nil) + r.ServeHTTP(w, req) + + // Fail-open: no tokens → must pass through (200). + if w.Code != http.StatusOK { + t.Errorf("#170 fail-open: expected 200, got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + // ──────────────────────────────────────────────────────────────────────────── // Issue #120 regression — unauthenticated PATCH /workspaces/:id // diff --git a/platform/internal/router/router.go b/platform/internal/router/router.go index 58ce2fd2..b1e482fc 100644 --- a/platform/internal/router/router.go +++ b/platform/internal/router/router.go @@ -25,6 +25,14 @@ import ( func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provisioner, platformURL, configsDir string, wh *handlers.WorkspaceHandler, channelMgr *channels.Manager) *gin.Engine { r := gin.Default() + // Issue #179 — trust no reverse-proxy headers. Without this call Gin's + // default is to trust ALL X-Forwarded-For values, which lets any caller + // spoof their IP and bypass per-IP rate limiting. With nil, c.ClientIP() + // always returns the real TCP RemoteAddr. + if err := r.SetTrustedProxies(nil); err != nil { + panic("router: SetTrustedProxies: " + err.Error()) + } + // CORS origins — configurable via CORS_ORIGINS env var (comma-separated) corsOrigins := []string{"http://localhost:3000", "http://localhost:3001"} if v := os.Getenv("CORS_ORIGINS"); v != "" {