Merge branch 'main' into fix/issue-179-trusted-proxies

2026-04-15 10:54:21 -07:00 · 2026-04-15 10:54:21 -07:00 · fa465e5db1
commit fa465e5db1
parent 1ad98be17b aa419477b7
4 changed files with 144 additions and 58 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -9,7 +9,7 @@ on:
 jobs:
  platform-build:
    name: Platform (Go)
-    runs-on: ubuntu-latest
+    runs-on: [self-hosted, macos, arm64]
    defaults:
      run:
        working-directory: platform
@ -43,7 +43,7 @@ jobs:

  canvas-build:
    name: Canvas (Next.js)
-    runs-on: ubuntu-latest
+    runs-on: [self-hosted, macos, arm64]
    defaults:
      run:
        working-directory: canvas
@ -59,7 +59,7 @@ jobs:

  mcp-server-build:
    name: MCP Server (Node.js)
-    runs-on: ubuntu-latest
+    runs-on: [self-hosted, macos, arm64]
    defaults:
      run:
        working-directory: mcp-server
@ -75,37 +75,17 @@ jobs:

  e2e-api:
    name: E2E API Smoke Test
-    runs-on: ubuntu-latest
-    timeout-minutes: 10
-    services:
-      postgres:
-        # Credentials match .env.example (dev:dev) so local reproduction is
-        # identical to CI. POSTGRES_DB matches the default there too.
-        image: postgres:16
-        env:
-          POSTGRES_USER: dev
-          POSTGRES_PASSWORD: dev
-          POSTGRES_DB: molecule
-        ports:
-          - 5432:5432
-        options: >-
-          --health-cmd "pg_isready -U dev"
-          --health-interval 10s
-          --health-timeout 5s
-          --health-retries 5
-      redis:
-        image: redis:7
-        ports:
-          - 6379:6379
-        options: >-
-          --health-cmd "redis-cli ping"
-          --health-interval 10s
-          --health-timeout 5s
-          --health-retries 5
+    runs-on: [self-hosted, macos, arm64]
+    timeout-minutes: 15
+    # `services:` is Linux-only on self-hosted runners — we start postgres
+    # and redis via `docker run` instead. Ports 15432/16379 avoid collision
+    # with anything the host may already have on the standard ports.
    env:
-      DATABASE_URL: postgres://dev:dev@localhost:5432/molecule?sslmode=disable
-      REDIS_URL: redis://localhost:6379
+      DATABASE_URL: postgres://dev:dev@localhost:15432/molecule?sslmode=disable
+      REDIS_URL: redis://localhost:16379
      PORT: "8080"
+      PG_CONTAINER: molecule-ci-postgres
+      REDIS_CONTAINER: molecule-ci-redis
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v5
@ -113,6 +93,38 @@ jobs:
          go-version: 'stable'
          cache: true
          cache-dependency-path: platform/go.sum
+      - name: Start Postgres (docker)
+        run: |
+          docker rm -f "$PG_CONTAINER" 2>/dev/null || true
+          docker run -d --name "$PG_CONTAINER" \
+            -e POSTGRES_USER=dev \
+            -e POSTGRES_PASSWORD=dev \
+            -e POSTGRES_DB=molecule \
+            -p 15432:5432 \
+            postgres:16
+          for i in $(seq 1 30); do
+            if docker exec "$PG_CONTAINER" pg_isready -U dev >/dev/null 2>&1; then
+              echo "Postgres ready after ${i}s"
+              exit 0
+            fi
+            sleep 1
+          done
+          echo "::error::Postgres did not become ready in 30s"
+          docker logs "$PG_CONTAINER" || true
+          exit 1
+      - name: Start Redis (docker)
+        run: |
+          docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
+          docker run -d --name "$REDIS_CONTAINER" -p 16379:6379 redis:7
+          for i in $(seq 1 15); do
+            if docker exec "$REDIS_CONTAINER" redis-cli ping 2>/dev/null | grep -q PONG; then
+              echo "Redis ready after ${i}s"
+              exit 0
+            fi
+            sleep 1
+          done
+          echo "::error::Redis did not become ready in 15s"
+          exit 1
      - name: Build platform
        working-directory: platform
        run: go build -o platform-server ./cmd/server
@ -135,17 +147,9 @@ jobs:
          exit 1
      - name: Assert migrations applied
        # Migrations auto-run at platform boot. Fail fast if they silently
-        # didn't — catches future migration-author mistakes (e.g. a new
-        # privileged op Postgres "dev" can't execute) before the E2E run.
-        # Uses docker exec into the service container's own psql — avoids
-        # a 10-20s apt-install step in the runner.
+        # didn't — catches future migration-author mistakes before the E2E run.
        run: |
-          pg_container=$(docker ps --filter "ancestor=postgres:16" --format "{{.ID}}" | head -1)
-          if [ -z "$pg_container" ]; then
-            echo "::error::Could not find postgres service container"
-            exit 1
-          fi
-          tables=$(docker exec "$pg_container" psql -U dev -d molecule -tAc "SELECT count(*) FROM information_schema.tables WHERE table_schema='public' AND table_name='workspaces'")
+          tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc "SELECT count(*) FROM information_schema.tables WHERE table_schema='public' AND table_name='workspaces'")
          if [ "$tables" != "1" ]; then
            echo "::error::Migrations did not apply — 'workspaces' table missing"
            cat platform/platform.log || true
@ -163,22 +167,31 @@ jobs:
          if [ -f platform/platform.pid ]; then
            kill "$(cat platform/platform.pid)" 2>/dev/null || true
          fi
+      - name: Stop service containers
+        if: always()
+        run: |
+          docker rm -f "$PG_CONTAINER" 2>/dev/null || true
+          docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true

  shellcheck:
    name: Shellcheck (E2E scripts)
-    runs-on: ubuntu-latest
+    runs-on: [self-hosted, macos, arm64]
    steps:
      - uses: actions/checkout@v4
      - name: Run shellcheck on tests/e2e/*.sh
-        uses: ludeeus/action-shellcheck@master
-        env:
-          SHELLCHECK_OPTS: --severity=warning
-        with:
-          scandir: tests/e2e
+        # `ludeeus/action-shellcheck` is a Docker action (Linux-only). We rely
+        # on shellcheck being pre-installed on the self-hosted runner instead.
+        run: |
+          if ! command -v shellcheck >/dev/null 2>&1; then
+            echo "::error::shellcheck is not installed on the runner"
+            exit 1
+          fi
+          find tests/e2e -type f -name '*.sh' -print0 \
+            | xargs -0 shellcheck --severity=warning

  canvas-deploy-reminder:
    name: Canvas Deploy Reminder
-    runs-on: ubuntu-latest
+    runs-on: [self-hosted, macos, arm64]
    needs: canvas-build
    # Only fires on direct pushes to main (i.e. after a PR merges).
    # PRs get canvas-build CI but no reminder — no deployment happens on PRs.
@ -216,7 +229,12 @@ jobs:

  python-lint:
    name: Python Lint & Test
-    runs-on: ubuntu-latest
+    runs-on: [self-hosted, macos, arm64]
+    env:
+      # setup-python@v5 defaults to /Users/runner/hostedtoolcache which does
+      # not exist on the self-hosted runner (user is hongming-claw). Point it
+      # to the runner user's writable directory so Python 3.11 can be cached.
+      AGENT_TOOLSDIRECTORY: /Users/hongming-claw/hostedtoolcache
    defaults:
      run:
        working-directory: workspace-template
--- a/.github/workflows/publish-platform-image.yml
+++ b/.github/workflows/publish-platform-image.yml
@ -32,11 +32,19 @@ env:

 jobs:
  build-and-push:
-    runs-on: ubuntu-latest
+    runs-on: [self-hosted, macos, arm64]
    steps:
      - name: Checkout
        uses: actions/checkout@v4

+      - name: Set up QEMU
+        # Required on the Apple-silicon self-hosted runner — Fly tenant machines
+        # pull linux/amd64, and buildx needs binfmt handlers in Docker Desktop's
+        # VM to emulate amd64 during the build.
+        uses: docker/setup-qemu-action@v3
+        with:
+          platforms: linux/amd64
+
      - name: Set up Docker Buildx
        # Buildx enables cache-from/cache-to via GHA cache and multi-arch
        # builds without local docker daemon wrangling.
@ -75,10 +83,13 @@ jobs:
        # GHCR (or vice versa) — each registry's failure mode is isolated.
        # GHA cache is shared because both steps re-use the same Dockerfile
        # context + build args.
+        # Explicit linux/amd64 target: the runner is Apple-silicon (arm64),
+        # but Fly tenant machines are amd64. QEMU handles the emulation.
        uses: docker/build-push-action@v5
        with:
          context: ./platform
          file: ./platform/Dockerfile
+          platforms: linux/amd64
          push: true
          tags: |
            ${{ env.IMAGE_NAME }}:latest
@ -99,6 +110,7 @@ jobs:
        with:
          context: ./platform
          file: ./platform/Dockerfile
+          platforms: linux/amd64
          push: true
          tags: |
            ${{ env.FLY_IMAGE_NAME }}:latest
--- a/org-templates/molecule-dev/documentation-specialist/system-prompt.md
+++ b/org-templates/molecule-dev/documentation-specialist/system-prompt.md
@ -0,0 +1,56 @@
+# Documentation Specialist
+
+**LANGUAGE RULE: Always respond in the same language the user uses.**
+
+You are the Documentation Specialist for Molecule AI. You own end-to-end documentation across three repos and are the single source of truth for terminology consistency across all public surfaces.
+
+## Your Three Repos
+
+| Repo | Visibility | Your Role |
+|---|---|---|
+| `Molecule-AI/molecule-monorepo` | **Public** | Internal architecture docs, READMEs, API references, `docs/` directory |
+| `Molecule-AI/docs` | **Public** | Customer-facing docs site (Fumadocs + Next.js 15, deployed to doc.moleculesai.app) |
+| `Molecule-AI/molecule-controlplane` | **⚠️ PRIVATE** | Internal README, PLAN.md, and `docs/saas/` section in the monorepo only |
+
+## ⚠️ Privacy Rule — Never Violate
+
+`molecule-controlplane` is a **private** repo. Its source code, file paths, internal endpoints, schema details, infra config, billing/auth implementation details — **none of that** goes into the public docs site or public monorepo README. Public docs describe the SaaS **product** (signup, billing, tenant lifecycle, multi-tenant isolation guarantees) but never the provisioner's internals. When in doubt: don't publish.
+
+## How You Work
+
+1. **Watch PRs landing on all three repos.** Any PR that touches a public API, template, plugin, channel, or user-facing concept needs a paired docs PR within one cron tick.
+2. **Backfill stubs.** The docs site has stub pages marked "Coming soon" — work through them systematically.
+3. **Hold the line on terminology.** Every concept has exactly one canonical name across all three repos. Flag and fix inconsistencies.
+4. **Keep controlplane docs internal.** Controlplane changes get documented in `controlplane/README.md`, `controlplane/PLAN.md`, and the gated `docs/saas/` section — never in public surfaces.
+
+## Definition of Done
+
+- Every public surface has accurate, current, example-rich documentation
+- Every merged PR that touches a public surface has a paired docs PR open within one cron tick
+- Every stub page eventually gets backfilled
+- Controlplane internal docs stay current with recent changes
+- Nothing private leaks to public surfaces
+
+## Workflow
+
+1. **Receive task from PM** — docs gap, new feature to document, PR to pair, stub to backfill
+2. **Pull latest** from all three repos before starting
+3. **Write or update** the relevant docs files
+4. **Open a PR** on the appropriate repo (monorepo or docs site)
+5. **Reference issues** — if your PR closes a docs gap issue, include `Closes #N` in the PR body
+6. **Never commit to `main`** — always a feature branch + PR
+
+## Memory
+
+Use `commit_memory` to track:
+- Stub pages on the docs site that need backfilling (with priority)
+- Recent platform PRs that have no docs PR yet
+- Recent controlplane PRs whose internal README needs updating
+- Terminology decisions (canonical names for concepts)
+
+## Hard Rules
+
+- **Never leak controlplane internals to public docs** — this is the top constraint
+- **Always branch + PR** — never commit directly to main on any repo
+- **Pair PRs within one cron tick** — don't let merged platform PRs go undocumented
+- **One canonical name per concept** — enforce consistency, file PRs to fix deviations
--- a/tests/e2e/test_api.sh
+++ b/tests/e2e/test_api.sh
@ -123,11 +123,11 @@ check "PATCH /workspaces/:id (name)" '"status":"updated"' "$R"
 R=$(curl -s "$BASE/workspaces/$ECHO_ID")
 check "Name updated" '"name":"Echo Agent v2"' "$R"

-# Test 17: Events
-R=$(curl -s "$BASE/events")
+# Test 17: Events (#165 / PR #167 — now admin-gated, bearer required)
+R=$(curl -s "$BASE/events" -H "Authorization: Bearer $ECHO_TOKEN")
 check "GET /events (has events)" 'WORKSPACE_ONLINE' "$R"

-R=$(curl -s "$BASE/events/$ECHO_ID")
+R=$(curl -s "$BASE/events/$ECHO_ID" -H "Authorization: Bearer $ECHO_TOKEN")
 check "GET /events/:id (has events for echo)" 'WORKSPACE_ONLINE' "$R"

 # Test 18: Update card
@ -253,8 +253,8 @@ check "List after delete (count=1)" "1" "$COUNT"
 echo ""
 echo "--- Bundle Round-Trip Test ---"

-# Export the summarizer workspace
-BUNDLE=$(curl -s "$BASE/bundles/export/$SUM_ID")
+# Export the summarizer workspace (#165 / PR #167 — admin-gated)
+BUNDLE=$(curl -s "$BASE/bundles/export/$SUM_ID" -H "Authorization: Bearer $SUM_TOKEN")
 check "GET /bundles/export/:id" '"name":"Summarizer Agent"' "$BUNDLE"

 # Capture original config for comparison
@ -321,8 +321,8 @@ check "Register re-imported workspace" '"status":"registered"' "$R"
 # revoked when SUM_ID was deleted above — use this one for cleanup instead.
 NEW_TOKEN=$(echo "$R" | e2e_extract_token)

-# Re-export and verify agent_card survives the round-trip
-REBUNDLE=$(curl -s "$BASE/bundles/export/$NEW_ID")
+# Re-export and verify agent_card survives the round-trip (#165 / PR #167 — admin-gated)
+REBUNDLE=$(curl -s "$BASE/bundles/export/$NEW_ID" -H "Authorization: Bearer $NEW_TOKEN")
 check "Re-exported bundle has agent_card" '"agent_card"' "$REBUNDLE"

 # Clean up — use the token just issued to the re-imported workspace