From 6bcafd643ec499c2b37c838364bb06d1b53399ee Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 14 Apr 2026 17:05:36 -0700 Subject: [PATCH 1/2] feat(ci): mirror platform image to registry.fly.io/molecule-tenant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Keeps ghcr.io/molecule-ai/platform private (per CEO direction โ€” open- source when full SaaS ships) while still letting the private control plane's Fly provisioner boot tenant machines: Fly auto-authenticates same-org machines against registry.fly.io, no per-tenant pull credentials to wire. Workflow now logs into both GHCR (using built-in GITHUB_TOKEN) and Fly registry (using FLY_API_TOKEN secret) and pushes the same image to four tags total: - ghcr.io/molecule-ai/platform:latest - ghcr.io/molecule-ai/platform:sha- - registry.fly.io/molecule-tenant:latest - registry.fly.io/molecule-tenant:sha- Secret added via `gh secret set FLY_API_TOKEN` on the public repo. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/publish-platform-image.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.github/workflows/publish-platform-image.yml b/.github/workflows/publish-platform-image.yml index 814647a5..dc22b2e7 100644 --- a/.github/workflows/publish-platform-image.yml +++ b/.github/workflows/publish-platform-image.yml @@ -23,6 +23,12 @@ permissions: env: # GHCR accepts mixed-case, but most tooling lowercases โ€” keep us consistent. IMAGE_NAME: ghcr.io/molecule-ai/platform + # Fly registry mirror โ€” tenant machines provisioned by the private + # `molecule-controlplane` pull from here (private GHCR image can't be + # pulled by Fly machines without auth plumbing we don't want to add). + # Fly auto-authenticates same-org machines against registry.fly.io, so + # mirroring keeps GHCR private while tenants still boot. + FLY_IMAGE_NAME: registry.fly.io/molecule-tenant jobs: build-and-push: @@ -43,6 +49,15 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Log in to Fly registry + # Fly's registry accepts any username as long as the password is a + # valid FLY_API_TOKEN. Must be added to repo Secrets first. + uses: docker/login-action@v3 + with: + registry: registry.fly.io + username: x + password: ${{ secrets.FLY_API_TOKEN }} + - name: Compute tags id: tags # Emit two tags per build: `latest` (floating, always the main tip) @@ -60,6 +75,8 @@ jobs: tags: | ${{ env.IMAGE_NAME }}:latest ${{ env.IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }} + ${{ env.FLY_IMAGE_NAME }}:latest + ${{ env.FLY_IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }} cache-from: type=gha cache-to: type=gha,mode=max labels: | From 73dbca4e38df567533bae148396f6fb066b28a55 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 14 Apr 2026 17:09:11 -0700 Subject: [PATCH 2/2] review: split push steps, runbook for secret rotation, username clarity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses PR #82 code review: ๐ŸŸกร—3 + ๐Ÿ”ตร—5. - Fly registry login username: 'x' โ†’ 'molecule-ai' + explanatory comment. - Build & push split into two steps (GHCR / Fly registry) so a single- registry outage can't fail the other. Second step uses 'if: always()' to ensure Fly mirror runs even if GHCR push flakes. - docs/runbooks/saas-secrets.md: full secret map + rotation procedures for every SaaS credential, with danger-case callouts. Documents the coupled FLY_API_TOKEN (lives in GHA secret AND fly secrets โ€” must be rotated in both). - CLAUDE.md: new 'SaaS ops' section linking to the runbook. --- .github/workflows/publish-platform-image.yml | 37 +++++-- CLAUDE.md | 9 ++ docs/runbooks/saas-secrets.md | 102 +++++++++++++++++++ 3 files changed, 142 insertions(+), 6 deletions(-) create mode 100644 docs/runbooks/saas-secrets.md diff --git a/.github/workflows/publish-platform-image.yml b/.github/workflows/publish-platform-image.yml index dc22b2e7..0c217f3a 100644 --- a/.github/workflows/publish-platform-image.yml +++ b/.github/workflows/publish-platform-image.yml @@ -50,12 +50,16 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} - name: Log in to Fly registry - # Fly's registry accepts any username as long as the password is a - # valid FLY_API_TOKEN. Must be added to repo Secrets first. + # Fly's registry is entirely token-auth: username is ignored, password + # must be a valid FLY_API_TOKEN. We pass "molecule-ai" as a human- + # readable placeholder so this step is obvious to future readers. + # Rotation: see docs/runbooks/saas-secrets.md โ€” FLY_API_TOKEN lives in + # two places (GitHub Actions secret here + `fly secrets` on molecule-cp) + # and MUST be updated in both on rotation. uses: docker/login-action@v3 with: registry: registry.fly.io - username: x + username: molecule-ai password: ${{ secrets.FLY_API_TOKEN }} - name: Compute tags @@ -66,7 +70,11 @@ jobs: run: | echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT" - - name: Build & push + - name: Build & push to GHCR + # Split from the Fly mirror so a registry.fly.io outage doesn't block + # GHCR (or vice versa) โ€” each registry's failure mode is isolated. + # GHA cache is shared because both steps re-use the same Dockerfile + # context + build args. uses: docker/build-push-action@v5 with: context: ./platform @@ -75,11 +83,28 @@ jobs: tags: | ${{ env.IMAGE_NAME }}:latest ${{ env.IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }} - ${{ env.FLY_IMAGE_NAME }}:latest - ${{ env.FLY_IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }} cache-from: type=gha cache-to: type=gha,mode=max labels: | org.opencontainers.image.source=https://github.com/${{ github.repository }} org.opencontainers.image.revision=${{ github.sha }} org.opencontainers.image.description=Molecule AI tenant platform (one instance per org) + + - name: Build & push to Fly registry + # Continues even if GHCR push failed โ€” `if: always()` ensures the + # private control plane's tenant-image mirror lands regardless of + # any GHCR-side flakiness. + if: always() + uses: docker/build-push-action@v5 + with: + context: ./platform + file: ./platform/Dockerfile + push: true + tags: | + ${{ env.FLY_IMAGE_NAME }}:latest + ${{ env.FLY_IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }} + cache-from: type=gha + labels: | + org.opencontainers.image.source=https://github.com/${{ github.repository }} + org.opencontainers.image.revision=${{ github.sha }} + org.opencontainers.image.description=Molecule AI tenant platform (one instance per org) diff --git a/CLAUDE.md b/CLAUDE.md index 7fbefd9e..e14c0ee7 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -14,6 +14,15 @@ overlap / differentiation / terminology-collision notes. Cross-referenced from `PLAN.md` and `README.md`; it's the canonical starting point for "what else is out there." +## SaaS ops + +When rotating SaaS credentials (Fly / Neon / Upstash / envelope key), read +**`docs/runbooks/saas-secrets.md`** first. It documents which secrets live +in multiple places (e.g. `FLY_API_TOKEN` in both GitHub Actions and `fly +secrets` on `molecule-cp`), the correct rotation order, and danger cases โ€” +notably `SECRETS_ENCRYPTION_KEY`, which cannot be rotated without a data +migration until Phase H lands KMS envelope encryption. + ## Agent operating rules (auto-loaded โ€” read first) The following are project-level rules that override default behavior. They diff --git a/docs/runbooks/saas-secrets.md b/docs/runbooks/saas-secrets.md new file mode 100644 index 00000000..16fd3ad9 --- /dev/null +++ b/docs/runbooks/saas-secrets.md @@ -0,0 +1,102 @@ +# SaaS secret rotation โ€” runbook + +Where each secret lives, why, and the **full rotation procedure** so a partial +update doesn't silently break production. + +## Secret map + +| Secret | Location(s) | Purpose | +|---|---|---| +| `FLY_API_TOKEN` | **(a)** `molecule-monorepo` GitHub Actions secret (push image to `registry.fly.io/molecule-tenant`) + **(b)** `fly secrets` on `molecule-cp` app (control plane creates + deletes tenant Fly Machines) | Any Fly Machines API call | +| `NEON_API_KEY` | `fly secrets` on `molecule-cp` | Create + delete tenant Neon branches | +| `DATABASE_URL` | `fly secrets` on `molecule-cp` | Control-plane Postgres connection (Neon `cool-sea-89357706`) | +| `TENANT_REDIS_URL` | `fly secrets` on `molecule-cp` | Injected into every tenant container as `REDIS_URL` | +| `SECRETS_ENCRYPTION_KEY` | `fly secrets` on `molecule-cp` | AES-256 key wrapping tenant DB/Redis URLs in `org_instances` (provisioner + tenant use this) | +| `GITHUB_TOKEN` | Built-in GitHub Actions token | GHCR push; rotated automatically | + +## Coupled secrets โ€” MUST rotate together + +`FLY_API_TOKEN` is the one secret duplicated across systems. Rotating **only +one** will cause **silent** breakage: + +- Rotating **only (a) GHA** โ†’ image publish workflow fails, but no alert; control plane keeps provisioning from the stale `latest` tag. +- Rotating **only (b) Fly secrets** โ†’ control plane's Fly API calls start erroring (`401`), tenant provisioning fails, but image publishes keep succeeding so everything *looks* fine on the build side. + +## Rotation procedure โ€” FLY_API_TOKEN + +1. Generate new token: + ``` + flyctl tokens create deploy --name molecule-cp-rotation-$(date +%Y%m%d) + ``` +2. Update **both** locations (order matters โ€” Fly secrets first, then GHA): + ``` + # (b) Fly secrets โ€” triggers zero-downtime redeploy + flyctl secrets set --app molecule-cp FLY_API_TOKEN='FlyV1 fm2_...' + + # (a) GitHub Actions secret โ€” next workflow run uses new token + echo 'FlyV1 fm2_...' | gh secret set FLY_API_TOKEN --repo Molecule-AI/molecule-monorepo + ``` +3. Verify: + ``` + # Control plane can reach Fly API: + curl https://molecule-cp.fly.dev/health + # Trigger image publish (dispatches workflow, pushes to both registries): + gh workflow run publish-platform-image.yml --repo Molecule-AI/molecule-monorepo + gh run list --repo Molecule-AI/molecule-monorepo --workflow publish-platform-image --limit 1 + ``` +4. Revoke the old token: + ``` + flyctl tokens list + flyctl tokens revoke + ``` + +## Rotation procedure โ€” NEON_API_KEY + +1. Create replacement key in Neon console โ†’ Account Settings โ†’ API Keys. +2. Update Fly secrets: + ``` + flyctl secrets set --app molecule-cp NEON_API_KEY='napi_...' + ``` +3. Trigger a test provision (dry run โ€” create + delete): + ``` + curl -X POST https://molecule-cp.fly.dev/cp/orgs \ + -H 'Content-Type: application/json' \ + -d '{"slug":"keytest-'$(date +%s)'","name":"Rotation test"}' + # Wait 60s, inspect logs: + flyctl logs --app molecule-cp --no-tail | tail -30 + # Clean up the test org via DELETE once live + ``` +4. Revoke old key in Neon console. + +## Rotation procedure โ€” SECRETS_ENCRYPTION_KEY + +**DANGEROUS**: rotating this key will invalidate every encrypted row in +`org_instances.database_url_encrypted` + `redis_url_encrypted`. Every tenant +becomes unreachable until re-provisioned. + +Mitigation: we intentionally defer real KMS + key-rotation to Phase H. Until +then, **do not rotate this key unless compromised.** If compromise, procedure is: + +1. Generate new key: `openssl rand -hex 32` +2. Set new key on `molecule-cp`. +3. For every row in `org_instances`: re-provision the tenant (creates fresh + Neon branch + Fly machine). The old encrypted URLs are un-decryptable but + irrelevant โ€” we mint fresh ones. +4. Migration to rotate encrypted columns in-place (decrypt-with-old โ†’ encrypt- + with-new) is Phase H work and requires envelope encryption with KMS. + +## Rotation procedure โ€” DATABASE_URL (control plane) + +The Neon `molecule-cp` project has a stable primary endpoint. Rotate only if: +- Neon forces a migration +- The connection-URI password is leaked + +Procedure: regenerate URI via Neon API โ†’ `flyctl secrets set DATABASE_URL=...`. +Zero-downtime (Fly applies secret via rolling restart). + +## Emergency contacts + +- **Fly**: billing dashboard at fly.io โ†’ Support +- **Neon**: console.neon.tech โ†’ Support +- **Upstash**: upstash.com โ†’ Support +- **GHCR**: github.com/orgs/Molecule-AI (org admins)