From 54b49ffd1be883d4a71e92cd5b10f2c6cb67b19f Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Wed, 15 Apr 2026 11:52:01 -0700 Subject: [PATCH] fix(code-review): idle loop hardening + idle_prompt docs + admin-auth runbook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses items 4, 5, 7 from the self-review of the batch merge. PR A (#228) covered items 1, 2, 3, 6 on the Go side. ## workspace-template/main.py — idle loop hardening - Replace asyncio.get_event_loop() with asyncio.get_running_loop() — the former is deprecated in 3.12+ and emits a DeprecationWarning on every idle fire. - Replace hardcoded urlopen timeout=600 with IDLE_FIRE_TIMEOUT_SECONDS clamped to max(60, min(300, idle_interval_seconds)). Long cadence workspaces no longer hold dangling requests open for 10 minutes; the cap adapts automatically when the interval is short. - Type the exception handling: split HTTPError (has .code) from URLError (connection-level) from the generic catch-all. Log status + error class separately so operators can grep for specific failure modes instead of a bare "post failed". - Fire-and-forget no longer loses exceptions. run_in_executor Future now has an add_done_callback that logs the outcome, so a panic in _post_sync surfaces as "Idle loop: post failed — status=None err=..." instead of Python's default "Task exception was never retrieved" warning burried in stderr. ## org-templates/molecule-dev/org.yaml — discoverability Added idle_prompt + idle_interval_seconds to the defaults: block with explanatory comments. Without this, users had to read main.py to discover the feature. ## docs/runbooks/admin-auth.md — new Documents the three middleware variants (AdminAuth strict, CanvasOrBearer soft, WorkspaceAuth per-id), the exact contract of each, and the three-question test for adding a new route to CanvasOrBearer. Also flags the session-cookie follow-up as Phase H. Referenced PRs: #138, #164, #165, #166, #167, #168, #190, #194, #203, #228. No code deltas in platform/ beyond the Python + YAML + docs changes. Full pytest suite unchanged except the pre-existing test_hermes_smoke flake that fails in full-suite but passes in isolation (test isolation bug, not introduced by this PR). Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/runbooks/admin-auth.md | 72 +++++++++++++++++++++++++++++ org-templates/molecule-dev/org.yaml | 9 ++++ workspace-template/main.py | 55 +++++++++++++++++----- 3 files changed, 125 insertions(+), 11 deletions(-) create mode 100644 docs/runbooks/admin-auth.md diff --git a/docs/runbooks/admin-auth.md b/docs/runbooks/admin-auth.md new file mode 100644 index 00000000..df3aa032 --- /dev/null +++ b/docs/runbooks/admin-auth.md @@ -0,0 +1,72 @@ +# Admin auth middleware reference + +Two Gin middleware variants gate admin-style routes on the platform. Pick the +right one — they have different security contracts. + +## `middleware.AdminAuth(db.DB)` — strict bearer-only + +Required for any route where a forged request could: + +- Leak prompts or memory (`GET /bundles/export/:id`, `GET /events*`) +- Create or mutate workspaces (`POST /workspaces`, `DELETE /workspaces/:id`, `POST /bundles/import`, `POST /templates/import`, `POST /org/import`) +- Leak operational intelligence (`GET /admin/liveness`) +- Touch approvals, secrets, or schedules at the cross-workspace level + +**Contract:** + +1. Reads `Authorization: Bearer ` and validates against `workspace_auth_tokens` via `wsauth.ValidateAnyToken` +2. **No fallback.** Missing or invalid bearer → 401 +3. Lazy-bootstrap fail-open: if `HasAnyLiveTokenGlobal` returns 0 (fresh install / rolling upgrade), the route is open. First token issued to any workspace activates enforcement for every route. + +**DO NOT use Origin header or session-cookie fallbacks here.** That reopens every route to curl-based spoofing — CORS is a browser-only defence, not a server-side auth signal. + +## `middleware.CanvasOrBearer(db.DB)` — softer, canvas-friendly + +**Only** for cosmetic routes where a forged request has zero data / security impact. + +Currently used on: + +| Route | Why soft is OK | +|-------|----------------| +| `PUT /canvas/viewport` | Viewport corruption resets on the next browser refresh. No data exposure, no resource creation. | + +**Contract:** + +1. Reads `Authorization: Bearer ` first. If present but **invalid**, returns 401 — **no fall-through** to the Origin path. (This was a CanvasOrBearer bug fixed during code review; preserved as the invariant.) +2. Empty bearer → check `Origin` header against `CORS_ORIGINS` env var. Exact-match only. Empty Origin does not pass. +3. Lazy-bootstrap fail-open identical to `AdminAuth`. + +**The Origin check is NOT a strict auth boundary.** Any non-browser client (curl, an attacker tool) can forge the `Origin` header. CORS protects the browser from reading the response, not the server from receiving the request. Apply `CanvasOrBearer` only to routes where a curl attacker with knowledge of the canvas origin could do nothing harmful. + +### When to add a new route to `CanvasOrBearer` + +Ask these three questions. **All three** must be yes or the route belongs behind strict `AdminAuth`: + +1. Can a browser at `https://.moleculesai.app` need this route without a bearer token? (If not, just use `AdminAuth` — browsers can send bearers via the session-cookie auth flow once that lands.) +2. If a non-browser attacker forged `Origin: https://.moleculesai.app`, would the worst-case outcome be purely cosmetic — recoverable with a browser refresh and no data exposure? +3. Is there no tenant isolation concern (cross-org data leak) on this route? + +If yes/yes/yes → `CanvasOrBearer` is acceptable. Document the rationale in the PR that adds it, and add the route to the table above in the same PR. + +## Relationship to `WorkspaceAuth` + +`WorkspaceAuth` is the `/workspaces/:id/*` sub-route middleware. Different contract entirely: it binds a bearer token to a specific workspace ID so workspace A's token can't hit workspace B's sub-routes. Used for all `/workspaces/:id/*` paths except the A2A proxy (which has its own `CanCommunicate` access-control layer). + +AdminAuth accepts **any** valid workspace bearer (it's a global gate). WorkspaceAuth accepts only the bearer for the **specific** `:id` in the URL path. + +## Known gap (Phase H follow-up) + +`CanvasOrBearer` is a tactical fix for the #168 canvas-regression problem. The proper long-term path is **session-cookie-accepting AdminAuth**: extend `AdminAuth` to validate the `mcp_session` cookie via `auth.Provider.VerifySession` (WorkOS in prod, DisabledProvider in dev). That would give the full list of admin routes browser compatibility without an Origin-based workaround. Tracked as a Phase H item once the SaaS control plane is the primary deployment surface. + +## Related PRs and issues + +- #138 — first canvas regression (PATCH /workspaces/:id), fixed with field-level authz in the handler (`WorkspaceHandler.Update`) +- #164 — CRITICAL anonymous workspace creation via unauthenticated `POST /bundles/import` +- #165 — HIGH topology disclosure via unauthenticated `GET /events` and `GET /bundles/export/:id` +- #166 — MEDIUM viewport corruption / liveness leak +- #167 — first auth-gate batch, strict `AdminAuth` on 5 routes +- #168 — canvas regression from the strict gating +- #190 — HIGH unauthenticated `POST /templates/import` +- #194 — rejected Origin-fallback approach (would have reopened #164) +- #203 — the `CanvasOrBearer` middleware, route-split approach, only on `PUT /canvas/viewport` +- #228 — code-review follow-up: CanvasOrBearer invalid-bearer fall-through fix diff --git a/org-templates/molecule-dev/org.yaml b/org-templates/molecule-dev/org.yaml index 55366f6d..2cbb5522 100644 --- a/org-templates/molecule-dev/org.yaml +++ b/org-templates/molecule-dev/org.yaml @@ -67,6 +67,15 @@ defaults: # workspace_dir: not set by default — each agent gets an isolated Docker volume # Set per-workspace to bind-mount a host directory as /workspace + # Idle-loop reflection pattern (#205). When idle_prompt is non-empty, the + # workspace self-sends this prompt every idle_interval_seconds while its + # heartbeat.active_tasks == 0. Pattern from Hermes/Letta. Cost collapses to + # event-driven (no LLM call unless there's actually nothing to do). Off by + # default to avoid surprising token burn — set per-workspace to enable. + # Keep idle prompts local (no A2A sends): same rule as initial_prompt. + idle_prompt: "" + idle_interval_seconds: 600 # 10 min — ignored when idle_prompt is empty + # initial_prompt runs once on first boot (not on restart). # ${GITHUB_REPO} is a container env var from .env secrets. # IMPORTANT: Do NOT send A2A messages in initial_prompt — other agents may not diff --git a/workspace-template/main.py b/workspace-template/main.py index 23782e9d..98aa3725 100644 --- a/workspace-template/main.py +++ b/workspace-template/main.py @@ -388,14 +388,21 @@ async def main(): # pragma: no cover # per-workspace to enable. idle_loop_task = None if config.idle_prompt: + # Idle-fire HTTP timeout. Kept tight relative to the fire cadence so a + # hung platform doesn't accumulate dangling requests — a fire that + # takes longer than the idle interval itself is almost certainly stuck. + IDLE_FIRE_TIMEOUT_SECONDS = max(60, min(300, config.idle_interval_seconds)) + # Initial settle delay — never longer than 60s so cold-start races + # don't stall the first fire, and never shorter than the configured + # interval (short intervals shouldn't fire instantly on boot either). + IDLE_INITIAL_SETTLE_SECONDS = min(config.idle_interval_seconds, 60) + async def _run_idle_loop(): """Self-sends config.idle_prompt periodically when the workspace is idle.""" - # Wait for server + initial prompt to settle before the first idle check. - # Short wait (min of 60s or interval) so cold-start races don't fire instantly. - await asyncio.sleep(min(config.idle_interval_seconds, 60)) + await asyncio.sleep(IDLE_INITIAL_SETTLE_SECONDS) import json as _json - import urllib.request + from urllib import request as _urlreq, error as _urlerr while True: try: @@ -424,20 +431,46 @@ async def main(): # pragma: no cover }).encode() def _post_sync(): + # Returns (status_code, error_type) so the caller logs the + # actual outcome instead of a bare "post failed" line. try: - req = urllib.request.Request( + req = _urlreq.Request( f"{platform_url}/workspaces/{workspace_id}/a2a", data=payload, headers={"Content-Type": "application/json"}, ) - with urllib.request.urlopen(req, timeout=600) as resp: + with _urlreq.urlopen(req, timeout=IDLE_FIRE_TIMEOUT_SECONDS) as resp: resp.read() - except Exception as e: - print(f"Idle loop: post failed — {e}", flush=True) + return resp.status, None + except _urlerr.HTTPError as e: + return e.code, type(e).__name__ + except _urlerr.URLError as e: + return None, f"URLError: {e.reason}" + except Exception as e: # pragma: no cover — catch-all safety net + return None, type(e).__name__ - print(f"Idle loop: firing (active_tasks=0, interval={config.idle_interval_seconds}s)", flush=True) - loop_ref = asyncio.get_event_loop() - loop_ref.run_in_executor(None, _post_sync) + print( + f"Idle loop: firing (active_tasks=0, interval={config.idle_interval_seconds}s, " + f"timeout={IDLE_FIRE_TIMEOUT_SECONDS}s)", + flush=True, + ) + loop_ref = asyncio.get_running_loop() + + def _log_result(future): + try: + status, err = future.result() + if err: + print( + f"Idle loop: post failed — status={status} err={err}", + flush=True, + ) + else: + print(f"Idle loop: post ok status={status}", flush=True) + except Exception as e: # pragma: no cover + print(f"Idle loop: executor callback crashed — {e}", flush=True) + + fut = loop_ref.run_in_executor(None, _post_sync) + fut.add_done_callback(_log_result) idle_loop_task = asyncio.create_task(_run_idle_loop())