diff --git a/docs/runbooks/admin-auth.md b/docs/runbooks/admin-auth.md new file mode 100644 index 00000000..df3aa032 --- /dev/null +++ b/docs/runbooks/admin-auth.md @@ -0,0 +1,72 @@ +# Admin auth middleware reference + +Two Gin middleware variants gate admin-style routes on the platform. Pick the +right one — they have different security contracts. + +## `middleware.AdminAuth(db.DB)` — strict bearer-only + +Required for any route where a forged request could: + +- Leak prompts or memory (`GET /bundles/export/:id`, `GET /events*`) +- Create or mutate workspaces (`POST /workspaces`, `DELETE /workspaces/:id`, `POST /bundles/import`, `POST /templates/import`, `POST /org/import`) +- Leak operational intelligence (`GET /admin/liveness`) +- Touch approvals, secrets, or schedules at the cross-workspace level + +**Contract:** + +1. Reads `Authorization: Bearer ` and validates against `workspace_auth_tokens` via `wsauth.ValidateAnyToken` +2. **No fallback.** Missing or invalid bearer → 401 +3. Lazy-bootstrap fail-open: if `HasAnyLiveTokenGlobal` returns 0 (fresh install / rolling upgrade), the route is open. First token issued to any workspace activates enforcement for every route. + +**DO NOT use Origin header or session-cookie fallbacks here.** That reopens every route to curl-based spoofing — CORS is a browser-only defence, not a server-side auth signal. + +## `middleware.CanvasOrBearer(db.DB)` — softer, canvas-friendly + +**Only** for cosmetic routes where a forged request has zero data / security impact. + +Currently used on: + +| Route | Why soft is OK | +|-------|----------------| +| `PUT /canvas/viewport` | Viewport corruption resets on the next browser refresh. No data exposure, no resource creation. | + +**Contract:** + +1. Reads `Authorization: Bearer ` first. If present but **invalid**, returns 401 — **no fall-through** to the Origin path. (This was a CanvasOrBearer bug fixed during code review; preserved as the invariant.) +2. Empty bearer → check `Origin` header against `CORS_ORIGINS` env var. Exact-match only. Empty Origin does not pass. +3. Lazy-bootstrap fail-open identical to `AdminAuth`. + +**The Origin check is NOT a strict auth boundary.** Any non-browser client (curl, an attacker tool) can forge the `Origin` header. CORS protects the browser from reading the response, not the server from receiving the request. Apply `CanvasOrBearer` only to routes where a curl attacker with knowledge of the canvas origin could do nothing harmful. + +### When to add a new route to `CanvasOrBearer` + +Ask these three questions. **All three** must be yes or the route belongs behind strict `AdminAuth`: + +1. Can a browser at `https://.moleculesai.app` need this route without a bearer token? (If not, just use `AdminAuth` — browsers can send bearers via the session-cookie auth flow once that lands.) +2. If a non-browser attacker forged `Origin: https://.moleculesai.app`, would the worst-case outcome be purely cosmetic — recoverable with a browser refresh and no data exposure? +3. Is there no tenant isolation concern (cross-org data leak) on this route? + +If yes/yes/yes → `CanvasOrBearer` is acceptable. Document the rationale in the PR that adds it, and add the route to the table above in the same PR. + +## Relationship to `WorkspaceAuth` + +`WorkspaceAuth` is the `/workspaces/:id/*` sub-route middleware. Different contract entirely: it binds a bearer token to a specific workspace ID so workspace A's token can't hit workspace B's sub-routes. Used for all `/workspaces/:id/*` paths except the A2A proxy (which has its own `CanCommunicate` access-control layer). + +AdminAuth accepts **any** valid workspace bearer (it's a global gate). WorkspaceAuth accepts only the bearer for the **specific** `:id` in the URL path. + +## Known gap (Phase H follow-up) + +`CanvasOrBearer` is a tactical fix for the #168 canvas-regression problem. The proper long-term path is **session-cookie-accepting AdminAuth**: extend `AdminAuth` to validate the `mcp_session` cookie via `auth.Provider.VerifySession` (WorkOS in prod, DisabledProvider in dev). That would give the full list of admin routes browser compatibility without an Origin-based workaround. Tracked as a Phase H item once the SaaS control plane is the primary deployment surface. + +## Related PRs and issues + +- #138 — first canvas regression (PATCH /workspaces/:id), fixed with field-level authz in the handler (`WorkspaceHandler.Update`) +- #164 — CRITICAL anonymous workspace creation via unauthenticated `POST /bundles/import` +- #165 — HIGH topology disclosure via unauthenticated `GET /events` and `GET /bundles/export/:id` +- #166 — MEDIUM viewport corruption / liveness leak +- #167 — first auth-gate batch, strict `AdminAuth` on 5 routes +- #168 — canvas regression from the strict gating +- #190 — HIGH unauthenticated `POST /templates/import` +- #194 — rejected Origin-fallback approach (would have reopened #164) +- #203 — the `CanvasOrBearer` middleware, route-split approach, only on `PUT /canvas/viewport` +- #228 — code-review follow-up: CanvasOrBearer invalid-bearer fall-through fix diff --git a/org-templates/molecule-dev/org.yaml b/org-templates/molecule-dev/org.yaml index 6dd2579e..540930c8 100644 --- a/org-templates/molecule-dev/org.yaml +++ b/org-templates/molecule-dev/org.yaml @@ -67,6 +67,15 @@ defaults: # workspace_dir: not set by default — each agent gets an isolated Docker volume # Set per-workspace to bind-mount a host directory as /workspace + # Idle-loop reflection pattern (#205). When idle_prompt is non-empty, the + # workspace self-sends this prompt every idle_interval_seconds while its + # heartbeat.active_tasks == 0. Pattern from Hermes/Letta. Cost collapses to + # event-driven (no LLM call unless there's actually nothing to do). Off by + # default to avoid surprising token burn — set per-workspace to enable. + # Keep idle prompts local (no A2A sends): same rule as initial_prompt. + idle_prompt: "" + idle_interval_seconds: 600 # 10 min — ignored when idle_prompt is empty + # initial_prompt runs once on first boot (not on restart). # ${GITHUB_REPO} is a container env var from .env secrets. # IMPORTANT: Do NOT send A2A messages in initial_prompt — other agents may not diff --git a/platform/internal/handlers/activity.go b/platform/internal/handlers/activity.go index b92538d8..4699a3a9 100644 --- a/platform/internal/handlers/activity.go +++ b/platform/internal/handlers/activity.go @@ -338,6 +338,11 @@ func (h *ActivityHandler) Report(c *gin.Context) { // Empty source_id falls through to the default-to-self branch below. sourceID := body.SourceID if sourceID != "" && sourceID != workspaceID { + // Log the spoof attempt as a security event so an auditor cron can + // surface repeat probing. Keep the log line stable (greppable) and + // avoid echoing attacker-supplied data verbatim beyond the UUIDs. + log.Printf("security: source_id spoof attempt — authed_workspace=%s body_source_id=%s remote=%s", + workspaceID, sourceID, c.ClientIP()) c.JSON(http.StatusForbidden, gin.H{"error": "source_id must match authenticated workspace"}) return } diff --git a/platform/internal/handlers/handlers_test.go b/platform/internal/handlers/handlers_test.go index d8f738d8..20897baf 100644 --- a/platform/internal/handlers/handlers_test.go +++ b/platform/internal/handlers/handlers_test.go @@ -1081,3 +1081,53 @@ func TestSharedContext_NoSharedFiles(t *testing.T) { t.Errorf("unmet sqlmock expectations: %v", err) } } + +// TestActivityHandler_Report_SourceIDSpoofRejected verifies the #209 spoof +// guard: a workspace authenticated for :id cannot inject activity rows with +// source_id pointing at a different workspace. Bearer-auth middleware would +// already cover the obvious case; this is the belt-and-suspenders body check. +func TestActivityHandler_Report_SourceIDSpoofRejected(t *testing.T) { + setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewActivityHandler(broadcaster) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-alice"}} + // alice's workspace authenticated — but body claims source_id=ws-bob. + body := `{"activity_type":"agent_log","summary":"fake log","source_id":"ws-bob"}` + c.Request = httptest.NewRequest("POST", "/workspaces/ws-alice/activity", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Report(c) + + if w.Code != http.StatusForbidden { + t.Errorf("spoof: got %d, want 403 (%s)", w.Code, w.Body.String()) + } +} + +// TestActivityHandler_Report_MatchingSourceIDAccepted — the non-spoof path: +// body.source_id explicitly matches workspaceID, still accepted. +func TestActivityHandler_Report_MatchingSourceIDAccepted(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewActivityHandler(broadcaster) + + mock.ExpectExec("INSERT INTO activity_logs"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-alice"}} + body := `{"activity_type":"agent_log","summary":"self log","source_id":"ws-alice"}` + c.Request = httptest.NewRequest("POST", "/workspaces/ws-alice/activity", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Report(c) + + if w.Code != http.StatusOK { + t.Errorf("matching source_id: got %d, want 200 (%s)", w.Code, w.Body.String()) + } +} diff --git a/platform/internal/handlers/schedules_test.go b/platform/internal/handlers/schedules_test.go index ad8a62ac..a3d307f6 100644 --- a/platform/internal/handlers/schedules_test.go +++ b/platform/internal/handlers/schedules_test.go @@ -124,3 +124,50 @@ func TestList_IncludesSourceColumn(t *testing.T) { t.Fatalf("unmet expectations: %v", err) } } + +// TestHistory_IncludesErrorDetail — #152 problem B coverage. The history +// endpoint must surface error_detail from activity_logs so clients know +// why a cron run failed (not just that it failed). Writes a fake cron_run +// row via sqlmock with a non-empty error_detail and asserts it reaches +// the JSON response. +func TestHistory_IncludesErrorDetail(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + handler := NewScheduleHandler() + + workspaceID := "550e8400-e29b-41d4-a716-446655440000" + scheduleID := "11111111-1111-1111-1111-111111111111" + now := time.Now() + + cols := []string{"created_at", "duration_ms", "status", "error_detail", "request_body"} + mock.ExpectQuery("SELECT created_at, duration_ms, status"). + WithArgs(workspaceID, scheduleID). + WillReturnRows(sqlmock.NewRows(cols). + AddRow(now, 4200, "error", "HTTP 500 — workspace agent OOM", `{"schedule_id":"`+scheduleID+`"}`). + AddRow(now, 1500, "ok", "", `{"schedule_id":"`+scheduleID+`"}`)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{ + {Key: "id", Value: workspaceID}, + {Key: "scheduleId", Value: scheduleID}, + } + c.Request = httptest.NewRequest("GET", + "/workspaces/"+workspaceID+"/schedules/"+scheduleID+"/history", nil) + + handler.History(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + body := w.Body.String() + if !strings.Contains(body, `"error_detail":"HTTP 500 — workspace agent OOM"`) { + t.Errorf("history response missing populated error_detail: %s", body) + } + if !strings.Contains(body, `"error_detail":""`) { + t.Errorf("history response missing empty error_detail on ok row: %s", body) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("sqlmock: %v", err) + } +} diff --git a/platform/internal/middleware/wsauth_middleware.go b/platform/internal/middleware/wsauth_middleware.go index 9eee64f7..0b357756 100644 --- a/platform/internal/middleware/wsauth_middleware.go +++ b/platform/internal/middleware/wsauth_middleware.go @@ -119,12 +119,17 @@ func CanvasOrBearer(database *sql.DB) gin.HandlerFunc { return } - // Path 1: valid bearer. + // Path 1: bearer present → bearer MUST validate. Do not fall through + // to Origin on an invalid bearer — an attacker with a revoked / + // expired token + a matching Origin would otherwise bypass auth. + // Empty bearer → skip to Origin path (canvas never sends one). if tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization")); tok != "" { - if err := wsauth.ValidateAnyToken(ctx, database, tok); err == nil { - c.Next() + if err := wsauth.ValidateAnyToken(ctx, database, tok); err != nil { + c.AbortWithStatusJSON(http.StatusUnauthorized, gin.H{"error": "invalid admin auth token"}) return } + c.Next() + return } // Path 2: canvas origin match. Read CORS_ORIGINS at request time so diff --git a/platform/internal/scheduler/scheduler.go b/platform/internal/scheduler/scheduler.go index 43285f47..8839fe0e 100644 --- a/platform/internal/scheduler/scheduler.go +++ b/platform/internal/scheduler/scheduler.go @@ -233,12 +233,8 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) { `SELECT COALESCE(active_tasks, 0) FROM workspaces WHERE id = $1`, sched.WorkspaceID, ).Scan(&activeTasks); err == nil && activeTasks > 0 { - wsID := sched.WorkspaceID - if len(wsID) > 12 { - wsID = wsID[:12] - } log.Printf("Scheduler: skipping '%s' on busy workspace %s (active_tasks=%d)", - sched.Name, wsID, activeTasks) + sched.Name, short(sched.WorkspaceID, 12), activeTasks) s.recordSkipped(ctx, sched, activeTasks) return } @@ -246,11 +242,7 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) { fireCtx, cancel := context.WithTimeout(ctx, fireTimeout) defer cancel() - idPrefix := sched.ID - if len(idPrefix) > 8 { - idPrefix = idPrefix[:8] - } - msgID := fmt.Sprintf("cron-%s-%s", idPrefix, uuid.New().String()[:8]) + msgID := fmt.Sprintf("cron-%s-%s", short(sched.ID, 8), uuid.New().String()[:8]) a2aBody, _ := json.Marshal(map[string]interface{}{ "method": "message/send", @@ -263,7 +255,7 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) { }, }) - log.Printf("Scheduler: firing '%s' → workspace %s", sched.Name, sched.WorkspaceID[:12]) + log.Printf("Scheduler: firing '%s' → workspace %s", sched.Name, short(sched.WorkspaceID, 12)) // Empty callerID = canvas-style request (bypasses access control, source_id=NULL in activity log). // "system:scheduler" was invalid — source_id column is UUID and rejects non-UUID strings. @@ -386,6 +378,16 @@ func truncate(s string, maxLen int) string { return s[:maxLen-3] + "..." } +// short returns up to n leading characters of s without panicking when s is +// shorter than n. Used to safely display UUID prefixes in log lines where +// the full ID would be noisy but the full-length bounds check is repetitive. +func short(s string, n int) string { + if len(s) <= n { + return s + } + return s[:n] +} + // ComputeNextRun parses a cron expression and returns the next fire time // after the given time, in the specified timezone. func ComputeNextRun(cronExpr, tz string, after time.Time) (time.Time, error) { diff --git a/platform/internal/scheduler/scheduler_test.go b/platform/internal/scheduler/scheduler_test.go index 47f3fa2e..b3e58e9a 100644 --- a/platform/internal/scheduler/scheduler_test.go +++ b/platform/internal/scheduler/scheduler_test.go @@ -178,3 +178,90 @@ func TestPanicRecovery(t *testing.T) { t.Errorf("unmet DB expectations: %v", err) } } + +// ── TestShort_helper ────────────────────────────────────────────────────────── +// Regression guard for the short() helper that replaced unsafe [:N] slices +// after code review. Panicked when IDs were shorter than the slice bound. + +func TestShort_helper(t *testing.T) { + cases := []struct { + in string + n int + want string + }{ + {"abcdef1234567890", 8, "abcdef12"}, + {"abc", 8, "abc"}, // shorter than n — no panic, no truncation + {"", 8, ""}, + {"12345678", 8, "12345678"}, // exactly n + } + for _, tc := range cases { + if got := short(tc.in, tc.n); got != tc.want { + t.Errorf("short(%q, %d) = %q, want %q", tc.in, tc.n, got, tc.want) + } + } +} + +// ── TestRecordSkipped_writesSkippedStatus ──────────────────────────────────── +// #115 coverage gap: the recordSkipped path wasn't tested at all when it +// first landed. Exercises the UPDATE workspace_schedules + INSERT into +// activity_logs via sqlmock. Broadcaster is nil so we don't need to stub +// RecordAndBroadcast (the nil-check in recordSkipped handles that). + +func TestRecordSkipped_writesSkippedStatus(t *testing.T) { + mock := setupTestDB(t) + s := New(nil, nil) + + sched := scheduleRow{ + ID: "11111111-1111-1111-1111-111111111111", + WorkspaceID: "22222222-2222-2222-2222-222222222222", + Name: "Hourly security audit", + CronExpr: "17 * * * *", + Timezone: "UTC", + Prompt: "audit", + } + + // Expect the schedule-row UPDATE with last_status='skipped' and the + // cron_run activity_logs INSERT with status='skipped' + error_detail + // carrying the active_tasks reason. + mock.ExpectExec(`UPDATE workspace_schedules`). + WithArgs(sched.ID, sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec(`INSERT INTO activity_logs`). + WithArgs(sched.WorkspaceID, sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + + s.recordSkipped(context.Background(), sched, 3) + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +// ── TestRecordSkipped_shortWorkspaceIDNoPanic ───────────────────────────────── +// Guards against the short() regression: recordSkipped must not panic if +// WorkspaceID is unexpectedly shorter than the 12-char prefix used in logs. + +func TestRecordSkipped_shortWorkspaceIDNoPanic(t *testing.T) { + mock := setupTestDB(t) + s := New(nil, nil) + + // 4-char workspace id — shorter than any substring bound in the code. + sched := scheduleRow{ + ID: "11111111-1111-1111-1111-111111111111", + WorkspaceID: "ws-x", + Name: "test", + CronExpr: "0 * * * *", + Timezone: "UTC", + } + mock.ExpectExec(`UPDATE workspace_schedules`). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec(`INSERT INTO activity_logs`). + WillReturnResult(sqlmock.NewResult(0, 1)) + + defer func() { + if r := recover(); r != nil { + t.Errorf("recordSkipped panicked on short WorkspaceID: %v", r) + } + }() + s.recordSkipped(context.Background(), sched, 1) +} diff --git a/workspace-template/main.py b/workspace-template/main.py index 23782e9d..98aa3725 100644 --- a/workspace-template/main.py +++ b/workspace-template/main.py @@ -388,14 +388,21 @@ async def main(): # pragma: no cover # per-workspace to enable. idle_loop_task = None if config.idle_prompt: + # Idle-fire HTTP timeout. Kept tight relative to the fire cadence so a + # hung platform doesn't accumulate dangling requests — a fire that + # takes longer than the idle interval itself is almost certainly stuck. + IDLE_FIRE_TIMEOUT_SECONDS = max(60, min(300, config.idle_interval_seconds)) + # Initial settle delay — never longer than 60s so cold-start races + # don't stall the first fire, and never shorter than the configured + # interval (short intervals shouldn't fire instantly on boot either). + IDLE_INITIAL_SETTLE_SECONDS = min(config.idle_interval_seconds, 60) + async def _run_idle_loop(): """Self-sends config.idle_prompt periodically when the workspace is idle.""" - # Wait for server + initial prompt to settle before the first idle check. - # Short wait (min of 60s or interval) so cold-start races don't fire instantly. - await asyncio.sleep(min(config.idle_interval_seconds, 60)) + await asyncio.sleep(IDLE_INITIAL_SETTLE_SECONDS) import json as _json - import urllib.request + from urllib import request as _urlreq, error as _urlerr while True: try: @@ -424,20 +431,46 @@ async def main(): # pragma: no cover }).encode() def _post_sync(): + # Returns (status_code, error_type) so the caller logs the + # actual outcome instead of a bare "post failed" line. try: - req = urllib.request.Request( + req = _urlreq.Request( f"{platform_url}/workspaces/{workspace_id}/a2a", data=payload, headers={"Content-Type": "application/json"}, ) - with urllib.request.urlopen(req, timeout=600) as resp: + with _urlreq.urlopen(req, timeout=IDLE_FIRE_TIMEOUT_SECONDS) as resp: resp.read() - except Exception as e: - print(f"Idle loop: post failed — {e}", flush=True) + return resp.status, None + except _urlerr.HTTPError as e: + return e.code, type(e).__name__ + except _urlerr.URLError as e: + return None, f"URLError: {e.reason}" + except Exception as e: # pragma: no cover — catch-all safety net + return None, type(e).__name__ - print(f"Idle loop: firing (active_tasks=0, interval={config.idle_interval_seconds}s)", flush=True) - loop_ref = asyncio.get_event_loop() - loop_ref.run_in_executor(None, _post_sync) + print( + f"Idle loop: firing (active_tasks=0, interval={config.idle_interval_seconds}s, " + f"timeout={IDLE_FIRE_TIMEOUT_SECONDS}s)", + flush=True, + ) + loop_ref = asyncio.get_running_loop() + + def _log_result(future): + try: + status, err = future.result() + if err: + print( + f"Idle loop: post failed — status={status} err={err}", + flush=True, + ) + else: + print(f"Idle loop: post ok status={status}", flush=True) + except Exception as e: # pragma: no cover + print(f"Idle loop: executor callback crashed — {e}", flush=True) + + fut = loop_ref.run_in_executor(None, _post_sync) + fut.add_done_callback(_log_result) idle_loop_task = asyncio.create_task(_run_idle_loop())