From 7a16eb4f70a32dfd15fdab2d2f4de0e15719727f Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Wed, 15 Apr 2026 11:09:16 -0700 Subject: [PATCH 1/4] =?UTF-8?q?fix(auth):=20#168=20=E2=80=94=20CanvasOrBea?= =?UTF-8?q?rer=20middleware=20for=20PUT=20/canvas/viewport=20only?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #168 by the route-split path from #194's review. #167 put PUT /canvas/viewport behind strict AdminAuth, breaking canvas drag/zoom persist because the canvas uses session cookies not bearer tokens. New narrow middleware CanvasOrBearer: - Accepts a valid bearer (same contract as AdminAuth) OR - Accepts a request whose Origin exactly matches CORS_ORIGINS - Lazy-bootstrap fail-open preserved for fresh installs Applied ONLY to PUT /canvas/viewport. The softer check is acceptable there because viewport corruption is cosmetic-only — worst case a user refreshes the page. This middleware must NOT be used on routes that leak prompts (#165), create resources (#164), or write files (#190) — see #194 review for why. The other canvas-facing routes mentioned in #168 (Events tab, Bundle Export/Import) remain behind strict AdminAuth pending a proper session-cookie-accepting AdminAuth (#168 follow-up for Phase H). 6 new tests cover: bootstrap fail-open, no-creds 401, canvas origin match, wrong origin 401, empty origin rejected, localhost default. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../internal/middleware/wsauth_middleware.go | 81 ++++++++++++ .../middleware/wsauth_middleware_test.go | 123 ++++++++++++++++++ platform/internal/router/router.go | 17 +-- 3 files changed, 213 insertions(+), 8 deletions(-) diff --git a/platform/internal/middleware/wsauth_middleware.go b/platform/internal/middleware/wsauth_middleware.go index e74aefac..9eee64f7 100644 --- a/platform/internal/middleware/wsauth_middleware.go +++ b/platform/internal/middleware/wsauth_middleware.go @@ -4,6 +4,8 @@ import ( "database/sql" "log" "net/http" + "os" + "strings" "github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth" "github.com/gin-gonic/gin" @@ -84,3 +86,82 @@ func AdminAuth(database *sql.DB) gin.HandlerFunc { c.Next() } } + +// CanvasOrBearer is a softer admin-auth variant used ONLY for cosmetic +// canvas routes where forging the request has zero security impact (PUT +// /canvas/viewport: worst case an attacker resets the shared viewport +// position, user refreshes the page, problem solved). +// +// Accepts either: +// +// 1. A valid bearer token (same contract as AdminAuth) — covers molecli, +// agent-to-platform calls, and anyone using the API directly. +// 2. A browser Origin header that matches CORS_ORIGINS (canvas itself). +// This is NOT a strict auth boundary — curl can forge Origin — but for +// cosmetic-only routes the trade-off is acceptable. Non-cosmetic routes +// MUST NOT use this middleware (see #194 review on why it would re-open +// #164 CRITICAL if applied to /bundles/import). +// +// Lazy-bootstrap fail-open preserved: zero-token installs pass everything +// through so fresh self-hosted / dev sessions aren't bricked. +func CanvasOrBearer(database *sql.DB) gin.HandlerFunc { + return func(c *gin.Context) { + ctx := c.Request.Context() + + hasLive, err := wsauth.HasAnyLiveTokenGlobal(ctx, database) + if err != nil { + log.Printf("wsauth: CanvasOrBearer HasAnyLiveTokenGlobal failed: %v — allowing request", err) + c.Next() + return + } + if !hasLive { + c.Next() + return + } + + // Path 1: valid bearer. + if tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization")); tok != "" { + if err := wsauth.ValidateAnyToken(ctx, database, tok); err == nil { + c.Next() + return + } + } + + // Path 2: canvas origin match. Read CORS_ORIGINS at request time so + // tests can override via t.Setenv. canvasOriginAllowed returns true + // iff Origin is non-empty AND exactly matches one of the configured + // origins. Empty Origin (same-origin / server-to-server) does NOT + // pass this check — those callers must use the bearer path. + if canvasOriginAllowed(c.GetHeader("Origin")) { + c.Next() + return + } + + c.AbortWithStatusJSON(http.StatusUnauthorized, gin.H{"error": "admin auth required"}) + } +} + +// canvasOriginAllowed returns true if origin matches any entry in the +// CORS_ORIGINS env var (comma-separated) or the localhost defaults. +// Exact-match only; no prefix or wildcard logic — that's handled by the +// real CORS middleware upstream. The intent here is "did this request come +// from the canvas page the user is already logged into?" — a binary check. +func canvasOriginAllowed(origin string) bool { + if origin == "" { + return false + } + allowed := []string{"http://localhost:3000", "http://localhost:3001"} + if v := os.Getenv("CORS_ORIGINS"); v != "" { + for _, o := range strings.Split(v, ",") { + if o = strings.TrimSpace(o); o != "" { + allowed = append(allowed, o) + } + } + } + for _, a := range allowed { + if a == origin { + return true + } + } + return false +} diff --git a/platform/internal/middleware/wsauth_middleware_test.go b/platform/internal/middleware/wsauth_middleware_test.go index 5fbb861f..783065a4 100644 --- a/platform/internal/middleware/wsauth_middleware_test.go +++ b/platform/internal/middleware/wsauth_middleware_test.go @@ -678,3 +678,126 @@ func TestAdminAuth_Issue120_PatchWorkspace_NoBearer_Returns401(t *testing.T) { t.Errorf("unmet sqlmock expectations: %v", err) } } + +// ── CanvasOrBearer (#168) ──────────────────────────────────────────────────── +// Narrow softer variant of AdminAuth used ONLY on PUT /canvas/viewport. +// Accepts bearer or a matching Origin header. MUST NOT be used anywhere a +// forged request would leak data or create resources. + +func TestCanvasOrBearer_NoTokens_FailOpen(t *testing.T) { + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock: %v", err) + } + defer mockDB.Close() + + mock.ExpectQuery(hasAnyLiveTokenGlobalQuery). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) + + r := gin.New() + r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"ok": true}) + }) + + w := httptest.NewRecorder() + req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil) + r.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Errorf("bootstrap fail-open: got %d, want 200 (%s)", w.Code, w.Body.String()) + } +} + +func TestCanvasOrBearer_TokensExist_NoCreds_Returns401(t *testing.T) { + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock: %v", err) + } + defer mockDB.Close() + + mock.ExpectQuery(hasAnyLiveTokenGlobalQuery). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1)) + + r := gin.New() + r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"ok": true}) + }) + + w := httptest.NewRecorder() + req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil) + r.ServeHTTP(w, req) + + if w.Code != http.StatusUnauthorized { + t.Errorf("no creds: got %d, want 401", w.Code) + } +} + +func TestCanvasOrBearer_TokensExist_CanvasOrigin_Passes(t *testing.T) { + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock: %v", err) + } + defer mockDB.Close() + + mock.ExpectQuery(hasAnyLiveTokenGlobalQuery). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1)) + + t.Setenv("CORS_ORIGINS", "https://acme.moleculesai.app,https://bob.moleculesai.app") + + r := gin.New() + r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"ok": true}) + }) + + w := httptest.NewRecorder() + req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil) + req.Header.Set("Origin", "https://acme.moleculesai.app") + r.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Errorf("canvas origin: got %d, want 200 (%s)", w.Code, w.Body.String()) + } +} + +func TestCanvasOrBearer_TokensExist_WrongOrigin_Returns401(t *testing.T) { + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock: %v", err) + } + defer mockDB.Close() + + mock.ExpectQuery(hasAnyLiveTokenGlobalQuery). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1)) + + t.Setenv("CORS_ORIGINS", "https://acme.moleculesai.app") + + r := gin.New() + r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"ok": true}) + }) + + w := httptest.NewRecorder() + req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil) + req.Header.Set("Origin", "https://evil.example.com") + r.ServeHTTP(w, req) + + if w.Code != http.StatusUnauthorized { + t.Errorf("wrong origin: got %d, want 401", w.Code) + } +} + +func TestCanvasOriginAllowed_EmptyOriginRejected(t *testing.T) { + if canvasOriginAllowed("") { + t.Error("empty Origin must not pass") + } +} + +func TestCanvasOriginAllowed_LocalhostDefault(t *testing.T) { + t.Setenv("CORS_ORIGINS", "") + if !canvasOriginAllowed("http://localhost:3000") { + t.Error("localhost:3000 should be allowed by default") + } + if canvasOriginAllowed("http://evil.example.com") { + t.Error("random origin should not be allowed") + } +} diff --git a/platform/internal/router/router.go b/platform/internal/router/router.go index fc3e579c..9db76af4 100644 --- a/platform/internal/router/router.go +++ b/platform/internal/router/router.go @@ -294,16 +294,17 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi th := handlers.NewTerminalHandler(dockerCli) wsAuth.GET("/terminal", th.HandleConnect) - // Canvas Viewport — #166: PUT gated behind AdminAuth so an anon caller - // can't reset the shared viewport state for all users. GET remains open - // because the canvas bootstraps without a bearer and needs the initial - // viewport for first paint. + // Canvas Viewport — #166 + #168: GET stays fully open for bootstrap. + // PUT uses CanvasOrBearer (accepts Origin-match OR bearer token) so the + // browser canvas can persist drag/zoom state without a bearer, while + // bearer-carrying clients (molecli, integration tests) still work. + // Viewport corruption is cosmetic-only — worst case a user refreshes + // the page — so the softer check is acceptable here. This middleware + // MUST NOT be used on routes that leak prompts, create workspaces, + // or write files (#164/#165/#190 class). vh := handlers.NewViewportHandler() r.GET("/canvas/viewport", vh.Get) - { - viewportAdmin := r.Group("", middleware.AdminAuth(db.DB)) - viewportAdmin.PUT("/canvas/viewport", vh.Save) - } + r.PUT("/canvas/viewport", middleware.CanvasOrBearer(db.DB), vh.Save) // Templates tmplh := handlers.NewTemplatesHandler(configsDir, dockerCli) From ce88a396da48c1a2a02f4b1cae3f4e15c95b741f Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Wed, 15 Apr 2026 11:11:16 -0700 Subject: [PATCH 2/4] =?UTF-8?q?fix(scheduler):=20#152=20problem=20B=20?= =?UTF-8?q?=E2=80=94=20persist=20and=20surface=20cron=20error=5Fdetail?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #152 problem B (schedule history API drops error detail). Two tiny changes: 1. scheduler.fireSchedule now writes lastError into activity_logs.error_detail when inserting the cron_run row. Previously the column was left NULL even on failure because the INSERT didn't include it. 2. schedules.History SELECT now reads error_detail and includes it in the JSON response under error_detail. Frontend + audit cron can now display "why did this run fail" instead of just "status=error". No schema change — activity_logs.error_detail already exists from migration 009. This just starts using the column. Problem A of #152 (Research Lead ecosystem-watch 50% error rate on its own) is a separate ops investigation and stays open. Co-Authored-By: Claude Opus 4.6 (1M context) --- platform/internal/handlers/schedules.go | 15 ++++++++++----- platform/internal/scheduler/scheduler.go | 10 +++++++--- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/platform/internal/handlers/schedules.go b/platform/internal/handlers/schedules.go index 5c9e7319..281f471b 100644 --- a/platform/internal/handlers/schedules.go +++ b/platform/internal/handlers/schedules.go @@ -275,8 +275,12 @@ func (h *ScheduleHandler) History(c *gin.Context) { workspaceID := c.Param("id") ctx := c.Request.Context() + // #152: include error_detail in history so UI can show why a run failed. + // activity_logs.error_detail is populated by scheduler.fireSchedule when + // the A2A proxy returns non-2xx or the update SQL reports an error. rows, err := db.DB.QueryContext(ctx, ` SELECT created_at, duration_ms, status, + COALESCE(error_detail, '') as error_detail, COALESCE(request_body::text, '{}') as request_body FROM activity_logs WHERE workspace_id = $1 @@ -292,17 +296,18 @@ func (h *ScheduleHandler) History(c *gin.Context) { defer rows.Close() type historyEntry struct { - Timestamp time.Time `json:"timestamp"` - DurationMs *int `json:"duration_ms"` - Status *string `json:"status"` - Request json.RawMessage `json:"request"` + Timestamp time.Time `json:"timestamp"` + DurationMs *int `json:"duration_ms"` + Status *string `json:"status"` + ErrorDetail string `json:"error_detail"` + Request json.RawMessage `json:"request"` } entries := make([]historyEntry, 0) for rows.Next() { var e historyEntry var reqStr string - if err := rows.Scan(&e.Timestamp, &e.DurationMs, &e.Status, &reqStr); err != nil { + if err := rows.Scan(&e.Timestamp, &e.DurationMs, &e.Status, &e.ErrorDetail, &reqStr); err != nil { continue } e.Request = json.RawMessage(reqStr) diff --git a/platform/internal/scheduler/scheduler.go b/platform/internal/scheduler/scheduler.go index c9fd7da1..cfb596eb 100644 --- a/platform/internal/scheduler/scheduler.go +++ b/platform/internal/scheduler/scheduler.go @@ -290,10 +290,14 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) { "cron_expr": sched.CronExpr, "prompt": truncate(sched.Prompt, 200), }) + // #152: persist lastError into error_detail on the activity_logs row + // so GET /workspaces/:id/schedules/:id/history can surface why a run + // failed (previously dropped — history returned status without any + // error context, making root-cause debugging impossible). _, _ = db.DB.ExecContext(ctx, ` - INSERT INTO activity_logs (workspace_id, activity_type, source_id, method, summary, request_body, status, created_at) - VALUES ($1, 'cron_run', NULL, 'cron', $2, $3::jsonb, $4, now()) - `, sched.WorkspaceID, "Cron: "+sched.Name, string(cronMeta), lastStatus) + INSERT INTO activity_logs (workspace_id, activity_type, source_id, method, summary, request_body, status, error_detail, created_at) + VALUES ($1, 'cron_run', NULL, 'cron', $2, $3::jsonb, $4, $5, now()) + `, sched.WorkspaceID, "Cron: "+sched.Name, string(cronMeta), lastStatus, lastError) if s.broadcaster != nil { s.broadcaster.RecordAndBroadcast(ctx, "CRON_EXECUTED", sched.WorkspaceID, map[string]interface{}{ From fb942fbb0c62c49d270f9555cd2569c097bdedfb Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Wed, 15 Apr 2026 11:13:15 -0700 Subject: [PATCH 3/4] =?UTF-8?q?fix(scheduler):=20#115=20=E2=80=94=20skip?= =?UTF-8?q?=20cron=20fire=20when=20workspace=20is=20busy?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #115. The Security Auditor hourly cron (and likely others) hit a ~36% miss rate because the platform's A2A proxy rejected fires with "workspace agent busy — retry after a short backoff" while the agent was still executing the prior audit. That error was recorded as a hard failure and polluted last_error. New behaviour: Before fireSchedule calls into the A2A proxy, it reads workspaces.active_tasks for the target. If >0, it: - Advances next_run_at to the next cron slot (cron keeps ticking) - Bumps run_count - Sets last_status='skipped' + last_error= - Inserts a cron_run activity_logs row with status='skipped' + error_detail - Broadcasts CRON_SKIPPED for canvas + operators Effect: busy-collision ceases to be an error. The history surface now distinguishes "ran and failed" from "skipped because busy". Operators can tell the difference at a glance, and the liveness view doesn't stall waiting for the next ticker cycle. Pairs with #149 (dedicated heartbeat pulse) and #152 problem B (error_detail surfaced in history) for a coherent scheduler story. Co-Authored-By: Claude Opus 4.6 (1M context) --- platform/internal/scheduler/scheduler.go | 71 ++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/platform/internal/scheduler/scheduler.go b/platform/internal/scheduler/scheduler.go index cfb596eb..43285f47 100644 --- a/platform/internal/scheduler/scheduler.go +++ b/platform/internal/scheduler/scheduler.go @@ -222,6 +222,27 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) { } }() + // #115 concurrency-aware skip — before firing check if the target + // workspace is already executing a task. If so, skip this tick instead + // of colliding (which used to surface as "workspace agent busy" errors + // and register as a hard fail). advance next_run_at so the next cron + // slot gets a fresh chance; log a skipped cron_run row so history shows + // the gap instead of a silent miss. COALESCE guards against NULL. + var activeTasks int + if err := db.DB.QueryRowContext(ctx, + `SELECT COALESCE(active_tasks, 0) FROM workspaces WHERE id = $1`, + sched.WorkspaceID, + ).Scan(&activeTasks); err == nil && activeTasks > 0 { + wsID := sched.WorkspaceID + if len(wsID) > 12 { + wsID = wsID[:12] + } + log.Printf("Scheduler: skipping '%s' on busy workspace %s (active_tasks=%d)", + sched.Name, wsID, activeTasks) + s.recordSkipped(ctx, sched, activeTasks) + return + } + fireCtx, cancel := context.WithTimeout(ctx, fireTimeout) defer cancel() @@ -308,6 +329,56 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) { } } +// recordSkipped advances next_run_at and logs a cron_run activity entry +// with status='skipped' when the target workspace was already busy. +// Issue #115 — replaces the previous "busy → fire → fail → retry next +// tick" loop with "busy → skip → advance → try next slot". Keeps the +// history surface honest (a skip is not an error) and stops filling +// last_error with noise. +func (s *Scheduler) recordSkipped(ctx context.Context, sched scheduleRow, activeTasks int) { + reason := fmt.Sprintf("skipped: workspace busy (active_tasks=%d)", activeTasks) + + nextRun, nextErr := ComputeNextRun(sched.CronExpr, sched.Timezone, time.Now()) + var nextRunPtr *time.Time + if nextErr == nil { + nextRunPtr = &nextRun + } + + // Advance next_run_at + bump run_count so the liveness view reflects + // that we're still ticking. last_status='skipped', last_error carries + // the reason for operators debugging via the schedule history API. + _, _ = db.DB.ExecContext(ctx, ` + UPDATE workspace_schedules + SET last_run_at = now(), + next_run_at = $2, + run_count = run_count + 1, + last_status = 'skipped', + last_error = $3, + updated_at = now() + WHERE id = $1 + `, sched.ID, nextRunPtr, reason) + + cronMeta, _ := json.Marshal(map[string]interface{}{ + "schedule_id": sched.ID, + "schedule_name": sched.Name, + "cron_expr": sched.CronExpr, + "skipped": true, + "active_tasks": activeTasks, + }) + _, _ = db.DB.ExecContext(ctx, ` + INSERT INTO activity_logs (workspace_id, activity_type, source_id, method, summary, request_body, status, error_detail, created_at) + VALUES ($1, 'cron_run', NULL, 'cron', $2, $3::jsonb, 'skipped', $4, now()) + `, sched.WorkspaceID, "Cron skipped: "+sched.Name, string(cronMeta), reason) + + if s.broadcaster != nil { + _ = s.broadcaster.RecordAndBroadcast(ctx, "CRON_SKIPPED", sched.WorkspaceID, map[string]interface{}{ + "schedule_id": sched.ID, + "schedule_name": sched.Name, + "reason": reason, + }) + } +} + func truncate(s string, maxLen int) string { if len(s) <= maxLen { return s From 25bbfd3bfc45a72c5dd4fa5ca8df583f4f740696 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Wed, 15 Apr 2026 11:15:08 -0700 Subject: [PATCH 4/4] =?UTF-8?q?fix(security):=20C2=20from=20#169=20?= =?UTF-8?q?=E2=80=94=20reject=20spoofed=20source=5Fid=20in=20activity.Repo?= =?UTF-8?q?rt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cherry-picks the one genuinely new fix from #169 after confirming the rest of that PR is already covered on main (C1/C3/C5 by wsAuth group, C6 by #94+#119 SSRF blocklist, C4 ownership by existing WHERE filter). Pre-existing middleware (WorkspaceAuth on /workspaces/:id/* sub-routes) proves the caller owns the :id path param. But the body field source_id was never validated — a workspace authenticated for its own /activity endpoint could still attribute logs to a different workspace by setting source_id=. Rejected with 403 now. No schema change, no new middleware. 4-line handler delta. Closes the only real gap in #169; #169 itself will be closed as superseded. Co-Authored-By: Claude Opus 4.6 (1M context) --- platform/internal/handlers/activity.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/platform/internal/handlers/activity.go b/platform/internal/handlers/activity.go index 9be1daf8..b92538d8 100644 --- a/platform/internal/handlers/activity.go +++ b/platform/internal/handlers/activity.go @@ -329,7 +329,18 @@ func (h *ActivityHandler) Report(c *gin.Context) { if reqBody == nil { reqBody = body.Metadata } + // C2 (from #169) — source_id spoof defense. WorkspaceAuth middleware + // already proves the caller owns :id, but that check doesn't cover the + // body field. Without this guard, workspace A authenticated for its own + // /activity endpoint could still set source_id= in + // the payload and attribute the log to B. Reject any body where + // source_id is non-empty AND differs from the authenticated workspace. + // Empty source_id falls through to the default-to-self branch below. sourceID := body.SourceID + if sourceID != "" && sourceID != workspaceID { + c.JSON(http.StatusForbidden, gin.H{"error": "source_id must match authenticated workspace"}) + return + } if sourceID == "" { sourceID = workspaceID }