From 87ae691e6723b43995ea86ad8f65bf4ade26c7fd Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 02:39:22 -0700 Subject: [PATCH 1/2] Distinguish poll-mode workspace from transient empty-URL on chat upload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit External-runtime workspaces that register in poll mode have no callback URL by design — the platform never dispatches to them, so chat upload (HTTP-forward by design) can't proceed. Returning 503 + "workspace url not registered yet" was misleading: the "yet" implied transient state, but the URL would never arrive. Caught externally on 2026-05-04: user uploading an image to an external "mac laptop" runtime workspace saw the 503 and assumed they should retry. The workspace's poll mode meant retrying would never help. Fix: include delivery_mode in the workspace lookup. When URL is empty: - poll mode → 422 + "re-register in push mode with a public URL" (Unprocessable Entity — this request can't succeed against this workspace's configuration; no retry will help) - push mode → 503 + "not registered yet" (genuine transient state — retry after next heartbeat is correct) Test: TestChatUpload_PollModeEmptyURL pins the new 422 path; existing TestChatUpload_NoURL strengthened to assert the "not registered yet" substring stays on the push branch (it would have silently passed if the new 422 path had clobbered both branches). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/handlers/chat_files.go | 29 ++++++++- .../internal/handlers/chat_files_test.go | 62 +++++++++++++++++-- 2 files changed, 83 insertions(+), 8 deletions(-) diff --git a/workspace-server/internal/handlers/chat_files.go b/workspace-server/internal/handlers/chat_files.go index c438bae7..6c576ecc 100644 --- a/workspace-server/internal/handlers/chat_files.go +++ b/workspace-server/internal/handlers/chat_files.go @@ -30,6 +30,7 @@ package handlers import ( "context" + "database/sql" "fmt" "io" "log" @@ -102,14 +103,38 @@ const chatUploadDir = "/workspace/.molecule/chat-uploads" // of bug as the original SaaS provision drift fixed in #2366; this // extraction prevents that class on the consumer side. func resolveWorkspaceForwardCreds(c *gin.Context, ctx context.Context, workspaceID, op string) (wsURL, secret string, ok bool) { + var deliveryMode sql.NullString if err := db.DB.QueryRowContext(ctx, - `SELECT COALESCE(url, '') FROM workspaces WHERE id = $1`, workspaceID, - ).Scan(&wsURL); err != nil { + `SELECT COALESCE(url, ''), delivery_mode FROM workspaces WHERE id = $1`, workspaceID, + ).Scan(&wsURL, &deliveryMode); err != nil { log.Printf("chat_files %s: workspace lookup failed for %s: %v", op, workspaceID, err) c.JSON(http.StatusNotFound, gin.H{"error": "workspace not found"}) return "", "", false } if wsURL == "" { + // Distinguish the two empty-URL classes so the user sees an + // actionable error rather than a misleading "not registered yet" + // (which implies waiting will help): + // + // poll-mode → URL is structurally absent. The platform never + // dispatches to a poll-mode workspace, so chat upload (which + // is HTTP-forward by design) cannot proceed. Returning 503 + // here would loop the canvas client forever. 422 signals + // "this request can't succeed against THIS workspace's + // configuration" — the only fix is to re-register the + // workspace in push mode with a publicly-reachable URL. + // + // push-mode → URL just isn't on the row yet (workspace + // restart in progress, or first /registry/register hasn't + // landed). 503 + "not registered yet" is correct — retry + // after the next heartbeat (~30s) will likely succeed. + if deliveryMode.Valid && deliveryMode.String == "poll" { + c.JSON(http.StatusUnprocessableEntity, gin.H{ + "error": "workspace is in poll mode — chat " + op + " requires push mode", + "detail": "Poll-mode workspaces have no callback URL the platform can dispatch to. Re-register the workspace with a publicly-reachable URL in push mode (e.g. via ngrok / Cloudflare tunnel) to enable chat file " + op + ".", + }) + return "", "", false + } c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace url not registered yet"}) return "", "", false } diff --git a/workspace-server/internal/handlers/chat_files_test.go b/workspace-server/internal/handlers/chat_files_test.go index 4556e77a..fc110ca5 100644 --- a/workspace-server/internal/handlers/chat_files_test.go +++ b/workspace-server/internal/handlers/chat_files_test.go @@ -58,16 +58,28 @@ func uploadFixture(t *testing.T) (*bytes.Buffer, string) { return &buf, mw.FormDataContentType() } -// expectURL stubs the SELECT that resolves the workspace's url. +// expectURL stubs the SELECT that resolves the workspace's url + +// delivery_mode. Defaults delivery_mode to "push" — most tests don't +// care about the mode and just want a URL to forward to. Use +// expectURLAndMode when the test needs a specific mode (e.g. the +// poll-mode 422 path). func expectURL(mock sqlmock.Sqlmock, workspaceID, url string) { - mock.ExpectQuery(`SELECT COALESCE\(url, ''\) FROM workspaces WHERE id = \$1`). + expectURLAndMode(mock, workspaceID, url, "push") +} + +// expectURLAndMode is the explicit form for tests that need to +// exercise the delivery_mode branch (e.g. poll-mode workspaces get +// a 422 instead of a 503 when URL is empty — the platform can't +// dispatch to a poll-mode workspace at all). +func expectURLAndMode(mock sqlmock.Sqlmock, workspaceID, url, mode string) { + mock.ExpectQuery(`SELECT COALESCE\(url, ''\), delivery_mode FROM workspaces WHERE id = \$1`). WithArgs(workspaceID). - WillReturnRows(sqlmock.NewRows([]string{"url"}).AddRow(url)) + WillReturnRows(sqlmock.NewRows([]string{"url", "delivery_mode"}).AddRow(url, mode)) } // expectURLMissing stubs the SELECT to return sql.ErrNoRows. func expectURLMissing(mock sqlmock.Sqlmock, workspaceID string) { - mock.ExpectQuery(`SELECT COALESCE\(url, ''\) FROM workspaces WHERE id = \$1`). + mock.ExpectQuery(`SELECT COALESCE\(url, ''\), delivery_mode FROM workspaces WHERE id = \$1`). WithArgs(workspaceID). WillReturnError(sql.ErrNoRows) } @@ -201,7 +213,9 @@ func TestChatUpload_NoURL(t *testing.T) { mock := setupTestDB(t) setupTestRedis(t) - // Workspace registered but URL hasn't been reported yet (mid-boot). + // Workspace registered (push-mode default) but URL hasn't been reported + // yet (mid-boot). 503 + "not registered yet" is the right surface — the + // canvas client can retry after the next heartbeat picks up the URL. wsID := "00000000-0000-0000-0000-000000000042" expectURL(mock, wsID, "") @@ -211,7 +225,43 @@ func TestChatUpload_NoURL(t *testing.T) { h.Upload(c) if w.Code != http.StatusServiceUnavailable { - t.Errorf("expected 503 when workspace url empty, got %d: %s", w.Code, w.Body.String()) + t.Errorf("expected 503 when workspace url empty (push mode), got %d: %s", w.Code, w.Body.String()) + } + if !strings.Contains(w.Body.String(), "not registered yet") { + t.Errorf("expected transient-state error message, got: %s", w.Body.String()) + } +} + +// TestChatUpload_PollModeEmptyURL pins the 422 distinguisher: a +// poll-mode workspace has no URL by design, so chat upload (which is +// HTTP-forward to the workspace) cannot succeed by retrying. Returning +// 503 here would loop the canvas client forever; 422 + an actionable +// message ("re-register in push mode") tells the user what to do. +// +// Caught externally on 2026-05-04 — user reported "Upload failed: 503 +// {error: workspace url not registered yet}" on an external runtime +// workspace (mac laptop, no public URL). The "yet" implied transient, +// but the workspace's poll-mode meant the URL would never arrive. +func TestChatUpload_PollModeEmptyURL(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + + wsID := "00000000-0000-0000-0000-000000000099" + expectURLAndMode(mock, wsID, "", "poll") + + h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil)) + body, ct := uploadFixture(t) + c, w := makeUploadRequest(t, wsID, body, ct) + h.Upload(c) + + if w.Code != http.StatusUnprocessableEntity { + t.Fatalf("expected 422 for poll-mode upload, got %d: %s", w.Code, w.Body.String()) + } + if !strings.Contains(w.Body.String(), "poll mode") { + t.Errorf("expected error to mention poll mode, got: %s", w.Body.String()) + } + if !strings.Contains(w.Body.String(), "push mode") { + t.Errorf("expected error to suggest re-registering in push mode, got: %s", w.Body.String()) } } From bcea8ac822b260b2a2724353f413ad8e7747d372 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 02:41:52 -0700 Subject: [PATCH 2/2] Broaden empty-URL 422 to cover NULL delivery_mode (production reality) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live-probed user's tenant: three of three external-runtime workspaces register with delivery_mode = NULL, not "poll". The earlier narrow poll-only check fell through to the misleading 503 for the actually- observed shape. Invariant we want: URL empty + not-exactly-"push" → no dispatch path will ever exist → 422. Only push-mode with empty URL is genuinely transient (mid-boot, restart in progress) → 503. Added TestChatUpload_NullModeEmptyURL using the user's actual workspace ID. Existing TestChatUpload_NoURL switched to explicit "push" mode (was relying on default — unsafe given the new branching). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/handlers/chat_files.go | 29 +++++---- .../internal/handlers/chat_files_test.go | 60 +++++++++++++++---- 2 files changed, 65 insertions(+), 24 deletions(-) diff --git a/workspace-server/internal/handlers/chat_files.go b/workspace-server/internal/handlers/chat_files.go index 6c576ecc..b58d2d28 100644 --- a/workspace-server/internal/handlers/chat_files.go +++ b/workspace-server/internal/handlers/chat_files.go @@ -116,22 +116,29 @@ func resolveWorkspaceForwardCreds(c *gin.Context, ctx context.Context, workspace // actionable error rather than a misleading "not registered yet" // (which implies waiting will help): // - // poll-mode → URL is structurally absent. The platform never - // dispatches to a poll-mode workspace, so chat upload (which - // is HTTP-forward by design) cannot proceed. Returning 503 - // here would loop the canvas client forever. 422 signals - // "this request can't succeed against THIS workspace's - // configuration" — the only fix is to re-register the - // workspace in push mode with a publicly-reachable URL. - // // push-mode → URL just isn't on the row yet (workspace // restart in progress, or first /registry/register hasn't // landed). 503 + "not registered yet" is correct — retry // after the next heartbeat (~30s) will likely succeed. - if deliveryMode.Valid && deliveryMode.String == "poll" { + // + // anything else (poll-mode, NULL, empty string) → URL is + // structurally absent. The platform never dispatches to a + // non-push workspace, so chat upload (which is HTTP-forward + // by design) cannot proceed by waiting. Returning 503 here + // would loop the canvas client forever. 422 signals "this + // request can't succeed against THIS workspace's + // configuration" — the only fix is to re-register the + // workspace with a publicly-reachable URL. + // + // Live-observed 2026-05-04: external runtime workspaces (e.g. + // molecule-sdk-python on a mac laptop) register with + // delivery_mode=NULL. The narrow "poll" check missed them; the + // invariant we actually want is "URL empty + not-push = no + // dispatch path, ever". + if !deliveryMode.Valid || deliveryMode.String != "push" { c.JSON(http.StatusUnprocessableEntity, gin.H{ - "error": "workspace is in poll mode — chat " + op + " requires push mode", - "detail": "Poll-mode workspaces have no callback URL the platform can dispatch to. Re-register the workspace with a publicly-reachable URL in push mode (e.g. via ngrok / Cloudflare tunnel) to enable chat file " + op + ".", + "error": "workspace has no callback URL — chat " + op + " requires push-mode + public URL", + "detail": "This workspace registered without a publicly-reachable URL (delivery_mode is not 'push'). The platform cannot dispatch chat uploads to it. Re-register the workspace with a public URL in push mode (e.g. via ngrok / Cloudflare tunnel) to enable chat file " + op + ".", }) return "", "", false } diff --git a/workspace-server/internal/handlers/chat_files_test.go b/workspace-server/internal/handlers/chat_files_test.go index fc110ca5..e7829f45 100644 --- a/workspace-server/internal/handlers/chat_files_test.go +++ b/workspace-server/internal/handlers/chat_files_test.go @@ -70,13 +70,23 @@ func expectURL(mock sqlmock.Sqlmock, workspaceID, url string) { // expectURLAndMode is the explicit form for tests that need to // exercise the delivery_mode branch (e.g. poll-mode workspaces get // a 422 instead of a 503 when URL is empty — the platform can't -// dispatch to a poll-mode workspace at all). +// dispatch to a non-push workspace at all). func expectURLAndMode(mock sqlmock.Sqlmock, workspaceID, url, mode string) { mock.ExpectQuery(`SELECT COALESCE\(url, ''\), delivery_mode FROM workspaces WHERE id = \$1`). WithArgs(workspaceID). WillReturnRows(sqlmock.NewRows([]string{"url", "delivery_mode"}).AddRow(url, mode)) } +// expectURLNullMode is the production-observed shape: external runtime +// workspaces (molecule-sdk-python on user infra) register with +// delivery_mode = NULL, not "poll". Caught 2026-05-04 — the narrow +// "poll" check missed three of three real workspaces in user reports. +func expectURLNullMode(mock sqlmock.Sqlmock, workspaceID, url string) { + mock.ExpectQuery(`SELECT COALESCE\(url, ''\), delivery_mode FROM workspaces WHERE id = \$1`). + WithArgs(workspaceID). + WillReturnRows(sqlmock.NewRows([]string{"url", "delivery_mode"}).AddRow(url, nil)) +} + // expectURLMissing stubs the SELECT to return sql.ErrNoRows. func expectURLMissing(mock sqlmock.Sqlmock, workspaceID string) { mock.ExpectQuery(`SELECT COALESCE\(url, ''\), delivery_mode FROM workspaces WHERE id = \$1`). @@ -213,11 +223,13 @@ func TestChatUpload_NoURL(t *testing.T) { mock := setupTestDB(t) setupTestRedis(t) - // Workspace registered (push-mode default) but URL hasn't been reported + // Workspace registered (push-mode) but URL hasn't been reported // yet (mid-boot). 503 + "not registered yet" is the right surface — the // canvas client can retry after the next heartbeat picks up the URL. + // Push mode is the only branch that produces 503; everything else + // (poll, NULL, empty) gets 422 because no amount of waiting helps. wsID := "00000000-0000-0000-0000-000000000042" - expectURL(mock, wsID, "") + expectURLAndMode(mock, wsID, "", "push") h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil)) body, ct := uploadFixture(t) @@ -236,12 +248,7 @@ func TestChatUpload_NoURL(t *testing.T) { // poll-mode workspace has no URL by design, so chat upload (which is // HTTP-forward to the workspace) cannot succeed by retrying. Returning // 503 here would loop the canvas client forever; 422 + an actionable -// message ("re-register in push mode") tells the user what to do. -// -// Caught externally on 2026-05-04 — user reported "Upload failed: 503 -// {error: workspace url not registered yet}" on an external runtime -// workspace (mac laptop, no public URL). The "yet" implied transient, -// but the workspace's poll-mode meant the URL would never arrive. +// message tells the user what to do. func TestChatUpload_PollModeEmptyURL(t *testing.T) { mock := setupTestDB(t) setupTestRedis(t) @@ -257,11 +264,38 @@ func TestChatUpload_PollModeEmptyURL(t *testing.T) { if w.Code != http.StatusUnprocessableEntity { t.Fatalf("expected 422 for poll-mode upload, got %d: %s", w.Code, w.Body.String()) } - if !strings.Contains(w.Body.String(), "poll mode") { - t.Errorf("expected error to mention poll mode, got: %s", w.Body.String()) + if !strings.Contains(w.Body.String(), "push") { + t.Errorf("expected error to suggest push mode, got: %s", w.Body.String()) } - if !strings.Contains(w.Body.String(), "push mode") { - t.Errorf("expected error to suggest re-registering in push mode, got: %s", w.Body.String()) +} + +// TestChatUpload_NullModeEmptyURL — production-observed 2026-05-04: +// external-runtime workspaces (molecule-sdk-python on user infra) +// register with delivery_mode = NULL, not "poll". The earlier narrow +// poll-only check fell through to the misleading 503. The fix is the +// inverse-of-push test: anything not exactly "push" with empty URL +// can't dispatch and gets the actionable 422. +// +// Three of three external workspaces in the user's tenant had this +// shape (home hermes / runner mac mini / mac laptop, all +// runtime=external + url='' + delivery_mode=NULL). +func TestChatUpload_NullModeEmptyURL(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + + wsID := "30ba7f0b-b303-4a20-aefe-3a4a675b8aa4" // user's "mac laptop" + expectURLNullMode(mock, wsID, "") + + h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil)) + body, ct := uploadFixture(t) + c, w := makeUploadRequest(t, wsID, body, ct) + h.Upload(c) + + if w.Code != http.StatusUnprocessableEntity { + t.Fatalf("expected 422 for null-delivery-mode upload, got %d: %s", w.Code, w.Body.String()) + } + if !strings.Contains(w.Body.String(), "callback URL") { + t.Errorf("expected error to mention callback URL, got: %s", w.Body.String()) } }