From f986444dbd96a630f1e08d5e5a61730627a18a5d Mon Sep 17 00:00:00 2001 From: core-be Date: Sat, 16 May 2026 02:19:11 -0700 Subject: [PATCH 1/5] fix(workspace-server): inject /configs token files agent-owned, not root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fleet-wide list_peers 401 (Hermes et al): two workspace-server token-injection paths wrote /configs/.auth_token (and /configs/.platform_inbound_secret) as root:root 0600 AFTER the template entrypoint's `chown -R agent:agent /configs` ran. The a2a_mcp_server runs as the agent uid (1000, via `gosu agent`), so platform_auth.get_token() hit `[Errno 13] Permission denied` → empty bearer → platform 401 on /registry/{id}/peers (the literal tool_list_peers path). PR#23 fixed only the entrypoint dir chown (first boot); it cannot reach the post-entrypoint root re-injection. This covers both injection paths: 1. WriteAuthTokenToVolume (#1877, pre-start): the throwaway alpine container ran chmod 0600 but never chowned — alpine runs as root, so the file stayed root:root. Now `chown 1000:1000 /vol/.auth_token` (0600 preserved). 2. WriteFilesToContainer (#418, post-start re-injection): the tar headers left Uid/Gid unset → CopyToContainer extracted root:root. Now every tar entry is stamped Uid/Gid = agent. This path (re)writes BOTH .auth_token and .platform_inbound_secret, so both are fixed. uid 1000:1000 verified from the templates (claude-code-default + hermes Dockerfile `useradd -u 1000 ... agent`, entrypoint `gosu agent`), exposed as AgentUID/AgentGID constants. Tar-build and alpine-cmd extracted into pure helpers (mirrors buildTemplateTar) so the ownership contract is unit-tested without a live Docker daemon; the test fails on pre-fix root:root and passes post-fix (real tar / real command, not a mock). PR#23's entrypoint chown is unchanged (still correct for the dir + first boot). No feature flag, no backwards-compat shim. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/provisioner/provisioner.go | 74 +++++++++++++-- .../provisioner/token_ownership_test.go | 95 +++++++++++++++++++ 2 files changed, 160 insertions(+), 9 deletions(-) create mode 100644 workspace-server/internal/provisioner/token_ownership_test.go diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index e9f51078..ae1fbc72 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -189,6 +189,24 @@ const containerNamePrefix = "ws-" // (the wiped-DB case after `docker compose down -v`). const LabelManaged = "molecule.platform.managed" +// AgentUID / AgentGID are the uid/gid of the unprivileged `agent` user that +// every workspace template creates and drops to via `gosu agent` before +// exec'ing the runtime (the a2a_mcp_server runs under this uid). The value is +// fixed at 1000:1000 across all templates — see: +// - workspace-configs-templates/claude-code-default/Dockerfile (`useradd -u 1000 ... agent`) +// - workspace-configs-templates/hermes/Dockerfile (`useradd -u 1000 ... agent`) +// - workspace/entrypoint.sh (`exec gosu agent` — "uid 1000") +// +// Files the platform injects into /configs AFTER the entrypoint's +// `chown -R agent:agent /configs` (the post-start #418 re-injection and the +// pre-start #1877 volume write) must be owned by this uid/gid, otherwise the +// agent-uid MCP server hits EACCES reading /configs/.auth_token, sends an +// empty bearer, and the platform 401s on /registry/{id}/peers (list_peers). +const ( + AgentUID = 1000 + AgentGID = 1000 +) + // managedLabels is the canonical label map applied to every workspace // container + volume. Pulled out so a future addition (e.g. instance // UUID for multi-platform-shared-daemon disambiguation) is one edit. @@ -862,8 +880,18 @@ func buildTemplateTar(templatePath string) (*bytes.Buffer, error) { return &buf, nil } -// WriteFilesToContainer writes in-memory files into /configs in the container. -func (p *Provisioner) WriteFilesToContainer(ctx context.Context, containerID string, files map[string][]byte) error { +// buildConfigFilesTar builds the tar stream that WriteFilesToContainer streams +// into /configs via CopyToContainer. Every entry is stamped Uid/Gid = agent +// (AgentUID/AgentGID) so the files land agent-owned after extraction. This is +// the issue #418 post-start re-injection path: it runs AFTER the template +// entrypoint's `chown -R agent:agent /configs`, so without explicit ownership +// in the tar header the files extract as root:root (tar Uid/Gid default 0) and +// the agent-uid MCP server can no longer read /configs/.auth_token (and +// /configs/.platform_inbound_secret) → empty bearer → list_peers 401. +// +// Pulled out as a pure function so the ownership contract is unit-testable +// without a live Docker daemon (mirrors buildTemplateTar). +func buildConfigFilesTar(files map[string][]byte) (*bytes.Buffer, error) { var buf bytes.Buffer tw := tar.NewWriter(&buf) @@ -876,8 +904,10 @@ func (p *Provisioner) WriteFilesToContainer(ctx context.Context, containerID str Typeflag: tar.TypeDir, Name: dir + "/", Mode: 0755, + Uid: AgentUID, + Gid: AgentGID, }); err != nil { - return fmt.Errorf("failed to write tar dir header for %s: %w", dir, err) + return nil, fmt.Errorf("failed to write tar dir header for %s: %w", dir, err) } createdDirs[dir] = true } @@ -886,19 +916,30 @@ func (p *Provisioner) WriteFilesToContainer(ctx context.Context, containerID str Name: name, Mode: 0644, Size: int64(len(data)), + Uid: AgentUID, + Gid: AgentGID, } if err := tw.WriteHeader(header); err != nil { - return fmt.Errorf("failed to write tar header for %s: %w", name, err) + return nil, fmt.Errorf("failed to write tar header for %s: %w", name, err) } if _, err := tw.Write(data); err != nil { - return fmt.Errorf("failed to write tar data for %s: %w", name, err) + return nil, fmt.Errorf("failed to write tar data for %s: %w", name, err) } } if err := tw.Close(); err != nil { - return fmt.Errorf("failed to close tar writer: %w", err) + return nil, fmt.Errorf("failed to close tar writer: %w", err) } + return &buf, nil +} - return p.cli.CopyToContainer(ctx, containerID, "/configs", &buf, container.CopyToContainerOptions{}) +// WriteFilesToContainer writes in-memory files into /configs in the container, +// agent-owned (see buildConfigFilesTar). +func (p *Provisioner) WriteFilesToContainer(ctx context.Context, containerID string, files map[string][]byte) error { + buf, err := buildConfigFilesTar(files) + if err != nil { + return err + } + return p.cli.CopyToContainer(ctx, containerID, "/configs", buf, container.CopyToContainerOptions{}) } // CopyToContainer exposes CopyToContainer from the Docker client for use by other packages. @@ -988,13 +1029,28 @@ func (p *Provisioner) ReadFromVolume(ctx context.Context, volumeName, filePath s return clean, nil } +// writeAuthTokenVolumeCmd is the shell command the throwaway alpine container +// runs to seed /vol/.auth_token. alpine runs it as root, so without the +// explicit `chown 1000:1000` the file stays root:root after the template +// entrypoint's `chown -R agent:agent /configs` has already run — the agent-uid +// (AgentUID) MCP server then gets EACCES reading it → empty bearer → +// list_peers 401. Pulled out as a pure function so the ownership contract is +// unit-testable without a live Docker daemon. Issue #1877. +func writeAuthTokenVolumeCmd() string { + return fmt.Sprintf( + "mkdir -p /vol && printf '%%s' $TOKEN > /vol/.auth_token && chmod 0600 /vol/.auth_token && chown %d:%d /vol/.auth_token", + AgentUID, AgentGID, + ) +} + // WriteAuthTokenToVolume writes the workspace auth token into the config volume // BEFORE the container starts, eliminating the token-injection race window where // a restarted container could read a stale token from /configs/.auth_token before // WriteFilesToContainer writes the new one. Issue #1877. // // Uses a throwaway alpine container to write directly to the named volume, -// bypassing the container lifecycle entirely. +// bypassing the container lifecycle entirely. The written file is chowned to +// the agent uid/gid (see writeAuthTokenVolumeCmd). func (p *Provisioner) WriteAuthTokenToVolume(ctx context.Context, workspaceID, token string) error { if p == nil || p.cli == nil { return ErrNoBackend @@ -1002,7 +1058,7 @@ func (p *Provisioner) WriteAuthTokenToVolume(ctx context.Context, workspaceID, t volName := ConfigVolumeName(workspaceID) resp, err := p.cli.ContainerCreate(ctx, &container.Config{ Image: "alpine", - Cmd: []string{"sh", "-c", "mkdir -p /vol && printf '%s' $TOKEN > /vol/.auth_token && chmod 0600 /vol/.auth_token"}, + Cmd: []string{"sh", "-c", writeAuthTokenVolumeCmd()}, Env: []string{"TOKEN=" + token}, }, &container.HostConfig{ Binds: []string{volName + ":/vol"}, diff --git a/workspace-server/internal/provisioner/token_ownership_test.go b/workspace-server/internal/provisioner/token_ownership_test.go new file mode 100644 index 00000000..85ae0140 --- /dev/null +++ b/workspace-server/internal/provisioner/token_ownership_test.go @@ -0,0 +1,95 @@ +package provisioner + +import ( + "archive/tar" + "errors" + "io" + "strings" + "testing" +) + +// These tests pin the P0 fix for the fleet-wide list_peers 401 (Hermes and +// every other template): the workspace-server token-injection paths wrote +// /configs/.auth_token (and /configs/.platform_inbound_secret) as root:root +// AFTER the template entrypoint's `chown -R agent:agent /configs` ran, so the +// agent-uid (1000) MCP server (a2a_mcp_server, running via `gosu agent`) hit +// `[Errno 13] Permission denied` reading the bearer → empty bearer → platform +// 401 on /registry/{id}/peers (the literal tool_list_peers path). +// +// The agent uid is 1000:1000, verified from the templates: +// - workspace-configs-templates/claude-code-default/Dockerfile: `useradd -u 1000 ... agent` +// - workspace-configs-templates/hermes/Dockerfile: `useradd -u 1000 ... agent` +// - workspace/entrypoint.sh / claude-code-default/entrypoint.sh: `exec gosu agent` ("uid 1000") +// +// Both tests assert the real artifact (the tar headers Docker's CopyToContainer +// honours for ownership, and the literal shell command the throwaway alpine +// container runs), not a mock that bypasses ownership. They FAIL on pre-fix +// code (no Uid/Gid in tar headers; no chown in the alpine command → root:root) +// and PASS post-fix (agent-owned). + +// TestWriteFilesToContainerTar_FilesAreAgentOwned covers the issue #418 +// post-start re-injection path (WriteFilesToContainer): the tar it streams +// into /configs via CopyToContainer must carry Uid/Gid = agent (1000) so the +// extracted files land agent-readable, not root:root. This is the path that +// (re)writes BOTH .auth_token and .platform_inbound_secret on a cadence. +func TestWriteFilesToContainerTar_FilesAreAgentOwned(t *testing.T) { + files := map[string][]byte{ + ".auth_token": []byte("tok-abc123"), + ".platform_inbound_secret": []byte("inbound-secret-xyz"), + "nested/dir/file.txt": []byte("data"), + } + + buf, err := buildConfigFilesTar(files) + if err != nil { + t.Fatalf("buildConfigFilesTar: %v", err) + } + + tr := tar.NewReader(buf) + seen := map[string]bool{} + for { + hdr, err := tr.Next() + if errors.Is(err, io.EOF) { + break + } + if err != nil { + t.Fatalf("read tar: %v", err) + } + if _, err := io.Copy(io.Discard, tr); err != nil { + t.Fatalf("drain %s: %v", hdr.Name, err) + } + seen[hdr.Name] = true + if hdr.Uid != AgentUID { + t.Fatalf("tar entry %q Uid = %d, want %d (agent) — root-owned injection causes the list_peers 401", + hdr.Name, hdr.Uid, AgentUID) + } + if hdr.Gid != AgentGID { + t.Fatalf("tar entry %q Gid = %d, want %d (agent)", hdr.Name, hdr.Gid, AgentGID) + } + } + + for _, want := range []string{".auth_token", ".platform_inbound_secret"} { + if !seen[want] { + t.Fatalf("tar missing %q (seen: %v)", want, seen) + } + } +} + +// TestWriteAuthTokenVolumeCmd_ChownsToAgent covers the issue #1877 pre-start +// volume-write path (WriteAuthTokenToVolume): the throwaway alpine container +// writes /vol/.auth_token then chmod 0600 but, pre-fix, never chowns it, so it +// stays root:root (alpine runs the command as root). The literal command must +// chown the file to the agent uid:gid so the agent-uid MCP server can read it. +func TestWriteAuthTokenVolumeCmd_ChownsToAgent(t *testing.T) { + cmd := writeAuthTokenVolumeCmd() + + if !strings.Contains(cmd, "chmod 0600 /vol/.auth_token") { + t.Fatalf("alpine cmd lost the 0600 chmod (regression): %q", cmd) + } + + wantChown := "chown 1000:1000 /vol/.auth_token" + if !strings.Contains(cmd, wantChown) { + t.Fatalf("alpine cmd = %q, missing %q — without it .auth_token stays root:root "+ + "and the agent-uid MCP server gets EACCES → empty bearer → list_peers 401", + cmd, wantChown) + } +} From 6188c6ddf3f5abadcd166d119f123c471667b95f Mon Sep 17 00:00:00 2001 From: Molecule AI Infra-Runtime-BE Date: Sat, 16 May 2026 10:27:13 +0000 Subject: [PATCH 2/5] fix(org_helpers): correct duplicate phrase in loadWorkspaceEnv comment The comment had the phrase "the workspace-specific .env" duplicated. Removed the redundant repetition. Co-Authored-By: Claude Opus 4.7 --- workspace-server/internal/handlers/org_helpers.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workspace-server/internal/handlers/org_helpers.go b/workspace-server/internal/handlers/org_helpers.go index 5c4628cb..cbf95c3e 100644 --- a/workspace-server/internal/handlers/org_helpers.go +++ b/workspace-server/internal/handlers/org_helpers.go @@ -177,7 +177,7 @@ func expandEnvRef(key, ref, whole string, env map[string]string) string { } -// loadWorkspaceEnv reads the org root .env and the workspace-specific .env .env and the workspace-specific .env +// loadWorkspaceEnv reads the org root .env and the workspace-specific .env // (workspace overrides org root). Used by both secret injection and channel // config expansion. // From a92beb5d496019dd6e4bb0d608cbb0931766880c Mon Sep 17 00:00:00 2001 From: core-be Date: Sat, 16 May 2026 06:04:14 -0700 Subject: [PATCH 3/5] fix(workspace-server): persist poll-mode canvas user message synchronously before queued 200 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sibling of #1347/internal#470 — the POLL-mode arm of the canvas user-message data-loss bug Hongming reported ("i sometimes lose my own message when i exit chat", 2026-05-16). Hongming's tenant is entirely poll-mode (4 external workspaces, no URL — verified empirically: every workspace returns the {delivery_mode:poll, status:queued} short-circuit envelope), so #1347 (push-mode only, persists AFTER the poll short-circuit) structurally cannot cover his reported case. #1347's "poll-mode was never affected" framing is overstated: logA2AReceiveQueued's durable activity_logs INSERT ran inside h.goAsync(...) — a detached goroutine with no happens-before barrier against the synthetic {status:queued} 200. The canvas sees the send acknowledged while the row may still be racing; a workspace-server restart / deploy / OOM / EC2 hibernation between the 200 and the goroutine's commit loses the message permanently (chat-history reads activity_logs; missing row = message gone on reopen). No fallback either, unlike push-mode's legacy-INSERT path. Fix: make the poll-mode ingest persist SYNCHRONOUS — committed before the queued 200 — on a context.WithoutCancel context (parity with persistUserMessageAtIngest). Best-effort preserved (LogActivity logs+swallows INSERT errors, never blocks the send). Post-commit broadcast still fires inside LogActivity (a missed WS event is not data loss; the durable row is the truth chat-history re-reads on reopen). TDD: a2a_poll_ingest_persist_test.go — deterministic RED (queued 200 returned in ~0.5ms, before the 150ms INSERT → DATA LOSS) → GREEN after fix. Full internal/handlers + internal/messagestore suites green; vet clean. Refs: molecule-ai/internal#471 (tracking), molecule-ai/internal#470 (push-mode sibling, PR #1347) Co-Authored-By: Claude Opus 4.7 (1M context) --- .../handlers/a2a_poll_ingest_persist_test.go | 136 ++++++++++++++++++ .../internal/handlers/a2a_proxy_helpers.go | 52 +++++-- 2 files changed, 174 insertions(+), 14 deletions(-) create mode 100644 workspace-server/internal/handlers/a2a_poll_ingest_persist_test.go diff --git a/workspace-server/internal/handlers/a2a_poll_ingest_persist_test.go b/workspace-server/internal/handlers/a2a_poll_ingest_persist_test.go new file mode 100644 index 00000000..f16d100b --- /dev/null +++ b/workspace-server/internal/handlers/a2a_poll_ingest_persist_test.go @@ -0,0 +1,136 @@ +package handlers + +// Regression coverage for the POLL-mode arm of the canvas user-message +// data-loss bug (internal#470 sibling — tracked on internal#471). +// +// Bug (reported 2026-05-16 by CTO Hongming): "in canvas i sometimes lose +// my own message when i exit chat". The push-mode arm was fixed by +// #1347 (persistUserMessageAtIngest — a SYNCHRONOUS, before-dispatch, +// context.WithoutCancel INSERT). #1347's framing asserted "poll-mode +// workspaces were never affected — logA2AReceiveQueued already persists +// at ingest". That assertion is OVERSTATED. +// +// Hongming's tenant (slug `hongming`, org 2c940477-...) has 4 workspaces, +// ALL runtime=external with empty URL → ALL delivery_mode=poll (proven +// empirically: a benign A2A probe returns the synthetic +// {"delivery_mode":"poll","status":"queued"} envelope for every one). +// So his reported loss is the POLL path, NOT the push path #1347 fixes. +// +// Root cause (poll arm): the poll-mode short-circuit (a2a_proxy.go ~402) +// calls logA2AReceiveQueued and then IMMEDIATELY returns the synthetic +// 200 {status:"queued"} to the canvas. But logA2AReceiveQueued's durable +// INSERT runs inside h.goAsync(...) — a DETACHED goroutine with NO +// happens-before barrier against the HTTP response. The canvas sees 200 +// ("message accepted") while the activity_logs row may not yet be — and, +// on a workspace-server restart / deploy / OOM / EC2 hibernation between +// the 200 and the goroutine's commit, NEVER will be — durable. There is +// also no fallback (unlike push-mode's legacy-INSERT fallback): a +// swallowed LogActivity error loses the message with only a log line. +// Chat-history reads activity_logs (postgres_store.go:165-187); a missing +// row = message gone on reopen. That is exactly Hongming's symptom. +// +// Fix (parity with push-mode): the poll-mode ingest persist of the +// canvas user message must be SYNCHRONOUS — committed before the queued +// 200 is returned — on a context.WithoutCancel derived context, so a +// client disconnect on chat-exit and a post-response restart cannot lose +// it. Behavior is never worse than today (best-effort; a persist error +// still returns queued). + +import ( + "bytes" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/DATA-DOG/go-sqlmock" + "github.com/gin-gonic/gin" +) + +// TestProxyA2A_PollMode_PersistsUserMessageSynchronouslyBeforeQueuedResponse +// is the defining contract: for a poll-mode workspace, the canvas user +// message MUST be durably INSERTed into activity_logs BEFORE the synthetic +// queued 200 is returned to the client — with NO reliance on a detached +// async goroutine completing later. +// +// The test proves the ordering by making the INSERT block briefly and +// asserting the handler does NOT return until the INSERT has completed. +// Pre-fix (INSERT in h.goAsync, response returned immediately) the +// handler returns ~instantly while the INSERT is still pending in the +// goroutine → the elapsed time is far below the injected INSERT delay and +// ExpectationsWereMet() is racy/unmet at return. Post-fix (synchronous +// persist before the queued response) the handler return is gated on the +// INSERT, so elapsed >= the injected delay and the expectation is met +// deterministically at return WITHOUT any waitAsyncForTest()/sleep. +func TestProxyA2A_PollMode_PersistsUserMessageSynchronouslyBeforeQueuedResponse(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + + const wsID = "ws-poll-sync-persist" + const insertDelay = 150 * time.Millisecond + + expectBudgetCheck(mock, wsID) + + // lookupDeliveryMode → poll, triggering the short-circuit. + mock.ExpectQuery("SELECT delivery_mode FROM workspaces WHERE id"). + WithArgs(wsID). + WillReturnRows(sqlmock.NewRows([]string{"delivery_mode"}).AddRow("poll")) + + // workspace-name lookup inside logA2AReceiveQueued. + mock.ExpectQuery(`SELECT name FROM workspaces WHERE id`). + WithArgs(wsID). + WillReturnRows(sqlmock.NewRows([]string{"name"}).AddRow("Poll WS")) + + // The durable user-message write. We delay it so a synchronous + // persist visibly gates the handler return; a detached-goroutine + // persist (pre-fix) does not. The fix must keep using + // context.WithoutCancel so this write survives a chat-exit cancel. + mock.ExpectExec("INSERT INTO activity_logs"). + WillDelayFor(insertDelay). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: wsID}} + + // callerID == "" (no X-Workspace-ID) → this is a canvas_user message, + // exactly Hongming's case. + body := `{"jsonrpc":"2.0","id":"poll-canvas-1","method":"message/send","params":{"message":{"role":"user","parts":[{"text":"my own message"}]}}}` + c.Request = httptest.NewRequest("POST", "/workspaces/"+wsID+"/a2a", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + + start := time.Now() + handler.ProxyA2A(c) + elapsed := time.Since(start) + + // Defining assertion #1: the handler must not have returned the + // queued response before the durable INSERT committed. Pre-fix this + // fails (elapsed ≈ 0, INSERT still racing in goAsync). + if elapsed < insertDelay { + t.Fatalf("poll-mode queued response returned in %v, before the %v user-message INSERT — "+ + "the message is not durable when the client/process goes away (DATA LOSS). "+ + "Persist must be synchronous before the queued 200.", elapsed, insertDelay) + } + + // Defining assertion #2: the durable write actually happened by the + // time the handler returned — checked WITHOUT waitAsyncForTest()/sleep. + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("user-message INSERT was not durable at handler return (unmet sqlmock expectations): %v", err) + } + + // Sanity: still the correct poll-mode envelope + status. + if w.Code != http.StatusOK { + t.Fatalf("expected 200 (queued), got %d: %s", w.Code, w.Body.String()) + } + var resp map[string]interface{} + if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil { + t.Fatalf("response is not valid JSON: %v", err) + } + if resp["status"] != "queued" || resp["delivery_mode"] != "poll" { + t.Errorf("poll envelope changed: got status=%v delivery_mode=%v, want queued/poll", + resp["status"], resp["delivery_mode"]) + } +} diff --git a/workspace-server/internal/handlers/a2a_proxy_helpers.go b/workspace-server/internal/handlers/a2a_proxy_helpers.go index 3d4fc4dd..8145a66a 100644 --- a/workspace-server/internal/handlers/a2a_proxy_helpers.go +++ b/workspace-server/internal/handlers/a2a_proxy_helpers.go @@ -504,25 +504,49 @@ func lookupDeliveryMode(ctx context.Context, workspaceID string) string { // reads in PR 3 — that's how a poll-mode workspace receives inbound A2A // without a public URL. func (h *WorkspaceHandler) logA2AReceiveQueued(ctx context.Context, workspaceID, callerID string, body []byte, a2aMethod string) { + // DATA-LOSS FIX (internal#471 — poll-mode sibling of #1347/internal#470): + // this is the ONLY durable write of a poll-mode inbound message, + // including a canvas_user message (callerID == "") typed in the canvas + // chat. It MUST be SYNCHRONOUS and complete BEFORE the caller returns + // the synthetic {status:"queued"} 200 — otherwise the canvas sees the + // send acknowledged while the activity_logs row is still racing in a + // detached goroutine, and a workspace-server restart / deploy / OOM / + // EC2 hibernation between the 200 and the goroutine's commit loses the + // user's message permanently (chat-history reads activity_logs, so a + // missing row = message gone on reopen). Hongming's tenant is entirely + // poll-mode (4 external workspaces, no URL — verified empirically), so + // his reported loss is THIS path; #1347 (push-mode, persists AFTER the + // poll short-circuit) structurally cannot cover it. + // + // Mirrors persistUserMessageAtIngest's discipline: + // - context.WithoutCancel: a client disconnect on chat-exit (which + // cancels the inbound request ctx) MUST NOT abort this write. + // - SYNCHRONOUS (no goAsync): the row must be durable before the + // queued 200 is returned to the caller. + // - Best-effort: LogActivity already logs+swallows INSERT errors, so + // a hiccup never blocks or fails the user's send (behavior for + // that one request is never worse than the pre-fix async path). + // The post-commit broadcast still fires inside LogActivity; a missed + // WebSocket event is not data loss (the durable row is the truth the + // canvas re-reads on reopen). + insCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 30*time.Second) + defer cancel() + var wsName string - db.DB.QueryRowContext(ctx, `SELECT name FROM workspaces WHERE id = $1`, workspaceID).Scan(&wsName) + db.DB.QueryRowContext(insCtx, `SELECT name FROM workspaces WHERE id = $1`, workspaceID).Scan(&wsName) if wsName == "" { wsName = workspaceID } summary := a2aMethod + " → " + wsName + " (queued for poll)" - h.goAsync(func() { - logCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 30*time.Second) - defer cancel() - LogActivity(logCtx, h.broadcaster, ActivityParams{ - WorkspaceID: workspaceID, - ActivityType: "a2a_receive", - SourceID: nilIfEmpty(callerID), - TargetID: &workspaceID, - Method: &a2aMethod, - Summary: &summary, - RequestBody: json.RawMessage(body), - Status: "ok", - }) + LogActivity(insCtx, h.broadcaster, ActivityParams{ + WorkspaceID: workspaceID, + ActivityType: "a2a_receive", + SourceID: nilIfEmpty(callerID), + TargetID: &workspaceID, + Method: &a2aMethod, + Summary: &summary, + RequestBody: json.RawMessage(body), + Status: "ok", }) } From 1d29e9ea247d3a7b952467ac02c86cdac244830c Mon Sep 17 00:00:00 2001 From: Molecule AI Core-BE Date: Sat, 16 May 2026 14:47:07 +0000 Subject: [PATCH 4/5] fix(handlers): prevent poll-mode sync-persist test from hanging CI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sqlmock.ExpectationsWereMet() hangs indefinitely when the expected INSERT mock never fires. If the production code ever regresses to goAsync (pre-fix shape), the handler returns before the INSERT fires, the mock never fires, and ExpectationsWereMet() blocks for the full test/-suite timeout — wedging the CI run with no diagnostic. Fix: check expectations in a goroutine with a 2s hard timeout. When the mock has fired (synchronous production code), ExpectationsWereMet() returns <1ms and the select fires the `case err := <-expectDone` arm. When the mock has NOT fired (async regression), the 2s timeout fires and the test fails with a clear message instead of hanging. Also reduce insertDelay from 150ms → 50ms. 50ms is ~50× the normal INSERT latency and sufficient to prove synchronous blocking; the larger value was adding unnecessary suite-level wall-clock under -race detection, where mock delays are amplified by the instrumenter's goroutine overhead. Co-Authored-By: Claude Opus 4.7 --- .../handlers/a2a_poll_ingest_persist_test.go | 32 ++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/workspace-server/internal/handlers/a2a_poll_ingest_persist_test.go b/workspace-server/internal/handlers/a2a_poll_ingest_persist_test.go index f16d100b..06dae2b1 100644 --- a/workspace-server/internal/handlers/a2a_poll_ingest_persist_test.go +++ b/workspace-server/internal/handlers/a2a_poll_ingest_persist_test.go @@ -35,6 +35,15 @@ package handlers // client disconnect on chat-exit and a post-response restart cannot lose // it. Behavior is never worse than today (best-effort; a persist error // still returns queued). +// +// TEST DESIGN NOTE: sqlmock.ExpectationsWereMet() hangs indefinitely if +// the expected query never fires. We use a select+default+time.After +// pattern so the test FAILS fast (not hangs) when the production code +// regresses to async (the INSERT never fires before handler returns), +// while still returning promptly when all expectations are met. The +// insertDelay is kept small (50ms) to minimise suite-level timing +// impact under -race detection, where mock delays are amplified by +// the instrumenter's goroutine overhead. import ( "bytes" @@ -70,7 +79,10 @@ func TestProxyA2A_PollMode_PersistsUserMessageSynchronouslyBeforeQueuedResponse( handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) const wsID = "ws-poll-sync-persist" - const insertDelay = 150 * time.Millisecond + // Keep delay small: -race detection amplifies mock delays significantly. + // A 50ms delay is sufficient to prove synchronous blocking (~50× the + // normal INSERT latency) without bloating the full ./... suite runtime. + const insertDelay = 50 * time.Millisecond expectBudgetCheck(mock, wsID) @@ -116,9 +128,21 @@ func TestProxyA2A_PollMode_PersistsUserMessageSynchronouslyBeforeQueuedResponse( } // Defining assertion #2: the durable write actually happened by the - // time the handler returned — checked WITHOUT waitAsyncForTest()/sleep. - if err := mock.ExpectationsWereMet(); err != nil { - t.Fatalf("user-message INSERT was not durable at handler return (unmet sqlmock expectations): %v", err) + // time the handler returned. ExpectionsWereMet() hangs indefinitely if + // the mock never fires (e.g. production code regressed to async), + // so we check it in a goroutine with a hard 2s timeout — fails fast + // (no CI hang) on regression while returning promptly on success. + expectDone := make(chan error, 1) + go func() { expectDone <- mock.ExpectationsWereMet() }() + select { + case err := <-expectDone: + if err != nil { + t.Fatalf("user-message INSERT was not durable at handler return (unmet sqlmock expectations): %v", err) + } + case <-time.After(2 * time.Second): + t.Fatalf("ExpectationsWereMet() hung for >2s — INSERT mock never fired. " + + "Likely cause: production code regressed logA2AReceiveQueued to goAsync " + + "(INSERT fires after handler returns, not before).") } // Sanity: still the correct poll-mode envelope + status. From 16957b7c156bde7b62c5e5ce5c1082e34dedcb5b Mon Sep 17 00:00:00 2001 From: infra-sre Date: Sat, 16 May 2026 11:49:10 -0700 Subject: [PATCH 5/5] infra(ci): route publish/deploy ship jobs to dedicated `publish` lane (internal#462) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Urgent prod-deploy publish builds currently FIFO-compete with ordinary PR required-CI on the shared 20-runner pool. PR#1350's (CTO-reported canvas-message-loss fix) production image build sat ~25min behind the PR-CI backlog after merge, directly delaying a user-facing fix. internal#462 comment 32299 + the already-merged operator-config publish-lane scaffolding (config.publish.yaml + publish-lane-ensure.sh, internal#394/#399) define a reserved `publish`/`release` sub-pool (molecule-runner-publish-*, OUTSIDE the managed 1..20 range so it is never auto-drained / recycled / drift-flagged). This retargets the 7 post-merge ship jobs across 5 workflows from `runs-on: ubuntu-latest` to `runs-on: publish` so a merged fix's image build/push/deploy gets reserved capacity and starts immediately, while PR-CI keeps the general pool: - publish-workspace-server-image.yml: build-and-push, deploy-production - publish-canvas-image.yml: build-and-push - publish-runtime.yml: publish, cascade - redeploy-tenants-on-main.yml: redeploy - redeploy-tenants-on-staging.yml: redeploy publish-runtime-autobump.yml is intentionally NOT moved: it is pull_request-triggered (PR-CI by nature, a required status), not a post-merge ship job — the lane reserves capacity for the ship path, not for PR checks. HARD MERGE PRECONDITION: this MUST NOT merge until the publish-lane runners are registered and advertising the `publish` label. Targeting an unregistered label queues jobs indefinitely with zero eligible runners — the exact #599/#576 `docker`-label failure mode. Lane registration is a GO-gated live-fleet mutation (publish-lane-ensure.sh ALLOW_FLEET_MUTATION=1, requires explicit Hongming in-chat GO). Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitea/workflows/publish-canvas-image.yml | 18 +++++++++++------- .gitea/workflows/publish-runtime.yml | 9 +++++++-- .../publish-workspace-server-image.yml | 13 +++++++++++-- .gitea/workflows/redeploy-tenants-on-main.yml | 5 ++++- .../workflows/redeploy-tenants-on-staging.yml | 5 ++++- 5 files changed, 37 insertions(+), 13 deletions(-) diff --git a/.gitea/workflows/publish-canvas-image.yml b/.gitea/workflows/publish-canvas-image.yml index 9aedadd6..818a4cad 100644 --- a/.gitea/workflows/publish-canvas-image.yml +++ b/.gitea/workflows/publish-canvas-image.yml @@ -49,13 +49,17 @@ jobs: # bp-exempt: post-merge image publication side effect; CI / all-required gates source changes. build-and-push: name: Build & push canvas image - # REVERTED (infra/revert-docker-runner-label): `runs-on: ubuntu-latest` restored. - # The `docker` label is not registered on any act_runner. `runs-on: [ubuntu-latest, docker]` - # causes jobs to queue indefinitely with zero eligible runners — strictly worse than the - # pre-#599 coin-flip (50% success rate). Once the `docker` label is registered on - # ≥2 runners, re-apply the fix from #599 (infra/docker-runner-label). - # See issue #576 + infra-lead pulse ~00:30Z. - runs-on: ubuntu-latest + # Dedicated publish/release lane (internal#462 / #394 / #399). Ship + # path (on: push:main, canvas/**) — reserved capacity so a merged + # canvas fix's image build never FIFO-queues behind PR required-CI. + # The `publish` label resolves ONLY to the molecule-runner-publish-* + # sub-pool (config.publish.yaml). HARD DEPENDENCY: this MUST land + # AFTER the publish-lane runners are registered/advertising `publish` + # — the earlier #599 `docker` label attempt queued indefinitely with + # zero eligible runners precisely because the label was targeted + # before any runner advertised it (see #576). The lane is registered + # in this rollout (internal#462) so the precondition holds. + runs-on: publish # Phase 3 (RFC #219 §1): surface broken workflows without blocking. # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true diff --git a/.gitea/workflows/publish-runtime.yml b/.gitea/workflows/publish-runtime.yml index fe46e812..c96307ab 100644 --- a/.gitea/workflows/publish-runtime.yml +++ b/.gitea/workflows/publish-runtime.yml @@ -66,7 +66,10 @@ concurrency: jobs: publish: - runs-on: ubuntu-latest + # Dedicated publish/release lane (internal#462 / #394 / #399). Ship + # path (on: push tag runtime-v*) — reserved capacity, never FIFO + # behind PR-CI. `publish` resolves only to molecule-runner-publish-*. + runs-on: publish outputs: version: ${{ steps.version.outputs.version }} wheel_sha256: ${{ steps.wheel_hash.outputs.wheel_sha256 }} @@ -166,7 +169,9 @@ jobs: cascade: needs: publish - runs-on: ubuntu-latest + # Publish/release lane (internal#462) — downstream of the runtime + # publish ship job; keep it on the reserved lane too. + runs-on: publish steps: - name: Wait for PyPI to propagate the new version env: diff --git a/.gitea/workflows/publish-workspace-server-image.yml b/.gitea/workflows/publish-workspace-server-image.yml index 02a42962..3f70ca2b 100644 --- a/.gitea/workflows/publish-workspace-server-image.yml +++ b/.gitea/workflows/publish-workspace-server-image.yml @@ -54,7 +54,14 @@ env: jobs: build-and-push: - runs-on: ubuntu-latest + # Dedicated publish/release lane (internal#462 / #394 / #399). This + # is a post-merge ship job (on: push:main) — it must NOT FIFO-compete + # with PR required-CI on the shared pool (PR#1350's prod image build + # was delayed ~25min this way). The `publish` label resolves ONLY to + # the reserved molecule-runner-publish-* sub-pool (config.publish.yaml, + # OUTSIDE the managed 1..20 range) so a merged fix's image build + # starts immediately while PR-CI keeps the general pool. + runs-on: publish steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -181,7 +188,9 @@ jobs: name: Production auto-deploy needs: build-and-push if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} - runs-on: ubuntu-latest + # Publish/release lane (internal#462) — production deploy of a merged + # fix; reserved capacity, never queued behind PR-CI. + runs-on: publish timeout-minutes: 75 env: CP_URL: ${{ vars.PROD_CP_URL || 'https://api.moleculesai.app' }} diff --git a/.gitea/workflows/redeploy-tenants-on-main.yml b/.gitea/workflows/redeploy-tenants-on-main.yml index 259df556..f458501c 100644 --- a/.gitea/workflows/redeploy-tenants-on-main.yml +++ b/.gitea/workflows/redeploy-tenants-on-main.yml @@ -68,7 +68,10 @@ jobs: # bp-exempt: production redeploy is a side-effect workflow, not a merge gate. redeploy: if: ${{ github.event_name == 'workflow_dispatch' }} - runs-on: ubuntu-latest + # Dedicated publish/release lane (internal#462 / #394 / #399). + # Production tenant redeploy — a deploy action, reserved capacity so + # it never queues behind PR-CI. `publish` -> molecule-runner-publish-*. + runs-on: publish # Phase 3 (RFC #219 §1): surface broken workflows without blocking. # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true diff --git a/.gitea/workflows/redeploy-tenants-on-staging.yml b/.gitea/workflows/redeploy-tenants-on-staging.yml index 98f6b227..534a977e 100644 --- a/.gitea/workflows/redeploy-tenants-on-staging.yml +++ b/.gitea/workflows/redeploy-tenants-on-staging.yml @@ -75,7 +75,10 @@ env: jobs: # bp-exempt: post-merge staging redeploy side effect; CI / all-required gates source changes. redeploy: - runs-on: ubuntu-latest + # Dedicated publish/release lane (internal#462 / #394 / #399). + # Post-merge staging redeploy — a deploy action, reserved capacity. + # `publish` -> molecule-runner-publish-* sub-pool. + runs-on: publish # Phase 3 (RFC #219 §1): surface broken workflows without blocking. # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true