From f986444dbd96a630f1e08d5e5a61730627a18a5d Mon Sep 17 00:00:00 2001
From: core-be <core-be@agents.moleculesai.app>
Date: Sat, 16 May 2026 02:19:11 -0700
Subject: [PATCH 1/5] fix(workspace-server): inject /configs token files
 agent-owned, not root
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The fleet-wide list_peers 401 (Hermes et al): two workspace-server
token-injection paths wrote /configs/.auth_token (and
/configs/.platform_inbound_secret) as root:root 0600 AFTER the template
entrypoint's `chown -R agent:agent /configs` ran. The a2a_mcp_server runs
as the agent uid (1000, via `gosu agent`), so platform_auth.get_token()
hit `[Errno 13] Permission denied` → empty bearer → platform 401 on
/registry/{id}/peers (the literal tool_list_peers path).

PR#23 fixed only the entrypoint dir chown (first boot); it cannot reach
the post-entrypoint root re-injection. This covers both injection paths:

1. WriteAuthTokenToVolume (#1877, pre-start): the throwaway alpine
   container ran chmod 0600 but never chowned — alpine runs as root, so
   the file stayed root:root. Now `chown 1000:1000 /vol/.auth_token`
   (0600 preserved).
2. WriteFilesToContainer (#418, post-start re-injection): the tar headers
   left Uid/Gid unset → CopyToContainer extracted root:root. Now every
   tar entry is stamped Uid/Gid = agent. This path (re)writes BOTH
   .auth_token and .platform_inbound_secret, so both are fixed.

uid 1000:1000 verified from the templates (claude-code-default + hermes
Dockerfile `useradd -u 1000 ... agent`, entrypoint `gosu agent`), exposed
as AgentUID/AgentGID constants. Tar-build and alpine-cmd extracted into
pure helpers (mirrors buildTemplateTar) so the ownership contract is
unit-tested without a live Docker daemon; the test fails on pre-fix
root:root and passes post-fix (real tar / real command, not a mock).

PR#23's entrypoint chown is unchanged (still correct for the dir +
first boot). No feature flag, no backwards-compat shim.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../internal/provisioner/provisioner.go       | 74 +++++++++++++--
 .../provisioner/token_ownership_test.go       | 95 +++++++++++++++++++
 2 files changed, 160 insertions(+), 9 deletions(-)
 create mode 100644 workspace-server/internal/provisioner/token_ownership_test.go

diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go
index e9f51078..ae1fbc72 100644
--- a/workspace-server/internal/provisioner/provisioner.go
+++ b/workspace-server/internal/provisioner/provisioner.go
@@ -189,6 +189,24 @@ const containerNamePrefix = "ws-"
 // (the wiped-DB case after `docker compose down -v`).
 const LabelManaged = "molecule.platform.managed"
 
+// AgentUID / AgentGID are the uid/gid of the unprivileged `agent` user that
+// every workspace template creates and drops to via `gosu agent` before
+// exec'ing the runtime (the a2a_mcp_server runs under this uid). The value is
+// fixed at 1000:1000 across all templates — see:
+//   - workspace-configs-templates/claude-code-default/Dockerfile (`useradd -u 1000 ... agent`)
+//   - workspace-configs-templates/hermes/Dockerfile               (`useradd -u 1000 ... agent`)
+//   - workspace/entrypoint.sh                                     (`exec gosu agent` — "uid 1000")
+//
+// Files the platform injects into /configs AFTER the entrypoint's
+// `chown -R agent:agent /configs` (the post-start #418 re-injection and the
+// pre-start #1877 volume write) must be owned by this uid/gid, otherwise the
+// agent-uid MCP server hits EACCES reading /configs/.auth_token, sends an
+// empty bearer, and the platform 401s on /registry/{id}/peers (list_peers).
+const (
+	AgentUID = 1000
+	AgentGID = 1000
+)
+
 // managedLabels is the canonical label map applied to every workspace
 // container + volume. Pulled out so a future addition (e.g. instance
 // UUID for multi-platform-shared-daemon disambiguation) is one edit.
@@ -862,8 +880,18 @@ func buildTemplateTar(templatePath string) (*bytes.Buffer, error) {
 	return &buf, nil
 }
 
-// WriteFilesToContainer writes in-memory files into /configs in the container.
-func (p *Provisioner) WriteFilesToContainer(ctx context.Context, containerID string, files map[string][]byte) error {
+// buildConfigFilesTar builds the tar stream that WriteFilesToContainer streams
+// into /configs via CopyToContainer. Every entry is stamped Uid/Gid = agent
+// (AgentUID/AgentGID) so the files land agent-owned after extraction. This is
+// the issue #418 post-start re-injection path: it runs AFTER the template
+// entrypoint's `chown -R agent:agent /configs`, so without explicit ownership
+// in the tar header the files extract as root:root (tar Uid/Gid default 0) and
+// the agent-uid MCP server can no longer read /configs/.auth_token (and
+// /configs/.platform_inbound_secret) → empty bearer → list_peers 401.
+//
+// Pulled out as a pure function so the ownership contract is unit-testable
+// without a live Docker daemon (mirrors buildTemplateTar).
+func buildConfigFilesTar(files map[string][]byte) (*bytes.Buffer, error) {
 	var buf bytes.Buffer
 	tw := tar.NewWriter(&buf)
 
@@ -876,8 +904,10 @@ func (p *Provisioner) WriteFilesToContainer(ctx context.Context, containerID str
 				Typeflag: tar.TypeDir,
 				Name:     dir + "/",
 				Mode:     0755,
+				Uid:      AgentUID,
+				Gid:      AgentGID,
 			}); err != nil {
-				return fmt.Errorf("failed to write tar dir header for %s: %w", dir, err)
+				return nil, fmt.Errorf("failed to write tar dir header for %s: %w", dir, err)
 			}
 			createdDirs[dir] = true
 		}
@@ -886,19 +916,30 @@ func (p *Provisioner) WriteFilesToContainer(ctx context.Context, containerID str
 			Name: name,
 			Mode: 0644,
 			Size: int64(len(data)),
+			Uid:  AgentUID,
+			Gid:  AgentGID,
 		}
 		if err := tw.WriteHeader(header); err != nil {
-			return fmt.Errorf("failed to write tar header for %s: %w", name, err)
+			return nil, fmt.Errorf("failed to write tar header for %s: %w", name, err)
 		}
 		if _, err := tw.Write(data); err != nil {
-			return fmt.Errorf("failed to write tar data for %s: %w", name, err)
+			return nil, fmt.Errorf("failed to write tar data for %s: %w", name, err)
 		}
 	}
 	if err := tw.Close(); err != nil {
-		return fmt.Errorf("failed to close tar writer: %w", err)
+		return nil, fmt.Errorf("failed to close tar writer: %w", err)
 	}
+	return &buf, nil
+}
 
-	return p.cli.CopyToContainer(ctx, containerID, "/configs", &buf, container.CopyToContainerOptions{})
+// WriteFilesToContainer writes in-memory files into /configs in the container,
+// agent-owned (see buildConfigFilesTar).
+func (p *Provisioner) WriteFilesToContainer(ctx context.Context, containerID string, files map[string][]byte) error {
+	buf, err := buildConfigFilesTar(files)
+	if err != nil {
+		return err
+	}
+	return p.cli.CopyToContainer(ctx, containerID, "/configs", buf, container.CopyToContainerOptions{})
 }
 
 // CopyToContainer exposes CopyToContainer from the Docker client for use by other packages.
@@ -988,13 +1029,28 @@ func (p *Provisioner) ReadFromVolume(ctx context.Context, volumeName, filePath s
 	return clean, nil
 }
 
+// writeAuthTokenVolumeCmd is the shell command the throwaway alpine container
+// runs to seed /vol/.auth_token. alpine runs it as root, so without the
+// explicit `chown 1000:1000` the file stays root:root after the template
+// entrypoint's `chown -R agent:agent /configs` has already run — the agent-uid
+// (AgentUID) MCP server then gets EACCES reading it → empty bearer →
+// list_peers 401. Pulled out as a pure function so the ownership contract is
+// unit-testable without a live Docker daemon. Issue #1877.
+func writeAuthTokenVolumeCmd() string {
+	return fmt.Sprintf(
+		"mkdir -p /vol && printf '%%s' $TOKEN > /vol/.auth_token && chmod 0600 /vol/.auth_token && chown %d:%d /vol/.auth_token",
+		AgentUID, AgentGID,
+	)
+}
+
 // WriteAuthTokenToVolume writes the workspace auth token into the config volume
 // BEFORE the container starts, eliminating the token-injection race window where
 // a restarted container could read a stale token from /configs/.auth_token before
 // WriteFilesToContainer writes the new one. Issue #1877.
 //
 // Uses a throwaway alpine container to write directly to the named volume,
-// bypassing the container lifecycle entirely.
+// bypassing the container lifecycle entirely. The written file is chowned to
+// the agent uid/gid (see writeAuthTokenVolumeCmd).
 func (p *Provisioner) WriteAuthTokenToVolume(ctx context.Context, workspaceID, token string) error {
 	if p == nil || p.cli == nil {
 		return ErrNoBackend
@@ -1002,7 +1058,7 @@ func (p *Provisioner) WriteAuthTokenToVolume(ctx context.Context, workspaceID, t
 	volName := ConfigVolumeName(workspaceID)
 	resp, err := p.cli.ContainerCreate(ctx, &container.Config{
 		Image: "alpine",
-		Cmd:   []string{"sh", "-c", "mkdir -p /vol && printf '%s' $TOKEN > /vol/.auth_token && chmod 0600 /vol/.auth_token"},
+		Cmd:   []string{"sh", "-c", writeAuthTokenVolumeCmd()},
 		Env:   []string{"TOKEN=" + token},
 	}, &container.HostConfig{
 		Binds: []string{volName + ":/vol"},
diff --git a/workspace-server/internal/provisioner/token_ownership_test.go b/workspace-server/internal/provisioner/token_ownership_test.go
new file mode 100644
index 00000000..85ae0140
--- /dev/null
+++ b/workspace-server/internal/provisioner/token_ownership_test.go
@@ -0,0 +1,95 @@
+package provisioner
+
+import (
+	"archive/tar"
+	"errors"
+	"io"
+	"strings"
+	"testing"
+)
+
+// These tests pin the P0 fix for the fleet-wide list_peers 401 (Hermes and
+// every other template): the workspace-server token-injection paths wrote
+// /configs/.auth_token (and /configs/.platform_inbound_secret) as root:root
+// AFTER the template entrypoint's `chown -R agent:agent /configs` ran, so the
+// agent-uid (1000) MCP server (a2a_mcp_server, running via `gosu agent`) hit
+// `[Errno 13] Permission denied` reading the bearer → empty bearer → platform
+// 401 on /registry/{id}/peers (the literal tool_list_peers path).
+//
+// The agent uid is 1000:1000, verified from the templates:
+//   - workspace-configs-templates/claude-code-default/Dockerfile: `useradd -u 1000 ... agent`
+//   - workspace-configs-templates/hermes/Dockerfile:               `useradd -u 1000 ... agent`
+//   - workspace/entrypoint.sh / claude-code-default/entrypoint.sh:  `exec gosu agent` ("uid 1000")
+//
+// Both tests assert the real artifact (the tar headers Docker's CopyToContainer
+// honours for ownership, and the literal shell command the throwaway alpine
+// container runs), not a mock that bypasses ownership. They FAIL on pre-fix
+// code (no Uid/Gid in tar headers; no chown in the alpine command → root:root)
+// and PASS post-fix (agent-owned).
+
+// TestWriteFilesToContainerTar_FilesAreAgentOwned covers the issue #418
+// post-start re-injection path (WriteFilesToContainer): the tar it streams
+// into /configs via CopyToContainer must carry Uid/Gid = agent (1000) so the
+// extracted files land agent-readable, not root:root. This is the path that
+// (re)writes BOTH .auth_token and .platform_inbound_secret on a cadence.
+func TestWriteFilesToContainerTar_FilesAreAgentOwned(t *testing.T) {
+	files := map[string][]byte{
+		".auth_token":              []byte("tok-abc123"),
+		".platform_inbound_secret": []byte("inbound-secret-xyz"),
+		"nested/dir/file.txt":      []byte("data"),
+	}
+
+	buf, err := buildConfigFilesTar(files)
+	if err != nil {
+		t.Fatalf("buildConfigFilesTar: %v", err)
+	}
+
+	tr := tar.NewReader(buf)
+	seen := map[string]bool{}
+	for {
+		hdr, err := tr.Next()
+		if errors.Is(err, io.EOF) {
+			break
+		}
+		if err != nil {
+			t.Fatalf("read tar: %v", err)
+		}
+		if _, err := io.Copy(io.Discard, tr); err != nil {
+			t.Fatalf("drain %s: %v", hdr.Name, err)
+		}
+		seen[hdr.Name] = true
+		if hdr.Uid != AgentUID {
+			t.Fatalf("tar entry %q Uid = %d, want %d (agent) — root-owned injection causes the list_peers 401",
+				hdr.Name, hdr.Uid, AgentUID)
+		}
+		if hdr.Gid != AgentGID {
+			t.Fatalf("tar entry %q Gid = %d, want %d (agent)", hdr.Name, hdr.Gid, AgentGID)
+		}
+	}
+
+	for _, want := range []string{".auth_token", ".platform_inbound_secret"} {
+		if !seen[want] {
+			t.Fatalf("tar missing %q (seen: %v)", want, seen)
+		}
+	}
+}
+
+// TestWriteAuthTokenVolumeCmd_ChownsToAgent covers the issue #1877 pre-start
+// volume-write path (WriteAuthTokenToVolume): the throwaway alpine container
+// writes /vol/.auth_token then chmod 0600 but, pre-fix, never chowns it, so it
+// stays root:root (alpine runs the command as root). The literal command must
+// chown the file to the agent uid:gid so the agent-uid MCP server can read it.
+func TestWriteAuthTokenVolumeCmd_ChownsToAgent(t *testing.T) {
+	cmd := writeAuthTokenVolumeCmd()
+
+	if !strings.Contains(cmd, "chmod 0600 /vol/.auth_token") {
+		t.Fatalf("alpine cmd lost the 0600 chmod (regression): %q", cmd)
+	}
+
+	wantChown := "chown 1000:1000 /vol/.auth_token"
+	if !strings.Contains(cmd, wantChown) {
+		t.Fatalf("alpine cmd = %q, missing %q — without it .auth_token stays root:root "+
+			"and the agent-uid MCP server gets EACCES → empty bearer → list_peers 401",
+			cmd, wantChown)
+	}
+}

From 6188c6ddf3f5abadcd166d119f123c471667b95f Mon Sep 17 00:00:00 2001
From: Molecule AI Infra-Runtime-BE <infra-runtime-be@agents.moleculesai.app>
Date: Sat, 16 May 2026 10:27:13 +0000
Subject: [PATCH 2/5] fix(org_helpers): correct duplicate phrase in
 loadWorkspaceEnv comment

The comment had the phrase "the workspace-specific .env" duplicated.
Removed the redundant repetition.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 workspace-server/internal/handlers/org_helpers.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workspace-server/internal/handlers/org_helpers.go b/workspace-server/internal/handlers/org_helpers.go
index 5c4628cb..cbf95c3e 100644
--- a/workspace-server/internal/handlers/org_helpers.go
+++ b/workspace-server/internal/handlers/org_helpers.go
@@ -177,7 +177,7 @@ func expandEnvRef(key, ref, whole string, env map[string]string) string {
 }
 
 
-// loadWorkspaceEnv reads the org root .env and the workspace-specific .env .env and the workspace-specific .env
+// loadWorkspaceEnv reads the org root .env and the workspace-specific .env
 // (workspace overrides org root). Used by both secret injection and channel
 // config expansion.
 //

From a92beb5d496019dd6e4bb0d608cbb0931766880c Mon Sep 17 00:00:00 2001
From: core-be <core-be@agents.moleculesai.app>
Date: Sat, 16 May 2026 06:04:14 -0700
Subject: [PATCH 3/5] fix(workspace-server): persist poll-mode canvas user
 message synchronously before queued 200
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sibling of #1347/internal#470 — the POLL-mode arm of the canvas
user-message data-loss bug Hongming reported ("i sometimes lose my own
message when i exit chat", 2026-05-16).

Hongming's tenant is entirely poll-mode (4 external workspaces, no URL —
verified empirically: every workspace returns the {delivery_mode:poll,
status:queued} short-circuit envelope), so #1347 (push-mode only,
persists AFTER the poll short-circuit) structurally cannot cover his
reported case. #1347's "poll-mode was never affected" framing is
overstated: logA2AReceiveQueued's durable activity_logs INSERT ran
inside h.goAsync(...) — a detached goroutine with no happens-before
barrier against the synthetic {status:queued} 200. The canvas sees the
send acknowledged while the row may still be racing; a workspace-server
restart / deploy / OOM / EC2 hibernation between the 200 and the
goroutine's commit loses the message permanently (chat-history reads
activity_logs; missing row = message gone on reopen). No fallback
either, unlike push-mode's legacy-INSERT path.

Fix: make the poll-mode ingest persist SYNCHRONOUS — committed before
the queued 200 — on a context.WithoutCancel context (parity with
persistUserMessageAtIngest). Best-effort preserved (LogActivity
logs+swallows INSERT errors, never blocks the send). Post-commit
broadcast still fires inside LogActivity (a missed WS event is not data
loss; the durable row is the truth chat-history re-reads on reopen).

TDD: a2a_poll_ingest_persist_test.go — deterministic RED (queued 200
returned in ~0.5ms, before the 150ms INSERT → DATA LOSS) → GREEN after
fix. Full internal/handlers + internal/messagestore suites green; vet
clean.

Refs: molecule-ai/internal#471 (tracking), molecule-ai/internal#470 (push-mode sibling, PR #1347)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../handlers/a2a_poll_ingest_persist_test.go  | 136 ++++++++++++++++++
 .../internal/handlers/a2a_proxy_helpers.go    |  52 +++++--
 2 files changed, 174 insertions(+), 14 deletions(-)
 create mode 100644 workspace-server/internal/handlers/a2a_poll_ingest_persist_test.go

diff --git a/workspace-server/internal/handlers/a2a_poll_ingest_persist_test.go b/workspace-server/internal/handlers/a2a_poll_ingest_persist_test.go
new file mode 100644
index 00000000..f16d100b
--- /dev/null
+++ b/workspace-server/internal/handlers/a2a_poll_ingest_persist_test.go
@@ -0,0 +1,136 @@
+package handlers
+
+// Regression coverage for the POLL-mode arm of the canvas user-message
+// data-loss bug (internal#470 sibling — tracked on internal#471).
+//
+// Bug (reported 2026-05-16 by CTO Hongming): "in canvas i sometimes lose
+// my own message when i exit chat". The push-mode arm was fixed by
+// #1347 (persistUserMessageAtIngest — a SYNCHRONOUS, before-dispatch,
+// context.WithoutCancel INSERT). #1347's framing asserted "poll-mode
+// workspaces were never affected — logA2AReceiveQueued already persists
+// at ingest". That assertion is OVERSTATED.
+//
+// Hongming's tenant (slug `hongming`, org 2c940477-...) has 4 workspaces,
+// ALL runtime=external with empty URL → ALL delivery_mode=poll (proven
+// empirically: a benign A2A probe returns the synthetic
+// {"delivery_mode":"poll","status":"queued"} envelope for every one).
+// So his reported loss is the POLL path, NOT the push path #1347 fixes.
+//
+// Root cause (poll arm): the poll-mode short-circuit (a2a_proxy.go ~402)
+// calls logA2AReceiveQueued and then IMMEDIATELY returns the synthetic
+// 200 {status:"queued"} to the canvas. But logA2AReceiveQueued's durable
+// INSERT runs inside h.goAsync(...) — a DETACHED goroutine with NO
+// happens-before barrier against the HTTP response. The canvas sees 200
+// ("message accepted") while the activity_logs row may not yet be — and,
+// on a workspace-server restart / deploy / OOM / EC2 hibernation between
+// the 200 and the goroutine's commit, NEVER will be — durable. There is
+// also no fallback (unlike push-mode's legacy-INSERT fallback): a
+// swallowed LogActivity error loses the message with only a log line.
+// Chat-history reads activity_logs (postgres_store.go:165-187); a missing
+// row = message gone on reopen. That is exactly Hongming's symptom.
+//
+// Fix (parity with push-mode): the poll-mode ingest persist of the
+// canvas user message must be SYNCHRONOUS — committed before the queued
+// 200 is returned — on a context.WithoutCancel derived context, so a
+// client disconnect on chat-exit and a post-response restart cannot lose
+// it. Behavior is never worse than today (best-effort; a persist error
+// still returns queued).
+
+import (
+	"bytes"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+
+	"github.com/DATA-DOG/go-sqlmock"
+	"github.com/gin-gonic/gin"
+)
+
+// TestProxyA2A_PollMode_PersistsUserMessageSynchronouslyBeforeQueuedResponse
+// is the defining contract: for a poll-mode workspace, the canvas user
+// message MUST be durably INSERTed into activity_logs BEFORE the synthetic
+// queued 200 is returned to the client — with NO reliance on a detached
+// async goroutine completing later.
+//
+// The test proves the ordering by making the INSERT block briefly and
+// asserting the handler does NOT return until the INSERT has completed.
+// Pre-fix (INSERT in h.goAsync, response returned immediately) the
+// handler returns ~instantly while the INSERT is still pending in the
+// goroutine → the elapsed time is far below the injected INSERT delay and
+// ExpectationsWereMet() is racy/unmet at return. Post-fix (synchronous
+// persist before the queued response) the handler return is gated on the
+// INSERT, so elapsed >= the injected delay and the expectation is met
+// deterministically at return WITHOUT any waitAsyncForTest()/sleep.
+func TestProxyA2A_PollMode_PersistsUserMessageSynchronouslyBeforeQueuedResponse(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+	const wsID = "ws-poll-sync-persist"
+	const insertDelay = 150 * time.Millisecond
+
+	expectBudgetCheck(mock, wsID)
+
+	// lookupDeliveryMode → poll, triggering the short-circuit.
+	mock.ExpectQuery("SELECT delivery_mode FROM workspaces WHERE id").
+		WithArgs(wsID).
+		WillReturnRows(sqlmock.NewRows([]string{"delivery_mode"}).AddRow("poll"))
+
+	// workspace-name lookup inside logA2AReceiveQueued.
+	mock.ExpectQuery(`SELECT name FROM workspaces WHERE id`).
+		WithArgs(wsID).
+		WillReturnRows(sqlmock.NewRows([]string{"name"}).AddRow("Poll WS"))
+
+	// The durable user-message write. We delay it so a synchronous
+	// persist visibly gates the handler return; a detached-goroutine
+	// persist (pre-fix) does not. The fix must keep using
+	// context.WithoutCancel so this write survives a chat-exit cancel.
+	mock.ExpectExec("INSERT INTO activity_logs").
+		WillDelayFor(insertDelay).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: wsID}}
+
+	// callerID == "" (no X-Workspace-ID) → this is a canvas_user message,
+	// exactly Hongming's case.
+	body := `{"jsonrpc":"2.0","id":"poll-canvas-1","method":"message/send","params":{"message":{"role":"user","parts":[{"text":"my own message"}]}}}`
+	c.Request = httptest.NewRequest("POST", "/workspaces/"+wsID+"/a2a", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	start := time.Now()
+	handler.ProxyA2A(c)
+	elapsed := time.Since(start)
+
+	// Defining assertion #1: the handler must not have returned the
+	// queued response before the durable INSERT committed. Pre-fix this
+	// fails (elapsed ≈ 0, INSERT still racing in goAsync).
+	if elapsed < insertDelay {
+		t.Fatalf("poll-mode queued response returned in %v, before the %v user-message INSERT — "+
+			"the message is not durable when the client/process goes away (DATA LOSS). "+
+			"Persist must be synchronous before the queued 200.", elapsed, insertDelay)
+	}
+
+	// Defining assertion #2: the durable write actually happened by the
+	// time the handler returned — checked WITHOUT waitAsyncForTest()/sleep.
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Fatalf("user-message INSERT was not durable at handler return (unmet sqlmock expectations): %v", err)
+	}
+
+	// Sanity: still the correct poll-mode envelope + status.
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200 (queued), got %d: %s", w.Code, w.Body.String())
+	}
+	var resp map[string]interface{}
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("response is not valid JSON: %v", err)
+	}
+	if resp["status"] != "queued" || resp["delivery_mode"] != "poll" {
+		t.Errorf("poll envelope changed: got status=%v delivery_mode=%v, want queued/poll",
+			resp["status"], resp["delivery_mode"])
+	}
+}
diff --git a/workspace-server/internal/handlers/a2a_proxy_helpers.go b/workspace-server/internal/handlers/a2a_proxy_helpers.go
index 3d4fc4dd..8145a66a 100644
--- a/workspace-server/internal/handlers/a2a_proxy_helpers.go
+++ b/workspace-server/internal/handlers/a2a_proxy_helpers.go
@@ -504,25 +504,49 @@ func lookupDeliveryMode(ctx context.Context, workspaceID string) string {
 // reads in PR 3 — that's how a poll-mode workspace receives inbound A2A
 // without a public URL.
 func (h *WorkspaceHandler) logA2AReceiveQueued(ctx context.Context, workspaceID, callerID string, body []byte, a2aMethod string) {
+	// DATA-LOSS FIX (internal#471 — poll-mode sibling of #1347/internal#470):
+	// this is the ONLY durable write of a poll-mode inbound message,
+	// including a canvas_user message (callerID == "") typed in the canvas
+	// chat. It MUST be SYNCHRONOUS and complete BEFORE the caller returns
+	// the synthetic {status:"queued"} 200 — otherwise the canvas sees the
+	// send acknowledged while the activity_logs row is still racing in a
+	// detached goroutine, and a workspace-server restart / deploy / OOM /
+	// EC2 hibernation between the 200 and the goroutine's commit loses the
+	// user's message permanently (chat-history reads activity_logs, so a
+	// missing row = message gone on reopen). Hongming's tenant is entirely
+	// poll-mode (4 external workspaces, no URL — verified empirically), so
+	// his reported loss is THIS path; #1347 (push-mode, persists AFTER the
+	// poll short-circuit) structurally cannot cover it.
+	//
+	// Mirrors persistUserMessageAtIngest's discipline:
+	//   - context.WithoutCancel: a client disconnect on chat-exit (which
+	//     cancels the inbound request ctx) MUST NOT abort this write.
+	//   - SYNCHRONOUS (no goAsync): the row must be durable before the
+	//     queued 200 is returned to the caller.
+	//   - Best-effort: LogActivity already logs+swallows INSERT errors, so
+	//     a hiccup never blocks or fails the user's send (behavior for
+	//     that one request is never worse than the pre-fix async path).
+	// The post-commit broadcast still fires inside LogActivity; a missed
+	// WebSocket event is not data loss (the durable row is the truth the
+	// canvas re-reads on reopen).
+	insCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 30*time.Second)
+	defer cancel()
+
 	var wsName string
-	db.DB.QueryRowContext(ctx, `SELECT name FROM workspaces WHERE id = $1`, workspaceID).Scan(&wsName)
+	db.DB.QueryRowContext(insCtx, `SELECT name FROM workspaces WHERE id = $1`, workspaceID).Scan(&wsName)
 	if wsName == "" {
 		wsName = workspaceID
 	}
 	summary := a2aMethod + " → " + wsName + " (queued for poll)"
-	h.goAsync(func() {
-		logCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 30*time.Second)
-		defer cancel()
-		LogActivity(logCtx, h.broadcaster, ActivityParams{
-			WorkspaceID:  workspaceID,
-			ActivityType: "a2a_receive",
-			SourceID:     nilIfEmpty(callerID),
-			TargetID:     &workspaceID,
-			Method:       &a2aMethod,
-			Summary:      &summary,
-			RequestBody:  json.RawMessage(body),
-			Status:       "ok",
-		})
+	LogActivity(insCtx, h.broadcaster, ActivityParams{
+		WorkspaceID:  workspaceID,
+		ActivityType: "a2a_receive",
+		SourceID:     nilIfEmpty(callerID),
+		TargetID:     &workspaceID,
+		Method:       &a2aMethod,
+		Summary:      &summary,
+		RequestBody:  json.RawMessage(body),
+		Status:       "ok",
 	})
 }
 

From 1d29e9ea247d3a7b952467ac02c86cdac244830c Mon Sep 17 00:00:00 2001
From: Molecule AI Core-BE <core-be@agents.moleculesai.app>
Date: Sat, 16 May 2026 14:47:07 +0000
Subject: [PATCH 4/5] fix(handlers): prevent poll-mode sync-persist test from
 hanging CI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

sqlmock.ExpectationsWereMet() hangs indefinitely when the expected INSERT
mock never fires. If the production code ever regresses to goAsync
(pre-fix shape), the handler returns before the INSERT fires, the mock
never fires, and ExpectationsWereMet() blocks for the full test/-suite
timeout — wedging the CI run with no diagnostic.

Fix: check expectations in a goroutine with a 2s hard timeout. When
the mock has fired (synchronous production code), ExpectationsWereMet()
returns <1ms and the select fires the `case err := <-expectDone` arm.
When the mock has NOT fired (async regression), the 2s timeout fires and
the test fails with a clear message instead of hanging.

Also reduce insertDelay from 150ms → 50ms. 50ms is ~50× the normal INSERT
latency and sufficient to prove synchronous blocking; the larger value
was adding unnecessary suite-level wall-clock under -race detection,
where mock delays are amplified by the instrumenter's goroutine overhead.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../handlers/a2a_poll_ingest_persist_test.go  | 32 ++++++++++++++++---
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/workspace-server/internal/handlers/a2a_poll_ingest_persist_test.go b/workspace-server/internal/handlers/a2a_poll_ingest_persist_test.go
index f16d100b..06dae2b1 100644
--- a/workspace-server/internal/handlers/a2a_poll_ingest_persist_test.go
+++ b/workspace-server/internal/handlers/a2a_poll_ingest_persist_test.go
@@ -35,6 +35,15 @@ package handlers
 // client disconnect on chat-exit and a post-response restart cannot lose
 // it. Behavior is never worse than today (best-effort; a persist error
 // still returns queued).
+//
+// TEST DESIGN NOTE: sqlmock.ExpectationsWereMet() hangs indefinitely if
+// the expected query never fires. We use a select+default+time.After
+// pattern so the test FAILS fast (not hangs) when the production code
+// regresses to async (the INSERT never fires before handler returns),
+// while still returning promptly when all expectations are met. The
+// insertDelay is kept small (50ms) to minimise suite-level timing
+// impact under -race detection, where mock delays are amplified by
+// the instrumenter's goroutine overhead.
 
 import (
 	"bytes"
@@ -70,7 +79,10 @@ func TestProxyA2A_PollMode_PersistsUserMessageSynchronouslyBeforeQueuedResponse(
 	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
 
 	const wsID = "ws-poll-sync-persist"
-	const insertDelay = 150 * time.Millisecond
+	// Keep delay small: -race detection amplifies mock delays significantly.
+	// A 50ms delay is sufficient to prove synchronous blocking (~50× the
+	// normal INSERT latency) without bloating the full ./... suite runtime.
+	const insertDelay = 50 * time.Millisecond
 
 	expectBudgetCheck(mock, wsID)
 
@@ -116,9 +128,21 @@ func TestProxyA2A_PollMode_PersistsUserMessageSynchronouslyBeforeQueuedResponse(
 	}
 
 	// Defining assertion #2: the durable write actually happened by the
-	// time the handler returned — checked WITHOUT waitAsyncForTest()/sleep.
-	if err := mock.ExpectationsWereMet(); err != nil {
-		t.Fatalf("user-message INSERT was not durable at handler return (unmet sqlmock expectations): %v", err)
+	// time the handler returned. ExpectionsWereMet() hangs indefinitely if
+	// the mock never fires (e.g. production code regressed to async),
+	// so we check it in a goroutine with a hard 2s timeout — fails fast
+	// (no CI hang) on regression while returning promptly on success.
+	expectDone := make(chan error, 1)
+	go func() { expectDone <- mock.ExpectationsWereMet() }()
+	select {
+	case err := <-expectDone:
+		if err != nil {
+			t.Fatalf("user-message INSERT was not durable at handler return (unmet sqlmock expectations): %v", err)
+		}
+	case <-time.After(2 * time.Second):
+		t.Fatalf("ExpectationsWereMet() hung for >2s — INSERT mock never fired. " +
+			"Likely cause: production code regressed logA2AReceiveQueued to goAsync " +
+			"(INSERT fires after handler returns, not before).")
 	}
 
 	// Sanity: still the correct poll-mode envelope + status.

From 16957b7c156bde7b62c5e5ce5c1082e34dedcb5b Mon Sep 17 00:00:00 2001
From: infra-sre <infra-sre@agents.moleculesai.app>
Date: Sat, 16 May 2026 11:49:10 -0700
Subject: [PATCH 5/5] infra(ci): route publish/deploy ship jobs to dedicated
 `publish` lane (internal#462)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Urgent prod-deploy publish builds currently FIFO-compete with ordinary
PR required-CI on the shared 20-runner pool. PR#1350's (CTO-reported
canvas-message-loss fix) production image build sat ~25min behind the
PR-CI backlog after merge, directly delaying a user-facing fix.

internal#462 comment 32299 + the already-merged operator-config
publish-lane scaffolding (config.publish.yaml + publish-lane-ensure.sh,
internal#394/#399) define a reserved `publish`/`release` sub-pool
(molecule-runner-publish-*, OUTSIDE the managed 1..20 range so it is
never auto-drained / recycled / drift-flagged). This retargets the 7
post-merge ship jobs across 5 workflows from `runs-on: ubuntu-latest`
to `runs-on: publish` so a merged fix's image build/push/deploy gets
reserved capacity and starts immediately, while PR-CI keeps the
general pool:

  - publish-workspace-server-image.yml: build-and-push, deploy-production
  - publish-canvas-image.yml: build-and-push
  - publish-runtime.yml: publish, cascade
  - redeploy-tenants-on-main.yml: redeploy
  - redeploy-tenants-on-staging.yml: redeploy

publish-runtime-autobump.yml is intentionally NOT moved: it is
pull_request-triggered (PR-CI by nature, a required status), not a
post-merge ship job — the lane reserves capacity for the ship path,
not for PR checks.

HARD MERGE PRECONDITION: this MUST NOT merge until the publish-lane
runners are registered and advertising the `publish` label. Targeting
an unregistered label queues jobs indefinitely with zero eligible
runners — the exact #599/#576 `docker`-label failure mode. Lane
registration is a GO-gated live-fleet mutation (publish-lane-ensure.sh
ALLOW_FLEET_MUTATION=1, requires explicit Hongming in-chat GO).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .gitea/workflows/publish-canvas-image.yml      | 18 +++++++++++-------
 .gitea/workflows/publish-runtime.yml           |  9 +++++++--
 .../publish-workspace-server-image.yml         | 13 +++++++++++--
 .gitea/workflows/redeploy-tenants-on-main.yml  |  5 ++++-
 .../workflows/redeploy-tenants-on-staging.yml  |  5 ++++-
 5 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/.gitea/workflows/publish-canvas-image.yml b/.gitea/workflows/publish-canvas-image.yml
index 9aedadd6..818a4cad 100644
--- a/.gitea/workflows/publish-canvas-image.yml
+++ b/.gitea/workflows/publish-canvas-image.yml
@@ -49,13 +49,17 @@ jobs:
   # bp-exempt: post-merge image publication side effect; CI / all-required gates source changes.
   build-and-push:
     name: Build & push canvas image
-    # REVERTED (infra/revert-docker-runner-label): `runs-on: ubuntu-latest` restored.
-    # The `docker` label is not registered on any act_runner. `runs-on: [ubuntu-latest, docker]`
-    # causes jobs to queue indefinitely with zero eligible runners — strictly worse than the
-    # pre-#599 coin-flip (50% success rate). Once the `docker` label is registered on
-    # ≥2 runners, re-apply the fix from #599 (infra/docker-runner-label).
-    # See issue #576 + infra-lead pulse ~00:30Z.
-    runs-on: ubuntu-latest
+    # Dedicated publish/release lane (internal#462 / #394 / #399). Ship
+    # path (on: push:main, canvas/**) — reserved capacity so a merged
+    # canvas fix's image build never FIFO-queues behind PR required-CI.
+    # The `publish` label resolves ONLY to the molecule-runner-publish-*
+    # sub-pool (config.publish.yaml). HARD DEPENDENCY: this MUST land
+    # AFTER the publish-lane runners are registered/advertising `publish`
+    # — the earlier #599 `docker` label attempt queued indefinitely with
+    # zero eligible runners precisely because the label was targeted
+    # before any runner advertised it (see #576). The lane is registered
+    # in this rollout (internal#462) so the precondition holds.
+    runs-on: publish
     # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
     # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
     continue-on-error: true
diff --git a/.gitea/workflows/publish-runtime.yml b/.gitea/workflows/publish-runtime.yml
index fe46e812..c96307ab 100644
--- a/.gitea/workflows/publish-runtime.yml
+++ b/.gitea/workflows/publish-runtime.yml
@@ -66,7 +66,10 @@ concurrency:
 
 jobs:
   publish:
-    runs-on: ubuntu-latest
+    # Dedicated publish/release lane (internal#462 / #394 / #399). Ship
+    # path (on: push tag runtime-v*) — reserved capacity, never FIFO
+    # behind PR-CI. `publish` resolves only to molecule-runner-publish-*.
+    runs-on: publish
     outputs:
       version: ${{ steps.version.outputs.version }}
       wheel_sha256: ${{ steps.wheel_hash.outputs.wheel_sha256 }}
@@ -166,7 +169,9 @@ jobs:
 
   cascade:
     needs: publish
-    runs-on: ubuntu-latest
+    # Publish/release lane (internal#462) — downstream of the runtime
+    # publish ship job; keep it on the reserved lane too.
+    runs-on: publish
     steps:
       - name: Wait for PyPI to propagate the new version
         env:
diff --git a/.gitea/workflows/publish-workspace-server-image.yml b/.gitea/workflows/publish-workspace-server-image.yml
index 02a42962..3f70ca2b 100644
--- a/.gitea/workflows/publish-workspace-server-image.yml
+++ b/.gitea/workflows/publish-workspace-server-image.yml
@@ -54,7 +54,14 @@ env:
 
 jobs:
   build-and-push:
-    runs-on: ubuntu-latest
+    # Dedicated publish/release lane (internal#462 / #394 / #399). This
+    # is a post-merge ship job (on: push:main) — it must NOT FIFO-compete
+    # with PR required-CI on the shared pool (PR#1350's prod image build
+    # was delayed ~25min this way). The `publish` label resolves ONLY to
+    # the reserved molecule-runner-publish-* sub-pool (config.publish.yaml,
+    # OUTSIDE the managed 1..20 range) so a merged fix's image build
+    # starts immediately while PR-CI keeps the general pool.
+    runs-on: publish
     steps:
       - name: Checkout
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -181,7 +188,9 @@ jobs:
     name: Production auto-deploy
     needs: build-and-push
     if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
-    runs-on: ubuntu-latest
+    # Publish/release lane (internal#462) — production deploy of a merged
+    # fix; reserved capacity, never queued behind PR-CI.
+    runs-on: publish
     timeout-minutes: 75
     env:
       CP_URL: ${{ vars.PROD_CP_URL || 'https://api.moleculesai.app' }}
diff --git a/.gitea/workflows/redeploy-tenants-on-main.yml b/.gitea/workflows/redeploy-tenants-on-main.yml
index 259df556..f458501c 100644
--- a/.gitea/workflows/redeploy-tenants-on-main.yml
+++ b/.gitea/workflows/redeploy-tenants-on-main.yml
@@ -68,7 +68,10 @@ jobs:
   # bp-exempt: production redeploy is a side-effect workflow, not a merge gate.
   redeploy:
     if: ${{ github.event_name == 'workflow_dispatch' }}
-    runs-on: ubuntu-latest
+    # Dedicated publish/release lane (internal#462 / #394 / #399).
+    # Production tenant redeploy — a deploy action, reserved capacity so
+    # it never queues behind PR-CI. `publish` -> molecule-runner-publish-*.
+    runs-on: publish
     # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
     # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
     continue-on-error: true
diff --git a/.gitea/workflows/redeploy-tenants-on-staging.yml b/.gitea/workflows/redeploy-tenants-on-staging.yml
index 98f6b227..534a977e 100644
--- a/.gitea/workflows/redeploy-tenants-on-staging.yml
+++ b/.gitea/workflows/redeploy-tenants-on-staging.yml
@@ -75,7 +75,10 @@ env:
 jobs:
   # bp-exempt: post-merge staging redeploy side effect; CI / all-required gates source changes.
   redeploy:
-    runs-on: ubuntu-latest
+    # Dedicated publish/release lane (internal#462 / #394 / #399).
+    # Post-merge staging redeploy — a deploy action, reserved capacity.
+    # `publish` -> molecule-runner-publish-* sub-pool.
+    runs-on: publish
     # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
     # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
     continue-on-error: true