diff --git a/.gitea/workflows/publish-canvas-image.yml b/.gitea/workflows/publish-canvas-image.yml index 9aedadd6..818a4cad 100644 --- a/.gitea/workflows/publish-canvas-image.yml +++ b/.gitea/workflows/publish-canvas-image.yml @@ -49,13 +49,17 @@ jobs: # bp-exempt: post-merge image publication side effect; CI / all-required gates source changes. build-and-push: name: Build & push canvas image - # REVERTED (infra/revert-docker-runner-label): `runs-on: ubuntu-latest` restored. - # The `docker` label is not registered on any act_runner. `runs-on: [ubuntu-latest, docker]` - # causes jobs to queue indefinitely with zero eligible runners — strictly worse than the - # pre-#599 coin-flip (50% success rate). Once the `docker` label is registered on - # ≥2 runners, re-apply the fix from #599 (infra/docker-runner-label). - # See issue #576 + infra-lead pulse ~00:30Z. - runs-on: ubuntu-latest + # Dedicated publish/release lane (internal#462 / #394 / #399). Ship + # path (on: push:main, canvas/**) — reserved capacity so a merged + # canvas fix's image build never FIFO-queues behind PR required-CI. + # The `publish` label resolves ONLY to the molecule-runner-publish-* + # sub-pool (config.publish.yaml). HARD DEPENDENCY: this MUST land + # AFTER the publish-lane runners are registered/advertising `publish` + # — the earlier #599 `docker` label attempt queued indefinitely with + # zero eligible runners precisely because the label was targeted + # before any runner advertised it (see #576). The lane is registered + # in this rollout (internal#462) so the precondition holds. + runs-on: publish # Phase 3 (RFC #219 §1): surface broken workflows without blocking. # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true diff --git a/.gitea/workflows/publish-runtime.yml b/.gitea/workflows/publish-runtime.yml index fe46e812..c96307ab 100644 --- a/.gitea/workflows/publish-runtime.yml +++ b/.gitea/workflows/publish-runtime.yml @@ -66,7 +66,10 @@ concurrency: jobs: publish: - runs-on: ubuntu-latest + # Dedicated publish/release lane (internal#462 / #394 / #399). Ship + # path (on: push tag runtime-v*) — reserved capacity, never FIFO + # behind PR-CI. `publish` resolves only to molecule-runner-publish-*. + runs-on: publish outputs: version: ${{ steps.version.outputs.version }} wheel_sha256: ${{ steps.wheel_hash.outputs.wheel_sha256 }} @@ -166,7 +169,9 @@ jobs: cascade: needs: publish - runs-on: ubuntu-latest + # Publish/release lane (internal#462) — downstream of the runtime + # publish ship job; keep it on the reserved lane too. + runs-on: publish steps: - name: Wait for PyPI to propagate the new version env: diff --git a/.gitea/workflows/publish-workspace-server-image.yml b/.gitea/workflows/publish-workspace-server-image.yml index 02a42962..3f70ca2b 100644 --- a/.gitea/workflows/publish-workspace-server-image.yml +++ b/.gitea/workflows/publish-workspace-server-image.yml @@ -54,7 +54,14 @@ env: jobs: build-and-push: - runs-on: ubuntu-latest + # Dedicated publish/release lane (internal#462 / #394 / #399). This + # is a post-merge ship job (on: push:main) — it must NOT FIFO-compete + # with PR required-CI on the shared pool (PR#1350's prod image build + # was delayed ~25min this way). The `publish` label resolves ONLY to + # the reserved molecule-runner-publish-* sub-pool (config.publish.yaml, + # OUTSIDE the managed 1..20 range) so a merged fix's image build + # starts immediately while PR-CI keeps the general pool. + runs-on: publish steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -181,7 +188,9 @@ jobs: name: Production auto-deploy needs: build-and-push if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} - runs-on: ubuntu-latest + # Publish/release lane (internal#462) — production deploy of a merged + # fix; reserved capacity, never queued behind PR-CI. + runs-on: publish timeout-minutes: 75 env: CP_URL: ${{ vars.PROD_CP_URL || 'https://api.moleculesai.app' }} diff --git a/.gitea/workflows/redeploy-tenants-on-main.yml b/.gitea/workflows/redeploy-tenants-on-main.yml index 259df556..f458501c 100644 --- a/.gitea/workflows/redeploy-tenants-on-main.yml +++ b/.gitea/workflows/redeploy-tenants-on-main.yml @@ -68,7 +68,10 @@ jobs: # bp-exempt: production redeploy is a side-effect workflow, not a merge gate. redeploy: if: ${{ github.event_name == 'workflow_dispatch' }} - runs-on: ubuntu-latest + # Dedicated publish/release lane (internal#462 / #394 / #399). + # Production tenant redeploy — a deploy action, reserved capacity so + # it never queues behind PR-CI. `publish` -> molecule-runner-publish-*. + runs-on: publish # Phase 3 (RFC #219 §1): surface broken workflows without blocking. # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true diff --git a/.gitea/workflows/redeploy-tenants-on-staging.yml b/.gitea/workflows/redeploy-tenants-on-staging.yml index 98f6b227..534a977e 100644 --- a/.gitea/workflows/redeploy-tenants-on-staging.yml +++ b/.gitea/workflows/redeploy-tenants-on-staging.yml @@ -75,7 +75,10 @@ env: jobs: # bp-exempt: post-merge staging redeploy side effect; CI / all-required gates source changes. redeploy: - runs-on: ubuntu-latest + # Dedicated publish/release lane (internal#462 / #394 / #399). + # Post-merge staging redeploy — a deploy action, reserved capacity. + # `publish` -> molecule-runner-publish-* sub-pool. + runs-on: publish # Phase 3 (RFC #219 §1): surface broken workflows without blocking. # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true diff --git a/workspace-server/internal/handlers/a2a_poll_ingest_persist_test.go b/workspace-server/internal/handlers/a2a_poll_ingest_persist_test.go new file mode 100644 index 00000000..06dae2b1 --- /dev/null +++ b/workspace-server/internal/handlers/a2a_poll_ingest_persist_test.go @@ -0,0 +1,160 @@ +package handlers + +// Regression coverage for the POLL-mode arm of the canvas user-message +// data-loss bug (internal#470 sibling — tracked on internal#471). +// +// Bug (reported 2026-05-16 by CTO Hongming): "in canvas i sometimes lose +// my own message when i exit chat". The push-mode arm was fixed by +// #1347 (persistUserMessageAtIngest — a SYNCHRONOUS, before-dispatch, +// context.WithoutCancel INSERT). #1347's framing asserted "poll-mode +// workspaces were never affected — logA2AReceiveQueued already persists +// at ingest". That assertion is OVERSTATED. +// +// Hongming's tenant (slug `hongming`, org 2c940477-...) has 4 workspaces, +// ALL runtime=external with empty URL → ALL delivery_mode=poll (proven +// empirically: a benign A2A probe returns the synthetic +// {"delivery_mode":"poll","status":"queued"} envelope for every one). +// So his reported loss is the POLL path, NOT the push path #1347 fixes. +// +// Root cause (poll arm): the poll-mode short-circuit (a2a_proxy.go ~402) +// calls logA2AReceiveQueued and then IMMEDIATELY returns the synthetic +// 200 {status:"queued"} to the canvas. But logA2AReceiveQueued's durable +// INSERT runs inside h.goAsync(...) — a DETACHED goroutine with NO +// happens-before barrier against the HTTP response. The canvas sees 200 +// ("message accepted") while the activity_logs row may not yet be — and, +// on a workspace-server restart / deploy / OOM / EC2 hibernation between +// the 200 and the goroutine's commit, NEVER will be — durable. There is +// also no fallback (unlike push-mode's legacy-INSERT fallback): a +// swallowed LogActivity error loses the message with only a log line. +// Chat-history reads activity_logs (postgres_store.go:165-187); a missing +// row = message gone on reopen. That is exactly Hongming's symptom. +// +// Fix (parity with push-mode): the poll-mode ingest persist of the +// canvas user message must be SYNCHRONOUS — committed before the queued +// 200 is returned — on a context.WithoutCancel derived context, so a +// client disconnect on chat-exit and a post-response restart cannot lose +// it. Behavior is never worse than today (best-effort; a persist error +// still returns queued). +// +// TEST DESIGN NOTE: sqlmock.ExpectationsWereMet() hangs indefinitely if +// the expected query never fires. We use a select+default+time.After +// pattern so the test FAILS fast (not hangs) when the production code +// regresses to async (the INSERT never fires before handler returns), +// while still returning promptly when all expectations are met. The +// insertDelay is kept small (50ms) to minimise suite-level timing +// impact under -race detection, where mock delays are amplified by +// the instrumenter's goroutine overhead. + +import ( + "bytes" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/DATA-DOG/go-sqlmock" + "github.com/gin-gonic/gin" +) + +// TestProxyA2A_PollMode_PersistsUserMessageSynchronouslyBeforeQueuedResponse +// is the defining contract: for a poll-mode workspace, the canvas user +// message MUST be durably INSERTed into activity_logs BEFORE the synthetic +// queued 200 is returned to the client — with NO reliance on a detached +// async goroutine completing later. +// +// The test proves the ordering by making the INSERT block briefly and +// asserting the handler does NOT return until the INSERT has completed. +// Pre-fix (INSERT in h.goAsync, response returned immediately) the +// handler returns ~instantly while the INSERT is still pending in the +// goroutine → the elapsed time is far below the injected INSERT delay and +// ExpectationsWereMet() is racy/unmet at return. Post-fix (synchronous +// persist before the queued response) the handler return is gated on the +// INSERT, so elapsed >= the injected delay and the expectation is met +// deterministically at return WITHOUT any waitAsyncForTest()/sleep. +func TestProxyA2A_PollMode_PersistsUserMessageSynchronouslyBeforeQueuedResponse(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + + const wsID = "ws-poll-sync-persist" + // Keep delay small: -race detection amplifies mock delays significantly. + // A 50ms delay is sufficient to prove synchronous blocking (~50× the + // normal INSERT latency) without bloating the full ./... suite runtime. + const insertDelay = 50 * time.Millisecond + + expectBudgetCheck(mock, wsID) + + // lookupDeliveryMode → poll, triggering the short-circuit. + mock.ExpectQuery("SELECT delivery_mode FROM workspaces WHERE id"). + WithArgs(wsID). + WillReturnRows(sqlmock.NewRows([]string{"delivery_mode"}).AddRow("poll")) + + // workspace-name lookup inside logA2AReceiveQueued. + mock.ExpectQuery(`SELECT name FROM workspaces WHERE id`). + WithArgs(wsID). + WillReturnRows(sqlmock.NewRows([]string{"name"}).AddRow("Poll WS")) + + // The durable user-message write. We delay it so a synchronous + // persist visibly gates the handler return; a detached-goroutine + // persist (pre-fix) does not. The fix must keep using + // context.WithoutCancel so this write survives a chat-exit cancel. + mock.ExpectExec("INSERT INTO activity_logs"). + WillDelayFor(insertDelay). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: wsID}} + + // callerID == "" (no X-Workspace-ID) → this is a canvas_user message, + // exactly Hongming's case. + body := `{"jsonrpc":"2.0","id":"poll-canvas-1","method":"message/send","params":{"message":{"role":"user","parts":[{"text":"my own message"}]}}}` + c.Request = httptest.NewRequest("POST", "/workspaces/"+wsID+"/a2a", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + + start := time.Now() + handler.ProxyA2A(c) + elapsed := time.Since(start) + + // Defining assertion #1: the handler must not have returned the + // queued response before the durable INSERT committed. Pre-fix this + // fails (elapsed ≈ 0, INSERT still racing in goAsync). + if elapsed < insertDelay { + t.Fatalf("poll-mode queued response returned in %v, before the %v user-message INSERT — "+ + "the message is not durable when the client/process goes away (DATA LOSS). "+ + "Persist must be synchronous before the queued 200.", elapsed, insertDelay) + } + + // Defining assertion #2: the durable write actually happened by the + // time the handler returned. ExpectionsWereMet() hangs indefinitely if + // the mock never fires (e.g. production code regressed to async), + // so we check it in a goroutine with a hard 2s timeout — fails fast + // (no CI hang) on regression while returning promptly on success. + expectDone := make(chan error, 1) + go func() { expectDone <- mock.ExpectationsWereMet() }() + select { + case err := <-expectDone: + if err != nil { + t.Fatalf("user-message INSERT was not durable at handler return (unmet sqlmock expectations): %v", err) + } + case <-time.After(2 * time.Second): + t.Fatalf("ExpectationsWereMet() hung for >2s — INSERT mock never fired. " + + "Likely cause: production code regressed logA2AReceiveQueued to goAsync " + + "(INSERT fires after handler returns, not before).") + } + + // Sanity: still the correct poll-mode envelope + status. + if w.Code != http.StatusOK { + t.Fatalf("expected 200 (queued), got %d: %s", w.Code, w.Body.String()) + } + var resp map[string]interface{} + if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil { + t.Fatalf("response is not valid JSON: %v", err) + } + if resp["status"] != "queued" || resp["delivery_mode"] != "poll" { + t.Errorf("poll envelope changed: got status=%v delivery_mode=%v, want queued/poll", + resp["status"], resp["delivery_mode"]) + } +} diff --git a/workspace-server/internal/handlers/a2a_proxy_helpers.go b/workspace-server/internal/handlers/a2a_proxy_helpers.go index 3d4fc4dd..8145a66a 100644 --- a/workspace-server/internal/handlers/a2a_proxy_helpers.go +++ b/workspace-server/internal/handlers/a2a_proxy_helpers.go @@ -504,25 +504,49 @@ func lookupDeliveryMode(ctx context.Context, workspaceID string) string { // reads in PR 3 — that's how a poll-mode workspace receives inbound A2A // without a public URL. func (h *WorkspaceHandler) logA2AReceiveQueued(ctx context.Context, workspaceID, callerID string, body []byte, a2aMethod string) { + // DATA-LOSS FIX (internal#471 — poll-mode sibling of #1347/internal#470): + // this is the ONLY durable write of a poll-mode inbound message, + // including a canvas_user message (callerID == "") typed in the canvas + // chat. It MUST be SYNCHRONOUS and complete BEFORE the caller returns + // the synthetic {status:"queued"} 200 — otherwise the canvas sees the + // send acknowledged while the activity_logs row is still racing in a + // detached goroutine, and a workspace-server restart / deploy / OOM / + // EC2 hibernation between the 200 and the goroutine's commit loses the + // user's message permanently (chat-history reads activity_logs, so a + // missing row = message gone on reopen). Hongming's tenant is entirely + // poll-mode (4 external workspaces, no URL — verified empirically), so + // his reported loss is THIS path; #1347 (push-mode, persists AFTER the + // poll short-circuit) structurally cannot cover it. + // + // Mirrors persistUserMessageAtIngest's discipline: + // - context.WithoutCancel: a client disconnect on chat-exit (which + // cancels the inbound request ctx) MUST NOT abort this write. + // - SYNCHRONOUS (no goAsync): the row must be durable before the + // queued 200 is returned to the caller. + // - Best-effort: LogActivity already logs+swallows INSERT errors, so + // a hiccup never blocks or fails the user's send (behavior for + // that one request is never worse than the pre-fix async path). + // The post-commit broadcast still fires inside LogActivity; a missed + // WebSocket event is not data loss (the durable row is the truth the + // canvas re-reads on reopen). + insCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 30*time.Second) + defer cancel() + var wsName string - db.DB.QueryRowContext(ctx, `SELECT name FROM workspaces WHERE id = $1`, workspaceID).Scan(&wsName) + db.DB.QueryRowContext(insCtx, `SELECT name FROM workspaces WHERE id = $1`, workspaceID).Scan(&wsName) if wsName == "" { wsName = workspaceID } summary := a2aMethod + " → " + wsName + " (queued for poll)" - h.goAsync(func() { - logCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 30*time.Second) - defer cancel() - LogActivity(logCtx, h.broadcaster, ActivityParams{ - WorkspaceID: workspaceID, - ActivityType: "a2a_receive", - SourceID: nilIfEmpty(callerID), - TargetID: &workspaceID, - Method: &a2aMethod, - Summary: &summary, - RequestBody: json.RawMessage(body), - Status: "ok", - }) + LogActivity(insCtx, h.broadcaster, ActivityParams{ + WorkspaceID: workspaceID, + ActivityType: "a2a_receive", + SourceID: nilIfEmpty(callerID), + TargetID: &workspaceID, + Method: &a2aMethod, + Summary: &summary, + RequestBody: json.RawMessage(body), + Status: "ok", }) } diff --git a/workspace-server/internal/handlers/org_helpers.go b/workspace-server/internal/handlers/org_helpers.go index 604ef551..a7fdd52c 100644 --- a/workspace-server/internal/handlers/org_helpers.go +++ b/workspace-server/internal/handlers/org_helpers.go @@ -177,7 +177,7 @@ func isEnvIdentPart(c byte) bool { return isEnvIdentStart(c) || (c >= '0' && c <= '9') } -// loadWorkspaceEnv reads the org root .env and the workspace-specific .env .env and the workspace-specific .env +// loadWorkspaceEnv reads the org root .env and the workspace-specific .env // (workspace overrides org root). Used by both secret injection and channel // config expansion. // diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index e9f51078..ae1fbc72 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -189,6 +189,24 @@ const containerNamePrefix = "ws-" // (the wiped-DB case after `docker compose down -v`). const LabelManaged = "molecule.platform.managed" +// AgentUID / AgentGID are the uid/gid of the unprivileged `agent` user that +// every workspace template creates and drops to via `gosu agent` before +// exec'ing the runtime (the a2a_mcp_server runs under this uid). The value is +// fixed at 1000:1000 across all templates — see: +// - workspace-configs-templates/claude-code-default/Dockerfile (`useradd -u 1000 ... agent`) +// - workspace-configs-templates/hermes/Dockerfile (`useradd -u 1000 ... agent`) +// - workspace/entrypoint.sh (`exec gosu agent` — "uid 1000") +// +// Files the platform injects into /configs AFTER the entrypoint's +// `chown -R agent:agent /configs` (the post-start #418 re-injection and the +// pre-start #1877 volume write) must be owned by this uid/gid, otherwise the +// agent-uid MCP server hits EACCES reading /configs/.auth_token, sends an +// empty bearer, and the platform 401s on /registry/{id}/peers (list_peers). +const ( + AgentUID = 1000 + AgentGID = 1000 +) + // managedLabels is the canonical label map applied to every workspace // container + volume. Pulled out so a future addition (e.g. instance // UUID for multi-platform-shared-daemon disambiguation) is one edit. @@ -862,8 +880,18 @@ func buildTemplateTar(templatePath string) (*bytes.Buffer, error) { return &buf, nil } -// WriteFilesToContainer writes in-memory files into /configs in the container. -func (p *Provisioner) WriteFilesToContainer(ctx context.Context, containerID string, files map[string][]byte) error { +// buildConfigFilesTar builds the tar stream that WriteFilesToContainer streams +// into /configs via CopyToContainer. Every entry is stamped Uid/Gid = agent +// (AgentUID/AgentGID) so the files land agent-owned after extraction. This is +// the issue #418 post-start re-injection path: it runs AFTER the template +// entrypoint's `chown -R agent:agent /configs`, so without explicit ownership +// in the tar header the files extract as root:root (tar Uid/Gid default 0) and +// the agent-uid MCP server can no longer read /configs/.auth_token (and +// /configs/.platform_inbound_secret) → empty bearer → list_peers 401. +// +// Pulled out as a pure function so the ownership contract is unit-testable +// without a live Docker daemon (mirrors buildTemplateTar). +func buildConfigFilesTar(files map[string][]byte) (*bytes.Buffer, error) { var buf bytes.Buffer tw := tar.NewWriter(&buf) @@ -876,8 +904,10 @@ func (p *Provisioner) WriteFilesToContainer(ctx context.Context, containerID str Typeflag: tar.TypeDir, Name: dir + "/", Mode: 0755, + Uid: AgentUID, + Gid: AgentGID, }); err != nil { - return fmt.Errorf("failed to write tar dir header for %s: %w", dir, err) + return nil, fmt.Errorf("failed to write tar dir header for %s: %w", dir, err) } createdDirs[dir] = true } @@ -886,19 +916,30 @@ func (p *Provisioner) WriteFilesToContainer(ctx context.Context, containerID str Name: name, Mode: 0644, Size: int64(len(data)), + Uid: AgentUID, + Gid: AgentGID, } if err := tw.WriteHeader(header); err != nil { - return fmt.Errorf("failed to write tar header for %s: %w", name, err) + return nil, fmt.Errorf("failed to write tar header for %s: %w", name, err) } if _, err := tw.Write(data); err != nil { - return fmt.Errorf("failed to write tar data for %s: %w", name, err) + return nil, fmt.Errorf("failed to write tar data for %s: %w", name, err) } } if err := tw.Close(); err != nil { - return fmt.Errorf("failed to close tar writer: %w", err) + return nil, fmt.Errorf("failed to close tar writer: %w", err) } + return &buf, nil +} - return p.cli.CopyToContainer(ctx, containerID, "/configs", &buf, container.CopyToContainerOptions{}) +// WriteFilesToContainer writes in-memory files into /configs in the container, +// agent-owned (see buildConfigFilesTar). +func (p *Provisioner) WriteFilesToContainer(ctx context.Context, containerID string, files map[string][]byte) error { + buf, err := buildConfigFilesTar(files) + if err != nil { + return err + } + return p.cli.CopyToContainer(ctx, containerID, "/configs", buf, container.CopyToContainerOptions{}) } // CopyToContainer exposes CopyToContainer from the Docker client for use by other packages. @@ -988,13 +1029,28 @@ func (p *Provisioner) ReadFromVolume(ctx context.Context, volumeName, filePath s return clean, nil } +// writeAuthTokenVolumeCmd is the shell command the throwaway alpine container +// runs to seed /vol/.auth_token. alpine runs it as root, so without the +// explicit `chown 1000:1000` the file stays root:root after the template +// entrypoint's `chown -R agent:agent /configs` has already run — the agent-uid +// (AgentUID) MCP server then gets EACCES reading it → empty bearer → +// list_peers 401. Pulled out as a pure function so the ownership contract is +// unit-testable without a live Docker daemon. Issue #1877. +func writeAuthTokenVolumeCmd() string { + return fmt.Sprintf( + "mkdir -p /vol && printf '%%s' $TOKEN > /vol/.auth_token && chmod 0600 /vol/.auth_token && chown %d:%d /vol/.auth_token", + AgentUID, AgentGID, + ) +} + // WriteAuthTokenToVolume writes the workspace auth token into the config volume // BEFORE the container starts, eliminating the token-injection race window where // a restarted container could read a stale token from /configs/.auth_token before // WriteFilesToContainer writes the new one. Issue #1877. // // Uses a throwaway alpine container to write directly to the named volume, -// bypassing the container lifecycle entirely. +// bypassing the container lifecycle entirely. The written file is chowned to +// the agent uid/gid (see writeAuthTokenVolumeCmd). func (p *Provisioner) WriteAuthTokenToVolume(ctx context.Context, workspaceID, token string) error { if p == nil || p.cli == nil { return ErrNoBackend @@ -1002,7 +1058,7 @@ func (p *Provisioner) WriteAuthTokenToVolume(ctx context.Context, workspaceID, t volName := ConfigVolumeName(workspaceID) resp, err := p.cli.ContainerCreate(ctx, &container.Config{ Image: "alpine", - Cmd: []string{"sh", "-c", "mkdir -p /vol && printf '%s' $TOKEN > /vol/.auth_token && chmod 0600 /vol/.auth_token"}, + Cmd: []string{"sh", "-c", writeAuthTokenVolumeCmd()}, Env: []string{"TOKEN=" + token}, }, &container.HostConfig{ Binds: []string{volName + ":/vol"}, diff --git a/workspace-server/internal/provisioner/token_ownership_test.go b/workspace-server/internal/provisioner/token_ownership_test.go new file mode 100644 index 00000000..85ae0140 --- /dev/null +++ b/workspace-server/internal/provisioner/token_ownership_test.go @@ -0,0 +1,95 @@ +package provisioner + +import ( + "archive/tar" + "errors" + "io" + "strings" + "testing" +) + +// These tests pin the P0 fix for the fleet-wide list_peers 401 (Hermes and +// every other template): the workspace-server token-injection paths wrote +// /configs/.auth_token (and /configs/.platform_inbound_secret) as root:root +// AFTER the template entrypoint's `chown -R agent:agent /configs` ran, so the +// agent-uid (1000) MCP server (a2a_mcp_server, running via `gosu agent`) hit +// `[Errno 13] Permission denied` reading the bearer → empty bearer → platform +// 401 on /registry/{id}/peers (the literal tool_list_peers path). +// +// The agent uid is 1000:1000, verified from the templates: +// - workspace-configs-templates/claude-code-default/Dockerfile: `useradd -u 1000 ... agent` +// - workspace-configs-templates/hermes/Dockerfile: `useradd -u 1000 ... agent` +// - workspace/entrypoint.sh / claude-code-default/entrypoint.sh: `exec gosu agent` ("uid 1000") +// +// Both tests assert the real artifact (the tar headers Docker's CopyToContainer +// honours for ownership, and the literal shell command the throwaway alpine +// container runs), not a mock that bypasses ownership. They FAIL on pre-fix +// code (no Uid/Gid in tar headers; no chown in the alpine command → root:root) +// and PASS post-fix (agent-owned). + +// TestWriteFilesToContainerTar_FilesAreAgentOwned covers the issue #418 +// post-start re-injection path (WriteFilesToContainer): the tar it streams +// into /configs via CopyToContainer must carry Uid/Gid = agent (1000) so the +// extracted files land agent-readable, not root:root. This is the path that +// (re)writes BOTH .auth_token and .platform_inbound_secret on a cadence. +func TestWriteFilesToContainerTar_FilesAreAgentOwned(t *testing.T) { + files := map[string][]byte{ + ".auth_token": []byte("tok-abc123"), + ".platform_inbound_secret": []byte("inbound-secret-xyz"), + "nested/dir/file.txt": []byte("data"), + } + + buf, err := buildConfigFilesTar(files) + if err != nil { + t.Fatalf("buildConfigFilesTar: %v", err) + } + + tr := tar.NewReader(buf) + seen := map[string]bool{} + for { + hdr, err := tr.Next() + if errors.Is(err, io.EOF) { + break + } + if err != nil { + t.Fatalf("read tar: %v", err) + } + if _, err := io.Copy(io.Discard, tr); err != nil { + t.Fatalf("drain %s: %v", hdr.Name, err) + } + seen[hdr.Name] = true + if hdr.Uid != AgentUID { + t.Fatalf("tar entry %q Uid = %d, want %d (agent) — root-owned injection causes the list_peers 401", + hdr.Name, hdr.Uid, AgentUID) + } + if hdr.Gid != AgentGID { + t.Fatalf("tar entry %q Gid = %d, want %d (agent)", hdr.Name, hdr.Gid, AgentGID) + } + } + + for _, want := range []string{".auth_token", ".platform_inbound_secret"} { + if !seen[want] { + t.Fatalf("tar missing %q (seen: %v)", want, seen) + } + } +} + +// TestWriteAuthTokenVolumeCmd_ChownsToAgent covers the issue #1877 pre-start +// volume-write path (WriteAuthTokenToVolume): the throwaway alpine container +// writes /vol/.auth_token then chmod 0600 but, pre-fix, never chowns it, so it +// stays root:root (alpine runs the command as root). The literal command must +// chown the file to the agent uid:gid so the agent-uid MCP server can read it. +func TestWriteAuthTokenVolumeCmd_ChownsToAgent(t *testing.T) { + cmd := writeAuthTokenVolumeCmd() + + if !strings.Contains(cmd, "chmod 0600 /vol/.auth_token") { + t.Fatalf("alpine cmd lost the 0600 chmod (regression): %q", cmd) + } + + wantChown := "chown 1000:1000 /vol/.auth_token" + if !strings.Contains(cmd, wantChown) { + t.Fatalf("alpine cmd = %q, missing %q — without it .auth_token stays root:root "+ + "and the agent-uid MCP server gets EACCES → empty bearer → list_peers 401", + cmd, wantChown) + } +}