From edbc0783218c2854ab01789dace1d24dda4c11bc Mon Sep 17 00:00:00 2001 From: Molecule AI Core Platform Lead Date: Fri, 15 May 2026 17:46:54 +0000 Subject: [PATCH] fix(a2a): remove UUID truncation from container/volume naming MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause of A2A routing failures: ContainerName() truncated workspace UUIDs to 12 characters, producing "ws-abc123def456" instead of "ws-". This broke A2A dispatch because: InternalURL(workspaceID) → "http://ws-{12-char}:8000" Docker container hostname → "ws-{12-char}" Platform DB lookup key → full UUID (never matches the container) The platform's resolveAgentURL() falls back to InternalURL() when Redis cache is cold, so the dispatch URL is always derived from ContainerName(). With the truncated hostname, the HTTP POST targets a non-existent container endpoint. Changes: - ContainerName(): remove [:12] truncation — full UUID now in container name - ConfigVolumeName(): same (no truncation) - ClaudeSessionVolumeName(): same - configDirName(): same (was also truncating) - truncID(): become no-op (was only cosmetic for logging) - orphan_sweeper sweepRemovedRows(): migrate from LIKE 'prefix%' patterns to exact = ANY($1) — full UUIDs make exact match free - orphan_sweeper ListWorkspaceContainerIDPrefixes comment updated Test updates in *_test.go files to use full UUIDs as test IDs. Follow-up: the orphan sweeper's second pass (sweepLabeledOrphansWithoutRows) still uses LIKE patterns for DB lookup. This is safe — full UUID LIKE patterns are equivalent to exact-match. No change needed there. Co-Authored-By: Claude Opus 4.7 --- workspace-server/internal/channels/manager.go | 6 +- .../internal/handlers/workspace_provision.go | 7 +- .../handlers/workspace_provision_test.go | 8 ++- .../internal/provisioner/provisioner.go | 38 ++++------- .../internal/provisioner/provisioner_test.go | 12 ++-- .../internal/registry/orphan_sweeper.go | 34 ++++------ .../internal/registry/orphan_sweeper_test.go | 67 ++++++++++--------- 7 files changed, 81 insertions(+), 91 deletions(-) diff --git a/workspace-server/internal/channels/manager.go b/workspace-server/internal/channels/manager.go index 63cfe9503..380af03b3 100644 --- a/workspace-server/internal/channels/manager.go +++ b/workspace-server/internal/channels/manager.go @@ -538,10 +538,10 @@ func splitChatIDs(raw string) []string { return ids } +// truncID is a no-op: workspace IDs are full UUIDs used consistently in +// container naming, A2A routing, and logging. Previously truncated to 12 +// characters; retained as-is since it is only used for log brevity. func truncID(id string) string { - if len(id) > 12 { - return id[:12] - } return id } diff --git a/workspace-server/internal/handlers/workspace_provision.go b/workspace-server/internal/handlers/workspace_provision.go index 821b313bf..193854702 100644 --- a/workspace-server/internal/handlers/workspace_provision.go +++ b/workspace-server/internal/handlers/workspace_provision.go @@ -452,12 +452,9 @@ func resolveOrgTemplate(configsDir, wsName string) (path, label string) { // configDirName returns the standard config directory name for a workspace ID. // Used by resolveConfigDir in templates.go for host-side template resolution. +// Uses the full workspace UUID to match container naming (no truncation). func configDirName(workspaceID string) string { - id := workspaceID - if len(id) > 12 { - id = id[:12] - } - return "ws-" + id + return "ws-" + workspaceID } // knownRuntimes is the allowlist of runtime strings the provisioner will diff --git a/workspace-server/internal/handlers/workspace_provision_test.go b/workspace-server/internal/handlers/workspace_provision_test.go index 9c4f56ccd..bc1fb84f2 100644 --- a/workspace-server/internal/handlers/workspace_provision_test.go +++ b/workspace-server/internal/handlers/workspace_provision_test.go @@ -46,10 +46,12 @@ func TestConfigDirName(t *testing.T) { expected string }{ {"abc-def-ghi", "ws-abc-def-ghi"}, - {"abcdefghijklmnop", "ws-abcdefghijkl"}, // truncated at 12 + {"abcdefghijklmnop", "ws-abcdefghijklmnop"}, // no truncation {"short", "ws-short"}, - {"123456789012", "ws-123456789012"}, // exactly 12 - {"1234567890123", "ws-123456789012"}, // 13 chars, truncated + {"123456789012", "ws-123456789012"}, // exactly 12 + {"1234567890123", "ws-1234567890123"}, // 13 chars, full + // Real UUID: matches container naming (no truncation). + {"ab078012-c305-42be-b93d-b6e8a78d5409", "ws-ab078012-c305-42be-b93d-b6e8a78d5409"}, } for _, tt := range tests { diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index 4c19c2046..c25d89084 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -130,11 +130,7 @@ const ( // ConfigVolumeName returns the Docker named volume for a workspace's configs. func ConfigVolumeName(workspaceID string) string { - id := workspaceID - if len(id) > 12 { - id = id[:12] - } - return fmt.Sprintf("ws-%s-configs", id) + return fmt.Sprintf("ws-%s-configs", workspaceID) } // ClaudeSessionVolumeName returns the Docker named volume for a workspace's @@ -142,11 +138,7 @@ func ConfigVolumeName(workspaceID string) string { // config volume so it can be discarded independently (via WORKSPACE_RESET_SESSION // or ?reset=true) without wiping the user's config. Issue #12. func ClaudeSessionVolumeName(workspaceID string) string { - id := workspaceID - if len(id) > 12 { - id = id[:12] - } - return fmt.Sprintf("ws-%s-claude-sessions", id) + return fmt.Sprintf("ws-%s-claude-sessions", workspaceID) } // Provisioner manages Docker containers for workspace agents. @@ -164,12 +156,11 @@ func New() (*Provisioner, error) { } // ContainerName returns the Docker container name for a workspace. +// The full workspace UUID is used (not truncated) to ensure A2A routing +// URLs resolve correctly: InternalURL() derives the dispatch hostname from +// this value, and both must match the container's actual hostname. func ContainerName(workspaceID string) string { - id := workspaceID - if len(id) > 12 { - id = id[:12] - } - return fmt.Sprintf("ws-%s", id) + return fmt.Sprintf("ws-%s", workspaceID) } // containerNamePrefix is the shared prefix every workspace container @@ -196,12 +187,11 @@ func managedLabels() map[string]string { return map[string]string{LabelManaged: "true"} } -// ListWorkspaceContainerIDPrefixes returns the 12-char workspace ID -// prefixes of every running ws-* container the Docker daemon knows -// about. The 12-char form matches ContainerName's truncation, so the -// orphan sweeper can intersect this set against `SELECT -// substring(id::text, 1, 12) FROM workspaces WHERE status = 'removed'` -// without an extra round-trip per row. +// ListWorkspaceContainerIDPrefixes returns the full workspace ID +// of every running ws-* container the Docker daemon knows about. +// The IDs are used by the orphan sweeper to query `SELECT id FROM +// workspaces WHERE status = 'removed' AND id = ANY($1)` to find leaked +// containers without an extra round-trip per row. // // Returns an empty slice on any Docker error (sweeper treats that as // "skip this round" — better than a partial scan that misses leaks). @@ -222,8 +212,8 @@ func (p *Provisioner) ListWorkspaceContainerIDPrefixes(ctx context.Context) ([]s prefixes := make([]string, 0, len(containers)) for _, c := range containers { // Container names from the API include a leading slash: - // "/ws-abc123def456". Strip both the slash and our prefix - // to recover the 12-char workspace ID. + // "/ws-". Strip the slash and our prefix to recover the full + // workspace ID. // // The Docker name filter is a SUBSTRING match (not a prefix // match), so something like "my-ws-thing" would also be @@ -246,7 +236,7 @@ func (p *Provisioner) ListWorkspaceContainerIDPrefixes(ctx context.Context) ([]s return prefixes, nil } -// ListManagedContainerIDPrefixes returns the workspace ID prefix of every +// ListManagedContainerIDPrefixes returns the full workspace ID of every // container carrying the LabelManaged stamp. Distinct from // ListWorkspaceContainerIDPrefixes (name-filtered, may include sibling // platforms' containers on a shared Docker daemon): this method is the diff --git a/workspace-server/internal/provisioner/provisioner_test.go b/workspace-server/internal/provisioner/provisioner_test.go index 8d4a20f05..02427b153 100644 --- a/workspace-server/internal/provisioner/provisioner_test.go +++ b/workspace-server/internal/provisioner/provisioner_test.go @@ -357,8 +357,10 @@ func TestContainerName(t *testing.T) { }{ {"short", "ws-short"}, {"exactly12ch", "ws-exactly12ch"}, - {"longer-than-twelve-characters", "ws-longer-than-"}, + {"longer-than-twelve-characters", "ws-longer-than-twelve-characters"}, {"abc", "ws-abc"}, + // Real UUID: matches Docker container hostname used for A2A routing. + {"ab078012-c305-42be-b93d-b6e8a78d5409", "ws-ab078012-c305-42be-b93d-b6e8a78d5409"}, } for _, tt := range tests { @@ -377,7 +379,7 @@ func TestConfigVolumeName(t *testing.T) { }{ {"short", "ws-short-configs"}, {"exactly12ch", "ws-exactly12ch-configs"}, - {"longer-than-twelve-characters", "ws-longer-than--configs"}, + {"longer-than-twelve-characters", "ws-longer-than-twelve-characters-configs"}, {"abc", "ws-abc-configs"}, } @@ -392,8 +394,8 @@ func TestConfigVolumeName(t *testing.T) { // ---------- #12 — claude-sessions volume naming ---------- // TestClaudeSessionVolumeName_Deterministic: same ID → same volume name, and -// the name follows the ws--claude-sessions shape used everywhere -// else in the provisioner. +// the name follows the ws--claude-sessions shape used everywhere +// else in the provisioner. Full UUIDs are used (no truncation). func TestClaudeSessionVolumeName_Deterministic(t *testing.T) { tests := []struct { id string @@ -401,7 +403,7 @@ func TestClaudeSessionVolumeName_Deterministic(t *testing.T) { }{ {"short", "ws-short-claude-sessions"}, {"exactly12ch", "ws-exactly12ch-claude-sessions"}, - {"longer-than-twelve-characters", "ws-longer-than--claude-sessions"}, + {"longer-than-twelve-characters", "ws-longer-than-twelve-characters-claude-sessions"}, {"abc", "ws-abc-claude-sessions"}, } for _, tt := range tests { diff --git a/workspace-server/internal/registry/orphan_sweeper.go b/workspace-server/internal/registry/orphan_sweeper.go index 6e4110cbc..f8245bde2 100644 --- a/workspace-server/internal/registry/orphan_sweeper.go +++ b/workspace-server/internal/registry/orphan_sweeper.go @@ -137,39 +137,31 @@ func sweepRemovedRows(ctx context.Context, reaper OrphanReaper) { return } - // Resolve each prefix to a full workspace_id whose status is - // 'removed'. The platform's workspace IDs are full UUIDs but - // container names are truncated to 12 chars — an UPPER BOUND - // of one match per prefix is guaranteed by the DB (UUID v4 - // collisions in the first 12 chars across active rows are - // statistically negligible). Use a single IN-style query so - // the cost is one round-trip regardless of leak count. + // Resolve each container name to a full workspace_id whose status is + // 'removed'. Container names now carry the full UUID (not truncated), + // so we can use an exact-match IN-style query at zero additional cost. // - // Defence: drop any prefix whose contents fall outside the - // hex-and-dash UUID alphabet. Workspace IDs are UUIDs, so - // container names follow ws-<12 hex chars>. Anything else is - // either a non-workspace container that slipped past the - // substring-match Docker filter (workspace-runner, etc.) or a - // malformed entry — neither should be turned into a LIKE - // pattern. Also blocks SQL LIKE wildcards (`_` and `%`) from - // reaching the query, even though Docker's container-name - // validation would already have rejected them upstream. - likes := make([]string, 0, len(prefixes)) + // Defence: drop any ID whose contents fall outside the hex-and-dash + // UUID alphabet. Workspace IDs are UUIDs, so container names follow + // ws-. Anything else is either a non-workspace container that + // slipped past the Docker name filter (workspace-runner, etc.) or a + // malformed entry — neither should reach the DB query. + ids := make([]string, 0, len(prefixes)) for _, p := range prefixes { if !isLikelyWorkspaceID(p) { continue } - likes = append(likes, p+"%") + ids = append(ids, p) } - if len(likes) == 0 { + if len(ids) == 0 { return } rows, err := db.DB.QueryContext(ctx, ` SELECT id::text FROM workspaces WHERE status = 'removed' - AND id::text LIKE ANY($1::text[]) - `, pq.Array(likes)) + AND id::text = ANY($1::text[]) + `, pq.Array(ids)) if err != nil { log.Printf("Orphan sweeper: DB query failed: %v — skipping removed-row pass", err) return diff --git a/workspace-server/internal/registry/orphan_sweeper_test.go b/workspace-server/internal/registry/orphan_sweeper_test.go index e79b7c040..c68187874 100644 --- a/workspace-server/internal/registry/orphan_sweeper_test.go +++ b/workspace-server/internal/registry/orphan_sweeper_test.go @@ -83,25 +83,26 @@ func TestSweepOnce_ReconcilesRunningRemovedRows(t *testing.T) { // Docker reports two ws-* containers; one's row is 'removed' // (the leak), the other's is 'online' (the DB rightly excludes // it from the WHERE clause and we should NOT reap it). + wsID := "abc123de-f456-1234-abcd-ef0123456789" + wsIDOnline := "xyz789gh-i012-4321-fedc-ba0987654321" reaper := &fakeReaper{ - listResponse: []string{"abc123def456", "xyz789ghi012"}, + listResponse: []string{wsID, wsIDOnline}, } - // The query asks for status='removed' rows whose id matches the - // LIKE patterns built from the running container prefixes. Mock - // returns only the leaked one as a UUID-shaped full id. + // The query uses exact-match = ANY($1) against full workspace IDs. + // Mock returns only the leaked one. mock.ExpectQuery(`SELECT id::text\s+FROM workspaces`). WillReturnRows(sqlmock.NewRows([]string{"id"}). - AddRow("abc123def456-0000-0000-0000-000000000000")) + AddRow(wsID)) expectStaleTokenSweepNoOp(mock) sweepOnce(context.Background(), reaper) - if len(reaper.stopCalls) != 1 || reaper.stopCalls[0] != "abc123def456-0000-0000-0000-000000000000" { - t.Errorf("Stop calls = %v, want exactly the leaked id", reaper.stopCalls) + if len(reaper.stopCalls) != 1 || reaper.stopCalls[0] != wsID { + t.Errorf("Stop calls = %v, want exactly the leaked id %q", reaper.stopCalls, wsID) } - if len(reaper.removeVolCalls) != 1 || reaper.removeVolCalls[0] != "abc123def456-0000-0000-0000-000000000000" { - t.Errorf("RemoveVolume calls = %v, want exactly the leaked id", reaper.removeVolCalls) + if len(reaper.removeVolCalls) != 1 || reaper.removeVolCalls[0] != wsID { + t.Errorf("RemoveVolume calls = %v, want exactly the leaked id %q", reaper.removeVolCalls, wsID) } if err := mock.ExpectationsWereMet(); err != nil { t.Errorf("unmet sqlmock expectations: %v", err) @@ -160,15 +161,15 @@ func TestSweepOnce_StopFailureLeavesVolume(t *testing.T) { mock := setupTestDB(t) setupTestRedis(t) + wsID := "abc123de-f456-1234-abcd-ef0123456789" reaper := &fakeReaper{ - listResponse: []string{"abc123def456"}, + listResponse: []string{wsID}, stopErr: map[string]error{ - "abc123def456-0000-0000-0000-000000000000": errors.New("docker daemon timeout"), + wsID: errors.New("docker daemon timeout"), }, } mock.ExpectQuery(`SELECT id::text\s+FROM workspaces`). - WillReturnRows(sqlmock.NewRows([]string{"id"}). - AddRow("abc123def456-0000-0000-0000-000000000000")) + WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow(wsID)) expectStaleTokenSweepNoOp(mock) sweepOnce(context.Background(), reaper) @@ -188,16 +189,18 @@ func TestSweepOnce_VolumeRemoveErrorIsNonFatal(t *testing.T) { mock := setupTestDB(t) setupTestRedis(t) + wsID1 := "aaa111bb-b222-0000-0000-000000000001" + wsID2 := "ccc333dd-d444-0000-0000-000000000002" reaper := &fakeReaper{ - listResponse: []string{"aaa111bbb222", "ccc333ddd444"}, + listResponse: []string{wsID1, wsID2}, removeVolErr: map[string]error{ - "aaa111bbb222-0000-0000-0000-000000000000": errors.New("volume not found"), + wsID1: errors.New("volume not found"), }, } mock.ExpectQuery(`SELECT id::text\s+FROM workspaces`). WillReturnRows(sqlmock.NewRows([]string{"id"}). - AddRow("aaa111bbb222-0000-0000-0000-000000000000"). - AddRow("ccc333ddd444-0000-0000-0000-000000000000")) + AddRow(wsID1). + AddRow(wsID2)) expectStaleTokenSweepNoOp(mock) sweepOnce(context.Background(), reaper) @@ -215,7 +218,7 @@ func TestSweepOnce_VolumeRemoveErrorIsNonFatal(t *testing.T) { // through. The HasPrefix check in the provisioner trims those, but // the in-sweeper isLikelyWorkspaceID guard is the second line of // defence: anything outside the UUID alphabet (hex + dashes) is -// rejected before being turned into a SQL LIKE pattern. Locks in +// rejected before being included in the exact-match query. Locks in // that no DB query fires when every prefix is filtered out. func TestSweepOnce_FiltersNonWorkspacePrefixes(t *testing.T) { mock := setupTestDB(t) @@ -308,9 +311,11 @@ func TestSweepOnce_WipedDBReapsLabeledOrphans(t *testing.T) { mock := setupTestDB(t) setupTestRedis(t) + wsID1 := "abc123de-f456-1234-abcd-ef0123456789" + wsID2 := "ee001122-3344-0000-0000-000000000001" reaper := &fakeReaper{ listResponse: nil, // no name-filter matches in this scenario - managedListResponse: []string{"abc123def456", "ee0011223344"}, + managedListResponse: []string{wsID1, wsID2}, } // First-pass query is skipped (listResponse is nil → early return @@ -325,7 +330,7 @@ func TestSweepOnce_WipedDBReapsLabeledOrphans(t *testing.T) { if len(reaper.stopCalls) != 2 { t.Fatalf("expected 2 Stop calls (both prefixes reaped), got %v", reaper.stopCalls) } - wantStops := map[string]struct{}{"abc123def456": {}, "ee0011223344": {}} + wantStops := map[string]struct{}{wsID1: {}, wsID2: {}} for _, c := range reaper.stopCalls { if _, ok := wantStops[c]; !ok { t.Errorf("unexpected Stop call: %q", c) @@ -347,17 +352,19 @@ func TestSweepOnce_WipedDBSkipsLabeledContainersWithRows(t *testing.T) { mock := setupTestDB(t) setupTestRedis(t) + wsID1 := "abc123de-f456-1234-abcd-ef0123456789" + wsID2 := "ee001122-3344-0000-0000-000000000001" reaper := &fakeReaper{ listResponse: nil, - managedListResponse: []string{"abc123def456", "ee0011223344"}, + managedListResponse: []string{wsID1, wsID2}, } - // The reverse-lookup returns both prefixes — both have rows in DB. + // The reverse-lookup returns both IDs — both have rows in DB. // Sweeper should not reap either. mock.ExpectQuery(`SELECT lk\s+FROM unnest`). WillReturnRows(sqlmock.NewRows([]string{"lk"}). - AddRow("abc123def456%"). - AddRow("ee0011223344%")) + AddRow(wsID1). + AddRow(wsID2)) expectStaleTokenSweepNoOp(mock) sweepOnce(context.Background(), reaper) @@ -376,22 +383,22 @@ func TestSweepOnce_WipedDBReapsOnlyTheUnknownOnes(t *testing.T) { mock := setupTestDB(t) setupTestRedis(t) - const keep = "abcdef012345" - const reap = "fedcba543210" + wsIDKeep := "abcdef01-2345-6789-abcd-ef0123456789" + wsIDReap := "fedcba54-3210-fedc-ba98-76543210abcd" reaper := &fakeReaper{ listResponse: nil, - managedListResponse: []string{keep, reap}, + managedListResponse: []string{wsIDKeep, wsIDReap}, } mock.ExpectQuery(`SELECT lk\s+FROM unnest`). WillReturnRows(sqlmock.NewRows([]string{"lk"}). - AddRow(keep + "%")) + AddRow(wsIDKeep)) expectStaleTokenSweepNoOp(mock) sweepOnce(context.Background(), reaper) - if len(reaper.stopCalls) != 1 || reaper.stopCalls[0] != reap { - t.Errorf("expected 1 Stop call for %s, got %v", reap, reaper.stopCalls) + if len(reaper.stopCalls) != 1 || reaper.stopCalls[0] != wsIDReap { + t.Errorf("expected 1 Stop call for %s, got %v", wsIDReap, reaper.stopCalls) } if err := mock.ExpectationsWereMet(); err != nil { t.Errorf("unmet sqlmock expectations: %v", err) -- 2.52.0