Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 3805779bc9 | |||
| 7ef6bb88c8 | |||
| 35270f3c37 | |||
| e0411e73f7 | |||
| 989912daf0 | |||
| ae9734f46c | |||
| 209fd2c9ae |
+21
-13
@@ -145,10 +145,11 @@ jobs:
|
||||
# the diagnostic step with its own continue-on-error: true (line 203).
|
||||
# Flip confirmed by CI / Platform (Go) status = success on main HEAD 363905d3.
|
||||
continue-on-error: false
|
||||
# Job-level ceiling. The go test step below runs with a per-step 10m timeout;
|
||||
# this cap catches any step that leaks past that. Set well above 10m so
|
||||
# the per-step timeout is the active constraint.
|
||||
timeout-minutes: 15
|
||||
# Job-level ceiling. The go test step below runs with a per-step 70m timeout;
|
||||
# this cap catches any step that leaks past that. Set well above 70m so
|
||||
# the per-step timeout is the active constraint. Raised to 75m
|
||||
# to account for golangci-lint ~17m + test suite ~20-30m on cold runner (mc#1099).
|
||||
timeout-minutes: 75
|
||||
defaults:
|
||||
run:
|
||||
working-directory: workspace-server
|
||||
@@ -174,14 +175,20 @@ jobs:
|
||||
run: go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.12.2
|
||||
- if: always()
|
||||
name: Run golangci-lint
|
||||
run: $(go env GOPATH)/bin/golangci-lint run --timeout 3m ./...
|
||||
# mc#1099: --no-config bypasses .golangci.yaml ceiling; --timeout 30m
|
||||
# is the active constraint. Cold runner: fetch-depth:0 clone (5-10m) + Go
|
||||
# toolchain (5-10m) + mod download (2-5m) + build + vet + install lint
|
||||
# (5m) = ~15-20m before linting even starts. 30m gives headroom.
|
||||
run: $(go env GOPATH)/bin/golangci-lint run --no-config --timeout 30m ./...
|
||||
- if: always()
|
||||
name: Diagnostic — per-package verbose 60s
|
||||
name: Diagnostic — per-package verbose 600s
|
||||
# mc#1099: step-level ceiling above the 600s Go timeout for cold-runner headroom.
|
||||
timeout-minutes: 20
|
||||
run: |
|
||||
set +e
|
||||
go test -race -v -timeout 60s ./internal/handlers/... 2>&1 | tee /tmp/test-handlers.log
|
||||
go test -race -v -timeout 600s ./internal/handlers/... 2>&1 | tee /tmp/test-handlers.log
|
||||
handlers_exit=$?
|
||||
go test -race -v -timeout 60s ./internal/pendinguploads/... 2>&1 | tee /tmp/test-pu.log
|
||||
go test -race -v -timeout 600s ./internal/pendinguploads/... 2>&1 | tee /tmp/test-pu.log
|
||||
pu_exit=$?
|
||||
echo "::group::handlers exit=$handlers_exit (last 100 lines)"
|
||||
tail -100 /tmp/test-handlers.log
|
||||
@@ -193,11 +200,12 @@ jobs:
|
||||
continue-on-error: true
|
||||
- if: always()
|
||||
name: Run tests with race detection and coverage
|
||||
# Explicit timeout: cold runner cache causes OOM kills at ~4m39s on the
|
||||
# full ./... suite with race detection + coverage. A 10m per-step timeout
|
||||
# lets the suite complete on cold cache (~5-7m) while failing cleanly
|
||||
# instead of OOM-killing. The job-level timeout (15m) is a backstop.
|
||||
run: go test -race -timeout 10m -coverprofile=coverage.out ./...
|
||||
# mc#1099: cold runner (~5-20m) + race detector (3-5x overhead) can push
|
||||
# the suite past 10m. Per-step ceiling must exceed Go-level timeout so
|
||||
# Go's timeout fires first (clean interrupt) rather than the step ceiling
|
||||
# (SIGKILL). Job-level ceiling (75m) is the outer backstop.
|
||||
timeout-minutes: 70
|
||||
run: go test -race -timeout 60m -coverprofile=coverage.out ./...
|
||||
|
||||
- if: always()
|
||||
name: Per-file coverage report
|
||||
|
||||
@@ -104,6 +104,9 @@ func (h *ChannelHandler) List(c *gin.Context) {
|
||||
}
|
||||
result = append(result, entry)
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
log.Printf("Channels list rows.Err workspace=%s: %v", workspaceID, err)
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, result)
|
||||
}
|
||||
@@ -149,15 +152,6 @@ func (h *ChannelHandler) Create(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// #319: encrypt sensitive fields (bot_token, webhook_secret) before
|
||||
// persisting so a DB read/backup leak can't recover the credentials.
|
||||
// Validation above ran against plaintext; storage is ciphertext.
|
||||
if err := channels.EncryptSensitiveFields(body.Config); err != nil {
|
||||
log.Printf("Channels: encrypt config failed for workspace %s: %v", workspaceID, err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "encrypt failed"})
|
||||
return
|
||||
}
|
||||
|
||||
configJSON, _ := json.Marshal(body.Config)
|
||||
allowedJSON, _ := json.Marshal(body.AllowedUsers)
|
||||
enabled := true
|
||||
@@ -514,6 +508,9 @@ func (h *ChannelHandler) Webhook(c *gin.Context) {
|
||||
candidates = append(candidates, row)
|
||||
}
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
log.Printf("Channels webhook rows.Err channel_type=%s: %v", channelType, err)
|
||||
}
|
||||
|
||||
if targetSlug != "" {
|
||||
// [slug] routing — match against config username (lowercased)
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"crypto/rand"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
@@ -1013,6 +1014,54 @@ func TestChannelHandler_Webhook_Discord_InvalidSig_Returns401(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestChannelHandler_List_RowsErr_LogsError verifies that when the row iterator
|
||||
// returns an error after the last row (mid-stream DB error), rows.Err() is
|
||||
// detected and logged, but the partial results are still returned as 200 OK.
|
||||
// This is the fix for the missing rows.Err() check in List().
|
||||
func TestChannelHandler_List_RowsErr_LogsError(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
handler := NewChannelHandler(newTestChannelManager())
|
||||
|
||||
// Return one valid row, then mark row 0 as having a scan error.
|
||||
// RowError(n, err) causes Scan() to fail on row n, and sets rows.Err()
|
||||
// to the error. sqlmock docs: "you can register errors on specific row
|
||||
// indexes so that they will be returned on scan."
|
||||
rows := sqlmock.NewRows([]string{
|
||||
"id", "workspace_id", "channel_type", "channel_config", "enabled",
|
||||
"allowed_users", "last_message_at", "message_count", "created_at", "updated_at",
|
||||
}).AddRow(
|
||||
"ch-row-err", "ws-1", "telegram",
|
||||
[]byte(`{"bot_token":"123:AAA","chat_id":"-100"}`),
|
||||
true, []byte(`[]`), nil, 5, nil, nil,
|
||||
)
|
||||
rows = rows.RowError(0, errors.New("connection lost"))
|
||||
|
||||
mock.ExpectQuery("SELECT .* FROM workspace_channels WHERE workspace_id").
|
||||
WithArgs("ws-1").
|
||||
WillReturnRows(rows)
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Request, _ = http.NewRequest("GET", "/workspaces/ws-1/channels", nil)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
|
||||
|
||||
handler.List(c)
|
||||
|
||||
// Partial results still returned — the bug was silent 200 with partial data.
|
||||
if w.Code != 200 {
|
||||
t.Errorf("expected 200 (partial results on rows.Err), got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
// The rows.Err() is logged, not surfaced to the client (non-fatal).
|
||||
var result []map[string]interface{}
|
||||
json.Unmarshal(w.Body.Bytes(), &result)
|
||||
if len(result) == 0 {
|
||||
t.Error("expected at least partial results despite rows.Err")
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sqlmock expectations not met: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestChannelHandler_Webhook_Discord_ValidSig_PingAccepted verifies that a
|
||||
// correctly signed Discord PING (type=1) passes the signature gate and the
|
||||
// handler returns 200 (PING returns nil msg → "ignored" status).
|
||||
|
||||
@@ -130,11 +130,7 @@ const (
|
||||
|
||||
// ConfigVolumeName returns the Docker named volume for a workspace's configs.
|
||||
func ConfigVolumeName(workspaceID string) string {
|
||||
id := workspaceID
|
||||
if len(id) > 12 {
|
||||
id = id[:12]
|
||||
}
|
||||
return fmt.Sprintf("ws-%s-configs", id)
|
||||
return fmt.Sprintf("ws-%s-configs", workspaceID)
|
||||
}
|
||||
|
||||
// ClaudeSessionVolumeName returns the Docker named volume for a workspace's
|
||||
@@ -142,11 +138,7 @@ func ConfigVolumeName(workspaceID string) string {
|
||||
// config volume so it can be discarded independently (via WORKSPACE_RESET_SESSION
|
||||
// or ?reset=true) without wiping the user's config. Issue #12.
|
||||
func ClaudeSessionVolumeName(workspaceID string) string {
|
||||
id := workspaceID
|
||||
if len(id) > 12 {
|
||||
id = id[:12]
|
||||
}
|
||||
return fmt.Sprintf("ws-%s-claude-sessions", id)
|
||||
return fmt.Sprintf("ws-%s-claude-sessions", workspaceID)
|
||||
}
|
||||
|
||||
// Provisioner manages Docker containers for workspace agents.
|
||||
@@ -164,12 +156,12 @@ func New() (*Provisioner, error) {
|
||||
}
|
||||
|
||||
// ContainerName returns the Docker container name for a workspace.
|
||||
// Docker limits container names to 128 chars; a UUID (36 chars) with the
|
||||
// "ws-" prefix is 39 chars — well within that limit. Previously truncated
|
||||
// to 12 chars (KI-010) which caused container-name collisions when two
|
||||
// workspaces shared the same first 12 UUID characters.
|
||||
func ContainerName(workspaceID string) string {
|
||||
id := workspaceID
|
||||
if len(id) > 12 {
|
||||
id = id[:12]
|
||||
}
|
||||
return fmt.Sprintf("ws-%s", id)
|
||||
return fmt.Sprintf("ws-%s", workspaceID)
|
||||
}
|
||||
|
||||
// containerNamePrefix is the shared prefix every workspace container
|
||||
@@ -196,12 +188,10 @@ func managedLabels() map[string]string {
|
||||
return map[string]string{LabelManaged: "true"}
|
||||
}
|
||||
|
||||
// ListWorkspaceContainerIDPrefixes returns the 12-char workspace ID
|
||||
// prefixes of every running ws-* container the Docker daemon knows
|
||||
// about. The 12-char form matches ContainerName's truncation, so the
|
||||
// orphan sweeper can intersect this set against `SELECT
|
||||
// substring(id::text, 1, 12) FROM workspaces WHERE status = 'removed'`
|
||||
// without an extra round-trip per row.
|
||||
// ListWorkspaceContainerIDPrefixes returns the workspace ID portion (after
|
||||
// the "ws-" prefix) of every running ws-* container Docker knows about.
|
||||
// The orphan sweeper uses this with exact matching (KI-010 fix: no
|
||||
// truncation to 12 chars — ContainerName uses the full workspace ID now).
|
||||
//
|
||||
// Returns an empty slice on any Docker error (sweeper treats that as
|
||||
// "skip this round" — better than a partial scan that misses leaks).
|
||||
@@ -246,11 +236,11 @@ func (p *Provisioner) ListWorkspaceContainerIDPrefixes(ctx context.Context) ([]s
|
||||
return prefixes, nil
|
||||
}
|
||||
|
||||
// ListManagedContainerIDPrefixes returns the workspace ID prefix of every
|
||||
// container carrying the LabelManaged stamp. Distinct from
|
||||
// ListWorkspaceContainerIDPrefixes (name-filtered, may include sibling
|
||||
// platforms' containers on a shared Docker daemon): this method is the
|
||||
// "things definitely provisioned by a Molecule platform process" set.
|
||||
// ListManagedContainerIDPrefixes returns the workspace ID portion (after
|
||||
// the "ws-" prefix) of every container carrying the LabelManaged stamp.
|
||||
// Distinct from ListWorkspaceContainerIDPrefixes (name-filtered, may include
|
||||
// sibling platforms' containers on a shared Docker daemon): this method is
|
||||
// the "things definitely provisioned by a Molecule platform process" set.
|
||||
//
|
||||
// The orphan sweeper uses this for its second pass — reaping containers
|
||||
// whose workspace row no longer exists at all (the wiped-DB case after
|
||||
|
||||
@@ -357,8 +357,11 @@ func TestContainerName(t *testing.T) {
|
||||
}{
|
||||
{"short", "ws-short"},
|
||||
{"exactly12ch", "ws-exactly12ch"},
|
||||
{"longer-than-twelve-characters", "ws-longer-than-"},
|
||||
// No truncation: full ID is used (KI-010 fix)
|
||||
{"longer-than-twelve-characters", "ws-longer-than-twelve-characters"},
|
||||
{"abc", "ws-abc"},
|
||||
// Full UUID: 36 chars + "ws-" = 39 chars, well under Docker's 128 limit
|
||||
{"550e8400-e29b-41d4-a716-446655440000", "ws-550e8400-e29b-41d4-a716-446655440000"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
@@ -377,8 +380,10 @@ func TestConfigVolumeName(t *testing.T) {
|
||||
}{
|
||||
{"short", "ws-short-configs"},
|
||||
{"exactly12ch", "ws-exactly12ch-configs"},
|
||||
{"longer-than-twelve-characters", "ws-longer-than--configs"},
|
||||
// No truncation (KI-010 fix)
|
||||
{"longer-than-twelve-characters", "ws-longer-than-twelve-characters-configs"},
|
||||
{"abc", "ws-abc-configs"},
|
||||
{"550e8400-e29b-41d4-a716-446655440000", "ws-550e8400-e29b-41d4-a716-446655440000-configs"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
@@ -392,8 +397,8 @@ func TestConfigVolumeName(t *testing.T) {
|
||||
// ---------- #12 — claude-sessions volume naming ----------
|
||||
|
||||
// TestClaudeSessionVolumeName_Deterministic: same ID → same volume name, and
|
||||
// the name follows the ws-<id[:12]>-claude-sessions shape used everywhere
|
||||
// else in the provisioner.
|
||||
// the name follows the ws-<full-id>-claude-sessions shape (KI-010 fix:
|
||||
// no truncation to 12 chars).
|
||||
func TestClaudeSessionVolumeName_Deterministic(t *testing.T) {
|
||||
tests := []struct {
|
||||
id string
|
||||
@@ -401,8 +406,10 @@ func TestClaudeSessionVolumeName_Deterministic(t *testing.T) {
|
||||
}{
|
||||
{"short", "ws-short-claude-sessions"},
|
||||
{"exactly12ch", "ws-exactly12ch-claude-sessions"},
|
||||
{"longer-than-twelve-characters", "ws-longer-than--claude-sessions"},
|
||||
// No truncation (KI-010 fix)
|
||||
{"longer-than-twelve-characters", "ws-longer-than-twelve-characters-claude-sessions"},
|
||||
{"abc", "ws-abc-claude-sessions"},
|
||||
{"550e8400-e29b-41d4-a716-446655440000", "ws-550e8400-e29b-41d4-a716-446655440000-claude-sessions"},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
got := ClaudeSessionVolumeName(tt.id)
|
||||
|
||||
@@ -45,12 +45,11 @@ type OrphanReaper interface {
|
||||
RemoveVolume(ctx context.Context, workspaceID string) error
|
||||
}
|
||||
|
||||
// isLikelyWorkspaceID accepts strings shaped like a UUID prefix —
|
||||
// hex chars and `-` only. Workspace IDs are full UUIDs and the
|
||||
// container-name truncation keeps the hex prefix intact, so any
|
||||
// container name that doesn't match this is by definition not one
|
||||
// of ours and should be skipped. Also doubles as a SQL LIKE
|
||||
// wildcard guard (rejects `_` and `%`).
|
||||
// isLikelyWorkspaceID accepts strings shaped like a full UUID —
|
||||
// hex chars and `-` only. This filters out non-workspace Docker containers
|
||||
// (e.g. workspace-runner, platform-server) that slipped past the Docker name
|
||||
// filter because the filter uses substring matching, not prefix matching.
|
||||
// Also guards against SQL LIKE wildcards (`_` and `%`) in unusual container names.
|
||||
func isLikelyWorkspaceID(s string) bool {
|
||||
if s == "" {
|
||||
return false
|
||||
@@ -128,48 +127,36 @@ func sweepOnce(parent context.Context, reaper OrphanReaper) {
|
||||
// Conservative — only acts on rows the platform explicitly marked
|
||||
// for cleanup. Runs every cycle.
|
||||
func sweepRemovedRows(ctx context.Context, reaper OrphanReaper) {
|
||||
prefixes, err := reaper.ListWorkspaceContainerIDPrefixes(ctx)
|
||||
ids, err := reaper.ListWorkspaceContainerIDPrefixes(ctx)
|
||||
if err != nil {
|
||||
log.Printf("Orphan sweeper: ListWorkspaceContainerIDPrefixes failed: %v — skipping removed-row pass", err)
|
||||
return
|
||||
}
|
||||
if len(prefixes) == 0 {
|
||||
if len(ids) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Resolve each prefix to a full workspace_id whose status is
|
||||
// 'removed'. The platform's workspace IDs are full UUIDs but
|
||||
// container names are truncated to 12 chars — an UPPER BOUND
|
||||
// of one match per prefix is guaranteed by the DB (UUID v4
|
||||
// collisions in the first 12 chars across active rows are
|
||||
// statistically negligible). Use a single IN-style query so
|
||||
// the cost is one round-trip regardless of leak count.
|
||||
//
|
||||
// Defence: drop any prefix whose contents fall outside the
|
||||
// hex-and-dash UUID alphabet. Workspace IDs are UUIDs, so
|
||||
// container names follow ws-<12 hex chars>. Anything else is
|
||||
// either a non-workspace container that slipped past the
|
||||
// substring-match Docker filter (workspace-runner, etc.) or a
|
||||
// malformed entry — neither should be turned into a LIKE
|
||||
// pattern. Also blocks SQL LIKE wildcards (`_` and `%`) from
|
||||
// reaching the query, even though Docker's container-name
|
||||
// validation would already have rejected them upstream.
|
||||
likes := make([]string, 0, len(prefixes))
|
||||
for _, p := range prefixes {
|
||||
if !isLikelyWorkspaceID(p) {
|
||||
// Resolve each container ID to a workspace_id whose status is
|
||||
// 'removed'. ContainerName uses the full workspace ID (no truncation
|
||||
// since KI-010 fix), so we use exact matching (=) rather than LIKE.
|
||||
// isLikelyWorkspaceID filters non-workspace Docker containers that
|
||||
// slipped through the substring name filter.
|
||||
candidates := make([]string, 0, len(ids))
|
||||
for _, id := range ids {
|
||||
if !isLikelyWorkspaceID(id) {
|
||||
continue
|
||||
}
|
||||
likes = append(likes, p+"%")
|
||||
candidates = append(candidates, id)
|
||||
}
|
||||
if len(likes) == 0 {
|
||||
if len(candidates) == 0 {
|
||||
return
|
||||
}
|
||||
rows, err := db.DB.QueryContext(ctx, `
|
||||
SELECT id::text
|
||||
FROM workspaces
|
||||
WHERE status = 'removed'
|
||||
AND id::text LIKE ANY($1::text[])
|
||||
`, pq.Array(likes))
|
||||
AND id::text = ANY($1::text[])
|
||||
`, pq.Array(candidates))
|
||||
if err != nil {
|
||||
log.Printf("Orphan sweeper: DB query failed: %v — skipping removed-row pass", err)
|
||||
return
|
||||
@@ -224,48 +211,44 @@ func sweepRemovedRows(ctx context.Context, reaper OrphanReaper) {
|
||||
// stamped only by the provisioner: a sibling stack's containers won't
|
||||
// carry it, so this pass leaves them alone.
|
||||
func sweepLabeledOrphansWithoutRows(ctx context.Context, reaper OrphanReaper) {
|
||||
managedPrefixes, err := reaper.ListManagedContainerIDPrefixes(ctx)
|
||||
ids, err := reaper.ListManagedContainerIDPrefixes(ctx)
|
||||
if err != nil {
|
||||
log.Printf("Orphan sweeper: ListManagedContainerIDPrefixes failed: %v — skipping wiped-DB pass", err)
|
||||
return
|
||||
}
|
||||
if len(managedPrefixes) == 0 {
|
||||
if len(ids) == 0 {
|
||||
return
|
||||
}
|
||||
managedLikes := make([]string, 0, len(managedPrefixes))
|
||||
keep := make([]string, 0, len(managedPrefixes))
|
||||
for _, p := range managedPrefixes {
|
||||
if !isLikelyWorkspaceID(p) {
|
||||
candidates := make([]string, 0, len(ids))
|
||||
for _, id := range ids {
|
||||
if !isLikelyWorkspaceID(id) {
|
||||
continue
|
||||
}
|
||||
managedLikes = append(managedLikes, p+"%")
|
||||
keep = append(keep, p) // index-aligned with managedLikes
|
||||
candidates = append(candidates, id) // index-aligned
|
||||
}
|
||||
if len(managedLikes) == 0 {
|
||||
if len(candidates) == 0 {
|
||||
return
|
||||
}
|
||||
// Find prefixes that match SOME workspace row (any status). Anything
|
||||
// in managedLikes NOT in this returned set is the wiped-DB orphan
|
||||
// Find IDs that match SOME workspace row (any status). Anything
|
||||
// in candidates NOT in this returned set is the wiped-DB orphan
|
||||
// set — labeled, no row, ours to reap.
|
||||
knownRows, err := db.DB.QueryContext(ctx, `
|
||||
SELECT lk
|
||||
FROM unnest($1::text[]) AS lk
|
||||
WHERE EXISTS (
|
||||
SELECT 1 FROM workspaces WHERE id::text LIKE lk
|
||||
)
|
||||
`, pq.Array(managedLikes))
|
||||
SELECT id::text
|
||||
FROM workspaces
|
||||
WHERE id::text = ANY($1::text[])
|
||||
`, pq.Array(candidates))
|
||||
if err != nil {
|
||||
log.Printf("Orphan sweeper: wiped-DB reverse-lookup failed: %v — skipping wiped-DB pass", err)
|
||||
return
|
||||
}
|
||||
known := make(map[string]struct{}, len(managedLikes))
|
||||
known := make(map[string]struct{}, len(candidates))
|
||||
for knownRows.Next() {
|
||||
var lk string
|
||||
if scanErr := knownRows.Scan(&lk); scanErr != nil {
|
||||
var id string
|
||||
if scanErr := knownRows.Scan(&id); scanErr != nil {
|
||||
log.Printf("Orphan sweeper: wiped-DB row scan failed: %v", scanErr)
|
||||
continue
|
||||
}
|
||||
known[lk] = struct{}{}
|
||||
known[id] = struct{}{}
|
||||
}
|
||||
if cerr := knownRows.Close(); cerr != nil {
|
||||
log.Printf("Orphan sweeper: wiped-DB rows close failed: %v", cerr)
|
||||
@@ -275,21 +258,20 @@ func sweepLabeledOrphansWithoutRows(ctx context.Context, reaper OrphanReaper) {
|
||||
return
|
||||
}
|
||||
|
||||
for i, lk := range managedLikes {
|
||||
if _, ok := known[lk]; ok {
|
||||
for _, id := range candidates {
|
||||
if _, ok := known[id]; ok {
|
||||
continue
|
||||
}
|
||||
// Reap by container-name prefix. ContainerName/Stop/RemoveVolume
|
||||
// truncate to 12 chars internally, so passing the prefix as the
|
||||
// "workspace ID" resolves to the same `ws-<prefix>` container.
|
||||
prefix := keep[i]
|
||||
log.Printf("Orphan sweeper: reaping untracked managed container ws-%s (no DB row — wiped-DB orphan)", prefix)
|
||||
if stopErr := reaper.Stop(ctx, prefix); stopErr != nil {
|
||||
log.Printf("Orphan sweeper: Stop failed for managed orphan ws-%s: %v — retry next cycle", prefix, stopErr)
|
||||
// id is a full workspace ID (container name = ws-<id>).
|
||||
// Stop/RemoveVolume use ContainerName internally, which now
|
||||
// uses the full ID, so passing id resolves correctly.
|
||||
log.Printf("Orphan sweeper: reaping untracked managed container ws-%s (no DB row — wiped-DB orphan)", id)
|
||||
if stopErr := reaper.Stop(ctx, id); stopErr != nil {
|
||||
log.Printf("Orphan sweeper: Stop failed for managed orphan ws-%s: %v — retry next cycle", id, stopErr)
|
||||
continue
|
||||
}
|
||||
if rmErr := reaper.RemoveVolume(ctx, prefix); rmErr != nil {
|
||||
log.Printf("Orphan sweeper: RemoveVolume warning for managed orphan ws-%s: %v", prefix, rmErr)
|
||||
if rmErr := reaper.RemoveVolume(ctx, id); rmErr != nil {
|
||||
log.Printf("Orphan sweeper: RemoveVolume warning for managed orphan ws-%s: %v", id, rmErr)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -377,37 +359,38 @@ func sweepStaleTokensWithoutContainer(ctx context.Context, reaper OrphanReaper)
|
||||
return
|
||||
}
|
||||
|
||||
prefixes, err := reaper.ListWorkspaceContainerIDPrefixes(ctx)
|
||||
ids, err := reaper.ListWorkspaceContainerIDPrefixes(ctx)
|
||||
if err != nil {
|
||||
log.Printf("Orphan sweeper: ListWorkspaceContainerIDPrefixes failed: %v — skipping stale-token pass", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Same hex-and-dash filter as the other passes — anything that
|
||||
// can't be a workspace UUID prefix doesn't belong in a SQL LIKE
|
||||
// pattern.
|
||||
// can't be a full workspace UUID is not one of ours and must be
|
||||
// skipped before inserting into the SQL query.
|
||||
//
|
||||
// NOTE: an empty `likes` array is intentionally NOT a short-circuit.
|
||||
// NOTE: an empty `ids` array is intentionally NOT a short-circuit.
|
||||
// "No workspace containers" is the load-bearing case for this pass
|
||||
// (operator nuked everything). The `cardinality($1) = 0` clause in
|
||||
// the SELECT below treats empty likes as "no LIKE filter" → every
|
||||
// the SELECT below treats empty ids as "no filter" → every
|
||||
// stale-token workspace becomes a candidate. The first two passes'
|
||||
// early-return-on-empty-prefixes pattern would defeat this entire
|
||||
// early-return-on-empty-ids pattern would defeat this entire
|
||||
// pass's purpose.
|
||||
likes := make([]string, 0, len(prefixes))
|
||||
for _, p := range prefixes {
|
||||
if !isLikelyWorkspaceID(p) {
|
||||
candidates := make([]string, 0, len(ids))
|
||||
for _, id := range ids {
|
||||
if !isLikelyWorkspaceID(id) {
|
||||
continue
|
||||
}
|
||||
likes = append(likes, p+"%")
|
||||
candidates = append(candidates, id)
|
||||
}
|
||||
|
||||
// Find workspaces with live tokens whose most-recent activity is
|
||||
// past the grace window AND whose ID does NOT match any live
|
||||
// container prefix. When `likes` is empty (no workspace containers
|
||||
// running at all), every stale-activity workspace is a candidate —
|
||||
// expressed via the `cardinality($1) = 0` short-circuit so the
|
||||
// query has a single shape regardless of container count.
|
||||
// past the grace window AND whose workspace ID is NOT in the set
|
||||
// of live container workspace IDs. When `candidates` is empty
|
||||
// (no workspace containers running at all), every stale-activity
|
||||
// workspace is a candidate — expressed via the
|
||||
// `cardinality($1) = 0` short-circuit so the query has a
|
||||
// single shape regardless of container count.
|
||||
//
|
||||
// make_interval(secs => $2) avoids the time.Duration.String() →
|
||||
// `"5m0s"` mismatch with Postgres interval grammar; passing seconds
|
||||
@@ -430,9 +413,9 @@ func sweepStaleTokensWithoutContainer(ctx context.Context, reaper OrphanReaper)
|
||||
AND COALESCE(t.last_used_at, t.created_at) < now() - make_interval(secs => $2)
|
||||
AND (
|
||||
cardinality($1::text[]) = 0
|
||||
OR NOT (t.workspace_id::text LIKE ANY($1::text[]))
|
||||
OR NOT (t.workspace_id::text = ANY($1::text[]))
|
||||
)
|
||||
`, pq.Array(likes), graceSeconds)
|
||||
`, pq.Array(candidates), graceSeconds)
|
||||
if qErr != nil {
|
||||
log.Printf("Orphan sweeper: stale-token query failed: %v — skipping stale-token pass", qErr)
|
||||
return
|
||||
|
||||
@@ -299,33 +299,35 @@ func TestStartOrphanSweeper_NilReaperIsNoOp(t *testing.T) {
|
||||
// - ListWorkspaceContainerIDPrefixes returns nothing (no name-filter
|
||||
// matches because we cleared running ws-* in this scenario via the
|
||||
// test setup — irrelevant to second pass)
|
||||
// - ListManagedContainerIDPrefixes returns the two labeled prefixes
|
||||
// - ListManagedContainerIDPrefixes returns the two labeled IDs
|
||||
// (in real Docker these still exist; their label survives daemon
|
||||
// restart)
|
||||
// - The reverse-lookup query returns zero matches (DB is empty)
|
||||
// - The lookup query returns zero matches (DB is empty)
|
||||
// - Sweeper reaps both
|
||||
func TestSweepOnce_WipedDBReapsLabeledOrphans(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
|
||||
ws1 := "abc123de-f456-4a71-9c00-000000000001"
|
||||
ws2 := "ee001122-3344-4a71-9c00-000000000002"
|
||||
reaper := &fakeReaper{
|
||||
listResponse: nil, // no name-filter matches in this scenario
|
||||
managedListResponse: []string{"abc123def456", "ee0011223344"},
|
||||
managedListResponse: []string{ws1, ws2},
|
||||
}
|
||||
|
||||
// First-pass query is skipped (listResponse is nil → early return
|
||||
// path doesn't even reach a DB call). Second-pass reverse lookup
|
||||
// returns no rows — both prefixes are unknown.
|
||||
mock.ExpectQuery(`SELECT lk\s+FROM unnest`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"lk"}))
|
||||
// path doesn't even reach a DB call). Second-pass lookup
|
||||
// returns no rows — both IDs are unknown.
|
||||
mock.ExpectQuery(`SELECT id::text\s+FROM workspaces`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}))
|
||||
expectStaleTokenSweepNoOp(mock)
|
||||
|
||||
sweepOnce(context.Background(), reaper)
|
||||
|
||||
if len(reaper.stopCalls) != 2 {
|
||||
t.Fatalf("expected 2 Stop calls (both prefixes reaped), got %v", reaper.stopCalls)
|
||||
t.Fatalf("expected 2 Stop calls (both IDs reaped), got %v", reaper.stopCalls)
|
||||
}
|
||||
wantStops := map[string]struct{}{"abc123def456": {}, "ee0011223344": {}}
|
||||
wantStops := map[string]struct{}{ws1: {}, ws2: {}}
|
||||
for _, c := range reaper.stopCalls {
|
||||
if _, ok := wantStops[c]; !ok {
|
||||
t.Errorf("unexpected Stop call: %q", c)
|
||||
@@ -347,17 +349,19 @@ func TestSweepOnce_WipedDBSkipsLabeledContainersWithRows(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
|
||||
ws1 := "abc123de-f456-4a71-9c00-000000000001"
|
||||
ws2 := "ee001122-3344-4a71-9c00-000000000002"
|
||||
reaper := &fakeReaper{
|
||||
listResponse: nil,
|
||||
managedListResponse: []string{"abc123def456", "ee0011223344"},
|
||||
managedListResponse: []string{ws1, ws2},
|
||||
}
|
||||
|
||||
// The reverse-lookup returns both prefixes — both have rows in DB.
|
||||
// The lookup returns both IDs — both have rows in DB.
|
||||
// Sweeper should not reap either.
|
||||
mock.ExpectQuery(`SELECT lk\s+FROM unnest`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"lk"}).
|
||||
AddRow("abc123def456%").
|
||||
AddRow("ee0011223344%"))
|
||||
mock.ExpectQuery(`SELECT id::text\s+FROM workspaces`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}).
|
||||
AddRow(ws1).
|
||||
AddRow(ws2))
|
||||
expectStaleTokenSweepNoOp(mock)
|
||||
|
||||
sweepOnce(context.Background(), reaper)
|
||||
@@ -376,16 +380,17 @@ func TestSweepOnce_WipedDBReapsOnlyTheUnknownOnes(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
|
||||
const keep = "abcdef012345"
|
||||
const reap = "fedcba543210"
|
||||
const keep = "abcdef01-2345-4a71-9c00-000000000001"
|
||||
const reap = "fedcba54-3210-4a71-9c00-000000000002"
|
||||
reaper := &fakeReaper{
|
||||
listResponse: nil,
|
||||
managedListResponse: []string{keep, reap},
|
||||
}
|
||||
|
||||
mock.ExpectQuery(`SELECT lk\s+FROM unnest`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"lk"}).
|
||||
AddRow(keep + "%"))
|
||||
// Lookup returns only `keep` — `reap` has no row and must be reaped.
|
||||
mock.ExpectQuery(`SELECT id::text\s+FROM workspaces`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}).
|
||||
AddRow(keep))
|
||||
expectStaleTokenSweepNoOp(mock)
|
||||
|
||||
sweepOnce(context.Background(), reaper)
|
||||
@@ -429,22 +434,22 @@ func TestSweepOnce_WipedDBSkippedOnDockerError(t *testing.T) {
|
||||
// TestSweepOnce_WipedDBSkipsNonUUIDPrefixes — defence-in-depth: if a
|
||||
// non-UUID-shaped name slipped past the label filter (shouldn't happen
|
||||
// because the provisioner only labels ws-* containers, but the sweeper
|
||||
// shouldn't trust upstream invariants), the prefix is dropped before
|
||||
// shouldn't trust upstream invariants), the ID is dropped before
|
||||
// hitting the SQL query.
|
||||
func TestSweepOnce_WipedDBSkipsNonUUIDPrefixes(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
|
||||
const valid = "abc123def456"
|
||||
const valid = "abc123de-f456-4a71-9c00-000000000001"
|
||||
reaper := &fakeReaper{
|
||||
listResponse: nil,
|
||||
managedListResponse: []string{"hello world", "abc%inject", valid},
|
||||
}
|
||||
|
||||
// Only `valid` survives isLikelyWorkspaceID — it's the only prefix
|
||||
// that should appear in the unnest array.
|
||||
mock.ExpectQuery(`SELECT lk\s+FROM unnest`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"lk"}))
|
||||
// Only `valid` survives isLikelyWorkspaceID — it's the only ID
|
||||
// that should appear in the = ANY array.
|
||||
mock.ExpectQuery(`SELECT id::text\s+FROM workspaces`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}))
|
||||
expectStaleTokenSweepNoOp(mock)
|
||||
|
||||
sweepOnce(context.Background(), reaper)
|
||||
|
||||
Reference in New Issue
Block a user