From 602f3ef68547d72edb1643ee7fef6d9187f751d0 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 14 Apr 2026 07:32:36 -0700 Subject: [PATCH] fix(provisioner): stop rogue config-missing restart loop (#17) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resolves #17. Part A: scripts/cleanup-rogue-workspaces.sh deletes workspaces whose id or name starts with known test placeholder prefixes (aaaaaaaa-, etc.) and force-removes the paired Docker container. Documented in tests/README.md. Part B: add a pre-flight check in provisionWorkspace() — when neither a template path nor in-memory configFiles supplies config.yaml, probe the existing named volume via a throwaway alpine container. If the volume lacks config.yaml, mark the workspace status='failed' with a clear last_sample_error instead of handing it to Docker's unless-stopped restart policy (which otherwise loops forever on FileNotFoundError). New pure helper provisioner.ValidateConfigSource + unit tests. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../internal/handlers/workspace_provision.go | 24 ++++++ platform/internal/provisioner/provisioner.go | 77 +++++++++++++++++++ .../internal/provisioner/provisioner_test.go | 55 +++++++++++++ scripts/cleanup-rogue-workspaces.sh | 59 ++++++++++++++ tests/README.md | 18 +++++ 5 files changed, 233 insertions(+) create mode 100755 scripts/cleanup-rogue-workspaces.sh diff --git a/platform/internal/handlers/workspace_provision.go b/platform/internal/handlers/workspace_provision.go index 536a92c3..312ed5e9 100644 --- a/platform/internal/handlers/workspace_provision.go +++ b/platform/internal/handlers/workspace_provision.go @@ -77,6 +77,30 @@ func (h *WorkspaceHandler) provisionWorkspace(workspaceID, templatePath string, awarenessNamespace := h.loadAwarenessNamespace(ctx, workspaceID) cfg := h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace) + // Preflight #17: refuse to start a container we already know will crash on missing config.yaml. + // When the caller supplies neither a template dir nor in-memory configFiles (the auto-restart + // path), probe the existing Docker named volume. If it's empty/missing config.yaml, mark the + // workspace 'failed' instead of handing it to Docker's unless-stopped restart policy, which + // would otherwise loop forever on FileNotFoundError. + if srcErr := provisioner.ValidateConfigSource(templatePath, configFiles); srcErr != nil { + hasConfig, probeErr := h.provisioner.VolumeHasFile(ctx, workspaceID, "config.yaml") + if probeErr != nil { + log.Printf("Provisioner: config.yaml preflight probe failed for %s: %v (proceeding)", workspaceID, probeErr) + } else if !hasConfig { + msg := fmt.Sprintf("cannot start workspace %s: no config.yaml source and config volume is empty — delete the workspace or provide a template", workspaceID) + log.Printf("Provisioner: %s", msg) + if _, dbErr := db.DB.ExecContext(ctx, + `UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`, + workspaceID, msg); dbErr != nil { + log.Printf("Provisioner: failed to mark workspace %s as failed: %v", workspaceID, dbErr) + } + h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{ + "error": msg, + }) + return + } + } + url, err := h.provisioner.Start(ctx, cfg) if err != nil { // Persist the error text to last_sample_error so the canvas and diff --git a/platform/internal/provisioner/provisioner.go b/platform/internal/provisioner/provisioner.go index 0993ae54..92f89a1d 100644 --- a/platform/internal/provisioner/provisioner.go +++ b/platform/internal/provisioner/provisioner.go @@ -635,6 +635,83 @@ func (p *Provisioner) Close() error { return p.cli.Close() } +// ValidateConfigSource is a pure check that ensures at least one static +// source of /configs/config.yaml is available before a container starts. +// +// Inputs mirror the fields on WorkspaceConfig: +// - templatePath: host dir expected to contain config.yaml (copied into /configs) +// - configFiles: in-memory files written into /configs at start time +// +// Returns nil if either source will place config.yaml into /configs. +// When both sources are empty, returns ErrNoConfigSource so callers can +// fall through to a volume probe (VolumeHasFile) before giving up. +// +// Used by the platform's provision flow to catch the rogue-restart-loop +// case (#17): a workspace whose config volume was wiped and whose +// auto-restart path passes empty template+configFiles would otherwise +// boot into a FileNotFoundError crash loop under Docker's +// `unless-stopped` restart policy. +func ValidateConfigSource(templatePath string, configFiles map[string][]byte) error { + if templatePath != "" { + // Stat the template's config.yaml; an empty/stale template dir + // without config.yaml is as broken as no template at all. + info, err := os.Stat(filepath.Join(templatePath, "config.yaml")) + if err == nil && !info.IsDir() { + return nil + } + } + if configFiles != nil { + if data, ok := configFiles["config.yaml"]; ok && len(data) > 0 { + return nil + } + } + return ErrNoConfigSource +} + +// ErrNoConfigSource is returned by ValidateConfigSource when neither the +// template path nor the in-memory config files supply a config.yaml. +var ErrNoConfigSource = fmt.Errorf("no config.yaml source: template path missing config.yaml and configFiles empty") + +// VolumeHasFile returns true if the named config volume for a workspace +// already contains the given file path (relative to /configs). Used by +// the auto-restart path to confirm a previously-provisioned volume is +// still populated before reusing it — if the volume was wiped, we must +// regenerate config or fail cleanly rather than loop on FileNotFoundError. +// +// Implementation: run a throwaway alpine `test -f` container bound to the +// volume read-only. Returns (false, nil) if the file is absent and +// (false, err) only on Docker-level failures. +func (p *Provisioner) VolumeHasFile(ctx context.Context, workspaceID, relPath string) (bool, error) { + volName := ConfigVolumeName(workspaceID) + // Confirm the volume exists first — Docker auto-creates on bind otherwise. + if _, err := p.cli.VolumeInspect(ctx, volName); err != nil { + return false, nil + } + resp, err := p.cli.ContainerCreate(ctx, &container.Config{ + Image: "alpine", + Cmd: []string{"test", "-f", "/vol/" + relPath}, + }, &container.HostConfig{ + Binds: []string{volName + ":/vol:ro"}, + }, nil, nil, "") + if err != nil { + return false, err + } + defer p.cli.ContainerRemove(ctx, resp.ID, container.RemoveOptions{Force: true}) + + if err := p.cli.ContainerStart(ctx, resp.ID, container.StartOptions{}); err != nil { + return false, err + } + waitCh, errCh := p.cli.ContainerWait(ctx, resp.ID, container.WaitConditionNotRunning) + select { + case w := <-waitCh: + return w.StatusCode == 0, nil + case err := <-errCh: + return false, err + case <-ctx.Done(): + return false, ctx.Err() + } +} + // isImageNotFoundErr classifies a Docker client error as "image not // available locally." The daemon wraps this message in a generic // SystemError type without exposing a typed sentinel, so we fall back diff --git a/platform/internal/provisioner/provisioner_test.go b/platform/internal/provisioner/provisioner_test.go index 95f99299..9db5f285 100644 --- a/platform/internal/provisioner/provisioner_test.go +++ b/platform/internal/provisioner/provisioner_test.go @@ -1,12 +1,67 @@ package provisioner import ( + "errors" + "os" + "path/filepath" "strings" "testing" "github.com/docker/docker/api/types/container" ) +// TestValidateConfigSource covers issue #17: a workspace restart with no +// template and no in-memory configFiles must be caught before Docker +// starts a container destined to crash-loop on FileNotFoundError. +func TestValidateConfigSource_ConfigFilesPresent(t *testing.T) { + files := map[string][]byte{"config.yaml": []byte("name: test\n")} + if err := ValidateConfigSource("", files); err != nil { + t.Fatalf("expected nil error when configFiles has config.yaml, got %v", err) + } +} + +func TestValidateConfigSource_ConfigFilesEmptyValue(t *testing.T) { + files := map[string][]byte{"config.yaml": {}} + if err := ValidateConfigSource("", files); !errors.Is(err, ErrNoConfigSource) { + t.Fatalf("expected ErrNoConfigSource for empty config.yaml bytes, got %v", err) + } +} + +func TestValidateConfigSource_TemplatePathWithConfig(t *testing.T) { + dir := t.TempDir() + if err := os.WriteFile(filepath.Join(dir, "config.yaml"), []byte("name: x\n"), 0644); err != nil { + t.Fatalf("setup: %v", err) + } + if err := ValidateConfigSource(dir, nil); err != nil { + t.Fatalf("expected nil when template dir has config.yaml, got %v", err) + } +} + +func TestValidateConfigSource_TemplatePathMissingConfig(t *testing.T) { + dir := t.TempDir() // empty dir + if err := ValidateConfigSource(dir, nil); !errors.Is(err, ErrNoConfigSource) { + t.Fatalf("expected ErrNoConfigSource for template dir without config.yaml, got %v", err) + } +} + +func TestValidateConfigSource_BothEmpty(t *testing.T) { + if err := ValidateConfigSource("", nil); !errors.Is(err, ErrNoConfigSource) { + t.Fatalf("expected ErrNoConfigSource when both sources empty, got %v", err) + } +} + +func TestValidateConfigSource_TemplateIsDirName(t *testing.T) { + // If `config.yaml` at the template path is itself a directory (weird + // but possible), the validator should reject it. + dir := t.TempDir() + if err := os.MkdirAll(filepath.Join(dir, "config.yaml"), 0755); err != nil { + t.Fatalf("setup: %v", err) + } + if err := ValidateConfigSource(dir, nil); !errors.Is(err, ErrNoConfigSource) { + t.Fatalf("expected ErrNoConfigSource when config.yaml is a dir, got %v", err) + } +} + // baseHostConfig returns a fresh HostConfig with typical pre-tier binds, // mimicking what Start() builds before calling ApplyTierConfig. func baseHostConfig(pluginsPath string) *container.HostConfig { diff --git a/scripts/cleanup-rogue-workspaces.sh b/scripts/cleanup-rogue-workspaces.sh new file mode 100755 index 00000000..910fd90a --- /dev/null +++ b/scripts/cleanup-rogue-workspaces.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +# cleanup-rogue-workspaces.sh — delete stale test workspaces matching +# well-known placeholder ID patterns (#17). +# +# Usage: +# bash scripts/cleanup-rogue-workspaces.sh # against http://localhost:8080 +# MOLECULE_URL=http://host:8080 bash scripts/cleanup-rogue-workspaces.sh +# +# Deletes any workspace whose id OR name starts with one of the +# hard-coded test prefixes. Also force-removes the matching Docker +# container (`ws-`) in case Docker's unless-stopped policy +# kept it restarting on a missing config.yaml. +# +# Safe to run repeatedly — idempotent. Non-matching workspaces are left +# untouched. + +set -euo pipefail + +PLATFORM_URL="${MOLECULE_URL:-http://localhost:8080}" + +echo "Scanning ${PLATFORM_URL}/workspaces for rogue test workspaces..." + +payload="$(curl -sS --fail "${PLATFORM_URL}/workspaces" || true)" +if [[ -z "${payload}" ]]; then + echo " platform unreachable at ${PLATFORM_URL}" + exit 1 +fi + +# Extract {id\tname} for workspaces whose id or name starts with a test prefix. +# Patterns are passed via env so the heredoc doesn't need shell interpolation. +matches="$(PAYLOAD="${payload}" python3 - <<'PY' +import json, os +data = json.loads(os.environ["PAYLOAD"]) +rows = data if isinstance(data, list) else data.get("workspaces", []) +patterns = ("aaaaaaaa-", "bbbbbbbb-", "cccccccc-", "test-ws-") +for w in rows: + wid = w.get("id", "") or "" + name = w.get("name", "") or "" + if any(wid.startswith(p) or name.startswith(p) for p in patterns): + print(f"{wid}\t{name}") +PY +)" + +if [[ -z "${matches}" ]]; then + echo " no rogue workspaces found" + exit 0 +fi + +while IFS=$'\t' read -r wid wname; do + [[ -z "${wid}" ]] && continue + short="${wid:0:12}" + echo " deleting workspace ${wid} (${wname})" + curl -sS -X DELETE "${PLATFORM_URL}/workspaces/${wid}" >/dev/null || true + if command -v docker >/dev/null 2>&1; then + docker rm -f "ws-${short}" >/dev/null 2>&1 || true + fi +done <<<"${matches}" + +echo "done." diff --git a/tests/README.md b/tests/README.md index bbff0ea9..4c2d4e73 100644 --- a/tests/README.md +++ b/tests/README.md @@ -22,3 +22,21 @@ This repo uses the standard monorepo testing convention: **unit tests live with ## Running E2E Every E2E script here assumes the platform is running at `localhost:8080` and (where noted) provisioned agents are online. See the header comment of each `.sh` for specifics. + +## Cleaning up rogue test workspaces + +If an E2E run aborts before its teardown runs (Ctrl-C, crash, CI timeout), +the platform can be left with workspaces whose config volume is stale or +empty — Docker's `unless-stopped` restart policy then spins those +containers in a FileNotFoundError loop. The platform's pre-flight check +(#17) marks such workspaces `failed` on the next restart, but a manual +cleanup is useful: + +```bash +bash scripts/cleanup-rogue-workspaces.sh # deletes ws with id/name starting aaaaaaaa-, bbbbbbbb-, cccccccc-, test-ws- +MOLECULE_URL=http://host:8080 bash scripts/cleanup-rogue-workspaces.sh +``` + +The script DELETEs each matching workspace via the API and +force-removes the `ws-` container as a belt-and-suspenders +fallback.