fix(provisioner): stop rogue config-missing restart loop (#17)

Resolves #17.

Part A: scripts/cleanup-rogue-workspaces.sh deletes workspaces whose id
or name starts with known test placeholder prefixes (aaaaaaaa-, etc.)
and force-removes the paired Docker container. Documented in
tests/README.md.

Part B: add a pre-flight check in provisionWorkspace() — when neither a
template path nor in-memory configFiles supplies config.yaml, probe the
existing named volume via a throwaway alpine container. If the volume
lacks config.yaml, mark the workspace status='failed' with a clear
last_sample_error instead of handing it to Docker's unless-stopped
restart policy (which otherwise loops forever on FileNotFoundError).

New pure helper provisioner.ValidateConfigSource + unit tests.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-14 07:32:36 -07:00
parent 5ab75532d0
commit f7683e3adf
5 changed files with 233 additions and 0 deletions

View File

@ -77,6 +77,30 @@ func (h *WorkspaceHandler) provisionWorkspace(workspaceID, templatePath string,
awarenessNamespace := h.loadAwarenessNamespace(ctx, workspaceID)
cfg := h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
// Preflight #17: refuse to start a container we already know will crash on missing config.yaml.
// When the caller supplies neither a template dir nor in-memory configFiles (the auto-restart
// path), probe the existing Docker named volume. If it's empty/missing config.yaml, mark the
// workspace 'failed' instead of handing it to Docker's unless-stopped restart policy, which
// would otherwise loop forever on FileNotFoundError.
if srcErr := provisioner.ValidateConfigSource(templatePath, configFiles); srcErr != nil {
hasConfig, probeErr := h.provisioner.VolumeHasFile(ctx, workspaceID, "config.yaml")
if probeErr != nil {
log.Printf("Provisioner: config.yaml preflight probe failed for %s: %v (proceeding)", workspaceID, probeErr)
} else if !hasConfig {
msg := fmt.Sprintf("cannot start workspace %s: no config.yaml source and config volume is empty — delete the workspace or provide a template", workspaceID)
log.Printf("Provisioner: %s", msg)
if _, dbErr := db.DB.ExecContext(ctx,
`UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
workspaceID, msg); dbErr != nil {
log.Printf("Provisioner: failed to mark workspace %s as failed: %v", workspaceID, dbErr)
}
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
"error": msg,
})
return
}
}
url, err := h.provisioner.Start(ctx, cfg)
if err != nil {
// Persist the error text to last_sample_error so the canvas and

View File

@ -635,6 +635,83 @@ func (p *Provisioner) Close() error {
return p.cli.Close()
}
// ValidateConfigSource is a pure check that ensures at least one static
// source of /configs/config.yaml is available before a container starts.
//
// Inputs mirror the fields on WorkspaceConfig:
// - templatePath: host dir expected to contain config.yaml (copied into /configs)
// - configFiles: in-memory files written into /configs at start time
//
// Returns nil if either source will place config.yaml into /configs.
// When both sources are empty, returns ErrNoConfigSource so callers can
// fall through to a volume probe (VolumeHasFile) before giving up.
//
// Used by the platform's provision flow to catch the rogue-restart-loop
// case (#17): a workspace whose config volume was wiped and whose
// auto-restart path passes empty template+configFiles would otherwise
// boot into a FileNotFoundError crash loop under Docker's
// `unless-stopped` restart policy.
func ValidateConfigSource(templatePath string, configFiles map[string][]byte) error {
if templatePath != "" {
// Stat the template's config.yaml; an empty/stale template dir
// without config.yaml is as broken as no template at all.
info, err := os.Stat(filepath.Join(templatePath, "config.yaml"))
if err == nil && !info.IsDir() {
return nil
}
}
if configFiles != nil {
if data, ok := configFiles["config.yaml"]; ok && len(data) > 0 {
return nil
}
}
return ErrNoConfigSource
}
// ErrNoConfigSource is returned by ValidateConfigSource when neither the
// template path nor the in-memory config files supply a config.yaml.
var ErrNoConfigSource = fmt.Errorf("no config.yaml source: template path missing config.yaml and configFiles empty")
// VolumeHasFile returns true if the named config volume for a workspace
// already contains the given file path (relative to /configs). Used by
// the auto-restart path to confirm a previously-provisioned volume is
// still populated before reusing it — if the volume was wiped, we must
// regenerate config or fail cleanly rather than loop on FileNotFoundError.
//
// Implementation: run a throwaway alpine `test -f` container bound to the
// volume read-only. Returns (false, nil) if the file is absent and
// (false, err) only on Docker-level failures.
func (p *Provisioner) VolumeHasFile(ctx context.Context, workspaceID, relPath string) (bool, error) {
volName := ConfigVolumeName(workspaceID)
// Confirm the volume exists first — Docker auto-creates on bind otherwise.
if _, err := p.cli.VolumeInspect(ctx, volName); err != nil {
return false, nil
}
resp, err := p.cli.ContainerCreate(ctx, &container.Config{
Image: "alpine",
Cmd: []string{"test", "-f", "/vol/" + relPath},
}, &container.HostConfig{
Binds: []string{volName + ":/vol:ro"},
}, nil, nil, "")
if err != nil {
return false, err
}
defer p.cli.ContainerRemove(ctx, resp.ID, container.RemoveOptions{Force: true})
if err := p.cli.ContainerStart(ctx, resp.ID, container.StartOptions{}); err != nil {
return false, err
}
waitCh, errCh := p.cli.ContainerWait(ctx, resp.ID, container.WaitConditionNotRunning)
select {
case w := <-waitCh:
return w.StatusCode == 0, nil
case err := <-errCh:
return false, err
case <-ctx.Done():
return false, ctx.Err()
}
}
// isImageNotFoundErr classifies a Docker client error as "image not
// available locally." The daemon wraps this message in a generic
// SystemError type without exposing a typed sentinel, so we fall back

View File

@ -1,12 +1,67 @@
package provisioner
import (
"errors"
"os"
"path/filepath"
"strings"
"testing"
"github.com/docker/docker/api/types/container"
)
// TestValidateConfigSource covers issue #17: a workspace restart with no
// template and no in-memory configFiles must be caught before Docker
// starts a container destined to crash-loop on FileNotFoundError.
func TestValidateConfigSource_ConfigFilesPresent(t *testing.T) {
files := map[string][]byte{"config.yaml": []byte("name: test\n")}
if err := ValidateConfigSource("", files); err != nil {
t.Fatalf("expected nil error when configFiles has config.yaml, got %v", err)
}
}
func TestValidateConfigSource_ConfigFilesEmptyValue(t *testing.T) {
files := map[string][]byte{"config.yaml": {}}
if err := ValidateConfigSource("", files); !errors.Is(err, ErrNoConfigSource) {
t.Fatalf("expected ErrNoConfigSource for empty config.yaml bytes, got %v", err)
}
}
func TestValidateConfigSource_TemplatePathWithConfig(t *testing.T) {
dir := t.TempDir()
if err := os.WriteFile(filepath.Join(dir, "config.yaml"), []byte("name: x\n"), 0644); err != nil {
t.Fatalf("setup: %v", err)
}
if err := ValidateConfigSource(dir, nil); err != nil {
t.Fatalf("expected nil when template dir has config.yaml, got %v", err)
}
}
func TestValidateConfigSource_TemplatePathMissingConfig(t *testing.T) {
dir := t.TempDir() // empty dir
if err := ValidateConfigSource(dir, nil); !errors.Is(err, ErrNoConfigSource) {
t.Fatalf("expected ErrNoConfigSource for template dir without config.yaml, got %v", err)
}
}
func TestValidateConfigSource_BothEmpty(t *testing.T) {
if err := ValidateConfigSource("", nil); !errors.Is(err, ErrNoConfigSource) {
t.Fatalf("expected ErrNoConfigSource when both sources empty, got %v", err)
}
}
func TestValidateConfigSource_TemplateIsDirName(t *testing.T) {
// If `config.yaml` at the template path is itself a directory (weird
// but possible), the validator should reject it.
dir := t.TempDir()
if err := os.MkdirAll(filepath.Join(dir, "config.yaml"), 0755); err != nil {
t.Fatalf("setup: %v", err)
}
if err := ValidateConfigSource(dir, nil); !errors.Is(err, ErrNoConfigSource) {
t.Fatalf("expected ErrNoConfigSource when config.yaml is a dir, got %v", err)
}
}
// baseHostConfig returns a fresh HostConfig with typical pre-tier binds,
// mimicking what Start() builds before calling ApplyTierConfig.
func baseHostConfig(pluginsPath string) *container.HostConfig {

View File

@ -0,0 +1,59 @@
#!/usr/bin/env bash
# cleanup-rogue-workspaces.sh — delete stale test workspaces matching
# well-known placeholder ID patterns (#17).
#
# Usage:
# bash scripts/cleanup-rogue-workspaces.sh # against http://localhost:8080
# MOLECULE_URL=http://host:8080 bash scripts/cleanup-rogue-workspaces.sh
#
# Deletes any workspace whose id OR name starts with one of the
# hard-coded test prefixes. Also force-removes the matching Docker
# container (`ws-<id[:12]>`) in case Docker's unless-stopped policy
# kept it restarting on a missing config.yaml.
#
# Safe to run repeatedly — idempotent. Non-matching workspaces are left
# untouched.
set -euo pipefail
PLATFORM_URL="${MOLECULE_URL:-http://localhost:8080}"
echo "Scanning ${PLATFORM_URL}/workspaces for rogue test workspaces..."
payload="$(curl -sS --fail "${PLATFORM_URL}/workspaces" || true)"
if [[ -z "${payload}" ]]; then
echo " platform unreachable at ${PLATFORM_URL}"
exit 1
fi
# Extract {id\tname} for workspaces whose id or name starts with a test prefix.
# Patterns are passed via env so the heredoc doesn't need shell interpolation.
matches="$(PAYLOAD="${payload}" python3 - <<'PY'
import json, os
data = json.loads(os.environ["PAYLOAD"])
rows = data if isinstance(data, list) else data.get("workspaces", [])
patterns = ("aaaaaaaa-", "bbbbbbbb-", "cccccccc-", "test-ws-")
for w in rows:
wid = w.get("id", "") or ""
name = w.get("name", "") or ""
if any(wid.startswith(p) or name.startswith(p) for p in patterns):
print(f"{wid}\t{name}")
PY
)"
if [[ -z "${matches}" ]]; then
echo " no rogue workspaces found"
exit 0
fi
while IFS=$'\t' read -r wid wname; do
[[ -z "${wid}" ]] && continue
short="${wid:0:12}"
echo " deleting workspace ${wid} (${wname})"
curl -sS -X DELETE "${PLATFORM_URL}/workspaces/${wid}" >/dev/null || true
if command -v docker >/dev/null 2>&1; then
docker rm -f "ws-${short}" >/dev/null 2>&1 || true
fi
done <<<"${matches}"
echo "done."

View File

@ -22,3 +22,21 @@ This repo uses the standard monorepo testing convention: **unit tests live with
## Running E2E
Every E2E script here assumes the platform is running at `localhost:8080` and (where noted) provisioned agents are online. See the header comment of each `.sh` for specifics.
## Cleaning up rogue test workspaces
If an E2E run aborts before its teardown runs (Ctrl-C, crash, CI timeout),
the platform can be left with workspaces whose config volume is stale or
empty — Docker's `unless-stopped` restart policy then spins those
containers in a FileNotFoundError loop. The platform's pre-flight check
(#17) marks such workspaces `failed` on the next restart, but a manual
cleanup is useful:
```bash
bash scripts/cleanup-rogue-workspaces.sh # deletes ws with id/name starting aaaaaaaa-, bbbbbbbb-, cccccccc-, test-ws-
MOLECULE_URL=http://host:8080 bash scripts/cleanup-rogue-workspaces.sh
```
The script DELETEs each matching workspace via the API and
force-removes the `ws-<id[:12]>` container as a belt-and-suspenders
fallback.