forked from molecule-ai/molecule-core
fix(provisioner): stop rogue config-missing restart loop (#17)
Resolves #17. Part A: scripts/cleanup-rogue-workspaces.sh deletes workspaces whose id or name starts with known test placeholder prefixes (aaaaaaaa-, etc.) and force-removes the paired Docker container. Documented in tests/README.md. Part B: add a pre-flight check in provisionWorkspace() — when neither a template path nor in-memory configFiles supplies config.yaml, probe the existing named volume via a throwaway alpine container. If the volume lacks config.yaml, mark the workspace status='failed' with a clear last_sample_error instead of handing it to Docker's unless-stopped restart policy (which otherwise loops forever on FileNotFoundError). New pure helper provisioner.ValidateConfigSource + unit tests. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
5ab75532d0
commit
f7683e3adf
@ -77,6 +77,30 @@ func (h *WorkspaceHandler) provisionWorkspace(workspaceID, templatePath string,
|
||||
awarenessNamespace := h.loadAwarenessNamespace(ctx, workspaceID)
|
||||
cfg := h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
|
||||
|
||||
// Preflight #17: refuse to start a container we already know will crash on missing config.yaml.
|
||||
// When the caller supplies neither a template dir nor in-memory configFiles (the auto-restart
|
||||
// path), probe the existing Docker named volume. If it's empty/missing config.yaml, mark the
|
||||
// workspace 'failed' instead of handing it to Docker's unless-stopped restart policy, which
|
||||
// would otherwise loop forever on FileNotFoundError.
|
||||
if srcErr := provisioner.ValidateConfigSource(templatePath, configFiles); srcErr != nil {
|
||||
hasConfig, probeErr := h.provisioner.VolumeHasFile(ctx, workspaceID, "config.yaml")
|
||||
if probeErr != nil {
|
||||
log.Printf("Provisioner: config.yaml preflight probe failed for %s: %v (proceeding)", workspaceID, probeErr)
|
||||
} else if !hasConfig {
|
||||
msg := fmt.Sprintf("cannot start workspace %s: no config.yaml source and config volume is empty — delete the workspace or provide a template", workspaceID)
|
||||
log.Printf("Provisioner: %s", msg)
|
||||
if _, dbErr := db.DB.ExecContext(ctx,
|
||||
`UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
|
||||
workspaceID, msg); dbErr != nil {
|
||||
log.Printf("Provisioner: failed to mark workspace %s as failed: %v", workspaceID, dbErr)
|
||||
}
|
||||
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
|
||||
"error": msg,
|
||||
})
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
url, err := h.provisioner.Start(ctx, cfg)
|
||||
if err != nil {
|
||||
// Persist the error text to last_sample_error so the canvas and
|
||||
|
||||
@ -635,6 +635,83 @@ func (p *Provisioner) Close() error {
|
||||
return p.cli.Close()
|
||||
}
|
||||
|
||||
// ValidateConfigSource is a pure check that ensures at least one static
|
||||
// source of /configs/config.yaml is available before a container starts.
|
||||
//
|
||||
// Inputs mirror the fields on WorkspaceConfig:
|
||||
// - templatePath: host dir expected to contain config.yaml (copied into /configs)
|
||||
// - configFiles: in-memory files written into /configs at start time
|
||||
//
|
||||
// Returns nil if either source will place config.yaml into /configs.
|
||||
// When both sources are empty, returns ErrNoConfigSource so callers can
|
||||
// fall through to a volume probe (VolumeHasFile) before giving up.
|
||||
//
|
||||
// Used by the platform's provision flow to catch the rogue-restart-loop
|
||||
// case (#17): a workspace whose config volume was wiped and whose
|
||||
// auto-restart path passes empty template+configFiles would otherwise
|
||||
// boot into a FileNotFoundError crash loop under Docker's
|
||||
// `unless-stopped` restart policy.
|
||||
func ValidateConfigSource(templatePath string, configFiles map[string][]byte) error {
|
||||
if templatePath != "" {
|
||||
// Stat the template's config.yaml; an empty/stale template dir
|
||||
// without config.yaml is as broken as no template at all.
|
||||
info, err := os.Stat(filepath.Join(templatePath, "config.yaml"))
|
||||
if err == nil && !info.IsDir() {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
if configFiles != nil {
|
||||
if data, ok := configFiles["config.yaml"]; ok && len(data) > 0 {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
return ErrNoConfigSource
|
||||
}
|
||||
|
||||
// ErrNoConfigSource is returned by ValidateConfigSource when neither the
|
||||
// template path nor the in-memory config files supply a config.yaml.
|
||||
var ErrNoConfigSource = fmt.Errorf("no config.yaml source: template path missing config.yaml and configFiles empty")
|
||||
|
||||
// VolumeHasFile returns true if the named config volume for a workspace
|
||||
// already contains the given file path (relative to /configs). Used by
|
||||
// the auto-restart path to confirm a previously-provisioned volume is
|
||||
// still populated before reusing it — if the volume was wiped, we must
|
||||
// regenerate config or fail cleanly rather than loop on FileNotFoundError.
|
||||
//
|
||||
// Implementation: run a throwaway alpine `test -f` container bound to the
|
||||
// volume read-only. Returns (false, nil) if the file is absent and
|
||||
// (false, err) only on Docker-level failures.
|
||||
func (p *Provisioner) VolumeHasFile(ctx context.Context, workspaceID, relPath string) (bool, error) {
|
||||
volName := ConfigVolumeName(workspaceID)
|
||||
// Confirm the volume exists first — Docker auto-creates on bind otherwise.
|
||||
if _, err := p.cli.VolumeInspect(ctx, volName); err != nil {
|
||||
return false, nil
|
||||
}
|
||||
resp, err := p.cli.ContainerCreate(ctx, &container.Config{
|
||||
Image: "alpine",
|
||||
Cmd: []string{"test", "-f", "/vol/" + relPath},
|
||||
}, &container.HostConfig{
|
||||
Binds: []string{volName + ":/vol:ro"},
|
||||
}, nil, nil, "")
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
defer p.cli.ContainerRemove(ctx, resp.ID, container.RemoveOptions{Force: true})
|
||||
|
||||
if err := p.cli.ContainerStart(ctx, resp.ID, container.StartOptions{}); err != nil {
|
||||
return false, err
|
||||
}
|
||||
waitCh, errCh := p.cli.ContainerWait(ctx, resp.ID, container.WaitConditionNotRunning)
|
||||
select {
|
||||
case w := <-waitCh:
|
||||
return w.StatusCode == 0, nil
|
||||
case err := <-errCh:
|
||||
return false, err
|
||||
case <-ctx.Done():
|
||||
return false, ctx.Err()
|
||||
}
|
||||
}
|
||||
|
||||
// isImageNotFoundErr classifies a Docker client error as "image not
|
||||
// available locally." The daemon wraps this message in a generic
|
||||
// SystemError type without exposing a typed sentinel, so we fall back
|
||||
|
||||
@ -1,12 +1,67 @@
|
||||
package provisioner
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/docker/docker/api/types/container"
|
||||
)
|
||||
|
||||
// TestValidateConfigSource covers issue #17: a workspace restart with no
|
||||
// template and no in-memory configFiles must be caught before Docker
|
||||
// starts a container destined to crash-loop on FileNotFoundError.
|
||||
func TestValidateConfigSource_ConfigFilesPresent(t *testing.T) {
|
||||
files := map[string][]byte{"config.yaml": []byte("name: test\n")}
|
||||
if err := ValidateConfigSource("", files); err != nil {
|
||||
t.Fatalf("expected nil error when configFiles has config.yaml, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateConfigSource_ConfigFilesEmptyValue(t *testing.T) {
|
||||
files := map[string][]byte{"config.yaml": {}}
|
||||
if err := ValidateConfigSource("", files); !errors.Is(err, ErrNoConfigSource) {
|
||||
t.Fatalf("expected ErrNoConfigSource for empty config.yaml bytes, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateConfigSource_TemplatePathWithConfig(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
if err := os.WriteFile(filepath.Join(dir, "config.yaml"), []byte("name: x\n"), 0644); err != nil {
|
||||
t.Fatalf("setup: %v", err)
|
||||
}
|
||||
if err := ValidateConfigSource(dir, nil); err != nil {
|
||||
t.Fatalf("expected nil when template dir has config.yaml, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateConfigSource_TemplatePathMissingConfig(t *testing.T) {
|
||||
dir := t.TempDir() // empty dir
|
||||
if err := ValidateConfigSource(dir, nil); !errors.Is(err, ErrNoConfigSource) {
|
||||
t.Fatalf("expected ErrNoConfigSource for template dir without config.yaml, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateConfigSource_BothEmpty(t *testing.T) {
|
||||
if err := ValidateConfigSource("", nil); !errors.Is(err, ErrNoConfigSource) {
|
||||
t.Fatalf("expected ErrNoConfigSource when both sources empty, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateConfigSource_TemplateIsDirName(t *testing.T) {
|
||||
// If `config.yaml` at the template path is itself a directory (weird
|
||||
// but possible), the validator should reject it.
|
||||
dir := t.TempDir()
|
||||
if err := os.MkdirAll(filepath.Join(dir, "config.yaml"), 0755); err != nil {
|
||||
t.Fatalf("setup: %v", err)
|
||||
}
|
||||
if err := ValidateConfigSource(dir, nil); !errors.Is(err, ErrNoConfigSource) {
|
||||
t.Fatalf("expected ErrNoConfigSource when config.yaml is a dir, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// baseHostConfig returns a fresh HostConfig with typical pre-tier binds,
|
||||
// mimicking what Start() builds before calling ApplyTierConfig.
|
||||
func baseHostConfig(pluginsPath string) *container.HostConfig {
|
||||
|
||||
59
scripts/cleanup-rogue-workspaces.sh
Executable file
59
scripts/cleanup-rogue-workspaces.sh
Executable file
@ -0,0 +1,59 @@
|
||||
#!/usr/bin/env bash
|
||||
# cleanup-rogue-workspaces.sh — delete stale test workspaces matching
|
||||
# well-known placeholder ID patterns (#17).
|
||||
#
|
||||
# Usage:
|
||||
# bash scripts/cleanup-rogue-workspaces.sh # against http://localhost:8080
|
||||
# MOLECULE_URL=http://host:8080 bash scripts/cleanup-rogue-workspaces.sh
|
||||
#
|
||||
# Deletes any workspace whose id OR name starts with one of the
|
||||
# hard-coded test prefixes. Also force-removes the matching Docker
|
||||
# container (`ws-<id[:12]>`) in case Docker's unless-stopped policy
|
||||
# kept it restarting on a missing config.yaml.
|
||||
#
|
||||
# Safe to run repeatedly — idempotent. Non-matching workspaces are left
|
||||
# untouched.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
PLATFORM_URL="${MOLECULE_URL:-http://localhost:8080}"
|
||||
|
||||
echo "Scanning ${PLATFORM_URL}/workspaces for rogue test workspaces..."
|
||||
|
||||
payload="$(curl -sS --fail "${PLATFORM_URL}/workspaces" || true)"
|
||||
if [[ -z "${payload}" ]]; then
|
||||
echo " platform unreachable at ${PLATFORM_URL}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Extract {id\tname} for workspaces whose id or name starts with a test prefix.
|
||||
# Patterns are passed via env so the heredoc doesn't need shell interpolation.
|
||||
matches="$(PAYLOAD="${payload}" python3 - <<'PY'
|
||||
import json, os
|
||||
data = json.loads(os.environ["PAYLOAD"])
|
||||
rows = data if isinstance(data, list) else data.get("workspaces", [])
|
||||
patterns = ("aaaaaaaa-", "bbbbbbbb-", "cccccccc-", "test-ws-")
|
||||
for w in rows:
|
||||
wid = w.get("id", "") or ""
|
||||
name = w.get("name", "") or ""
|
||||
if any(wid.startswith(p) or name.startswith(p) for p in patterns):
|
||||
print(f"{wid}\t{name}")
|
||||
PY
|
||||
)"
|
||||
|
||||
if [[ -z "${matches}" ]]; then
|
||||
echo " no rogue workspaces found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
while IFS=$'\t' read -r wid wname; do
|
||||
[[ -z "${wid}" ]] && continue
|
||||
short="${wid:0:12}"
|
||||
echo " deleting workspace ${wid} (${wname})"
|
||||
curl -sS -X DELETE "${PLATFORM_URL}/workspaces/${wid}" >/dev/null || true
|
||||
if command -v docker >/dev/null 2>&1; then
|
||||
docker rm -f "ws-${short}" >/dev/null 2>&1 || true
|
||||
fi
|
||||
done <<<"${matches}"
|
||||
|
||||
echo "done."
|
||||
@ -22,3 +22,21 @@ This repo uses the standard monorepo testing convention: **unit tests live with
|
||||
## Running E2E
|
||||
|
||||
Every E2E script here assumes the platform is running at `localhost:8080` and (where noted) provisioned agents are online. See the header comment of each `.sh` for specifics.
|
||||
|
||||
## Cleaning up rogue test workspaces
|
||||
|
||||
If an E2E run aborts before its teardown runs (Ctrl-C, crash, CI timeout),
|
||||
the platform can be left with workspaces whose config volume is stale or
|
||||
empty — Docker's `unless-stopped` restart policy then spins those
|
||||
containers in a FileNotFoundError loop. The platform's pre-flight check
|
||||
(#17) marks such workspaces `failed` on the next restart, but a manual
|
||||
cleanup is useful:
|
||||
|
||||
```bash
|
||||
bash scripts/cleanup-rogue-workspaces.sh # deletes ws with id/name starting aaaaaaaa-, bbbbbbbb-, cccccccc-, test-ws-
|
||||
MOLECULE_URL=http://host:8080 bash scripts/cleanup-rogue-workspaces.sh
|
||||
```
|
||||
|
||||
The script DELETEs each matching workspace via the API and
|
||||
force-removes the `ws-<id[:12]>` container as a belt-and-suspenders
|
||||
fallback.
|
||||
|
||||
Loading…
Reference in New Issue
Block a user