From 862a90a316b128f132484f865d05a49e79835199 Mon Sep 17 00:00:00 2001 From: core-devops Date: Sat, 16 May 2026 15:36:16 -0700 Subject: [PATCH] fix(runtime): complete crewai/deepagents/gemini-cli removal + fail-closed on unresolvable runtime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses core-security REQUEST-CHANGES (review 4269) on PR #1385 (RFC internal#483). The original removal was incomplete and silently changed behavior: 1. manifest.json workspace_templates still listed crewai/deepagents/ gemini-cli. handlers.knownRuntimes (the workspace-create admission allowlist) is built at boot from manifest.json via runtime_registry.go — NOT from provisioner.knownRuntimes — so a create request for a "removed" runtime was still admitted. Removed the 3 entries; the manifest now resolves to the 6 kept template-backed runtimes (claude-code, hermes, openclaw, codex, langgraph, autogen) + injected meta-runtimes. langgraph/autogen retained intentionally — not in the internal#483 removal set and still in provisioner.knownRuntimes. 2. selectImage silently fell back to DefaultImage (langgraph) for any named-but-unresolvable runtime: a user asking for crewai got a langgraph container with no signal. selectImage now returns (string, error) and rejects a NAMED-but-unresolvable runtime with ErrUnresolvableRuntime (naming the offending runtime). Start propagates it; the existing markProvisionFailed path already broadcasts WorkspaceProvisionFailed + records the message. Implements the CTO hardgate directive (feedback_platform_must_hardgate_base_contract — fail-closed, no silent degrade). The genuinely-unspecified (empty) runtime remains a distinct legitimate path → DefaultImage. go build ./... and go test ./internal/provisioner/ ./internal/handlers/ ./internal/imagewatch/ ./internal/registry/ ./internal/models/ all pass. selectImage tests updated to assert the new fail-closed contract (new expected values verified against the security finding, not made-green). Co-Authored-By: Claude Opus 4.7 (1M context) --- manifest.json | 5 +- .../internal/provisioner/provisioner.go | 52 +++++++++++++++---- .../internal/provisioner/provisioner_test.go | 43 +++++++++++---- 3 files changed, 76 insertions(+), 24 deletions(-) diff --git a/manifest.json b/manifest.json index bde3a1d9..e68aa1e4 100644 --- a/manifest.json +++ b/manifest.json @@ -30,10 +30,7 @@ {"name": "openclaw", "repo": "molecule-ai/molecule-ai-workspace-template-openclaw", "ref": "main"}, {"name": "codex", "repo": "molecule-ai/molecule-ai-workspace-template-codex", "ref": "main"}, {"name": "langgraph", "repo": "molecule-ai/molecule-ai-workspace-template-langgraph", "ref": "main"}, - {"name": "crewai", "repo": "molecule-ai/molecule-ai-workspace-template-crewai", "ref": "main"}, - {"name": "autogen", "repo": "molecule-ai/molecule-ai-workspace-template-autogen", "ref": "main"}, - {"name": "deepagents", "repo": "molecule-ai/molecule-ai-workspace-template-deepagents", "ref": "main"}, - {"name": "gemini-cli", "repo": "molecule-ai/molecule-ai-workspace-template-gemini-cli", "ref": "main"} + {"name": "autogen", "repo": "molecule-ai/molecule-ai-workspace-template-autogen", "ref": "main"} ], "org_templates": [ {"name": "molecule-dev", "repo": "molecule-ai/molecule-ai-org-template-molecule-dev", "ref": "main"}, diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index ae1fbc72..f4ca31c5 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -35,6 +35,19 @@ import ( // drift-risk #6. var ErrNoBackend = errors.New("provisioner: no backend configured (zero-valued receiver)") +// ErrUnresolvableRuntime is returned by selectImage when a workspace +// names a runtime that has no resolvable image (not in RuntimeImages and +// no operator-pinned cfg.Image). RFC internal#483 + security review 4269: +// previously such a request silently fell through to DefaultImage +// (langgraph) — a user asking for crewai would get a langgraph container +// with no signal. The CTO standing directive +// (feedback_platform_must_hardgate_base_contract) is fail-closed: a +// named-but-unresolvable runtime must reject with a structured, +// runtime-naming error so the existing provision-failed notify/log path +// surfaces it, NOT silently degrade. The genuinely-unspecified (empty) +// runtime is still a distinct, legitimate path that keeps DefaultImage. +var ErrUnresolvableRuntime = errors.New("provisioner: requested runtime has no resolvable image") + // RuntimeImages maps runtime names to their Docker image refs. // Each standalone template repo publishes its image via the reusable // publish-template-image workflow in molecule-ci on every main merge. @@ -104,20 +117,33 @@ type WorkspaceConfig struct { // selectImage resolves the final Docker image ref for a workspace. The handler // layer is the source of truth — if it set cfg.Image (the digest-pinned form // from runtime_image_pins, #2272), honor that. Otherwise fall back to the -// runtime→tag lookup in RuntimeImages (legacy `:latest` behavior). When the -// runtime isn't recognized either, fall back to DefaultImage so Start() still -// has something to hand Docker — surfacing a "No such image" later is more -// actionable than a silent "" panic in ContainerCreate. -func selectImage(cfg WorkspaceConfig) string { +// runtime→tag lookup in RuntimeImages (legacy `:latest` behavior). +// +// Fail-closed contract (RFC internal#483 / security review 4269 / +// feedback_platform_must_hardgate_base_contract): if the workspace NAMES a +// runtime that resolves to no image (not in RuntimeImages, no pinned +// cfg.Image), reject with ErrUnresolvableRuntime instead of silently +// substituting DefaultImage. Pre-fix, removing crewai/deepagents/gemini-cli +// from the catalog left those create requests silently provisioning a +// langgraph container — the user asked for crewai and got langgraph with no +// signal. The error propagates through Start → markProvisionFailed, which +// already broadcasts WorkspaceProvisionFailed and records the message. +// +// The genuinely-unspecified runtime (empty cfg.Runtime, e.g. an org template +// that doesn't pin one) is an intended distinct path and still resolves to +// DefaultImage — only a NAMED-but-unresolvable runtime is rejected. +func selectImage(cfg WorkspaceConfig) (string, error) { if cfg.Image != "" { - return cfg.Image + return cfg.Image, nil } if cfg.Runtime != "" { if img, ok := RuntimeImages[cfg.Runtime]; ok { - return img + return img, nil } + return "", fmt.Errorf("%w: runtime %q (known runtimes: %v)", + ErrUnresolvableRuntime, cfg.Runtime, knownRuntimes) } - return DefaultImage + return DefaultImage, nil } // Workspace-access constants for #65. Matches the CHECK constraint on @@ -336,7 +362,15 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e env := buildContainerEnv(cfg) - image := selectImage(cfg) + image, imgErr := selectImage(cfg) + if imgErr != nil { + // Fail-closed: a named-but-unresolvable runtime must not silently + // become DefaultImage (RFC internal#483 / review 4269). The caller's + // error path (markProvisionFailed) broadcasts the failure + records + // the message so the canvas surfaces it. + log.Printf("Provisioner: refusing to start %s: %v", cfg.WorkspaceID, imgErr) + return "", imgErr + } // Local-build mode (issue #63 / Task #194): when MOLECULE_IMAGE_REGISTRY // is unset, the OSS contributor path skips the registry pull entirely diff --git a/workspace-server/internal/provisioner/provisioner_test.go b/workspace-server/internal/provisioner/provisioner_test.go index bbc1f61f..815c47cb 100644 --- a/workspace-server/internal/provisioner/provisioner_test.go +++ b/workspace-server/internal/provisioner/provisioner_test.go @@ -513,7 +513,10 @@ func TestWorkspaceConfig_ResetClaudeSessionFieldPresent(t *testing.T) { // we lose the "one bad publish doesn't break every workspace" guarantee. func TestSelectImage_PrefersExplicitImage(t *testing.T) { pinned := "ghcr.io/molecule-ai/workspace-template-claude-code@sha256:3d6761a97ed07d7d33cfc19a8fbab81175d9d9179618d493dbc00c5f7ef076a3" - got := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: pinned}) + got, err := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: pinned}) + if err != nil { + t.Fatalf("selectImage with cfg.Image=pinned: unexpected error %v", err) + } if got != pinned { t.Errorf("selectImage with cfg.Image=pinned: got %q, want %q", got, pinned) } @@ -523,28 +526,46 @@ func TestSelectImage_PrefersExplicitImage(t *testing.T) { // pin lookup deliberately bypassed via WORKSPACE_IMAGE_LOCAL_OVERRIDE). // selectImage must use the legacy runtime→:latest map. func TestSelectImage_FallsBackToRuntimeMap(t *testing.T) { - got := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: ""}) + got, err := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: ""}) + if err != nil { + t.Fatalf("selectImage with empty Image: unexpected error %v", err) + } want := RuntimeImages["claude-code"] if got != want { t.Errorf("selectImage with empty Image: got %q, want %q", got, want) } } -// TestSelectImage_UnknownRuntimeFallsBackToDefault preserves today's -// behavior — an unrecognized runtime resolves to DefaultImage rather than -// "" so ContainerCreate gets a usable arg and surfaces a meaningful -// "No such image" error if the default itself is missing. -func TestSelectImage_UnknownRuntimeFallsBackToDefault(t *testing.T) { - got := selectImage(WorkspaceConfig{Runtime: "no-such-runtime"}) - if got != DefaultImage { - t.Errorf("selectImage with unknown runtime: got %q, want DefaultImage %q", got, DefaultImage) +// TestSelectImage_NamedUnresolvableRuntimeRejects pins the fail-closed +// contract (RFC internal#483 / security review 4269 / +// feedback_platform_must_hardgate_base_contract): a NAMED runtime with no +// resolvable image must reject with ErrUnresolvableRuntime, NOT silently +// substitute DefaultImage. Pre-fix this returned langgraph — a user asking +// for a removed runtime (crewai/deepagents/gemini-cli) silently got a +// langgraph container. "crewai" is the concrete regression from the +// security finding. +func TestSelectImage_NamedUnresolvableRuntimeRejects(t *testing.T) { + for _, rt := range []string{"no-such-runtime", "crewai", "deepagents", "gemini-cli"} { + got, err := selectImage(WorkspaceConfig{Runtime: rt}) + if !errors.Is(err, ErrUnresolvableRuntime) { + t.Errorf("selectImage(%q): got err %v, want ErrUnresolvableRuntime", rt, err) + } + if got != "" { + t.Errorf("selectImage(%q): got image %q, want \"\" on reject", rt, got) + } + if err != nil && !strings.Contains(err.Error(), rt) { + t.Errorf("selectImage(%q): error must name the offending runtime, got %v", rt, err) + } } } // TestSelectImage_EmptyRuntimeFallsBackToDefault: same invariant for the // no-runtime-supplied path (legacy callers / older handler code). func TestSelectImage_EmptyRuntimeFallsBackToDefault(t *testing.T) { - got := selectImage(WorkspaceConfig{}) + got, err := selectImage(WorkspaceConfig{}) + if err != nil { + t.Fatalf("selectImage with zero cfg: unexpected error %v (empty runtime is a legitimate DefaultImage path)", err) + } if got != DefaultImage { t.Errorf("selectImage with zero cfg: got %q, want DefaultImage %q", got, DefaultImage) }