diff --git a/workspace-server/internal/handlers/handlers_test.go b/workspace-server/internal/handlers/handlers_test.go index dc7b7213..85cff897 100644 --- a/workspace-server/internal/handlers/handlers_test.go +++ b/workspace-server/internal/handlers/handlers_test.go @@ -2,6 +2,7 @@ package handlers import ( "bytes" + "context" "database/sql" "encoding/json" "fmt" @@ -363,6 +364,7 @@ func TestBuildProvisionerConfig_IncludesAwarenessSettings(t *testing.T) { t.Setenv("WORKSPACE_DIR", "/tmp/workspace") cfg := handler.buildProvisionerConfig( + context.Background(), "ws-123", "/tmp/configs/template", map[string][]byte{"config.yaml": []byte("name: test")}, diff --git a/workspace-server/internal/handlers/runtime_image_pin.go b/workspace-server/internal/handlers/runtime_image_pin.go new file mode 100644 index 00000000..2cd90bd4 --- /dev/null +++ b/workspace-server/internal/handlers/runtime_image_pin.go @@ -0,0 +1,67 @@ +package handlers + +import ( + "context" + "database/sql" + "errors" + "log" + "os" + "strings" + + "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner" +) + +// resolveRuntimeImage returns the digest-pinned image ref for a runtime when +// an operator has promoted one via the runtime_image_pins table (#2272 layer 1), +// otherwise "" so the caller falls back to the legacy `:latest` lookup in +// provisioner.RuntimeImages. +// +// Policy: availability over pinning. Any DB hiccup (sql.ErrNoRows is the +// steady-state when nothing is pinned, but transient network blips, table +// missing post-rollback, etc.) returns "" and the provision continues on +// the moving tag — better one workspace on a slightly-newer image than a +// provision-blocked tenant. +// +// WORKSPACE_IMAGE_LOCAL_OVERRIDE=1 short-circuits the lookup entirely so a +// developer rebuilding template images locally gets their fresh build via +// `:latest` even when a remote digest is pinned for the same runtime. +func resolveRuntimeImage(ctx context.Context, runtime string) string { + if runtime == "" { + return "" + } + base, ok := provisioner.RuntimeImages[runtime] + if !ok { + // Unknown runtime — let provisioner.Start fall through to its own + // DefaultImage. Querying the pin table for a runtime that doesn't + // exist would only produce noise and a guaranteed ErrNoRows. + return "" + } + if os.Getenv("WORKSPACE_IMAGE_LOCAL_OVERRIDE") != "" { + return "" + } + if db.DB == nil { + return "" + } + + var digest string + err := db.DB.QueryRowContext(ctx, + `SELECT digest FROM runtime_image_pins WHERE template_name = $1`, runtime, + ).Scan(&digest) + if err != nil { + if !errors.Is(err, sql.ErrNoRows) { + log.Printf("resolveRuntimeImage: pin lookup for %q failed (%v) — falling back to :latest", runtime, err) + } + return "" + } + + // Strip the moving tag suffix (`:latest`, `:staging`) before appending + // the immutable digest. Docker treats `name:tag@sha256:...` as valid + // but the tag is ignored; dropping it keeps logs and admin diffs honest + // about what's actually being pulled. + pinned := base + if idx := strings.LastIndex(pinned, ":"); idx > strings.LastIndex(pinned, "/") { + pinned = pinned[:idx] + } + return pinned + "@" + digest +} diff --git a/workspace-server/internal/handlers/runtime_image_pin_test.go b/workspace-server/internal/handlers/runtime_image_pin_test.go new file mode 100644 index 00000000..b7589bc6 --- /dev/null +++ b/workspace-server/internal/handlers/runtime_image_pin_test.go @@ -0,0 +1,138 @@ +package handlers + +import ( + "context" + "database/sql" + "strings" + "testing" + + "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" + sqlmock "github.com/DATA-DOG/go-sqlmock" +) + +// TestResolveRuntimeImage_NoPin: lookup returns sql.ErrNoRows (the steady- +// state when an operator hasn't pinned this runtime). resolveRuntimeImage +// returns "" so the caller falls back to RuntimeImages[runtime] (legacy +// :latest behavior). This is the expected hot path until digest pinning +// is opted into per runtime. +func TestResolveRuntimeImage_NoPin(t *testing.T) { + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatal(err) + } + defer mockDB.Close() + prev := db.DB + db.DB = mockDB + defer func() { db.DB = prev }() + + mock.ExpectQuery(`SELECT digest FROM runtime_image_pins WHERE template_name = \$1`). + WithArgs("claude-code"). + WillReturnError(sql.ErrNoRows) + + got := resolveRuntimeImage(context.Background(), "claude-code") + if got != "" { + t.Errorf("expected empty (no pin = fallback), got %q", got) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("sqlmock expectations: %v", err) + } +} + +// TestResolveRuntimeImage_DBError: an unexpected DB failure (transient +// network blip, table missing post-rollback, etc.) must NOT block the +// provision — log + fall through to the legacy :latest path. This is +// the availability-over-pinning policy spelled out in resolveRuntimeImage's +// doc comment. +func TestResolveRuntimeImage_DBError(t *testing.T) { + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatal(err) + } + defer mockDB.Close() + prev := db.DB + db.DB = mockDB + defer func() { db.DB = prev }() + + mock.ExpectQuery(`SELECT digest FROM runtime_image_pins WHERE template_name = \$1`). + WithArgs("claude-code"). + WillReturnError(sqlmock.ErrCancelled) + + got := resolveRuntimeImage(context.Background(), "claude-code") + if got != "" { + t.Errorf("expected empty on DB error (fallback), got %q", got) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("sqlmock expectations: %v", err) + } +} + +// TestResolveRuntimeImage_WithPin returns image@sha256: when row exists. +func TestResolveRuntimeImage_WithPin(t *testing.T) { + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatal(err) + } + defer mockDB.Close() + prev := db.DB + db.DB = mockDB + defer func() { db.DB = prev }() + + digest := "sha256:3d6761a97ed07d7d33cfc19a8fbab81175d9d9179618d493dbc00c5f7ef076a3" + mock.ExpectQuery(`SELECT digest FROM runtime_image_pins WHERE template_name = \$1`). + WithArgs("claude-code"). + WillReturnRows(sqlmock.NewRows([]string{"digest"}).AddRow(digest)) + + got := resolveRuntimeImage(context.Background(), "claude-code") + if !strings.HasSuffix(got, "@"+digest) { + t.Errorf("expected suffix @%s, got %q", digest, got) + } + if !strings.HasPrefix(got, "ghcr.io/molecule-ai/workspace-template-claude-code") { + t.Errorf("expected GHCR prefix preserved, got %q", got) + } + if strings.Contains(got, ":latest") { + t.Errorf("expected :latest stripped, got %q", got) + } +} + +// TestResolveRuntimeImage_EmptyRuntime short-circuits to "" without DB. +func TestResolveRuntimeImage_EmptyRuntime(t *testing.T) { + got := resolveRuntimeImage(context.Background(), "") + if got != "" { + t.Errorf("expected empty for empty runtime, got %q", got) + } +} + +// TestResolveRuntimeImage_UnknownRuntime returns "" without DB lookup. +func TestResolveRuntimeImage_UnknownRuntime(t *testing.T) { + got := resolveRuntimeImage(context.Background(), "no-such-runtime") + if got != "" { + t.Errorf("expected empty for unknown runtime, got %q", got) + } +} + +// TestResolveRuntimeImage_LocalOverride: when WORKSPACE_IMAGE_LOCAL_OVERRIDE +// is set, the pin lookup is short-circuited even with a row present — +// devs rebuild images locally and want the floating tag to resolve to +// their fresh build, not a remote-pinned digest. +func TestResolveRuntimeImage_LocalOverride(t *testing.T) { + t.Setenv("WORKSPACE_IMAGE_LOCAL_OVERRIDE", "1") + + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatal(err) + } + defer mockDB.Close() + prev := db.DB + db.DB = mockDB + defer func() { db.DB = prev }() + + // No expectation set — if resolveRuntimeImage queries the DB despite + // the override, sqlmock fails the test via ExpectationsWereMet. + got := resolveRuntimeImage(context.Background(), "claude-code") + if got != "" { + t.Errorf("expected empty under WORKSPACE_IMAGE_LOCAL_OVERRIDE=1, got %q", got) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("DB queried despite override: %v", err) + } +} diff --git a/workspace-server/internal/handlers/workspace_provision.go b/workspace-server/internal/handlers/workspace_provision.go index e34ea315..7734aff0 100644 --- a/workspace-server/internal/handlers/workspace_provision.go +++ b/workspace-server/internal/handlers/workspace_provision.go @@ -127,7 +127,7 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri workspaceID, filepath.Base(runtimeTemplate)) templatePath = runtimeTemplate // Rebuild cfg with the recovered template path so Start() sees it. - cfg = h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, prepared.EnvVars, prepared.PluginsPath, prepared.AwarenessNamespace) + cfg = h.buildProvisionerConfig(ctx, workspaceID, templatePath, configFiles, payload, prepared.EnvVars, prepared.PluginsPath, prepared.AwarenessNamespace) cfg.ResetClaudeSession = resetClaudeSession recovered = true break @@ -243,6 +243,7 @@ func (h *WorkspaceHandler) loadAwarenessNamespace(ctx context.Context, workspace } func (h *WorkspaceHandler) buildProvisionerConfig( + ctx context.Context, workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload, @@ -291,6 +292,7 @@ func (h *WorkspaceHandler) buildProvisionerConfig( PlatformURL: h.platformURL, AwarenessURL: os.Getenv("AWARENESS_URL"), AwarenessNamespace: awarenessNamespace, + Image: resolveRuntimeImage(ctx, payload.Runtime), } } diff --git a/workspace-server/internal/handlers/workspace_provision_shared.go b/workspace-server/internal/handlers/workspace_provision_shared.go index 4f53493e..00e00bd0 100644 --- a/workspace-server/internal/handlers/workspace_provision_shared.go +++ b/workspace-server/internal/handlers/workspace_provision_shared.go @@ -166,7 +166,7 @@ func (h *WorkspaceHandler) prepareProvisionContext( } } - cfg := h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace) + cfg := h.buildProvisionerConfig(ctx, workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace) cfg.ResetClaudeSession = resetClaudeSession return &preparedProvisionContext{ diff --git a/workspace-server/internal/handlers/workspace_provision_test.go b/workspace-server/internal/handlers/workspace_provision_test.go index 86bf74fe..18d5777d 100644 --- a/workspace-server/internal/handlers/workspace_provision_test.go +++ b/workspace-server/internal/handlers/workspace_provision_test.go @@ -642,6 +642,7 @@ func TestBuildProvisionerConfig_BasicFields(t *testing.T) { templatePath := filepath.Join(tmpDir, "template") pluginsPath := t.TempDir() cfg := handler.buildProvisionerConfig( + context.Background(), "ws-basic", templatePath, map[string][]byte{"config.yaml": []byte("name: test")}, @@ -687,6 +688,7 @@ func TestBuildProvisionerConfig_WorkspacePathFromEnv(t *testing.T) { pluginsPath := t.TempDir() cfg := handler.buildProvisionerConfig( + context.Background(), "ws-env", "", nil, diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index f414aa9c..0b90c899 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -90,6 +90,33 @@ type WorkspaceConfig struct { AwarenessNamespace string WorkspaceAccess string // #65: "none" (default), "read_only", or "read_write" ResetClaudeSession bool // #12: if true, discard the claude-sessions volume before start (fresh session dir) + + // Image, when non-empty, overrides the runtime→image lookup. The handler + // layer sets this to the digest-pinned form (`@sha256:`) + // when an operator has promoted a specific runtime build via the + // runtime_image_pins table (#2272 layer 1). Empty = legacy behavior, + // fall back to RuntimeImages[Runtime] which resolves to the moving + // `:latest` tag. + Image string +} + +// selectImage resolves the final Docker image ref for a workspace. The handler +// layer is the source of truth — if it set cfg.Image (the digest-pinned form +// from runtime_image_pins, #2272), honor that. Otherwise fall back to the +// runtime→tag lookup in RuntimeImages (legacy `:latest` behavior). When the +// runtime isn't recognized either, fall back to DefaultImage so Start() still +// has something to hand Docker — surfacing a "No such image" later is more +// actionable than a silent "" panic in ContainerCreate. +func selectImage(cfg WorkspaceConfig) string { + if cfg.Image != "" { + return cfg.Image + } + if cfg.Runtime != "" { + if img, ok := RuntimeImages[cfg.Runtime]; ok { + return img + } + } + return DefaultImage } // Workspace-access constants for #65. Matches the CHECK constraint on @@ -290,13 +317,7 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e env := buildContainerEnv(cfg) - // Select image based on runtime (each adapter has its own pre-built image) - image := DefaultImage - if cfg.Runtime != "" { - if img, ok := RuntimeImages[cfg.Runtime]; ok { - image = img - } - } + image := selectImage(cfg) containerCfg := &container.Config{ Image: image, diff --git a/workspace-server/internal/provisioner/provisioner_test.go b/workspace-server/internal/provisioner/provisioner_test.go index 295475cc..290793b3 100644 --- a/workspace-server/internal/provisioner/provisioner_test.go +++ b/workspace-server/internal/provisioner/provisioner_test.go @@ -436,6 +436,62 @@ func TestWorkspaceConfig_ResetClaudeSessionFieldPresent(t *testing.T) { } } +// ---------- selectImage (#2272 layer 1) ---------- + +// TestSelectImage_PrefersExplicitImage: when the handler resolved a digest +// pin via runtime_image_pins, cfg.Image is set. selectImage must honor it +// and ignore the cfg.Runtime → :latest fallback. This is the load-bearing +// invariant for digest pinning — if it ever silently reverts to :latest, +// we lose the "one bad publish doesn't break every workspace" guarantee. +func TestSelectImage_PrefersExplicitImage(t *testing.T) { + pinned := "ghcr.io/molecule-ai/workspace-template-claude-code@sha256:3d6761a97ed07d7d33cfc19a8fbab81175d9d9179618d493dbc00c5f7ef076a3" + got := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: pinned}) + if got != pinned { + t.Errorf("selectImage with cfg.Image=pinned: got %q, want %q", got, pinned) + } +} + +// TestSelectImage_FallsBackToRuntimeMap: handler returned "" (no pin or +// pin lookup deliberately bypassed via WORKSPACE_IMAGE_LOCAL_OVERRIDE). +// selectImage must use the legacy runtime→:latest map. +func TestSelectImage_FallsBackToRuntimeMap(t *testing.T) { + got := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: ""}) + want := RuntimeImages["claude-code"] + if got != want { + t.Errorf("selectImage with empty Image: got %q, want %q", got, want) + } +} + +// TestSelectImage_UnknownRuntimeFallsBackToDefault preserves today's +// behavior — an unrecognized runtime resolves to DefaultImage rather than +// "" so ContainerCreate gets a usable arg and surfaces a meaningful +// "No such image" error if the default itself is missing. +func TestSelectImage_UnknownRuntimeFallsBackToDefault(t *testing.T) { + got := selectImage(WorkspaceConfig{Runtime: "no-such-runtime"}) + if got != DefaultImage { + t.Errorf("selectImage with unknown runtime: got %q, want DefaultImage %q", got, DefaultImage) + } +} + +// TestSelectImage_EmptyRuntimeFallsBackToDefault: same invariant for the +// no-runtime-supplied path (legacy callers / older handler code). +func TestSelectImage_EmptyRuntimeFallsBackToDefault(t *testing.T) { + got := selectImage(WorkspaceConfig{}) + if got != DefaultImage { + t.Errorf("selectImage with zero cfg: got %q, want DefaultImage %q", got, DefaultImage) + } +} + +// TestWorkspaceConfig_ImageFieldPresent compile-time-pins the Image field +// so the handler→provisioner contract for digest-pinned image refs can't +// be silently removed by a future struct refactor (#2272). +func TestWorkspaceConfig_ImageFieldPresent(t *testing.T) { + cfg := WorkspaceConfig{Image: "ghcr.io/example@sha256:abc"} + if cfg.Image == "" { + t.Fatal("Image should round-trip through struct literal") + } +} + // ---------- buildContainerEnv — #67 MOLECULE_URL injection ---------- func TestBuildContainerEnv_InjectsBothPlatformURLAndMoleculeAIURL(t *testing.T) { diff --git a/workspace-server/migrations/047_runtime_image_pins.down.sql b/workspace-server/migrations/047_runtime_image_pins.down.sql new file mode 100644 index 00000000..72af81e0 --- /dev/null +++ b/workspace-server/migrations/047_runtime_image_pins.down.sql @@ -0,0 +1,8 @@ +-- Reverse of 044_runtime_image_pins.up.sql. +-- +-- Drop the index before the column it references to satisfy Postgres +-- ordering. The pins table can drop independently of the column. + +DROP INDEX IF EXISTS idx_workspaces_runtime_image_digest; +ALTER TABLE workspaces DROP COLUMN IF EXISTS runtime_image_digest; +DROP TABLE IF EXISTS runtime_image_pins; diff --git a/workspace-server/migrations/047_runtime_image_pins.up.sql b/workspace-server/migrations/047_runtime_image_pins.up.sql new file mode 100644 index 00000000..2f186af6 --- /dev/null +++ b/workspace-server/migrations/047_runtime_image_pins.up.sql @@ -0,0 +1,46 @@ +-- Layer 1 of the runtime-rollout plan (issue #2272). +-- +-- Decouples publish from promotion: rather than every workspace pulling +-- whatever's on `:latest` at create-time, the provisioner consults a +-- promoted-digest table. Operators (or a cascade-success hook) update +-- the table to roll out a new version; existing workspaces stay on +-- whatever they were created with until they're explicitly recreated. +-- One bad publish no longer breaks every workspace simultaneously. +-- +-- Two pieces: +-- 1. runtime_image_pins — the lookup table. One row per template name. +-- No row = fall back to `:latest` (preserves today's behavior so +-- this migration is non-disruptive to existing deployments). +-- +-- 2. workspaces.runtime_image_digest — what the workspace actually +-- pulled at create/restart time. Diagnostic + foundation for +-- Layer 2's per-ring routing (canary vs stable). Nullable so +-- pre-migration workspaces show up as NULL until next restart. + +CREATE TABLE IF NOT EXISTS runtime_image_pins ( + -- Template name (e.g. "claude-code", "hermes"). Matches the keys + -- in workspace-server/internal/provisioner.RuntimeImages. + template_name TEXT PRIMARY KEY, + + -- Full GHCR digest including the `sha256:` prefix. Provisioner pulls + -- ghcr.io/molecule-ai/workspace-template-