feat(provisioner): digest-pin workspace images via runtime_image_pins (#2272 layer 1)

Layer 1 of the runtime-rollout plan. Decouples publish from promotion by
giving operators a `runtime_image_pins` table the provisioner consults at
container-create time. No row = legacy `:latest` behavior; row present =
provisioner pulls `<base>@sha256:<digest>`. One bad publish no longer
breaks every workspace simultaneously.

Mechanics:

  - Migration 047: `runtime_image_pins` (template_name PK + sha256 digest +
    audit columns) and `workspaces.runtime_image_digest` (nullable, with
    partial index) for "show me workspaces still on the old digest" queries.
  - `resolveRuntimeImage` (handlers/runtime_image_pin.go): looks up the
    pin, returns `<base>@sha256:<digest>` on hit, "" on miss/error so the
    provisioner falls through to the legacy tag map. Availability over
    pinning — any DB error logs and returns "" rather than blocking the
    provision. `WORKSPACE_IMAGE_LOCAL_OVERRIDE=1` short-circuits the
    lookup so devs rebuilding template images locally see their fresh
    build.
  - `WorkspaceConfig.Image` carries the resolved value into the
    provisioner. `selectImage` honors it ahead of the runtime→tag map and
    falls back to DefaultImage on unknown runtime.
  - The existing `imageTagIsMoving` predicate (#215) already returns false
    on `@sha256:` form, so digest pins skip the force-pull path naturally.

Tests:

  - Handler-side (sqlmock): no-pin/db-error/with-pin/empty/unknown/local-
    override paths cover every branch of `resolveRuntimeImage`.
  - Provisioner-side: `selectImage` table covers explicit-image preference,
    runtime-map fallback, unknown-runtime → default, empty-config →
    default. Plus a struct-literal compile-time pin on `Image` so a future
    refactor can't silently drop the field.

Layer 2 (per-ring routing via `workspaces.runtime_image_digest`) and the
admin promote/rollback endpoint ride on top of this and ship separately.
This commit is contained in:
Hongming Wang 2026-05-03 02:30:00 -07:00
parent e4893f5a9a
commit 1bff419833
10 changed files with 351 additions and 9 deletions

View File

@ -2,6 +2,7 @@ package handlers
import (
"bytes"
"context"
"database/sql"
"encoding/json"
"fmt"
@ -363,6 +364,7 @@ func TestBuildProvisionerConfig_IncludesAwarenessSettings(t *testing.T) {
t.Setenv("WORKSPACE_DIR", "/tmp/workspace")
cfg := handler.buildProvisionerConfig(
context.Background(),
"ws-123",
"/tmp/configs/template",
map[string][]byte{"config.yaml": []byte("name: test")},

View File

@ -0,0 +1,67 @@
package handlers
import (
"context"
"database/sql"
"errors"
"log"
"os"
"strings"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
)
// resolveRuntimeImage returns the digest-pinned image ref for a runtime when
// an operator has promoted one via the runtime_image_pins table (#2272 layer 1),
// otherwise "" so the caller falls back to the legacy `:latest` lookup in
// provisioner.RuntimeImages.
//
// Policy: availability over pinning. Any DB hiccup (sql.ErrNoRows is the
// steady-state when nothing is pinned, but transient network blips, table
// missing post-rollback, etc.) returns "" and the provision continues on
// the moving tag — better one workspace on a slightly-newer image than a
// provision-blocked tenant.
//
// WORKSPACE_IMAGE_LOCAL_OVERRIDE=1 short-circuits the lookup entirely so a
// developer rebuilding template images locally gets their fresh build via
// `:latest` even when a remote digest is pinned for the same runtime.
func resolveRuntimeImage(ctx context.Context, runtime string) string {
if runtime == "" {
return ""
}
base, ok := provisioner.RuntimeImages[runtime]
if !ok {
// Unknown runtime — let provisioner.Start fall through to its own
// DefaultImage. Querying the pin table for a runtime that doesn't
// exist would only produce noise and a guaranteed ErrNoRows.
return ""
}
if os.Getenv("WORKSPACE_IMAGE_LOCAL_OVERRIDE") != "" {
return ""
}
if db.DB == nil {
return ""
}
var digest string
err := db.DB.QueryRowContext(ctx,
`SELECT digest FROM runtime_image_pins WHERE template_name = $1`, runtime,
).Scan(&digest)
if err != nil {
if !errors.Is(err, sql.ErrNoRows) {
log.Printf("resolveRuntimeImage: pin lookup for %q failed (%v) — falling back to :latest", runtime, err)
}
return ""
}
// Strip the moving tag suffix (`:latest`, `:staging`) before appending
// the immutable digest. Docker treats `name:tag@sha256:...` as valid
// but the tag is ignored; dropping it keeps logs and admin diffs honest
// about what's actually being pulled.
pinned := base
if idx := strings.LastIndex(pinned, ":"); idx > strings.LastIndex(pinned, "/") {
pinned = pinned[:idx]
}
return pinned + "@" + digest
}

View File

@ -0,0 +1,138 @@
package handlers
import (
"context"
"database/sql"
"strings"
"testing"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
sqlmock "github.com/DATA-DOG/go-sqlmock"
)
// TestResolveRuntimeImage_NoPin: lookup returns sql.ErrNoRows (the steady-
// state when an operator hasn't pinned this runtime). resolveRuntimeImage
// returns "" so the caller falls back to RuntimeImages[runtime] (legacy
// :latest behavior). This is the expected hot path until digest pinning
// is opted into per runtime.
func TestResolveRuntimeImage_NoPin(t *testing.T) {
mockDB, mock, err := sqlmock.New()
if err != nil {
t.Fatal(err)
}
defer mockDB.Close()
prev := db.DB
db.DB = mockDB
defer func() { db.DB = prev }()
mock.ExpectQuery(`SELECT digest FROM runtime_image_pins WHERE template_name = \$1`).
WithArgs("claude-code").
WillReturnError(sql.ErrNoRows)
got := resolveRuntimeImage(context.Background(), "claude-code")
if got != "" {
t.Errorf("expected empty (no pin = fallback), got %q", got)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("sqlmock expectations: %v", err)
}
}
// TestResolveRuntimeImage_DBError: an unexpected DB failure (transient
// network blip, table missing post-rollback, etc.) must NOT block the
// provision — log + fall through to the legacy :latest path. This is
// the availability-over-pinning policy spelled out in resolveRuntimeImage's
// doc comment.
func TestResolveRuntimeImage_DBError(t *testing.T) {
mockDB, mock, err := sqlmock.New()
if err != nil {
t.Fatal(err)
}
defer mockDB.Close()
prev := db.DB
db.DB = mockDB
defer func() { db.DB = prev }()
mock.ExpectQuery(`SELECT digest FROM runtime_image_pins WHERE template_name = \$1`).
WithArgs("claude-code").
WillReturnError(sqlmock.ErrCancelled)
got := resolveRuntimeImage(context.Background(), "claude-code")
if got != "" {
t.Errorf("expected empty on DB error (fallback), got %q", got)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("sqlmock expectations: %v", err)
}
}
// TestResolveRuntimeImage_WithPin returns image@sha256:<digest> when row exists.
func TestResolveRuntimeImage_WithPin(t *testing.T) {
mockDB, mock, err := sqlmock.New()
if err != nil {
t.Fatal(err)
}
defer mockDB.Close()
prev := db.DB
db.DB = mockDB
defer func() { db.DB = prev }()
digest := "sha256:3d6761a97ed07d7d33cfc19a8fbab81175d9d9179618d493dbc00c5f7ef076a3"
mock.ExpectQuery(`SELECT digest FROM runtime_image_pins WHERE template_name = \$1`).
WithArgs("claude-code").
WillReturnRows(sqlmock.NewRows([]string{"digest"}).AddRow(digest))
got := resolveRuntimeImage(context.Background(), "claude-code")
if !strings.HasSuffix(got, "@"+digest) {
t.Errorf("expected suffix @%s, got %q", digest, got)
}
if !strings.HasPrefix(got, "ghcr.io/molecule-ai/workspace-template-claude-code") {
t.Errorf("expected GHCR prefix preserved, got %q", got)
}
if strings.Contains(got, ":latest") {
t.Errorf("expected :latest stripped, got %q", got)
}
}
// TestResolveRuntimeImage_EmptyRuntime short-circuits to "" without DB.
func TestResolveRuntimeImage_EmptyRuntime(t *testing.T) {
got := resolveRuntimeImage(context.Background(), "")
if got != "" {
t.Errorf("expected empty for empty runtime, got %q", got)
}
}
// TestResolveRuntimeImage_UnknownRuntime returns "" without DB lookup.
func TestResolveRuntimeImage_UnknownRuntime(t *testing.T) {
got := resolveRuntimeImage(context.Background(), "no-such-runtime")
if got != "" {
t.Errorf("expected empty for unknown runtime, got %q", got)
}
}
// TestResolveRuntimeImage_LocalOverride: when WORKSPACE_IMAGE_LOCAL_OVERRIDE
// is set, the pin lookup is short-circuited even with a row present —
// devs rebuild images locally and want the floating tag to resolve to
// their fresh build, not a remote-pinned digest.
func TestResolveRuntimeImage_LocalOverride(t *testing.T) {
t.Setenv("WORKSPACE_IMAGE_LOCAL_OVERRIDE", "1")
mockDB, mock, err := sqlmock.New()
if err != nil {
t.Fatal(err)
}
defer mockDB.Close()
prev := db.DB
db.DB = mockDB
defer func() { db.DB = prev }()
// No expectation set — if resolveRuntimeImage queries the DB despite
// the override, sqlmock fails the test via ExpectationsWereMet.
got := resolveRuntimeImage(context.Background(), "claude-code")
if got != "" {
t.Errorf("expected empty under WORKSPACE_IMAGE_LOCAL_OVERRIDE=1, got %q", got)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("DB queried despite override: %v", err)
}
}

View File

@ -127,7 +127,7 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri
workspaceID, filepath.Base(runtimeTemplate))
templatePath = runtimeTemplate
// Rebuild cfg with the recovered template path so Start() sees it.
cfg = h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, prepared.EnvVars, prepared.PluginsPath, prepared.AwarenessNamespace)
cfg = h.buildProvisionerConfig(ctx, workspaceID, templatePath, configFiles, payload, prepared.EnvVars, prepared.PluginsPath, prepared.AwarenessNamespace)
cfg.ResetClaudeSession = resetClaudeSession
recovered = true
break
@ -243,6 +243,7 @@ func (h *WorkspaceHandler) loadAwarenessNamespace(ctx context.Context, workspace
}
func (h *WorkspaceHandler) buildProvisionerConfig(
ctx context.Context,
workspaceID, templatePath string,
configFiles map[string][]byte,
payload models.CreateWorkspacePayload,
@ -291,6 +292,7 @@ func (h *WorkspaceHandler) buildProvisionerConfig(
PlatformURL: h.platformURL,
AwarenessURL: os.Getenv("AWARENESS_URL"),
AwarenessNamespace: awarenessNamespace,
Image: resolveRuntimeImage(ctx, payload.Runtime),
}
}

View File

@ -166,7 +166,7 @@ func (h *WorkspaceHandler) prepareProvisionContext(
}
}
cfg := h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
cfg := h.buildProvisionerConfig(ctx, workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
cfg.ResetClaudeSession = resetClaudeSession
return &preparedProvisionContext{

View File

@ -642,6 +642,7 @@ func TestBuildProvisionerConfig_BasicFields(t *testing.T) {
templatePath := filepath.Join(tmpDir, "template")
pluginsPath := t.TempDir()
cfg := handler.buildProvisionerConfig(
context.Background(),
"ws-basic",
templatePath,
map[string][]byte{"config.yaml": []byte("name: test")},
@ -687,6 +688,7 @@ func TestBuildProvisionerConfig_WorkspacePathFromEnv(t *testing.T) {
pluginsPath := t.TempDir()
cfg := handler.buildProvisionerConfig(
context.Background(),
"ws-env",
"",
nil,

View File

@ -90,6 +90,33 @@ type WorkspaceConfig struct {
AwarenessNamespace string
WorkspaceAccess string // #65: "none" (default), "read_only", or "read_write"
ResetClaudeSession bool // #12: if true, discard the claude-sessions volume before start (fresh session dir)
// Image, when non-empty, overrides the runtime→image lookup. The handler
// layer sets this to the digest-pinned form (`<base>@sha256:<digest>`)
// when an operator has promoted a specific runtime build via the
// runtime_image_pins table (#2272 layer 1). Empty = legacy behavior,
// fall back to RuntimeImages[Runtime] which resolves to the moving
// `:latest` tag.
Image string
}
// selectImage resolves the final Docker image ref for a workspace. The handler
// layer is the source of truth — if it set cfg.Image (the digest-pinned form
// from runtime_image_pins, #2272), honor that. Otherwise fall back to the
// runtime→tag lookup in RuntimeImages (legacy `:latest` behavior). When the
// runtime isn't recognized either, fall back to DefaultImage so Start() still
// has something to hand Docker — surfacing a "No such image" later is more
// actionable than a silent "" panic in ContainerCreate.
func selectImage(cfg WorkspaceConfig) string {
if cfg.Image != "" {
return cfg.Image
}
if cfg.Runtime != "" {
if img, ok := RuntimeImages[cfg.Runtime]; ok {
return img
}
}
return DefaultImage
}
// Workspace-access constants for #65. Matches the CHECK constraint on
@ -290,13 +317,7 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
env := buildContainerEnv(cfg)
// Select image based on runtime (each adapter has its own pre-built image)
image := DefaultImage
if cfg.Runtime != "" {
if img, ok := RuntimeImages[cfg.Runtime]; ok {
image = img
}
}
image := selectImage(cfg)
containerCfg := &container.Config{
Image: image,

View File

@ -436,6 +436,62 @@ func TestWorkspaceConfig_ResetClaudeSessionFieldPresent(t *testing.T) {
}
}
// ---------- selectImage (#2272 layer 1) ----------
// TestSelectImage_PrefersExplicitImage: when the handler resolved a digest
// pin via runtime_image_pins, cfg.Image is set. selectImage must honor it
// and ignore the cfg.Runtime → :latest fallback. This is the load-bearing
// invariant for digest pinning — if it ever silently reverts to :latest,
// we lose the "one bad publish doesn't break every workspace" guarantee.
func TestSelectImage_PrefersExplicitImage(t *testing.T) {
pinned := "ghcr.io/molecule-ai/workspace-template-claude-code@sha256:3d6761a97ed07d7d33cfc19a8fbab81175d9d9179618d493dbc00c5f7ef076a3"
got := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: pinned})
if got != pinned {
t.Errorf("selectImage with cfg.Image=pinned: got %q, want %q", got, pinned)
}
}
// TestSelectImage_FallsBackToRuntimeMap: handler returned "" (no pin or
// pin lookup deliberately bypassed via WORKSPACE_IMAGE_LOCAL_OVERRIDE).
// selectImage must use the legacy runtime→:latest map.
func TestSelectImage_FallsBackToRuntimeMap(t *testing.T) {
got := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: ""})
want := RuntimeImages["claude-code"]
if got != want {
t.Errorf("selectImage with empty Image: got %q, want %q", got, want)
}
}
// TestSelectImage_UnknownRuntimeFallsBackToDefault preserves today's
// behavior — an unrecognized runtime resolves to DefaultImage rather than
// "" so ContainerCreate gets a usable arg and surfaces a meaningful
// "No such image" error if the default itself is missing.
func TestSelectImage_UnknownRuntimeFallsBackToDefault(t *testing.T) {
got := selectImage(WorkspaceConfig{Runtime: "no-such-runtime"})
if got != DefaultImage {
t.Errorf("selectImage with unknown runtime: got %q, want DefaultImage %q", got, DefaultImage)
}
}
// TestSelectImage_EmptyRuntimeFallsBackToDefault: same invariant for the
// no-runtime-supplied path (legacy callers / older handler code).
func TestSelectImage_EmptyRuntimeFallsBackToDefault(t *testing.T) {
got := selectImage(WorkspaceConfig{})
if got != DefaultImage {
t.Errorf("selectImage with zero cfg: got %q, want DefaultImage %q", got, DefaultImage)
}
}
// TestWorkspaceConfig_ImageFieldPresent compile-time-pins the Image field
// so the handler→provisioner contract for digest-pinned image refs can't
// be silently removed by a future struct refactor (#2272).
func TestWorkspaceConfig_ImageFieldPresent(t *testing.T) {
cfg := WorkspaceConfig{Image: "ghcr.io/example@sha256:abc"}
if cfg.Image == "" {
t.Fatal("Image should round-trip through struct literal")
}
}
// ---------- buildContainerEnv — #67 MOLECULE_URL injection ----------
func TestBuildContainerEnv_InjectsBothPlatformURLAndMoleculeAIURL(t *testing.T) {

View File

@ -0,0 +1,8 @@
-- Reverse of 044_runtime_image_pins.up.sql.
--
-- Drop the index before the column it references to satisfy Postgres
-- ordering. The pins table can drop independently of the column.
DROP INDEX IF EXISTS idx_workspaces_runtime_image_digest;
ALTER TABLE workspaces DROP COLUMN IF EXISTS runtime_image_digest;
DROP TABLE IF EXISTS runtime_image_pins;

View File

@ -0,0 +1,46 @@
-- Layer 1 of the runtime-rollout plan (issue #2272).
--
-- Decouples publish from promotion: rather than every workspace pulling
-- whatever's on `:latest` at create-time, the provisioner consults a
-- promoted-digest table. Operators (or a cascade-success hook) update
-- the table to roll out a new version; existing workspaces stay on
-- whatever they were created with until they're explicitly recreated.
-- One bad publish no longer breaks every workspace simultaneously.
--
-- Two pieces:
-- 1. runtime_image_pins — the lookup table. One row per template name.
-- No row = fall back to `:latest` (preserves today's behavior so
-- this migration is non-disruptive to existing deployments).
--
-- 2. workspaces.runtime_image_digest — what the workspace actually
-- pulled at create/restart time. Diagnostic + foundation for
-- Layer 2's per-ring routing (canary vs stable). Nullable so
-- pre-migration workspaces show up as NULL until next restart.
CREATE TABLE IF NOT EXISTS runtime_image_pins (
-- Template name (e.g. "claude-code", "hermes"). Matches the keys
-- in workspace-server/internal/provisioner.RuntimeImages.
template_name TEXT PRIMARY KEY,
-- Full GHCR digest including the `sha256:` prefix. Provisioner pulls
-- ghcr.io/molecule-ai/workspace-template-<template>@sha256:<digest>.
digest TEXT NOT NULL CHECK (digest ~ '^sha256:[a-f0-9]{64}$'),
-- Audit fields — populated by the admin promote/rollback endpoint
-- (separate follow-up PR). Operators can grep history by actor
-- when investigating a bad promotion.
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_by TEXT,
notes TEXT
);
ALTER TABLE workspaces
ADD COLUMN IF NOT EXISTS runtime_image_digest TEXT;
-- Index supports "show me all workspaces still on the old digest" queries
-- after a promotion (used by the canvas admin's stale-workspaces panel
-- + the recreate sweeper that nudges operators when a bad version is
-- still in service).
CREATE INDEX IF NOT EXISTS idx_workspaces_runtime_image_digest
ON workspaces (runtime_image_digest)
WHERE runtime_image_digest IS NOT NULL;