Merge pull request #2276 from Molecule-AI/feat/layer1-runtime-digest-pinning
feat(provisioner): digest-pin runtime images via runtime_image_pins table (Layer 1 of #2272)
This commit is contained in:
commit
15124527da
@ -2,6 +2,7 @@ package handlers
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
@ -363,6 +364,7 @@ func TestBuildProvisionerConfig_IncludesAwarenessSettings(t *testing.T) {
|
||||
t.Setenv("WORKSPACE_DIR", "/tmp/workspace")
|
||||
|
||||
cfg := handler.buildProvisionerConfig(
|
||||
context.Background(),
|
||||
"ws-123",
|
||||
"/tmp/configs/template",
|
||||
map[string][]byte{"config.yaml": []byte("name: test")},
|
||||
|
||||
67
workspace-server/internal/handlers/runtime_image_pin.go
Normal file
67
workspace-server/internal/handlers/runtime_image_pin.go
Normal file
@ -0,0 +1,67 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
|
||||
)
|
||||
|
||||
// resolveRuntimeImage returns the digest-pinned image ref for a runtime when
|
||||
// an operator has promoted one via the runtime_image_pins table (#2272 layer 1),
|
||||
// otherwise "" so the caller falls back to the legacy `:latest` lookup in
|
||||
// provisioner.RuntimeImages.
|
||||
//
|
||||
// Policy: availability over pinning. Any DB hiccup (sql.ErrNoRows is the
|
||||
// steady-state when nothing is pinned, but transient network blips, table
|
||||
// missing post-rollback, etc.) returns "" and the provision continues on
|
||||
// the moving tag — better one workspace on a slightly-newer image than a
|
||||
// provision-blocked tenant.
|
||||
//
|
||||
// WORKSPACE_IMAGE_LOCAL_OVERRIDE=1 short-circuits the lookup entirely so a
|
||||
// developer rebuilding template images locally gets their fresh build via
|
||||
// `:latest` even when a remote digest is pinned for the same runtime.
|
||||
func resolveRuntimeImage(ctx context.Context, runtime string) string {
|
||||
if runtime == "" {
|
||||
return ""
|
||||
}
|
||||
base, ok := provisioner.RuntimeImages[runtime]
|
||||
if !ok {
|
||||
// Unknown runtime — let provisioner.Start fall through to its own
|
||||
// DefaultImage. Querying the pin table for a runtime that doesn't
|
||||
// exist would only produce noise and a guaranteed ErrNoRows.
|
||||
return ""
|
||||
}
|
||||
if os.Getenv("WORKSPACE_IMAGE_LOCAL_OVERRIDE") != "" {
|
||||
return ""
|
||||
}
|
||||
if db.DB == nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
var digest string
|
||||
err := db.DB.QueryRowContext(ctx,
|
||||
`SELECT digest FROM runtime_image_pins WHERE template_name = $1`, runtime,
|
||||
).Scan(&digest)
|
||||
if err != nil {
|
||||
if !errors.Is(err, sql.ErrNoRows) {
|
||||
log.Printf("resolveRuntimeImage: pin lookup for %q failed (%v) — falling back to :latest", runtime, err)
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// Strip the moving tag suffix (`:latest`, `:staging`) before appending
|
||||
// the immutable digest. Docker treats `name:tag@sha256:...` as valid
|
||||
// but the tag is ignored; dropping it keeps logs and admin diffs honest
|
||||
// about what's actually being pulled.
|
||||
pinned := base
|
||||
if idx := strings.LastIndex(pinned, ":"); idx > strings.LastIndex(pinned, "/") {
|
||||
pinned = pinned[:idx]
|
||||
}
|
||||
return pinned + "@" + digest
|
||||
}
|
||||
138
workspace-server/internal/handlers/runtime_image_pin_test.go
Normal file
138
workspace-server/internal/handlers/runtime_image_pin_test.go
Normal file
@ -0,0 +1,138 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
sqlmock "github.com/DATA-DOG/go-sqlmock"
|
||||
)
|
||||
|
||||
// TestResolveRuntimeImage_NoPin: lookup returns sql.ErrNoRows (the steady-
|
||||
// state when an operator hasn't pinned this runtime). resolveRuntimeImage
|
||||
// returns "" so the caller falls back to RuntimeImages[runtime] (legacy
|
||||
// :latest behavior). This is the expected hot path until digest pinning
|
||||
// is opted into per runtime.
|
||||
func TestResolveRuntimeImage_NoPin(t *testing.T) {
|
||||
mockDB, mock, err := sqlmock.New()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer mockDB.Close()
|
||||
prev := db.DB
|
||||
db.DB = mockDB
|
||||
defer func() { db.DB = prev }()
|
||||
|
||||
mock.ExpectQuery(`SELECT digest FROM runtime_image_pins WHERE template_name = \$1`).
|
||||
WithArgs("claude-code").
|
||||
WillReturnError(sql.ErrNoRows)
|
||||
|
||||
got := resolveRuntimeImage(context.Background(), "claude-code")
|
||||
if got != "" {
|
||||
t.Errorf("expected empty (no pin = fallback), got %q", got)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sqlmock expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolveRuntimeImage_DBError: an unexpected DB failure (transient
|
||||
// network blip, table missing post-rollback, etc.) must NOT block the
|
||||
// provision — log + fall through to the legacy :latest path. This is
|
||||
// the availability-over-pinning policy spelled out in resolveRuntimeImage's
|
||||
// doc comment.
|
||||
func TestResolveRuntimeImage_DBError(t *testing.T) {
|
||||
mockDB, mock, err := sqlmock.New()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer mockDB.Close()
|
||||
prev := db.DB
|
||||
db.DB = mockDB
|
||||
defer func() { db.DB = prev }()
|
||||
|
||||
mock.ExpectQuery(`SELECT digest FROM runtime_image_pins WHERE template_name = \$1`).
|
||||
WithArgs("claude-code").
|
||||
WillReturnError(sqlmock.ErrCancelled)
|
||||
|
||||
got := resolveRuntimeImage(context.Background(), "claude-code")
|
||||
if got != "" {
|
||||
t.Errorf("expected empty on DB error (fallback), got %q", got)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sqlmock expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolveRuntimeImage_WithPin returns image@sha256:<digest> when row exists.
|
||||
func TestResolveRuntimeImage_WithPin(t *testing.T) {
|
||||
mockDB, mock, err := sqlmock.New()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer mockDB.Close()
|
||||
prev := db.DB
|
||||
db.DB = mockDB
|
||||
defer func() { db.DB = prev }()
|
||||
|
||||
digest := "sha256:3d6761a97ed07d7d33cfc19a8fbab81175d9d9179618d493dbc00c5f7ef076a3"
|
||||
mock.ExpectQuery(`SELECT digest FROM runtime_image_pins WHERE template_name = \$1`).
|
||||
WithArgs("claude-code").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"digest"}).AddRow(digest))
|
||||
|
||||
got := resolveRuntimeImage(context.Background(), "claude-code")
|
||||
if !strings.HasSuffix(got, "@"+digest) {
|
||||
t.Errorf("expected suffix @%s, got %q", digest, got)
|
||||
}
|
||||
if !strings.HasPrefix(got, "ghcr.io/molecule-ai/workspace-template-claude-code") {
|
||||
t.Errorf("expected GHCR prefix preserved, got %q", got)
|
||||
}
|
||||
if strings.Contains(got, ":latest") {
|
||||
t.Errorf("expected :latest stripped, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolveRuntimeImage_EmptyRuntime short-circuits to "" without DB.
|
||||
func TestResolveRuntimeImage_EmptyRuntime(t *testing.T) {
|
||||
got := resolveRuntimeImage(context.Background(), "")
|
||||
if got != "" {
|
||||
t.Errorf("expected empty for empty runtime, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolveRuntimeImage_UnknownRuntime returns "" without DB lookup.
|
||||
func TestResolveRuntimeImage_UnknownRuntime(t *testing.T) {
|
||||
got := resolveRuntimeImage(context.Background(), "no-such-runtime")
|
||||
if got != "" {
|
||||
t.Errorf("expected empty for unknown runtime, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolveRuntimeImage_LocalOverride: when WORKSPACE_IMAGE_LOCAL_OVERRIDE
|
||||
// is set, the pin lookup is short-circuited even with a row present —
|
||||
// devs rebuild images locally and want the floating tag to resolve to
|
||||
// their fresh build, not a remote-pinned digest.
|
||||
func TestResolveRuntimeImage_LocalOverride(t *testing.T) {
|
||||
t.Setenv("WORKSPACE_IMAGE_LOCAL_OVERRIDE", "1")
|
||||
|
||||
mockDB, mock, err := sqlmock.New()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer mockDB.Close()
|
||||
prev := db.DB
|
||||
db.DB = mockDB
|
||||
defer func() { db.DB = prev }()
|
||||
|
||||
// No expectation set — if resolveRuntimeImage queries the DB despite
|
||||
// the override, sqlmock fails the test via ExpectationsWereMet.
|
||||
got := resolveRuntimeImage(context.Background(), "claude-code")
|
||||
if got != "" {
|
||||
t.Errorf("expected empty under WORKSPACE_IMAGE_LOCAL_OVERRIDE=1, got %q", got)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("DB queried despite override: %v", err)
|
||||
}
|
||||
}
|
||||
@ -127,7 +127,7 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri
|
||||
workspaceID, filepath.Base(runtimeTemplate))
|
||||
templatePath = runtimeTemplate
|
||||
// Rebuild cfg with the recovered template path so Start() sees it.
|
||||
cfg = h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, prepared.EnvVars, prepared.PluginsPath, prepared.AwarenessNamespace)
|
||||
cfg = h.buildProvisionerConfig(ctx, workspaceID, templatePath, configFiles, payload, prepared.EnvVars, prepared.PluginsPath, prepared.AwarenessNamespace)
|
||||
cfg.ResetClaudeSession = resetClaudeSession
|
||||
recovered = true
|
||||
break
|
||||
@ -243,6 +243,7 @@ func (h *WorkspaceHandler) loadAwarenessNamespace(ctx context.Context, workspace
|
||||
}
|
||||
|
||||
func (h *WorkspaceHandler) buildProvisionerConfig(
|
||||
ctx context.Context,
|
||||
workspaceID, templatePath string,
|
||||
configFiles map[string][]byte,
|
||||
payload models.CreateWorkspacePayload,
|
||||
@ -291,6 +292,7 @@ func (h *WorkspaceHandler) buildProvisionerConfig(
|
||||
PlatformURL: h.platformURL,
|
||||
AwarenessURL: os.Getenv("AWARENESS_URL"),
|
||||
AwarenessNamespace: awarenessNamespace,
|
||||
Image: resolveRuntimeImage(ctx, payload.Runtime),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -166,7 +166,7 @@ func (h *WorkspaceHandler) prepareProvisionContext(
|
||||
}
|
||||
}
|
||||
|
||||
cfg := h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
|
||||
cfg := h.buildProvisionerConfig(ctx, workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
|
||||
cfg.ResetClaudeSession = resetClaudeSession
|
||||
|
||||
return &preparedProvisionContext{
|
||||
|
||||
@ -642,6 +642,7 @@ func TestBuildProvisionerConfig_BasicFields(t *testing.T) {
|
||||
templatePath := filepath.Join(tmpDir, "template")
|
||||
pluginsPath := t.TempDir()
|
||||
cfg := handler.buildProvisionerConfig(
|
||||
context.Background(),
|
||||
"ws-basic",
|
||||
templatePath,
|
||||
map[string][]byte{"config.yaml": []byte("name: test")},
|
||||
@ -687,6 +688,7 @@ func TestBuildProvisionerConfig_WorkspacePathFromEnv(t *testing.T) {
|
||||
|
||||
pluginsPath := t.TempDir()
|
||||
cfg := handler.buildProvisionerConfig(
|
||||
context.Background(),
|
||||
"ws-env",
|
||||
"",
|
||||
nil,
|
||||
|
||||
@ -90,6 +90,33 @@ type WorkspaceConfig struct {
|
||||
AwarenessNamespace string
|
||||
WorkspaceAccess string // #65: "none" (default), "read_only", or "read_write"
|
||||
ResetClaudeSession bool // #12: if true, discard the claude-sessions volume before start (fresh session dir)
|
||||
|
||||
// Image, when non-empty, overrides the runtime→image lookup. The handler
|
||||
// layer sets this to the digest-pinned form (`<base>@sha256:<digest>`)
|
||||
// when an operator has promoted a specific runtime build via the
|
||||
// runtime_image_pins table (#2272 layer 1). Empty = legacy behavior,
|
||||
// fall back to RuntimeImages[Runtime] which resolves to the moving
|
||||
// `:latest` tag.
|
||||
Image string
|
||||
}
|
||||
|
||||
// selectImage resolves the final Docker image ref for a workspace. The handler
|
||||
// layer is the source of truth — if it set cfg.Image (the digest-pinned form
|
||||
// from runtime_image_pins, #2272), honor that. Otherwise fall back to the
|
||||
// runtime→tag lookup in RuntimeImages (legacy `:latest` behavior). When the
|
||||
// runtime isn't recognized either, fall back to DefaultImage so Start() still
|
||||
// has something to hand Docker — surfacing a "No such image" later is more
|
||||
// actionable than a silent "" panic in ContainerCreate.
|
||||
func selectImage(cfg WorkspaceConfig) string {
|
||||
if cfg.Image != "" {
|
||||
return cfg.Image
|
||||
}
|
||||
if cfg.Runtime != "" {
|
||||
if img, ok := RuntimeImages[cfg.Runtime]; ok {
|
||||
return img
|
||||
}
|
||||
}
|
||||
return DefaultImage
|
||||
}
|
||||
|
||||
// Workspace-access constants for #65. Matches the CHECK constraint on
|
||||
@ -290,13 +317,7 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
|
||||
|
||||
env := buildContainerEnv(cfg)
|
||||
|
||||
// Select image based on runtime (each adapter has its own pre-built image)
|
||||
image := DefaultImage
|
||||
if cfg.Runtime != "" {
|
||||
if img, ok := RuntimeImages[cfg.Runtime]; ok {
|
||||
image = img
|
||||
}
|
||||
}
|
||||
image := selectImage(cfg)
|
||||
|
||||
containerCfg := &container.Config{
|
||||
Image: image,
|
||||
|
||||
@ -436,6 +436,62 @@ func TestWorkspaceConfig_ResetClaudeSessionFieldPresent(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// ---------- selectImage (#2272 layer 1) ----------
|
||||
|
||||
// TestSelectImage_PrefersExplicitImage: when the handler resolved a digest
|
||||
// pin via runtime_image_pins, cfg.Image is set. selectImage must honor it
|
||||
// and ignore the cfg.Runtime → :latest fallback. This is the load-bearing
|
||||
// invariant for digest pinning — if it ever silently reverts to :latest,
|
||||
// we lose the "one bad publish doesn't break every workspace" guarantee.
|
||||
func TestSelectImage_PrefersExplicitImage(t *testing.T) {
|
||||
pinned := "ghcr.io/molecule-ai/workspace-template-claude-code@sha256:3d6761a97ed07d7d33cfc19a8fbab81175d9d9179618d493dbc00c5f7ef076a3"
|
||||
got := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: pinned})
|
||||
if got != pinned {
|
||||
t.Errorf("selectImage with cfg.Image=pinned: got %q, want %q", got, pinned)
|
||||
}
|
||||
}
|
||||
|
||||
// TestSelectImage_FallsBackToRuntimeMap: handler returned "" (no pin or
|
||||
// pin lookup deliberately bypassed via WORKSPACE_IMAGE_LOCAL_OVERRIDE).
|
||||
// selectImage must use the legacy runtime→:latest map.
|
||||
func TestSelectImage_FallsBackToRuntimeMap(t *testing.T) {
|
||||
got := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: ""})
|
||||
want := RuntimeImages["claude-code"]
|
||||
if got != want {
|
||||
t.Errorf("selectImage with empty Image: got %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
// TestSelectImage_UnknownRuntimeFallsBackToDefault preserves today's
|
||||
// behavior — an unrecognized runtime resolves to DefaultImage rather than
|
||||
// "" so ContainerCreate gets a usable arg and surfaces a meaningful
|
||||
// "No such image" error if the default itself is missing.
|
||||
func TestSelectImage_UnknownRuntimeFallsBackToDefault(t *testing.T) {
|
||||
got := selectImage(WorkspaceConfig{Runtime: "no-such-runtime"})
|
||||
if got != DefaultImage {
|
||||
t.Errorf("selectImage with unknown runtime: got %q, want DefaultImage %q", got, DefaultImage)
|
||||
}
|
||||
}
|
||||
|
||||
// TestSelectImage_EmptyRuntimeFallsBackToDefault: same invariant for the
|
||||
// no-runtime-supplied path (legacy callers / older handler code).
|
||||
func TestSelectImage_EmptyRuntimeFallsBackToDefault(t *testing.T) {
|
||||
got := selectImage(WorkspaceConfig{})
|
||||
if got != DefaultImage {
|
||||
t.Errorf("selectImage with zero cfg: got %q, want DefaultImage %q", got, DefaultImage)
|
||||
}
|
||||
}
|
||||
|
||||
// TestWorkspaceConfig_ImageFieldPresent compile-time-pins the Image field
|
||||
// so the handler→provisioner contract for digest-pinned image refs can't
|
||||
// be silently removed by a future struct refactor (#2272).
|
||||
func TestWorkspaceConfig_ImageFieldPresent(t *testing.T) {
|
||||
cfg := WorkspaceConfig{Image: "ghcr.io/example@sha256:abc"}
|
||||
if cfg.Image == "" {
|
||||
t.Fatal("Image should round-trip through struct literal")
|
||||
}
|
||||
}
|
||||
|
||||
// ---------- buildContainerEnv — #67 MOLECULE_URL injection ----------
|
||||
|
||||
func TestBuildContainerEnv_InjectsBothPlatformURLAndMoleculeAIURL(t *testing.T) {
|
||||
|
||||
@ -0,0 +1,8 @@
|
||||
-- Reverse of 044_runtime_image_pins.up.sql.
|
||||
--
|
||||
-- Drop the index before the column it references to satisfy Postgres
|
||||
-- ordering. The pins table can drop independently of the column.
|
||||
|
||||
DROP INDEX IF EXISTS idx_workspaces_runtime_image_digest;
|
||||
ALTER TABLE workspaces DROP COLUMN IF EXISTS runtime_image_digest;
|
||||
DROP TABLE IF EXISTS runtime_image_pins;
|
||||
46
workspace-server/migrations/047_runtime_image_pins.up.sql
Normal file
46
workspace-server/migrations/047_runtime_image_pins.up.sql
Normal file
@ -0,0 +1,46 @@
|
||||
-- Layer 1 of the runtime-rollout plan (issue #2272).
|
||||
--
|
||||
-- Decouples publish from promotion: rather than every workspace pulling
|
||||
-- whatever's on `:latest` at create-time, the provisioner consults a
|
||||
-- promoted-digest table. Operators (or a cascade-success hook) update
|
||||
-- the table to roll out a new version; existing workspaces stay on
|
||||
-- whatever they were created with until they're explicitly recreated.
|
||||
-- One bad publish no longer breaks every workspace simultaneously.
|
||||
--
|
||||
-- Two pieces:
|
||||
-- 1. runtime_image_pins — the lookup table. One row per template name.
|
||||
-- No row = fall back to `:latest` (preserves today's behavior so
|
||||
-- this migration is non-disruptive to existing deployments).
|
||||
--
|
||||
-- 2. workspaces.runtime_image_digest — what the workspace actually
|
||||
-- pulled at create/restart time. Diagnostic + foundation for
|
||||
-- Layer 2's per-ring routing (canary vs stable). Nullable so
|
||||
-- pre-migration workspaces show up as NULL until next restart.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS runtime_image_pins (
|
||||
-- Template name (e.g. "claude-code", "hermes"). Matches the keys
|
||||
-- in workspace-server/internal/provisioner.RuntimeImages.
|
||||
template_name TEXT PRIMARY KEY,
|
||||
|
||||
-- Full GHCR digest including the `sha256:` prefix. Provisioner pulls
|
||||
-- ghcr.io/molecule-ai/workspace-template-<template>@sha256:<digest>.
|
||||
digest TEXT NOT NULL CHECK (digest ~ '^sha256:[a-f0-9]{64}$'),
|
||||
|
||||
-- Audit fields — populated by the admin promote/rollback endpoint
|
||||
-- (separate follow-up PR). Operators can grep history by actor
|
||||
-- when investigating a bad promotion.
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_by TEXT,
|
||||
notes TEXT
|
||||
);
|
||||
|
||||
ALTER TABLE workspaces
|
||||
ADD COLUMN IF NOT EXISTS runtime_image_digest TEXT;
|
||||
|
||||
-- Index supports "show me all workspaces still on the old digest" queries
|
||||
-- after a promotion (used by the canvas admin's stale-workspaces panel
|
||||
-- + the recreate sweeper that nudges operators when a bad version is
|
||||
-- still in service).
|
||||
CREATE INDEX IF NOT EXISTS idx_workspaces_runtime_image_digest
|
||||
ON workspaces (runtime_image_digest)
|
||||
WHERE runtime_image_digest IS NOT NULL;
|
||||
Loading…
Reference in New Issue
Block a user