diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index 1de342b1..f414aa9c 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -388,19 +388,35 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e imgPlatform := parseOCIPlatform(imgPlatformStr) // Log image resolution for debugging stale-image issues, and pull from - // GHCR on miss so tenant hosts don't need a pre-build step anymore. + // GHCR so tenant hosts don't need a pre-build step anymore. Two cases + // trigger a pull: + // 1. Image not present locally — historical behavior (pull-on-miss). + // 2. Image present locally AND tag is moving (`:latest`, no tag, + // `:staging`, etc.) — without this, a tenant that pulled `:latest` + // once is stuck on that snapshot forever even after publish-runtime + // pushes a newer image with the same tag. See task #215; sibling + // task #232 fixed the same class on the platform-tenant redeploy + // path. Pinned tags (semver, sha256) skip the pull because their + // contents are by definition immutable. // The pull is best-effort: if it fails (network, auth, rate limit) the // subsequent ContainerCreate still surfaces the actionable error below. imgInspect, _, imgErr := p.cli.ImageInspectWithRaw(ctx, image) - if imgErr == nil { - log.Printf("Provisioner: creating %s from image %s (ID: %s, created: %s)", - name, image, imgInspect.ID[:19], imgInspect.Created[:19]) - } else { + moving := imageTagIsMoving(image) + switch { + case imgErr != nil: if imgPlatformStr != "" { log.Printf("Provisioner: image %s not present locally (%v) — attempting pull (platform=%s)", image, imgErr, imgPlatformStr) } else { log.Printf("Provisioner: image %s not present locally (%v) — attempting pull", image, imgErr) } + case moving: + log.Printf("Provisioner: image %s present locally (ID: %s, created: %s) but tag is moving — re-pulling to refresh", + image, imgInspect.ID[:19], imgInspect.Created[:19]) + default: + log.Printf("Provisioner: creating %s from image %s (ID: %s, created: %s)", + name, image, imgInspect.ID[:19], imgInspect.Created[:19]) + } + if imgErr != nil || moving { if perr := pullImageAndDrain(ctx, p.cli, image, imgPlatformStr); perr != nil { log.Printf("Provisioner: image pull for %s failed: %v (falling through to create)", image, perr) } else { @@ -1199,6 +1215,55 @@ func isImageNotFoundErr(err error) bool { strings.Contains(m, "not found") && strings.Contains(m, "image") } +// imageTagIsMoving reports whether the tag portion of an image reference +// is one whose contents change over time at the registry — meaning a +// local-cache hit is not safe to trust because the cached snapshot may +// be stale relative to what the registry currently serves under the +// same tag. +// +// Returns true for: +// - References with no tag at all (Docker defaults the missing tag +// to `:latest`, which is the canonical moving tag). +// - Explicit `:latest`, `:staging`, `:main`, `:dev`, `:edge`, `:nightly`, +// `:rolling` — the conventional set of "moves on every publish" +// tags across the org's pipelines. +// +// Returns false for: +// - Digest-pinned references (`@sha256:...`) — by definition immutable. +// - Semver / SHA / build-ID tags (`:0.8.2`, `:abc1234`, `:2026-04-30`) — +// these are conventionally pinned, and even if a publisher mis-uses +// them, the wrong behavior is "stale" not "broken-fleet" because +// the tenant who chose a pinned tag is asking for that snapshot. +// +// The classification is deliberately conservative on the "moving" side +// (only the well-known moving tags) because mis-classifying a pinned +// tag as moving means we re-pull on every provision — wasted bandwidth, +// no correctness loss. Mis-classifying moving as pinned silently bricks +// the fleet on stale snapshots — exactly the bug class that motivated +// task #215. So the bias is: when in doubt, treat as pinned. +// +// Sibling task #232 (Platform-tenant :latest re-pull on redeploy) +// applied the same principle on the controlplane redeploy path. Keep +// the moving-tag list aligned across both implementations if updated. +func imageTagIsMoving(image string) bool { + // Digest-pinned references are immutable by construction. + if strings.Contains(image, "@sha256:") { + return false + } + // Strip everything before the LAST colon to isolate the tag, but + // stop at a `/` to avoid mistaking a port number in a registry + // hostname (e.g. `localhost:5000/foo`) for a tag. + tag := "" + if i := strings.LastIndex(image, ":"); i >= 0 && !strings.Contains(image[i+1:], "/") { + tag = image[i+1:] + } + switch tag { + case "", "latest", "staging", "main", "dev", "edge", "nightly", "rolling": + return true + } + return false +} + // runtimeTagFromImage extracts the runtime name from a workspace-template // image reference for use in user-facing error hints. Handles both the // legacy local tag (`workspace-template:`) and the current GHCR diff --git a/workspace-server/internal/provisioner/provisioner_test.go b/workspace-server/internal/provisioner/provisioner_test.go index f36b77ef..295475cc 100644 --- a/workspace-server/internal/provisioner/provisioner_test.go +++ b/workspace-server/internal/provisioner/provisioner_test.go @@ -732,6 +732,68 @@ func TestRuntimeTagFromImage(t *testing.T) { } } +// ---------- imageTagIsMoving (task #215) ---------- + +// TestImageTagIsMoving pins the moving-tag classifier. The classifier +// gates whether Start() forces a re-pull on a local-cache hit — get +// the classification wrong on the "moving" side and we waste bandwidth +// on every provision; get it wrong on the "pinned" side and the fleet +// silently sticks on a stale `:latest` snapshot (the bug class this +// task closes). +func TestImageTagIsMoving(t *testing.T) { + cases := []struct { + name string + image string + want bool + }{ + // Bare references default to :latest at the registry level. + {"bare repo no tag", "ghcr.io/molecule-ai/workspace-template-hermes", true}, + {"bare local image no tag", "workspace-template", true}, + + // Explicit moving tags. + {"explicit latest", "ghcr.io/molecule-ai/workspace-template-hermes:latest", true}, + {"explicit staging", "ghcr.io/molecule-ai/workspace-template-hermes:staging", true}, + {"explicit main", "ghcr.io/molecule-ai/workspace-template-hermes:main", true}, + {"explicit dev", "ghcr.io/molecule-ai/workspace-template-hermes:dev", true}, + {"explicit edge", "ghcr.io/molecule-ai/workspace-template-hermes:edge", true}, + {"explicit nightly", "ghcr.io/molecule-ai/workspace-template-hermes:nightly", true}, + {"explicit rolling", "ghcr.io/molecule-ai/workspace-template-hermes:rolling", true}, + + // Pinned tags — must NOT be classified as moving. + {"semver tag", "ghcr.io/molecule-ai/workspace-template-hermes:0.8.2", false}, + {"semver with v prefix", "ghcr.io/molecule-ai/workspace-template-hermes:v1.2.3", false}, + {"sha-prefixed commit tag", "ghcr.io/molecule-ai/workspace-template-langgraph:sha-abc1234", false}, + {"date-stamped tag", "ghcr.io/molecule-ai/workspace-template-hermes:2026-04-30", false}, + {"build-id tag", "ghcr.io/molecule-ai/workspace-template-hermes:build-12345", false}, + + // Digest pinning — strongest immutability signal, never moving + // even if a moving-looking tag is also present. + {"digest only", "ghcr.io/molecule-ai/workspace-template-hermes@sha256:abc123def456", false}, + {"tag plus digest", "ghcr.io/molecule-ai/workspace-template-hermes:latest@sha256:abc123def456", false}, + + // Registry hostname with port — the `:` in `:5000` must NOT be + // mistaken for a tag separator. Without this guard, a private + // registry like `localhost:5000/foo` would always re-pull. + {"registry with port no tag", "localhost:5000/workspace-template-hermes", true}, // bare → moving + {"registry with port pinned tag", "localhost:5000/workspace-template-hermes:0.8.2", false}, + {"registry with port latest tag", "localhost:5000/workspace-template-hermes:latest", true}, + + // Legacy local-build tags from `docker build -t workspace-template:`. + // These are arbitrary strings, treated as pinned (they don't + // move from the registry's perspective — there is no registry). + {"legacy local hermes tag", "workspace-template:hermes", false}, + {"legacy local claude-code tag", "workspace-template:claude-code", false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := imageTagIsMoving(tc.image) + if got != tc.want { + t.Errorf("imageTagIsMoving(%q) = %v, want %v", tc.image, got, tc.want) + } + }) + } +} + // ---------- End-to-end error-message shape ---------- // // Verifies the wrapped error that Start() surfaces when ContainerCreate