forked from molecule-ai/molecule-core
Merge pull request #2546 from Molecule-AI/fix/provisioner-repull-moving-tags
fix(provisioner): force re-pull of moving image tags on workspace start
This commit is contained in:
commit
c4f64a11a8
@ -388,19 +388,35 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
|
||||
imgPlatform := parseOCIPlatform(imgPlatformStr)
|
||||
|
||||
// Log image resolution for debugging stale-image issues, and pull from
|
||||
// GHCR on miss so tenant hosts don't need a pre-build step anymore.
|
||||
// GHCR so tenant hosts don't need a pre-build step anymore. Two cases
|
||||
// trigger a pull:
|
||||
// 1. Image not present locally — historical behavior (pull-on-miss).
|
||||
// 2. Image present locally AND tag is moving (`:latest`, no tag,
|
||||
// `:staging`, etc.) — without this, a tenant that pulled `:latest`
|
||||
// once is stuck on that snapshot forever even after publish-runtime
|
||||
// pushes a newer image with the same tag. See task #215; sibling
|
||||
// task #232 fixed the same class on the platform-tenant redeploy
|
||||
// path. Pinned tags (semver, sha256) skip the pull because their
|
||||
// contents are by definition immutable.
|
||||
// The pull is best-effort: if it fails (network, auth, rate limit) the
|
||||
// subsequent ContainerCreate still surfaces the actionable error below.
|
||||
imgInspect, _, imgErr := p.cli.ImageInspectWithRaw(ctx, image)
|
||||
if imgErr == nil {
|
||||
log.Printf("Provisioner: creating %s from image %s (ID: %s, created: %s)",
|
||||
name, image, imgInspect.ID[:19], imgInspect.Created[:19])
|
||||
} else {
|
||||
moving := imageTagIsMoving(image)
|
||||
switch {
|
||||
case imgErr != nil:
|
||||
if imgPlatformStr != "" {
|
||||
log.Printf("Provisioner: image %s not present locally (%v) — attempting pull (platform=%s)", image, imgErr, imgPlatformStr)
|
||||
} else {
|
||||
log.Printf("Provisioner: image %s not present locally (%v) — attempting pull", image, imgErr)
|
||||
}
|
||||
case moving:
|
||||
log.Printf("Provisioner: image %s present locally (ID: %s, created: %s) but tag is moving — re-pulling to refresh",
|
||||
image, imgInspect.ID[:19], imgInspect.Created[:19])
|
||||
default:
|
||||
log.Printf("Provisioner: creating %s from image %s (ID: %s, created: %s)",
|
||||
name, image, imgInspect.ID[:19], imgInspect.Created[:19])
|
||||
}
|
||||
if imgErr != nil || moving {
|
||||
if perr := pullImageAndDrain(ctx, p.cli, image, imgPlatformStr); perr != nil {
|
||||
log.Printf("Provisioner: image pull for %s failed: %v (falling through to create)", image, perr)
|
||||
} else {
|
||||
@ -1199,6 +1215,55 @@ func isImageNotFoundErr(err error) bool {
|
||||
strings.Contains(m, "not found") && strings.Contains(m, "image")
|
||||
}
|
||||
|
||||
// imageTagIsMoving reports whether the tag portion of an image reference
|
||||
// is one whose contents change over time at the registry — meaning a
|
||||
// local-cache hit is not safe to trust because the cached snapshot may
|
||||
// be stale relative to what the registry currently serves under the
|
||||
// same tag.
|
||||
//
|
||||
// Returns true for:
|
||||
// - References with no tag at all (Docker defaults the missing tag
|
||||
// to `:latest`, which is the canonical moving tag).
|
||||
// - Explicit `:latest`, `:staging`, `:main`, `:dev`, `:edge`, `:nightly`,
|
||||
// `:rolling` — the conventional set of "moves on every publish"
|
||||
// tags across the org's pipelines.
|
||||
//
|
||||
// Returns false for:
|
||||
// - Digest-pinned references (`@sha256:...`) — by definition immutable.
|
||||
// - Semver / SHA / build-ID tags (`:0.8.2`, `:abc1234`, `:2026-04-30`) —
|
||||
// these are conventionally pinned, and even if a publisher mis-uses
|
||||
// them, the wrong behavior is "stale" not "broken-fleet" because
|
||||
// the tenant who chose a pinned tag is asking for that snapshot.
|
||||
//
|
||||
// The classification is deliberately conservative on the "moving" side
|
||||
// (only the well-known moving tags) because mis-classifying a pinned
|
||||
// tag as moving means we re-pull on every provision — wasted bandwidth,
|
||||
// no correctness loss. Mis-classifying moving as pinned silently bricks
|
||||
// the fleet on stale snapshots — exactly the bug class that motivated
|
||||
// task #215. So the bias is: when in doubt, treat as pinned.
|
||||
//
|
||||
// Sibling task #232 (Platform-tenant :latest re-pull on redeploy)
|
||||
// applied the same principle on the controlplane redeploy path. Keep
|
||||
// the moving-tag list aligned across both implementations if updated.
|
||||
func imageTagIsMoving(image string) bool {
|
||||
// Digest-pinned references are immutable by construction.
|
||||
if strings.Contains(image, "@sha256:") {
|
||||
return false
|
||||
}
|
||||
// Strip everything before the LAST colon to isolate the tag, but
|
||||
// stop at a `/` to avoid mistaking a port number in a registry
|
||||
// hostname (e.g. `localhost:5000/foo`) for a tag.
|
||||
tag := ""
|
||||
if i := strings.LastIndex(image, ":"); i >= 0 && !strings.Contains(image[i+1:], "/") {
|
||||
tag = image[i+1:]
|
||||
}
|
||||
switch tag {
|
||||
case "", "latest", "staging", "main", "dev", "edge", "nightly", "rolling":
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// runtimeTagFromImage extracts the runtime name from a workspace-template
|
||||
// image reference for use in user-facing error hints. Handles both the
|
||||
// legacy local tag (`workspace-template:<runtime>`) and the current GHCR
|
||||
|
||||
@ -732,6 +732,68 @@ func TestRuntimeTagFromImage(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// ---------- imageTagIsMoving (task #215) ----------
|
||||
|
||||
// TestImageTagIsMoving pins the moving-tag classifier. The classifier
|
||||
// gates whether Start() forces a re-pull on a local-cache hit — get
|
||||
// the classification wrong on the "moving" side and we waste bandwidth
|
||||
// on every provision; get it wrong on the "pinned" side and the fleet
|
||||
// silently sticks on a stale `:latest` snapshot (the bug class this
|
||||
// task closes).
|
||||
func TestImageTagIsMoving(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
image string
|
||||
want bool
|
||||
}{
|
||||
// Bare references default to :latest at the registry level.
|
||||
{"bare repo no tag", "ghcr.io/molecule-ai/workspace-template-hermes", true},
|
||||
{"bare local image no tag", "workspace-template", true},
|
||||
|
||||
// Explicit moving tags.
|
||||
{"explicit latest", "ghcr.io/molecule-ai/workspace-template-hermes:latest", true},
|
||||
{"explicit staging", "ghcr.io/molecule-ai/workspace-template-hermes:staging", true},
|
||||
{"explicit main", "ghcr.io/molecule-ai/workspace-template-hermes:main", true},
|
||||
{"explicit dev", "ghcr.io/molecule-ai/workspace-template-hermes:dev", true},
|
||||
{"explicit edge", "ghcr.io/molecule-ai/workspace-template-hermes:edge", true},
|
||||
{"explicit nightly", "ghcr.io/molecule-ai/workspace-template-hermes:nightly", true},
|
||||
{"explicit rolling", "ghcr.io/molecule-ai/workspace-template-hermes:rolling", true},
|
||||
|
||||
// Pinned tags — must NOT be classified as moving.
|
||||
{"semver tag", "ghcr.io/molecule-ai/workspace-template-hermes:0.8.2", false},
|
||||
{"semver with v prefix", "ghcr.io/molecule-ai/workspace-template-hermes:v1.2.3", false},
|
||||
{"sha-prefixed commit tag", "ghcr.io/molecule-ai/workspace-template-langgraph:sha-abc1234", false},
|
||||
{"date-stamped tag", "ghcr.io/molecule-ai/workspace-template-hermes:2026-04-30", false},
|
||||
{"build-id tag", "ghcr.io/molecule-ai/workspace-template-hermes:build-12345", false},
|
||||
|
||||
// Digest pinning — strongest immutability signal, never moving
|
||||
// even if a moving-looking tag is also present.
|
||||
{"digest only", "ghcr.io/molecule-ai/workspace-template-hermes@sha256:abc123def456", false},
|
||||
{"tag plus digest", "ghcr.io/molecule-ai/workspace-template-hermes:latest@sha256:abc123def456", false},
|
||||
|
||||
// Registry hostname with port — the `:` in `:5000` must NOT be
|
||||
// mistaken for a tag separator. Without this guard, a private
|
||||
// registry like `localhost:5000/foo` would always re-pull.
|
||||
{"registry with port no tag", "localhost:5000/workspace-template-hermes", true}, // bare → moving
|
||||
{"registry with port pinned tag", "localhost:5000/workspace-template-hermes:0.8.2", false},
|
||||
{"registry with port latest tag", "localhost:5000/workspace-template-hermes:latest", true},
|
||||
|
||||
// Legacy local-build tags from `docker build -t workspace-template:<runtime>`.
|
||||
// These are arbitrary strings, treated as pinned (they don't
|
||||
// move from the registry's perspective — there is no registry).
|
||||
{"legacy local hermes tag", "workspace-template:hermes", false},
|
||||
{"legacy local claude-code tag", "workspace-template:claude-code", false},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := imageTagIsMoving(tc.image)
|
||||
if got != tc.want {
|
||||
t.Errorf("imageTagIsMoving(%q) = %v, want %v", tc.image, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ---------- End-to-end error-message shape ----------
|
||||
//
|
||||
// Verifies the wrapped error that Start() surfaces when ContainerCreate
|
||||
|
||||
Loading…
Reference in New Issue
Block a user