Merge branch 'staging' into perf/cache-platform-inbound-secret

This commit is contained in:
Hongming Wang 2026-05-03 00:08:43 -07:00 committed by GitHub
commit 384edb4af0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 132 additions and 5 deletions

View File

@ -388,19 +388,35 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
imgPlatform := parseOCIPlatform(imgPlatformStr)
// Log image resolution for debugging stale-image issues, and pull from
// GHCR on miss so tenant hosts don't need a pre-build step anymore.
// GHCR so tenant hosts don't need a pre-build step anymore. Two cases
// trigger a pull:
// 1. Image not present locally — historical behavior (pull-on-miss).
// 2. Image present locally AND tag is moving (`:latest`, no tag,
// `:staging`, etc.) — without this, a tenant that pulled `:latest`
// once is stuck on that snapshot forever even after publish-runtime
// pushes a newer image with the same tag. See task #215; sibling
// task #232 fixed the same class on the platform-tenant redeploy
// path. Pinned tags (semver, sha256) skip the pull because their
// contents are by definition immutable.
// The pull is best-effort: if it fails (network, auth, rate limit) the
// subsequent ContainerCreate still surfaces the actionable error below.
imgInspect, _, imgErr := p.cli.ImageInspectWithRaw(ctx, image)
if imgErr == nil {
log.Printf("Provisioner: creating %s from image %s (ID: %s, created: %s)",
name, image, imgInspect.ID[:19], imgInspect.Created[:19])
} else {
moving := imageTagIsMoving(image)
switch {
case imgErr != nil:
if imgPlatformStr != "" {
log.Printf("Provisioner: image %s not present locally (%v) — attempting pull (platform=%s)", image, imgErr, imgPlatformStr)
} else {
log.Printf("Provisioner: image %s not present locally (%v) — attempting pull", image, imgErr)
}
case moving:
log.Printf("Provisioner: image %s present locally (ID: %s, created: %s) but tag is moving — re-pulling to refresh",
image, imgInspect.ID[:19], imgInspect.Created[:19])
default:
log.Printf("Provisioner: creating %s from image %s (ID: %s, created: %s)",
name, image, imgInspect.ID[:19], imgInspect.Created[:19])
}
if imgErr != nil || moving {
if perr := pullImageAndDrain(ctx, p.cli, image, imgPlatformStr); perr != nil {
log.Printf("Provisioner: image pull for %s failed: %v (falling through to create)", image, perr)
} else {
@ -1199,6 +1215,55 @@ func isImageNotFoundErr(err error) bool {
strings.Contains(m, "not found") && strings.Contains(m, "image")
}
// imageTagIsMoving reports whether the tag portion of an image reference
// is one whose contents change over time at the registry — meaning a
// local-cache hit is not safe to trust because the cached snapshot may
// be stale relative to what the registry currently serves under the
// same tag.
//
// Returns true for:
// - References with no tag at all (Docker defaults the missing tag
// to `:latest`, which is the canonical moving tag).
// - Explicit `:latest`, `:staging`, `:main`, `:dev`, `:edge`, `:nightly`,
// `:rolling` — the conventional set of "moves on every publish"
// tags across the org's pipelines.
//
// Returns false for:
// - Digest-pinned references (`@sha256:...`) — by definition immutable.
// - Semver / SHA / build-ID tags (`:0.8.2`, `:abc1234`, `:2026-04-30`) —
// these are conventionally pinned, and even if a publisher mis-uses
// them, the wrong behavior is "stale" not "broken-fleet" because
// the tenant who chose a pinned tag is asking for that snapshot.
//
// The classification is deliberately conservative on the "moving" side
// (only the well-known moving tags) because mis-classifying a pinned
// tag as moving means we re-pull on every provision — wasted bandwidth,
// no correctness loss. Mis-classifying moving as pinned silently bricks
// the fleet on stale snapshots — exactly the bug class that motivated
// task #215. So the bias is: when in doubt, treat as pinned.
//
// Sibling task #232 (Platform-tenant :latest re-pull on redeploy)
// applied the same principle on the controlplane redeploy path. Keep
// the moving-tag list aligned across both implementations if updated.
func imageTagIsMoving(image string) bool {
// Digest-pinned references are immutable by construction.
if strings.Contains(image, "@sha256:") {
return false
}
// Strip everything before the LAST colon to isolate the tag, but
// stop at a `/` to avoid mistaking a port number in a registry
// hostname (e.g. `localhost:5000/foo`) for a tag.
tag := ""
if i := strings.LastIndex(image, ":"); i >= 0 && !strings.Contains(image[i+1:], "/") {
tag = image[i+1:]
}
switch tag {
case "", "latest", "staging", "main", "dev", "edge", "nightly", "rolling":
return true
}
return false
}
// runtimeTagFromImage extracts the runtime name from a workspace-template
// image reference for use in user-facing error hints. Handles both the
// legacy local tag (`workspace-template:<runtime>`) and the current GHCR

View File

@ -732,6 +732,68 @@ func TestRuntimeTagFromImage(t *testing.T) {
}
}
// ---------- imageTagIsMoving (task #215) ----------
// TestImageTagIsMoving pins the moving-tag classifier. The classifier
// gates whether Start() forces a re-pull on a local-cache hit — get
// the classification wrong on the "moving" side and we waste bandwidth
// on every provision; get it wrong on the "pinned" side and the fleet
// silently sticks on a stale `:latest` snapshot (the bug class this
// task closes).
func TestImageTagIsMoving(t *testing.T) {
cases := []struct {
name string
image string
want bool
}{
// Bare references default to :latest at the registry level.
{"bare repo no tag", "ghcr.io/molecule-ai/workspace-template-hermes", true},
{"bare local image no tag", "workspace-template", true},
// Explicit moving tags.
{"explicit latest", "ghcr.io/molecule-ai/workspace-template-hermes:latest", true},
{"explicit staging", "ghcr.io/molecule-ai/workspace-template-hermes:staging", true},
{"explicit main", "ghcr.io/molecule-ai/workspace-template-hermes:main", true},
{"explicit dev", "ghcr.io/molecule-ai/workspace-template-hermes:dev", true},
{"explicit edge", "ghcr.io/molecule-ai/workspace-template-hermes:edge", true},
{"explicit nightly", "ghcr.io/molecule-ai/workspace-template-hermes:nightly", true},
{"explicit rolling", "ghcr.io/molecule-ai/workspace-template-hermes:rolling", true},
// Pinned tags — must NOT be classified as moving.
{"semver tag", "ghcr.io/molecule-ai/workspace-template-hermes:0.8.2", false},
{"semver with v prefix", "ghcr.io/molecule-ai/workspace-template-hermes:v1.2.3", false},
{"sha-prefixed commit tag", "ghcr.io/molecule-ai/workspace-template-langgraph:sha-abc1234", false},
{"date-stamped tag", "ghcr.io/molecule-ai/workspace-template-hermes:2026-04-30", false},
{"build-id tag", "ghcr.io/molecule-ai/workspace-template-hermes:build-12345", false},
// Digest pinning — strongest immutability signal, never moving
// even if a moving-looking tag is also present.
{"digest only", "ghcr.io/molecule-ai/workspace-template-hermes@sha256:abc123def456", false},
{"tag plus digest", "ghcr.io/molecule-ai/workspace-template-hermes:latest@sha256:abc123def456", false},
// Registry hostname with port — the `:` in `:5000` must NOT be
// mistaken for a tag separator. Without this guard, a private
// registry like `localhost:5000/foo` would always re-pull.
{"registry with port no tag", "localhost:5000/workspace-template-hermes", true}, // bare → moving
{"registry with port pinned tag", "localhost:5000/workspace-template-hermes:0.8.2", false},
{"registry with port latest tag", "localhost:5000/workspace-template-hermes:latest", true},
// Legacy local-build tags from `docker build -t workspace-template:<runtime>`.
// These are arbitrary strings, treated as pinned (they don't
// move from the registry's perspective — there is no registry).
{"legacy local hermes tag", "workspace-template:hermes", false},
{"legacy local claude-code tag", "workspace-template:claude-code", false},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
got := imageTagIsMoving(tc.image)
if got != tc.want {
t.Errorf("imageTagIsMoving(%q) = %v, want %v", tc.image, got, tc.want)
}
})
}
}
// ---------- End-to-end error-message shape ----------
//
// Verifies the wrapped error that Start() surfaces when ContainerCreate