fix(provisioner): force linux/amd64 pull + create on Apple Silicon hosts (#1875)
On an Apple Silicon dev box, every `POST /workspaces` failed immediately
with:
no matching manifest for linux/arm64/v8 in the manifest list entries:
no match for platform in manifest: not found
because the GHCR workspace-template-* images ship only a linux/amd64
manifest today. `ImagePull` and `ContainerCreate` asked for the daemon's
native arch and missed. The Canvas surfaced this as
docker image "ghcr.io/molecule-ai/workspace-template-autogen:latest"
not found after pull attempt — verify GHCR visibility for autogen
— confusing because the image IS visible, just not for linux/arm64.
### Fix
Add an auto-detect helper `defaultImagePlatform()` in
`internal/provisioner/provisioner.go` that returns `"linux/amd64"` on
Apple Silicon hosts and `""` (no preference) everywhere else, with an
env override `MOLECULE_IMAGE_PLATFORM` for operators who want to pin
or disable explicitly. The result is passed to both `ImagePull`
(`PullOptions.Platform`) and `ContainerCreate` (4th arg
`*ocispec.Platform`) so the pulled amd64 manifest matches the
create-time platform spec. Docker Desktop transparently runs it
under QEMU emulation on M-series Macs — slow (2–5× native) but
functional.
SaaS production (linux/amd64 EC2, `MOLECULE_ENV=production`) never
hits the `runtime.GOARCH == "arm64"` branch, so the current behaviour
on real tenants is byte-for-byte unchanged. Opt-in escape hatch for
operators who want it off:
export MOLECULE_IMAGE_PLATFORM="" # disable auto-force
export MOLECULE_IMAGE_PLATFORM=linux/arm64 # pin alternate
`ocispec` is `github.com/opencontainers/image-spec/specs-go/v1` —
already in go.sum v1.1.1 as a transitive dependency of
`github.com/docker/docker`, not a new import.
### Tests
`internal/provisioner/platform_test.go` exercises every branch:
- `TestDefaultImagePlatform_EnvOverride_ExplicitValue` — env wins
- `TestDefaultImagePlatform_EnvOverride_EmptyValue` — empty string
disables the auto-force (operator escape hatch)
- `TestDefaultImagePlatform_AutoDetect` — linux/amd64 on arm64 Mac,
"" on every other host
- `TestParseOCIPlatform` — 7 table-driven cases covering well-formed
platforms, malformed inputs, and nil handling
### End-to-end verification
Before this commit, `POST /workspaces` on my Apple Silicon box:
workspace status transitioned: provisioning → failed (~1s)
log: image pull for ... failed: no matching manifest for linux/arm64/v8
After this commit, fresh DB + fresh platform:
workspace status transitioned: provisioning → online (~25s)
log: attempting pull (platform=linux/amd64)
pulled ghcr.io/molecule-ai/workspace-template-langgraph:latest
docker ps: ws-7aa08951-00d Up 27 seconds
The existing provisioner race-tested test suite (`go test -race
./internal/provisioner/`) still passes — the platform pointer defaults
to nil on linux/amd64 hosts, so the CI-resolved test expectations
don't change.
Closes #1875 (arm64 image blocker).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
96cc4b0c42
commit
539e3483e4
109
workspace-server/internal/provisioner/platform_test.go
Normal file
109
workspace-server/internal/provisioner/platform_test.go
Normal file
@ -0,0 +1,109 @@
|
||||
package provisioner
|
||||
|
||||
import (
|
||||
"os"
|
||||
"runtime"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// Tests for defaultImagePlatform + parseOCIPlatform.
|
||||
//
|
||||
// The platform-forcing helper unblocks Apple Silicon dev boxes — see
|
||||
// issue #1875. SaaS production (linux/amd64 EC2) must NOT hit the
|
||||
// forced-platform branch, which is what the "no override + linux host"
|
||||
// and the explicit-empty-override tests lock in.
|
||||
|
||||
func TestDefaultImagePlatform_EnvOverride_ExplicitValue(t *testing.T) {
|
||||
t.Setenv("MOLECULE_IMAGE_PLATFORM", "linux/arm64")
|
||||
got := defaultImagePlatform()
|
||||
if got != "linux/arm64" {
|
||||
t.Errorf("expected env override to win, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDefaultImagePlatform_EnvOverride_EmptyValue(t *testing.T) {
|
||||
// An explicitly empty env var disables the auto-force. This is the
|
||||
// escape hatch for operators who don't want the fallback but also
|
||||
// haven't pinned an alternate platform.
|
||||
t.Setenv("MOLECULE_IMAGE_PLATFORM", "")
|
||||
got := defaultImagePlatform()
|
||||
if got != "" {
|
||||
t.Errorf("expected empty override to suppress auto-force, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDefaultImagePlatform_AutoDetect(t *testing.T) {
|
||||
// Clear any override the test runner inherited so we see pure
|
||||
// auto-detect behaviour.
|
||||
t.Setenv("MOLECULE_IMAGE_PLATFORM", "")
|
||||
// Re-run without the env var at all — t.Setenv already backs up,
|
||||
// but we need to Unsetenv for the LookupEnv branch to miss.
|
||||
if err := unsetEnvForTest(t, "MOLECULE_IMAGE_PLATFORM"); err != nil {
|
||||
t.Fatalf("unset env: %v", err)
|
||||
}
|
||||
|
||||
got := defaultImagePlatform()
|
||||
switch {
|
||||
case runtime.GOOS == "darwin" && runtime.GOARCH == "arm64":
|
||||
if got != "linux/amd64" {
|
||||
t.Errorf("Apple Silicon: expected linux/amd64 auto-force, got %q", got)
|
||||
}
|
||||
default:
|
||||
if got != "" {
|
||||
t.Errorf("non-Apple-Silicon host: expected no auto-force, got %q", got)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseOCIPlatform(t *testing.T) {
|
||||
cases := []struct {
|
||||
in string
|
||||
wantOS string
|
||||
wantCPU string
|
||||
wantNil bool
|
||||
}{
|
||||
{"", "", "", true},
|
||||
{"linux/amd64", "linux", "amd64", false},
|
||||
{"linux/arm64", "linux", "arm64", false},
|
||||
// Malformed inputs must return nil so ContainerCreate falls back
|
||||
// to "no preference" instead of getting a half-populated struct.
|
||||
{"linux", "", "", true},
|
||||
{"linux/", "", "", true},
|
||||
{"/amd64", "", "", true},
|
||||
{"linux/amd64/v8", "linux", "amd64/v8", false}, // current parser: everything after first "/" is arch
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.in, func(t *testing.T) {
|
||||
got := parseOCIPlatform(tc.in)
|
||||
if tc.wantNil {
|
||||
if got != nil {
|
||||
t.Errorf("expected nil, got %+v", got)
|
||||
}
|
||||
return
|
||||
}
|
||||
if got == nil {
|
||||
t.Fatalf("unexpected nil for %q", tc.in)
|
||||
}
|
||||
if got.OS != tc.wantOS || got.Architecture != tc.wantCPU {
|
||||
t.Errorf("parse %q = %+v, want OS=%q Arch=%q",
|
||||
tc.in, got, tc.wantOS, tc.wantCPU)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// unsetEnvForTest removes an env var for the duration of the test and
|
||||
// restores it on cleanup. t.Setenv only supports setting, not removing;
|
||||
// we need the unset path to test the "no override" branch.
|
||||
func unsetEnvForTest(t *testing.T, key string) error {
|
||||
t.Helper()
|
||||
prev, existed := os.LookupEnv(key)
|
||||
t.Cleanup(func() {
|
||||
if existed {
|
||||
_ = os.Setenv(key, prev)
|
||||
} else {
|
||||
_ = os.Unsetenv(key)
|
||||
}
|
||||
})
|
||||
return os.Unsetenv(key)
|
||||
}
|
||||
@ -10,6 +10,7 @@ import (
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
@ -20,6 +21,7 @@ import (
|
||||
"github.com/docker/docker/api/types/volume"
|
||||
"github.com/docker/docker/client"
|
||||
"github.com/docker/go-connections/nat"
|
||||
ocispec "github.com/opencontainers/image-spec/specs-go/v1"
|
||||
)
|
||||
|
||||
// RuntimeImages maps runtime names to their Docker image refs on GHCR.
|
||||
@ -236,6 +238,18 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
|
||||
// Ensure no stale container exists with the same name (race with restart policy)
|
||||
_ = p.cli.ContainerRemove(ctx, name, container.RemoveOptions{Force: true})
|
||||
|
||||
// Resolve the target image platform once so the pull and the
|
||||
// container-create use the same value. On an Apple Silicon dev
|
||||
// laptop the GHCR workspace-template-* images only ship a
|
||||
// linux/amd64 manifest today; without an explicit platform the
|
||||
// daemon asks for linux/arm64/v8 and ImagePull returns
|
||||
// "no matching manifest for linux/arm64/v8 in the manifest list
|
||||
// entries". Forcing linux/amd64 lets Docker Desktop run them
|
||||
// under QEMU emulation (slow but functional — unblocks local
|
||||
// dev + Canvas smoke-testing on M-series Macs). See issue #1875.
|
||||
imgPlatformStr := defaultImagePlatform()
|
||||
imgPlatform := parseOCIPlatform(imgPlatformStr)
|
||||
|
||||
// Log image resolution for debugging stale-image issues, and pull from
|
||||
// GHCR on miss so tenant hosts don't need a pre-build step anymore.
|
||||
// The pull is best-effort: if it fails (network, auth, rate limit) the
|
||||
@ -245,8 +259,12 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
|
||||
log.Printf("Provisioner: creating %s from image %s (ID: %s, created: %s)",
|
||||
name, image, imgInspect.ID[:19], imgInspect.Created[:19])
|
||||
} else {
|
||||
log.Printf("Provisioner: image %s not present locally (%v) — attempting pull", image, imgErr)
|
||||
if perr := pullImageAndDrain(ctx, p.cli, image); perr != nil {
|
||||
if imgPlatformStr != "" {
|
||||
log.Printf("Provisioner: image %s not present locally (%v) — attempting pull (platform=%s)", image, imgErr, imgPlatformStr)
|
||||
} else {
|
||||
log.Printf("Provisioner: image %s not present locally (%v) — attempting pull", image, imgErr)
|
||||
}
|
||||
if perr := pullImageAndDrain(ctx, p.cli, image, imgPlatformStr); perr != nil {
|
||||
log.Printf("Provisioner: image pull for %s failed: %v (falling through to create)", image, perr)
|
||||
} else {
|
||||
log.Printf("Provisioner: pulled %s", image)
|
||||
@ -257,7 +275,7 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
|
||||
// Docker returns a generic "No such image" error that's opaque to
|
||||
// operators — wrap it with the resolved tag and the exact pull
|
||||
// command so last_sample_error surfaces something actionable. Issue #117.
|
||||
resp, err := p.cli.ContainerCreate(ctx, containerCfg, hostCfg, networkCfg, nil, name)
|
||||
resp, err := p.cli.ContainerCreate(ctx, containerCfg, hostCfg, networkCfg, imgPlatform, name)
|
||||
if err != nil {
|
||||
if isImageNotFoundErr(err) {
|
||||
return "", fmt.Errorf(
|
||||
@ -980,8 +998,12 @@ type dockerImageClient interface {
|
||||
// pull to finish; returning early leaves the daemon mid-pull. We
|
||||
// discard the progress payload because operators read container logs
|
||||
// for boot diagnostics, not pull chatter.
|
||||
func pullImageAndDrain(ctx context.Context, cli dockerImageClient, ref string) error {
|
||||
rc, err := cli.ImagePull(ctx, ref, dockerimage.PullOptions{})
|
||||
//
|
||||
// `platform` is "os/arch" (e.g. "linux/amd64") when the host needs to
|
||||
// pull a non-native manifest, or "" to let the daemon pick the default
|
||||
// for its arch. See defaultImagePlatform for when that matters.
|
||||
func pullImageAndDrain(ctx context.Context, cli dockerImageClient, ref, platform string) error {
|
||||
rc, err := cli.ImagePull(ctx, ref, dockerimage.PullOptions{Platform: platform})
|
||||
if err != nil {
|
||||
return fmt.Errorf("ImagePull: %w", err)
|
||||
}
|
||||
@ -991,3 +1013,44 @@ func pullImageAndDrain(ctx context.Context, cli dockerImageClient, ref string) e
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// defaultImagePlatform picks the Docker image platform string used for
|
||||
// `ImagePull` + `ContainerCreate` on the workspace-template-* images.
|
||||
//
|
||||
// Empty result means "use the daemon default" — the common case on
|
||||
// linux/amd64 hosts (CI, SaaS EC2, Linux dev machines). On Apple Silicon
|
||||
// the GHCR workspace-template-* images ship a single linux/amd64
|
||||
// manifest today, so the daemon's native linux/arm64/v8 request misses
|
||||
// with "no matching manifest". Forcing linux/amd64 pulls the amd64
|
||||
// manifest and lets Docker Desktop run it under QEMU emulation. Slow
|
||||
// (2–5× native) but functional — unblocks local dev on M-series Macs.
|
||||
//
|
||||
// Override via MOLECULE_IMAGE_PLATFORM — set to the empty string to
|
||||
// disable the auto-force, or to a specific value ("linux/amd64",
|
||||
// "linux/arm64") to pin. SaaS production should leave this unset.
|
||||
//
|
||||
// Tracked in issue #1875; remove this fallback once the template repos
|
||||
// publish multi-arch manifests.
|
||||
func defaultImagePlatform() string {
|
||||
if v, ok := os.LookupEnv("MOLECULE_IMAGE_PLATFORM"); ok {
|
||||
return v
|
||||
}
|
||||
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
|
||||
return "linux/amd64"
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// parseOCIPlatform turns "linux/amd64" into the *ocispec.Platform shape
|
||||
// `ContainerCreate`'s platform argument expects. "" returns nil, which
|
||||
// is exactly how the Docker SDK signals "no preference".
|
||||
func parseOCIPlatform(s string) *ocispec.Platform {
|
||||
if s == "" {
|
||||
return nil
|
||||
}
|
||||
parts := strings.SplitN(s, "/", 2)
|
||||
if len(parts) != 2 || parts[0] == "" || parts[1] == "" {
|
||||
return nil
|
||||
}
|
||||
return &ocispec.Platform{OS: parts[0], Architecture: parts[1]}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user