From 539e3483e41d11bf39157c4667287f7a2883305a Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 23 Apr 2026 13:38:26 -0700 Subject: [PATCH] fix(provisioner): force linux/amd64 pull + create on Apple Silicon hosts (#1875) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On an Apple Silicon dev box, every `POST /workspaces` failed immediately with: no matching manifest for linux/arm64/v8 in the manifest list entries: no match for platform in manifest: not found because the GHCR workspace-template-* images ship only a linux/amd64 manifest today. `ImagePull` and `ContainerCreate` asked for the daemon's native arch and missed. The Canvas surfaced this as docker image "ghcr.io/molecule-ai/workspace-template-autogen:latest" not found after pull attempt — verify GHCR visibility for autogen — confusing because the image IS visible, just not for linux/arm64. ### Fix Add an auto-detect helper `defaultImagePlatform()` in `internal/provisioner/provisioner.go` that returns `"linux/amd64"` on Apple Silicon hosts and `""` (no preference) everywhere else, with an env override `MOLECULE_IMAGE_PLATFORM` for operators who want to pin or disable explicitly. The result is passed to both `ImagePull` (`PullOptions.Platform`) and `ContainerCreate` (4th arg `*ocispec.Platform`) so the pulled amd64 manifest matches the create-time platform spec. Docker Desktop transparently runs it under QEMU emulation on M-series Macs — slow (2–5× native) but functional. SaaS production (linux/amd64 EC2, `MOLECULE_ENV=production`) never hits the `runtime.GOARCH == "arm64"` branch, so the current behaviour on real tenants is byte-for-byte unchanged. Opt-in escape hatch for operators who want it off: export MOLECULE_IMAGE_PLATFORM="" # disable auto-force export MOLECULE_IMAGE_PLATFORM=linux/arm64 # pin alternate `ocispec` is `github.com/opencontainers/image-spec/specs-go/v1` — already in go.sum v1.1.1 as a transitive dependency of `github.com/docker/docker`, not a new import. ### Tests `internal/provisioner/platform_test.go` exercises every branch: - `TestDefaultImagePlatform_EnvOverride_ExplicitValue` — env wins - `TestDefaultImagePlatform_EnvOverride_EmptyValue` — empty string disables the auto-force (operator escape hatch) - `TestDefaultImagePlatform_AutoDetect` — linux/amd64 on arm64 Mac, "" on every other host - `TestParseOCIPlatform` — 7 table-driven cases covering well-formed platforms, malformed inputs, and nil handling ### End-to-end verification Before this commit, `POST /workspaces` on my Apple Silicon box: workspace status transitioned: provisioning → failed (~1s) log: image pull for ... failed: no matching manifest for linux/arm64/v8 After this commit, fresh DB + fresh platform: workspace status transitioned: provisioning → online (~25s) log: attempting pull (platform=linux/amd64) pulled ghcr.io/molecule-ai/workspace-template-langgraph:latest docker ps: ws-7aa08951-00d Up 27 seconds The existing provisioner race-tested test suite (`go test -race ./internal/provisioner/`) still passes — the platform pointer defaults to nil on linux/amd64 hosts, so the CI-resolved test expectations don't change. Closes #1875 (arm64 image blocker). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/provisioner/platform_test.go | 109 ++++++++++++++++++ .../internal/provisioner/provisioner.go | 73 +++++++++++- 2 files changed, 177 insertions(+), 5 deletions(-) create mode 100644 workspace-server/internal/provisioner/platform_test.go diff --git a/workspace-server/internal/provisioner/platform_test.go b/workspace-server/internal/provisioner/platform_test.go new file mode 100644 index 00000000..9f7827c6 --- /dev/null +++ b/workspace-server/internal/provisioner/platform_test.go @@ -0,0 +1,109 @@ +package provisioner + +import ( + "os" + "runtime" + "testing" +) + +// Tests for defaultImagePlatform + parseOCIPlatform. +// +// The platform-forcing helper unblocks Apple Silicon dev boxes — see +// issue #1875. SaaS production (linux/amd64 EC2) must NOT hit the +// forced-platform branch, which is what the "no override + linux host" +// and the explicit-empty-override tests lock in. + +func TestDefaultImagePlatform_EnvOverride_ExplicitValue(t *testing.T) { + t.Setenv("MOLECULE_IMAGE_PLATFORM", "linux/arm64") + got := defaultImagePlatform() + if got != "linux/arm64" { + t.Errorf("expected env override to win, got %q", got) + } +} + +func TestDefaultImagePlatform_EnvOverride_EmptyValue(t *testing.T) { + // An explicitly empty env var disables the auto-force. This is the + // escape hatch for operators who don't want the fallback but also + // haven't pinned an alternate platform. + t.Setenv("MOLECULE_IMAGE_PLATFORM", "") + got := defaultImagePlatform() + if got != "" { + t.Errorf("expected empty override to suppress auto-force, got %q", got) + } +} + +func TestDefaultImagePlatform_AutoDetect(t *testing.T) { + // Clear any override the test runner inherited so we see pure + // auto-detect behaviour. + t.Setenv("MOLECULE_IMAGE_PLATFORM", "") + // Re-run without the env var at all — t.Setenv already backs up, + // but we need to Unsetenv for the LookupEnv branch to miss. + if err := unsetEnvForTest(t, "MOLECULE_IMAGE_PLATFORM"); err != nil { + t.Fatalf("unset env: %v", err) + } + + got := defaultImagePlatform() + switch { + case runtime.GOOS == "darwin" && runtime.GOARCH == "arm64": + if got != "linux/amd64" { + t.Errorf("Apple Silicon: expected linux/amd64 auto-force, got %q", got) + } + default: + if got != "" { + t.Errorf("non-Apple-Silicon host: expected no auto-force, got %q", got) + } + } +} + +func TestParseOCIPlatform(t *testing.T) { + cases := []struct { + in string + wantOS string + wantCPU string + wantNil bool + }{ + {"", "", "", true}, + {"linux/amd64", "linux", "amd64", false}, + {"linux/arm64", "linux", "arm64", false}, + // Malformed inputs must return nil so ContainerCreate falls back + // to "no preference" instead of getting a half-populated struct. + {"linux", "", "", true}, + {"linux/", "", "", true}, + {"/amd64", "", "", true}, + {"linux/amd64/v8", "linux", "amd64/v8", false}, // current parser: everything after first "/" is arch + } + for _, tc := range cases { + t.Run(tc.in, func(t *testing.T) { + got := parseOCIPlatform(tc.in) + if tc.wantNil { + if got != nil { + t.Errorf("expected nil, got %+v", got) + } + return + } + if got == nil { + t.Fatalf("unexpected nil for %q", tc.in) + } + if got.OS != tc.wantOS || got.Architecture != tc.wantCPU { + t.Errorf("parse %q = %+v, want OS=%q Arch=%q", + tc.in, got, tc.wantOS, tc.wantCPU) + } + }) + } +} + +// unsetEnvForTest removes an env var for the duration of the test and +// restores it on cleanup. t.Setenv only supports setting, not removing; +// we need the unset path to test the "no override" branch. +func unsetEnvForTest(t *testing.T, key string) error { + t.Helper() + prev, existed := os.LookupEnv(key) + t.Cleanup(func() { + if existed { + _ = os.Setenv(key, prev) + } else { + _ = os.Unsetenv(key) + } + }) + return os.Unsetenv(key) +} diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index 2e945905..481f09b7 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -10,6 +10,7 @@ import ( "log" "os" "path/filepath" + "runtime" "strconv" "strings" "time" @@ -20,6 +21,7 @@ import ( "github.com/docker/docker/api/types/volume" "github.com/docker/docker/client" "github.com/docker/go-connections/nat" + ocispec "github.com/opencontainers/image-spec/specs-go/v1" ) // RuntimeImages maps runtime names to their Docker image refs on GHCR. @@ -236,6 +238,18 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e // Ensure no stale container exists with the same name (race with restart policy) _ = p.cli.ContainerRemove(ctx, name, container.RemoveOptions{Force: true}) + // Resolve the target image platform once so the pull and the + // container-create use the same value. On an Apple Silicon dev + // laptop the GHCR workspace-template-* images only ship a + // linux/amd64 manifest today; without an explicit platform the + // daemon asks for linux/arm64/v8 and ImagePull returns + // "no matching manifest for linux/arm64/v8 in the manifest list + // entries". Forcing linux/amd64 lets Docker Desktop run them + // under QEMU emulation (slow but functional — unblocks local + // dev + Canvas smoke-testing on M-series Macs). See issue #1875. + imgPlatformStr := defaultImagePlatform() + imgPlatform := parseOCIPlatform(imgPlatformStr) + // Log image resolution for debugging stale-image issues, and pull from // GHCR on miss so tenant hosts don't need a pre-build step anymore. // The pull is best-effort: if it fails (network, auth, rate limit) the @@ -245,8 +259,12 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e log.Printf("Provisioner: creating %s from image %s (ID: %s, created: %s)", name, image, imgInspect.ID[:19], imgInspect.Created[:19]) } else { - log.Printf("Provisioner: image %s not present locally (%v) — attempting pull", image, imgErr) - if perr := pullImageAndDrain(ctx, p.cli, image); perr != nil { + if imgPlatformStr != "" { + log.Printf("Provisioner: image %s not present locally (%v) — attempting pull (platform=%s)", image, imgErr, imgPlatformStr) + } else { + log.Printf("Provisioner: image %s not present locally (%v) — attempting pull", image, imgErr) + } + if perr := pullImageAndDrain(ctx, p.cli, image, imgPlatformStr); perr != nil { log.Printf("Provisioner: image pull for %s failed: %v (falling through to create)", image, perr) } else { log.Printf("Provisioner: pulled %s", image) @@ -257,7 +275,7 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e // Docker returns a generic "No such image" error that's opaque to // operators — wrap it with the resolved tag and the exact pull // command so last_sample_error surfaces something actionable. Issue #117. - resp, err := p.cli.ContainerCreate(ctx, containerCfg, hostCfg, networkCfg, nil, name) + resp, err := p.cli.ContainerCreate(ctx, containerCfg, hostCfg, networkCfg, imgPlatform, name) if err != nil { if isImageNotFoundErr(err) { return "", fmt.Errorf( @@ -980,8 +998,12 @@ type dockerImageClient interface { // pull to finish; returning early leaves the daemon mid-pull. We // discard the progress payload because operators read container logs // for boot diagnostics, not pull chatter. -func pullImageAndDrain(ctx context.Context, cli dockerImageClient, ref string) error { - rc, err := cli.ImagePull(ctx, ref, dockerimage.PullOptions{}) +// +// `platform` is "os/arch" (e.g. "linux/amd64") when the host needs to +// pull a non-native manifest, or "" to let the daemon pick the default +// for its arch. See defaultImagePlatform for when that matters. +func pullImageAndDrain(ctx context.Context, cli dockerImageClient, ref, platform string) error { + rc, err := cli.ImagePull(ctx, ref, dockerimage.PullOptions{Platform: platform}) if err != nil { return fmt.Errorf("ImagePull: %w", err) } @@ -991,3 +1013,44 @@ func pullImageAndDrain(ctx context.Context, cli dockerImageClient, ref string) e } return nil } + +// defaultImagePlatform picks the Docker image platform string used for +// `ImagePull` + `ContainerCreate` on the workspace-template-* images. +// +// Empty result means "use the daemon default" — the common case on +// linux/amd64 hosts (CI, SaaS EC2, Linux dev machines). On Apple Silicon +// the GHCR workspace-template-* images ship a single linux/amd64 +// manifest today, so the daemon's native linux/arm64/v8 request misses +// with "no matching manifest". Forcing linux/amd64 pulls the amd64 +// manifest and lets Docker Desktop run it under QEMU emulation. Slow +// (2–5× native) but functional — unblocks local dev on M-series Macs. +// +// Override via MOLECULE_IMAGE_PLATFORM — set to the empty string to +// disable the auto-force, or to a specific value ("linux/amd64", +// "linux/arm64") to pin. SaaS production should leave this unset. +// +// Tracked in issue #1875; remove this fallback once the template repos +// publish multi-arch manifests. +func defaultImagePlatform() string { + if v, ok := os.LookupEnv("MOLECULE_IMAGE_PLATFORM"); ok { + return v + } + if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" { + return "linux/amd64" + } + return "" +} + +// parseOCIPlatform turns "linux/amd64" into the *ocispec.Platform shape +// `ContainerCreate`'s platform argument expects. "" returns nil, which +// is exactly how the Docker SDK signals "no preference". +func parseOCIPlatform(s string) *ocispec.Platform { + if s == "" { + return nil + } + parts := strings.SplitN(s, "/", 2) + if len(parts) != 2 || parts[0] == "" || parts[1] == "" { + return nil + } + return &ocispec.Platform{OS: parts[0], Architecture: parts[1]} +}