fix(provisioner): force linux/amd64 pull + create on Apple Silicon hosts (#1875)

On an Apple Silicon dev box, every `POST /workspaces` failed immediately
with:

  no matching manifest for linux/arm64/v8 in the manifest list entries:
  no match for platform in manifest: not found

because the GHCR workspace-template-* images ship only a linux/amd64
manifest today. `ImagePull` and `ContainerCreate` asked for the daemon's
native arch and missed. The Canvas surfaced this as

  docker image "ghcr.io/molecule-ai/workspace-template-autogen:latest"
  not found after pull attempt — verify GHCR visibility for autogen

— confusing because the image IS visible, just not for linux/arm64.

### Fix

Add an auto-detect helper `defaultImagePlatform()` in
`internal/provisioner/provisioner.go` that returns `"linux/amd64"` on
Apple Silicon hosts and `""` (no preference) everywhere else, with an
env override `MOLECULE_IMAGE_PLATFORM` for operators who want to pin
or disable explicitly. The result is passed to both `ImagePull`
(`PullOptions.Platform`) and `ContainerCreate` (4th arg
`*ocispec.Platform`) so the pulled amd64 manifest matches the
create-time platform spec. Docker Desktop transparently runs it
under QEMU emulation on M-series Macs — slow (2–5× native) but
functional.

SaaS production (linux/amd64 EC2, `MOLECULE_ENV=production`) never
hits the `runtime.GOARCH == "arm64"` branch, so the current behaviour
on real tenants is byte-for-byte unchanged. Opt-in escape hatch for
operators who want it off:

  export MOLECULE_IMAGE_PLATFORM=""     # disable auto-force
  export MOLECULE_IMAGE_PLATFORM=linux/arm64   # pin alternate

`ocispec` is `github.com/opencontainers/image-spec/specs-go/v1` —
already in go.sum v1.1.1 as a transitive dependency of
`github.com/docker/docker`, not a new import.

### Tests

`internal/provisioner/platform_test.go` exercises every branch:

  - `TestDefaultImagePlatform_EnvOverride_ExplicitValue` — env wins
  - `TestDefaultImagePlatform_EnvOverride_EmptyValue` — empty string
    disables the auto-force (operator escape hatch)
  - `TestDefaultImagePlatform_AutoDetect` — linux/amd64 on arm64 Mac,
    "" on every other host
  - `TestParseOCIPlatform` — 7 table-driven cases covering well-formed
    platforms, malformed inputs, and nil handling

### End-to-end verification

Before this commit, `POST /workspaces` on my Apple Silicon box:

  workspace status transitioned: provisioning → failed (~1s)
  log: image pull for ... failed: no matching manifest for linux/arm64/v8

After this commit, fresh DB + fresh platform:

  workspace status transitioned: provisioning → online (~25s)
  log: attempting pull (platform=linux/amd64)
       pulled ghcr.io/molecule-ai/workspace-template-langgraph:latest
  docker ps: ws-7aa08951-00d  Up 27 seconds

The existing provisioner race-tested test suite (`go test -race
./internal/provisioner/`) still passes — the platform pointer defaults
to nil on linux/amd64 hosts, so the CI-resolved test expectations
don't change.

Closes #1875 (arm64 image blocker).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-23 13:38:26 -07:00
parent 96cc4b0c42
commit 539e3483e4
2 changed files with 177 additions and 5 deletions

View File

@ -0,0 +1,109 @@
package provisioner
import (
"os"
"runtime"
"testing"
)
// Tests for defaultImagePlatform + parseOCIPlatform.
//
// The platform-forcing helper unblocks Apple Silicon dev boxes — see
// issue #1875. SaaS production (linux/amd64 EC2) must NOT hit the
// forced-platform branch, which is what the "no override + linux host"
// and the explicit-empty-override tests lock in.
func TestDefaultImagePlatform_EnvOverride_ExplicitValue(t *testing.T) {
t.Setenv("MOLECULE_IMAGE_PLATFORM", "linux/arm64")
got := defaultImagePlatform()
if got != "linux/arm64" {
t.Errorf("expected env override to win, got %q", got)
}
}
func TestDefaultImagePlatform_EnvOverride_EmptyValue(t *testing.T) {
// An explicitly empty env var disables the auto-force. This is the
// escape hatch for operators who don't want the fallback but also
// haven't pinned an alternate platform.
t.Setenv("MOLECULE_IMAGE_PLATFORM", "")
got := defaultImagePlatform()
if got != "" {
t.Errorf("expected empty override to suppress auto-force, got %q", got)
}
}
func TestDefaultImagePlatform_AutoDetect(t *testing.T) {
// Clear any override the test runner inherited so we see pure
// auto-detect behaviour.
t.Setenv("MOLECULE_IMAGE_PLATFORM", "")
// Re-run without the env var at all — t.Setenv already backs up,
// but we need to Unsetenv for the LookupEnv branch to miss.
if err := unsetEnvForTest(t, "MOLECULE_IMAGE_PLATFORM"); err != nil {
t.Fatalf("unset env: %v", err)
}
got := defaultImagePlatform()
switch {
case runtime.GOOS == "darwin" && runtime.GOARCH == "arm64":
if got != "linux/amd64" {
t.Errorf("Apple Silicon: expected linux/amd64 auto-force, got %q", got)
}
default:
if got != "" {
t.Errorf("non-Apple-Silicon host: expected no auto-force, got %q", got)
}
}
}
func TestParseOCIPlatform(t *testing.T) {
cases := []struct {
in string
wantOS string
wantCPU string
wantNil bool
}{
{"", "", "", true},
{"linux/amd64", "linux", "amd64", false},
{"linux/arm64", "linux", "arm64", false},
// Malformed inputs must return nil so ContainerCreate falls back
// to "no preference" instead of getting a half-populated struct.
{"linux", "", "", true},
{"linux/", "", "", true},
{"/amd64", "", "", true},
{"linux/amd64/v8", "linux", "amd64/v8", false}, // current parser: everything after first "/" is arch
}
for _, tc := range cases {
t.Run(tc.in, func(t *testing.T) {
got := parseOCIPlatform(tc.in)
if tc.wantNil {
if got != nil {
t.Errorf("expected nil, got %+v", got)
}
return
}
if got == nil {
t.Fatalf("unexpected nil for %q", tc.in)
}
if got.OS != tc.wantOS || got.Architecture != tc.wantCPU {
t.Errorf("parse %q = %+v, want OS=%q Arch=%q",
tc.in, got, tc.wantOS, tc.wantCPU)
}
})
}
}
// unsetEnvForTest removes an env var for the duration of the test and
// restores it on cleanup. t.Setenv only supports setting, not removing;
// we need the unset path to test the "no override" branch.
func unsetEnvForTest(t *testing.T, key string) error {
t.Helper()
prev, existed := os.LookupEnv(key)
t.Cleanup(func() {
if existed {
_ = os.Setenv(key, prev)
} else {
_ = os.Unsetenv(key)
}
})
return os.Unsetenv(key)
}

View File

@ -10,6 +10,7 @@ import (
"log"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"time"
@ -20,6 +21,7 @@ import (
"github.com/docker/docker/api/types/volume"
"github.com/docker/docker/client"
"github.com/docker/go-connections/nat"
ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)
// RuntimeImages maps runtime names to their Docker image refs on GHCR.
@ -236,6 +238,18 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
// Ensure no stale container exists with the same name (race with restart policy)
_ = p.cli.ContainerRemove(ctx, name, container.RemoveOptions{Force: true})
// Resolve the target image platform once so the pull and the
// container-create use the same value. On an Apple Silicon dev
// laptop the GHCR workspace-template-* images only ship a
// linux/amd64 manifest today; without an explicit platform the
// daemon asks for linux/arm64/v8 and ImagePull returns
// "no matching manifest for linux/arm64/v8 in the manifest list
// entries". Forcing linux/amd64 lets Docker Desktop run them
// under QEMU emulation (slow but functional — unblocks local
// dev + Canvas smoke-testing on M-series Macs). See issue #1875.
imgPlatformStr := defaultImagePlatform()
imgPlatform := parseOCIPlatform(imgPlatformStr)
// Log image resolution for debugging stale-image issues, and pull from
// GHCR on miss so tenant hosts don't need a pre-build step anymore.
// The pull is best-effort: if it fails (network, auth, rate limit) the
@ -245,8 +259,12 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
log.Printf("Provisioner: creating %s from image %s (ID: %s, created: %s)",
name, image, imgInspect.ID[:19], imgInspect.Created[:19])
} else {
log.Printf("Provisioner: image %s not present locally (%v) — attempting pull", image, imgErr)
if perr := pullImageAndDrain(ctx, p.cli, image); perr != nil {
if imgPlatformStr != "" {
log.Printf("Provisioner: image %s not present locally (%v) — attempting pull (platform=%s)", image, imgErr, imgPlatformStr)
} else {
log.Printf("Provisioner: image %s not present locally (%v) — attempting pull", image, imgErr)
}
if perr := pullImageAndDrain(ctx, p.cli, image, imgPlatformStr); perr != nil {
log.Printf("Provisioner: image pull for %s failed: %v (falling through to create)", image, perr)
} else {
log.Printf("Provisioner: pulled %s", image)
@ -257,7 +275,7 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
// Docker returns a generic "No such image" error that's opaque to
// operators — wrap it with the resolved tag and the exact pull
// command so last_sample_error surfaces something actionable. Issue #117.
resp, err := p.cli.ContainerCreate(ctx, containerCfg, hostCfg, networkCfg, nil, name)
resp, err := p.cli.ContainerCreate(ctx, containerCfg, hostCfg, networkCfg, imgPlatform, name)
if err != nil {
if isImageNotFoundErr(err) {
return "", fmt.Errorf(
@ -980,8 +998,12 @@ type dockerImageClient interface {
// pull to finish; returning early leaves the daemon mid-pull. We
// discard the progress payload because operators read container logs
// for boot diagnostics, not pull chatter.
func pullImageAndDrain(ctx context.Context, cli dockerImageClient, ref string) error {
rc, err := cli.ImagePull(ctx, ref, dockerimage.PullOptions{})
//
// `platform` is "os/arch" (e.g. "linux/amd64") when the host needs to
// pull a non-native manifest, or "" to let the daemon pick the default
// for its arch. See defaultImagePlatform for when that matters.
func pullImageAndDrain(ctx context.Context, cli dockerImageClient, ref, platform string) error {
rc, err := cli.ImagePull(ctx, ref, dockerimage.PullOptions{Platform: platform})
if err != nil {
return fmt.Errorf("ImagePull: %w", err)
}
@ -991,3 +1013,44 @@ func pullImageAndDrain(ctx context.Context, cli dockerImageClient, ref string) e
}
return nil
}
// defaultImagePlatform picks the Docker image platform string used for
// `ImagePull` + `ContainerCreate` on the workspace-template-* images.
//
// Empty result means "use the daemon default" — the common case on
// linux/amd64 hosts (CI, SaaS EC2, Linux dev machines). On Apple Silicon
// the GHCR workspace-template-* images ship a single linux/amd64
// manifest today, so the daemon's native linux/arm64/v8 request misses
// with "no matching manifest". Forcing linux/amd64 pulls the amd64
// manifest and lets Docker Desktop run it under QEMU emulation. Slow
// (25× native) but functional — unblocks local dev on M-series Macs.
//
// Override via MOLECULE_IMAGE_PLATFORM — set to the empty string to
// disable the auto-force, or to a specific value ("linux/amd64",
// "linux/arm64") to pin. SaaS production should leave this unset.
//
// Tracked in issue #1875; remove this fallback once the template repos
// publish multi-arch manifests.
func defaultImagePlatform() string {
if v, ok := os.LookupEnv("MOLECULE_IMAGE_PLATFORM"); ok {
return v
}
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
return "linux/amd64"
}
return ""
}
// parseOCIPlatform turns "linux/amd64" into the *ocispec.Platform shape
// `ContainerCreate`'s platform argument expects. "" returns nil, which
// is exactly how the Docker SDK signals "no preference".
func parseOCIPlatform(s string) *ocispec.Platform {
if s == "" {
return nil
}
parts := strings.SplitN(s, "/", 2)
if len(parts) != 2 || parts[0] == "" || parts[1] == "" {
return nil
}
return &ocispec.Platform{OS: parts[0], Architecture: parts[1]}
}