From 0846ebc1f66ccd51da0588a877bdaf76e97a6959 Mon Sep 17 00:00:00 2001 From: core-be Date: Sun, 10 May 2026 04:21:27 -0700 Subject: [PATCH] fix(workspace-server): respect MOLECULE_IMAGE_REGISTRY in imagewatch + admin_workspace_images (RFC #229 P2-4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two surfaces in workspace-server hardcoded `ghcr.io` and silently bypassed the `MOLECULE_IMAGE_REGISTRY` env override that flips every other image operation to the configured private mirror (e.g. AWS ECR in production): 1. internal/imagewatch/watch.go — image-auto-refresh polled `https://ghcr.io/v2/...` and `https://ghcr.io/token` directly. Post- suspension, with the platform pointed at ECR, the watcher silently stopped seeing digest changes (every poll either 404'd or hung on a registry it has no business talking to). 2. internal/handlers/admin_workspace_images.go — Docker Engine auth payload pinned `serveraddress: "ghcr.io"`, so when the operator sets `MOLECULE_IMAGE_REGISTRY=…ecr…/molecule-ai` the engine matched the wrong credential entry on every authenticated pull. Fix: extract `provisioner.RegistryHost()` returning the host portion of `RegistryPrefix()` (e.g. `ghcr.io` ← `ghcr.io/molecule-ai`, or `004947743811.dkr.ecr.us-east-2.amazonaws.com` ← the ECR mirror prefix), and route both surfaces through it. Default behavior is unchanged for OSS users on GHCR. Tests - New `TestRegistryHost_SplitsHostFromOrgPath` and `TestRegistryHost_NeverEmpty` pin the helper across GHCR / ECR / self-hosted Gitea / bare-host edge cases. - New `TestGHCRAuthHeader_RespectsRegistryEnv` asserts the Docker auth payload's `serveraddress` follows MOLECULE_IMAGE_REGISTRY (and never leaks the org-path suffix). - New `TestRemoteDigest_RegistryHostFollowsEnv` stands up an httptest server, points MOLECULE_IMAGE_REGISTRY at it, and confirms both the token endpoint and the manifest HEAD land there — i.e. the full image- watch loop respects the env override end-to-end. Both new tests were verified to FAIL on the pre-fix code path before the helper was wired in, so a future revert can't silently re-introduce the bug. Out of scope (followup needed) ECR uses `aws ecr get-authorization-token` (SigV4 + basic-auth) instead of GHCR's `/token?service=…&scope=…` flow. This PR makes the URL host- configurable; the bearer-token negotiation in `fetchPullToken` still speaks the GHCR flavor. On ECR with `IMAGE_AUTO_REFRESH=true`, the watcher will now fail loudly at the token fetch (logged per tick) rather than silently hitting ghcr.io. Operators on ECR should keep IMAGE_AUTO_REFRESH=false until ECR auth is wired — tracked as a separate task. Net effect of this PR alone is strictly better than pre-fix: fail-loud > silent-broken. Refs: RFC #229 P2-4 tier:low Co-Authored-By: Claude Opus 4.7 (1M context) --- .../handlers/admin_workspace_images.go | 13 ++- .../handlers/admin_workspace_images_test.go | 39 ++++++++ workspace-server/internal/imagewatch/watch.go | 39 ++++++-- .../internal/imagewatch/watch_test.go | 97 +++++++++++++++++++ .../internal/provisioner/registry.go | 27 ++++++ .../internal/provisioner/registry_test.go | 44 +++++++++ 6 files changed, 247 insertions(+), 12 deletions(-) diff --git a/workspace-server/internal/handlers/admin_workspace_images.go b/workspace-server/internal/handlers/admin_workspace_images.go index 68bc50f1..95af3c91 100644 --- a/workspace-server/internal/handlers/admin_workspace_images.go +++ b/workspace-server/internal/handlers/admin_workspace_images.go @@ -71,10 +71,17 @@ func TemplateImageRef(runtime string) string { // ghcrAuthHeader returns the base64-encoded JSON auth payload Docker's // ImagePull expects in PullOptions.RegistryAuth, or empty string when no -// GHCR_USER/GHCR_TOKEN env is set (lets public images pull through). +// GHCR_USER/GHCR_TOKEN env is set (lets public images pull through and lets +// ECR's credential-helper-driven flow take over without a stale GHCR +// payload masking it). // // The Docker SDK doesn't read ~/.docker/config.json — every authenticated -// pull needs an explicit RegistryAuth string. +// pull needs an explicit RegistryAuth string. The serveraddress field is +// resolved from provisioner.RegistryHost() so it tracks MOLECULE_IMAGE_REGISTRY +// when the operator points the platform at a private mirror (e.g. ECR). +// Leaving it hardcoded to "ghcr.io" caused the engine to match the wrong +// auth entry post-suspension when MOLECULE_IMAGE_REGISTRY was flipped to +// the AWS ECR mirror (RFC #229). func ghcrAuthHeader() string { user := strings.TrimSpace(os.Getenv("GHCR_USER")) token := strings.TrimSpace(os.Getenv("GHCR_TOKEN")) @@ -84,7 +91,7 @@ func ghcrAuthHeader() string { payload := map[string]string{ "username": user, "password": token, - "serveraddress": "ghcr.io", + "serveraddress": provisioner.RegistryHost(), } js, err := json.Marshal(payload) if err != nil { diff --git a/workspace-server/internal/handlers/admin_workspace_images_test.go b/workspace-server/internal/handlers/admin_workspace_images_test.go index 26e61f95..411cba5a 100644 --- a/workspace-server/internal/handlers/admin_workspace_images_test.go +++ b/workspace-server/internal/handlers/admin_workspace_images_test.go @@ -9,6 +9,7 @@ import ( func TestGHCRAuthHeader_NoEnvReturnsEmpty(t *testing.T) { t.Setenv("GHCR_USER", "") t.Setenv("GHCR_TOKEN", "") + t.Setenv("MOLECULE_IMAGE_REGISTRY", "") if got := ghcrAuthHeader(); got != "" { t.Errorf("expected empty (no auth → public-only), got %q", got) } @@ -29,6 +30,10 @@ func TestGHCRAuthHeader_PartialEnvReturnsEmpty(t *testing.T) { } func TestGHCRAuthHeader_EncodesDockerEnginePayload(t *testing.T) { + // Default registry env (unset → ghcr.io/molecule-ai) means the + // serveraddress field should resolve to ghcr.io. Pin both env vars so the + // test is hermetic regardless of the host's MOLECULE_IMAGE_REGISTRY. + t.Setenv("MOLECULE_IMAGE_REGISTRY", "") t.Setenv("GHCR_USER", "alice") t.Setenv("GHCR_TOKEN", "fake-tok-value") got := ghcrAuthHeader() @@ -54,7 +59,41 @@ func TestGHCRAuthHeader_EncodesDockerEnginePayload(t *testing.T) { } } +// TestGHCRAuthHeader_RespectsRegistryEnv pins the RFC #229 fix: when +// MOLECULE_IMAGE_REGISTRY points at a private mirror (e.g. AWS ECR), the +// Docker engine auth payload's serveraddress must reflect that mirror's +// host so credential matching lands on the right entry. Pre-fix this was +// hardcoded to "ghcr.io" and silently dropped the override. +func TestGHCRAuthHeader_RespectsRegistryEnv(t *testing.T) { + t.Setenv("GHCR_USER", "alice") + t.Setenv("GHCR_TOKEN", "fake-tok-value") + t.Setenv("MOLECULE_IMAGE_REGISTRY", "004947743811.dkr.ecr.us-east-2.amazonaws.com/molecule-ai") + + got := ghcrAuthHeader() + if got == "" { + t.Fatal("expected non-empty auth header") + } + raw, err := base64.URLEncoding.DecodeString(got) + if err != nil { + t.Fatalf("auth header is not valid base64-url: %v", err) + } + var payload map[string]string + if err := json.Unmarshal(raw, &payload); err != nil { + t.Fatalf("decoded auth is not valid JSON: %v (raw=%s)", err, raw) + } + want := "004947743811.dkr.ecr.us-east-2.amazonaws.com" + if payload["serveraddress"] != want { + t.Errorf("serveraddress: got %q, want %q (must follow MOLECULE_IMAGE_REGISTRY host)", + payload["serveraddress"], want) + } + // Sanity: the org-path portion must NOT leak into serveraddress. + if payload["serveraddress"] == "004947743811.dkr.ecr.us-east-2.amazonaws.com/molecule-ai" { + t.Error("serveraddress must be host-only, not host+org-path") + } +} + func TestGHCRAuthHeader_TrimsWhitespace(t *testing.T) { + t.Setenv("MOLECULE_IMAGE_REGISTRY", "") // .env lines often have trailing newlines or accidental spaces. Without // trimming, a stray space would produce an auth payload the engine // rejects with a confusing 401. diff --git a/workspace-server/internal/imagewatch/watch.go b/workspace-server/internal/imagewatch/watch.go index d39d57f3..7e038b35 100644 --- a/workspace-server/internal/imagewatch/watch.go +++ b/workspace-server/internal/imagewatch/watch.go @@ -29,6 +29,7 @@ import ( "time" "github.com/Molecule-AI/molecule-monorepo/platform/internal/handlers" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner" ) // DefaultInterval is the polling cadence. Runtime publishes happen at most @@ -127,20 +128,32 @@ func (w *Watcher) tick(ctx context.Context, fetch digestFetcher) { } } -// remoteDigest queries GHCR for the current manifest digest of the -// workspace-template-:latest image. Uses the Docker Registry V2 -// HTTP API: get a bearer token, then HEAD the manifest. +// remoteDigest queries the configured registry for the current manifest +// digest of the workspace-template-:latest image. Uses the Docker +// Registry V2 HTTP API: get a bearer token, then HEAD the manifest. +// +// Registry host is resolved from provisioner.RegistryHost() so the watcher +// follows MOLECULE_IMAGE_REGISTRY in production tenants. Pre-RFC #229 this +// was hardcoded to ghcr.io, which silently broke image-watch in tenants +// pointed at the AWS ECR mirror. // // Auth: if GHCR_USER+GHCR_TOKEN are set, basic-auth the token request // (works for both public and private images). If unset, anonymous token // (works for public images only — every workspace template is public). +// +// NOTE: the bearer-token negotiation in fetchPullToken speaks GHCR's +// `/token` flavor of the Docker Registry V2 spec. ECR uses a different +// auth path (`aws ecr get-authorization-token` → SigV4 + basic-auth header). +// Wiring ECR auth here is tracked as a follow-up; until then, operators on +// ECR should keep IMAGE_AUTO_REFRESH=false and the watcher will fail loudly +// at the token fetch instead of pulling from ghcr.io behind their back. func (w *Watcher) remoteDigest(ctx context.Context, runtime string) (string, error) { repo := "molecule-ai/workspace-template-" + runtime tok, err := w.fetchPullToken(ctx, repo) if err != nil { return "", fmt.Errorf("pull token: %w", err) } - manifestURL := fmt.Sprintf("https://ghcr.io/v2/%s/manifests/latest", repo) + manifestURL := fmt.Sprintf("https://%s/v2/%s/manifests/latest", provisioner.RegistryHost(), repo) req, err := http.NewRequestWithContext(ctx, "HEAD", manifestURL, nil) if err != nil { return "", err @@ -171,14 +184,22 @@ func (w *Watcher) remoteDigest(ctx context.Context, runtime string) (string, err return digest, nil } -// fetchPullToken negotiates a short-lived bearer token from GHCR's token -// endpoint scoped to repo:pull. GHCR requires a token even for anonymous -// pulls of public images. +// fetchPullToken negotiates a short-lived bearer token from the registry's +// `/token` endpoint scoped to repo:pull. GHCR requires a token even for +// anonymous pulls of public images. +// +// Registry host follows provisioner.RegistryHost() so the request goes to +// the same registry the rest of the platform pulls from. The `service` +// query parameter mirrors the host because GHCR (and most registries +// implementing the Docker Registry V2 token spec) validate it against the +// realm/service the auth challenge advertised. ECR doesn't implement this +// flow — see remoteDigest's note on the ECR auth follow-up. func (w *Watcher) fetchPullToken(ctx context.Context, repo string) (string, error) { + host := provisioner.RegistryHost() q := url.Values{} - q.Set("service", "ghcr.io") + q.Set("service", host) q.Set("scope", "repository:"+repo+":pull") - tokURL := "https://ghcr.io/token?" + q.Encode() + tokURL := "https://" + host + "/token?" + q.Encode() req, err := http.NewRequestWithContext(ctx, "GET", tokURL, nil) if err != nil { return "", err diff --git a/workspace-server/internal/imagewatch/watch_test.go b/workspace-server/internal/imagewatch/watch_test.go index b29d17a3..662e5113 100644 --- a/workspace-server/internal/imagewatch/watch_test.go +++ b/workspace-server/internal/imagewatch/watch_test.go @@ -3,6 +3,9 @@ package imagewatch import ( "context" "errors" + "net/http" + "net/http/httptest" + "strings" "sync" "testing" @@ -160,6 +163,100 @@ func TestTick_DigestFetchErrorSkipsRuntime(t *testing.T) { } } +// TestRemoteDigest_RegistryHostFollowsEnv pins the RFC #229 fix: with +// MOLECULE_IMAGE_REGISTRY pointed at a private mirror, the watcher's HTTP +// calls (token endpoint + manifest HEAD) must hit that mirror's host, not +// the hardcoded ghcr.io of the pre-fix code path. We stand up an httptest +// server, point MOLECULE_IMAGE_REGISTRY at its host, and assert both +// endpoints get hit on it. +// +// Without this test, a future refactor could revert the helper indirection +// and the watcher would silently go back to talking to ghcr.io even when +// the platform is configured for ECR — exactly the bug RFC #229 is closing. +func TestRemoteDigest_RegistryHostFollowsEnv(t *testing.T) { + var ( + mu sync.Mutex + tokenHits int + manifestHits int + lastTokenURL string + lastManifestURL string + ) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + mu.Lock() + defer mu.Unlock() + switch { + case strings.HasPrefix(r.URL.Path, "/token"): + tokenHits++ + lastTokenURL = r.URL.String() + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"token":"fake-bearer"}`)) + case strings.HasPrefix(r.URL.Path, "/v2/") && strings.Contains(r.URL.Path, "/manifests/latest"): + manifestHits++ + lastManifestURL = r.URL.Path + w.Header().Set("Docker-Content-Digest", "sha256:cafef00d") + w.WriteHeader(http.StatusOK) + default: + w.WriteHeader(http.StatusNotFound) + } + })) + defer srv.Close() + + // httptest.Server.URL is "http://127.0.0.1:NNNN". RegistryHost() works + // over the host:port portion (provisioner.RegistryPrefix takes the env + // verbatim), so we strip the scheme and append "/molecule-ai" to mimic + // the prefix shape MOLECULE_IMAGE_REGISTRY actually uses in production. + host := strings.TrimPrefix(srv.URL, "http://") + t.Setenv("MOLECULE_IMAGE_REGISTRY", host+"/molecule-ai") + + w := newTestWatcher(&fakeRefresher{}, "claude-code") + // Use the test-server URL scheme by overriding the http client only — + // remoteDigest constructs https:///... internally. We need the + // watcher to hit our http server, so swap the URL scheme by injecting + // a transport that rewrites https→http for this test. + w.http = &http.Client{Transport: rewriteToHTTP{}} + + digest, err := w.remoteDigest(context.Background(), "claude-code") + if err != nil { + t.Fatalf("remoteDigest failed: %v", err) + } + if digest != "sha256:cafef00d" { + t.Errorf("digest: got %q, want sha256:cafef00d", digest) + } + + mu.Lock() + defer mu.Unlock() + if tokenHits != 1 { + t.Errorf("token endpoint hits: got %d, want 1 (watcher must hit configured registry, not ghcr.io)", tokenHits) + } + if manifestHits != 1 { + t.Errorf("manifest HEAD hits: got %d, want 1 (watcher must hit configured registry, not ghcr.io)", manifestHits) + } + // service= query param must reflect the configured host so registries + // that validate the param (GHCR-style spec) accept the request. + if !strings.Contains(lastTokenURL, "service="+host) && !strings.Contains(lastTokenURL, "service=127.0.0.1") { + t.Errorf("token URL service param not host-derived: got %q", lastTokenURL) + } + wantManifestPath := "/v2/molecule-ai/workspace-template-claude-code/manifests/latest" + if lastManifestURL != wantManifestPath { + t.Errorf("manifest path: got %q, want %q", lastManifestURL, wantManifestPath) + } +} + +// rewriteToHTTP is a tiny RoundTripper that flips https→http so the watcher +// (which builds https URLs from the configured registry host) can target an +// httptest.Server that only speaks http. Production code paths still go +// over https; this is a unit-test seam only. +type rewriteToHTTP struct{} + +func (rewriteToHTTP) RoundTrip(req *http.Request) (*http.Response, error) { + if req.URL.Scheme == "https" { + clone := req.Clone(req.Context()) + clone.URL.Scheme = "http" + req = clone + } + return http.DefaultTransport.RoundTrip(req) +} + func TestShortDigest(t *testing.T) { cases := map[string]string{ "sha256:abcdef0123456789": "sha256:abcdef012345", diff --git a/workspace-server/internal/provisioner/registry.go b/workspace-server/internal/provisioner/registry.go index 209411a4..74334882 100644 --- a/workspace-server/internal/provisioner/registry.go +++ b/workspace-server/internal/provisioner/registry.go @@ -3,6 +3,7 @@ package provisioner import ( "fmt" "os" + "strings" ) // defaultRegistryPrefix is the upstream OSS face for all workspace template @@ -62,6 +63,32 @@ func RegistryPrefix() string { return defaultRegistryPrefix } +// RegistryHost returns just the registry host portion of RegistryPrefix() — +// i.e. everything before the first "/" separator. This is the value that +// belongs in: +// +// - Docker Engine PullOptions.RegistryAuth payloads (`serveraddress` field) +// — the engine matches credentials against host, not host+org-path. +// - Docker Registry V2 HTTP API base URLs (e.g. `https:///v2/...`) +// — the V2 API is host-rooted; the org-path lives in the manifest path. +// +// Examples: +// +// "ghcr.io/molecule-ai" → "ghcr.io" +// "123456789012.dkr.ecr.us-east-2.amazonaws.com/molecule-ai" → "123456789012.dkr.ecr.us-east-2.amazonaws.com" +// "git.moleculesai.app/molecule-ai" → "git.moleculesai.app" +// +// If RegistryPrefix() ever returns a bare host (no `/`), we return it as-is +// rather than letting strings.SplitN produce an empty string — defensive +// against a misconfiguration where the operator sets just the host. +func RegistryHost() string { + prefix := RegistryPrefix() + if i := strings.IndexByte(prefix, '/'); i > 0 { + return prefix[:i] + } + return prefix +} + // RuntimeImage returns the canonical image reference for the given runtime, // using the current RegistryPrefix() and the moving `:latest` tag. // diff --git a/workspace-server/internal/provisioner/registry_test.go b/workspace-server/internal/provisioner/registry_test.go index 885a6b99..f9c6611c 100644 --- a/workspace-server/internal/provisioner/registry_test.go +++ b/workspace-server/internal/provisioner/registry_test.go @@ -127,6 +127,50 @@ func TestComputeRuntimeImages_ReflectsCurrentEnv(t *testing.T) { } } +// TestRegistryHost_SplitsHostFromOrgPath pins the contract that callers +// (Docker auth payloads, registry V2 HTTP base URLs) need: the host portion +// must be free of the "/molecule-ai" org suffix that appears in the +// pull-prefix form. Pre-RFC #229, ghcr.io was hardcoded in two places +// (imagewatch + admin_workspace_images auth payload); this helper is the +// single source they should resolve from. +func TestRegistryHost_SplitsHostFromOrgPath(t *testing.T) { + cases := []struct { + name string + env string + want string + }{ + {"default GHCR", "", "ghcr.io"}, + {"AWS ECR mirror", "004947743811.dkr.ecr.us-east-2.amazonaws.com/molecule-ai", "004947743811.dkr.ecr.us-east-2.amazonaws.com"}, + {"self-hosted Gitea", "git.moleculesai.app/molecule-ai", "git.moleculesai.app"}, + // Bare host (no /org) — defensive: return as-is rather than empty. + {"bare host no org-path", "registry.example.com", "registry.example.com"}, + // Multi-level org path — split at the first "/" only. + {"nested org path", "registry.example.com/org/sub", "registry.example.com"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Setenv("MOLECULE_IMAGE_REGISTRY", tc.env) + got := RegistryHost() + if got != tc.want { + t.Errorf("RegistryHost() with env=%q: got %q, want %q", tc.env, got, tc.want) + } + }) + } +} + +// TestRegistryHost_NeverEmpty — guard against a future refactor accidentally +// returning "" for some edge env value. An empty serveraddress in the +// Docker engine auth payload, or an empty host in `https:///v2/...`, would +// silently break image operations. +func TestRegistryHost_NeverEmpty(t *testing.T) { + for _, env := range []string{"", "ghcr.io/molecule-ai", "/leading-slash", "host-only", "host/with/path"} { + t.Setenv("MOLECULE_IMAGE_REGISTRY", env) + if got := RegistryHost(); got == "" { + t.Errorf("RegistryHost() with env=%q returned empty (would break Docker auth + V2 HTTP)", env) + } + } +} + // TestKnownRuntimes_AlphabeticalOrder — pin the order so test snapshots // (and human readers diffing the file) see deterministic output. Adding a // new runtime out of alphabetical order will fail this test, which is the -- 2.45.2