Merge pull request 'fix(platform): A2A proxy ResponseHeaderTimeout 60s → 180s default, env-configurable' (#322) from fix/a2a-proxy-response-header-timeout-clean into staging
All checks were successful
Secret scan / Scan diff for credential-shaped strings (push) Successful in 3s
All checks were successful
Secret scan / Scan diff for credential-shaped strings (push) Successful in 3s
This commit is contained in:
commit
de5d8585c7
@ -57,6 +57,25 @@ jobs:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
# Health check: verify Docker daemon is accessible before attempting any
|
||||
# build steps. This fails loudly at step 1 when the runner's docker.sock
|
||||
# is inaccessible (e.g. permission change, daemon restart, or group-membership
|
||||
# drift) rather than silently continuing to step 2 where `docker build`
|
||||
# fails deep in the process with a cryptic ECR auth error that doesn't
|
||||
# surface the root cause. Also reports the daemon version so operator
|
||||
# can correlate with runner host logs.
|
||||
- name: Verify Docker daemon access
|
||||
run: |
|
||||
set -euo pipefail
|
||||
echo "::group::Docker daemon health check"
|
||||
docker info 2>&1 | head -5 || {
|
||||
echo "::error::Docker daemon is not accessible at /var/run/docker.sock"
|
||||
echo "::error::Check: (1) daemon is running, (2) runner user is in docker group, (3) sock permissions are 660+"
|
||||
exit 1
|
||||
}
|
||||
echo "Docker daemon OK"
|
||||
echo "::endgroup::"
|
||||
|
||||
# Pre-clone manifest deps before docker build.
|
||||
#
|
||||
# Why: workspace-template-* repos on Gitea are private. The pre-fix
|
||||
|
||||
16
.github/workflows/publish-canvas-image.yml
vendored
16
.github/workflows/publish-canvas-image.yml
vendored
@ -54,6 +54,22 @@ jobs:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0
|
||||
|
||||
# Health check: verify Docker daemon is accessible before attempting any
|
||||
# build steps. This fails loudly at step 1 when the runner's docker.sock
|
||||
# is inaccessible rather than silently continuing to the build step
|
||||
# where docker build fails deep in ECR auth with a cryptic error.
|
||||
- name: Verify Docker daemon access
|
||||
run: |
|
||||
set -euo pipefail
|
||||
echo "::group::Docker daemon health check"
|
||||
docker info 2>&1 | head -5 || {
|
||||
echo "::error::Docker daemon is not accessible at /var/run/docker.sock"
|
||||
echo "::error::Check: (1) daemon running, (2) runner user in docker group, (3) sock perms 660+"
|
||||
exit 1
|
||||
}
|
||||
echo "Docker daemon OK"
|
||||
echo "::endgroup::"
|
||||
|
||||
- name: Compute tags
|
||||
id: tags
|
||||
shell: bash
|
||||
|
||||
@ -107,6 +107,22 @@ jobs:
|
||||
run: |
|
||||
echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# Health check: verify Docker daemon is accessible before attempting any
|
||||
# build steps. This fails loudly at step 1 when the runner's docker.sock
|
||||
# is inaccessible rather than silently continuing to the build step
|
||||
# where docker build fails deep in ECR auth with a cryptic error.
|
||||
- name: Verify Docker daemon access
|
||||
run: |
|
||||
set -euo pipefail
|
||||
echo "::group::Docker daemon health check"
|
||||
docker info 2>&1 | head -5 || {
|
||||
echo "::error::Docker daemon is not accessible at /var/run/docker.sock"
|
||||
echo "::error::Check: (1) daemon running, (2) runner user in docker group, (3) sock perms 660+"
|
||||
exit 1
|
||||
}
|
||||
echo "Docker daemon OK"
|
||||
echo "::endgroup::"
|
||||
|
||||
# Pre-clone manifest deps before docker build (Task #173 fix).
|
||||
#
|
||||
# Why pre-clone: post-2026-05-06, every workspace-template-* repo on
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
services:
|
||||
# digest-pinned 2026-05-10 (sha256:4941ef97aaa2633ce9808f7766f8b8d746dd039ce8c51ca6da185c3dc63ab579, linux/amd64)
|
||||
postgres:
|
||||
image: postgres:16-alpine
|
||||
image: postgres@sha256:4941ef97aaa2633ce9808f7766f8b8d746dd039ce8c51ca6da185c3dc63ab579
|
||||
environment:
|
||||
POSTGRES_USER: ${POSTGRES_USER:-dev}
|
||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-dev}
|
||||
@ -17,7 +18,7 @@ services:
|
||||
retries: 10
|
||||
|
||||
langfuse-db-init:
|
||||
image: postgres:16-alpine
|
||||
image: postgres@sha256:4941ef97aaa2633ce9808f7766f8b8d746dd039ce8c51ca6da185c3dc63ab579
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
@ -36,8 +37,9 @@ services:
|
||||
psql -h postgres -U "$${POSTGRES_USER}" -d postgres -c "CREATE DATABASE langfuse"
|
||||
fi
|
||||
|
||||
# digest-pinned 2026-05-10 (sha256:b1addbe72465a718643cff9e60a58e6df1841e29d6d7d60c9a85d8d72f08d1a7, linux/amd64)
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
image: redis@sha256:b1addbe72465a718643cff9e60a58e6df1841e29d6d7d60c9a85d8d72f08d1a7
|
||||
command: ["redis-server", "--notify-keyspace-events", "KEA"]
|
||||
ports:
|
||||
- "6379:6379"
|
||||
@ -49,8 +51,9 @@ services:
|
||||
timeout: 5s
|
||||
retries: 10
|
||||
|
||||
# digest-pinned 2026-05-10 (sha256:5b296e0ba1da74efea3143c773ddd60245f249fb7c72eb1d866c2d6ebc759fbe, linux/amd64)
|
||||
clickhouse:
|
||||
image: clickhouse/clickhouse-server:24-alpine
|
||||
image: clickhouse/clickhouse-server@sha256:5b296e0ba1da74efea3143c773ddd60245f249fb7c72eb1d866c2d6ebc759fbe
|
||||
environment:
|
||||
CLICKHOUSE_DB: langfuse
|
||||
CLICKHOUSE_USER: langfuse
|
||||
@ -64,8 +67,9 @@ services:
|
||||
retries: 10
|
||||
|
||||
# dev-only: no-auth on 0.0.0.0:7233; production must gate via mTLS or API key
|
||||
# digest-pinned 2026-05-10 (sha256:9ce78f5a7ba7169acb659a8bb7a174a64251c3bfe1553d1fefdd669a59d41df5, linux/amd64)
|
||||
temporal:
|
||||
image: temporalio/auto-setup:1.25
|
||||
image: temporalio/auto-setup@sha256:9ce78f5a7ba7169acb659a8bb7a174a64251c3bfe1553d1fefdd669a59d41df5
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
@ -85,8 +89,9 @@ services:
|
||||
timeout: 5s
|
||||
retries: 10
|
||||
|
||||
# digest-pinned 2026-05-10 (sha256:7be8d6e41d4846ccb718c4f35956c9557512f8085e94a73954286a4e95113703, linux/amd64)
|
||||
temporal-ui:
|
||||
image: temporalio/ui:2.31.2
|
||||
image: temporalio/ui@sha256:7be8d6e41d4846ccb718c4f35956c9557512f8085e94a73954286a4e95113703
|
||||
depends_on:
|
||||
- temporal
|
||||
environment:
|
||||
@ -95,8 +100,9 @@ services:
|
||||
ports:
|
||||
- "8233:8080"
|
||||
|
||||
# digest-pinned 2026-05-10 (sha256:e7aafd3ccf721821b40f8b2251220b4bb8af5e4877b5c5a8846af5b3318aaf1d, linux/amd64)
|
||||
langfuse-web:
|
||||
image: langfuse/langfuse:2
|
||||
image: langfuse/langfuse@sha256:e7aafd3ccf721821b40f8b2251220b4bb8af5e4877b5c5a8846af5b3318aaf1d
|
||||
depends_on:
|
||||
clickhouse:
|
||||
condition: service_healthy
|
||||
|
||||
@ -4,8 +4,9 @@ include:
|
||||
|
||||
services:
|
||||
# --- Infrastructure ---
|
||||
# digest-pinned 2026-05-10 (sha256:4941ef97aaa2633ce9808f7766f8b8d746dd039ce8c51ca6da185c3dc63ab579, linux/amd64)
|
||||
postgres:
|
||||
image: postgres:16-alpine
|
||||
image: postgres@sha256:4941ef97aaa2633ce9808f7766f8b8d746dd039ce8c51ca6da185c3dc63ab579
|
||||
environment:
|
||||
POSTGRES_USER: ${POSTGRES_USER:-dev}
|
||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-dev}
|
||||
@ -25,7 +26,7 @@ services:
|
||||
retries: 10
|
||||
|
||||
langfuse-db-init:
|
||||
image: postgres:16-alpine
|
||||
image: postgres@sha256:4941ef97aaa2633ce9808f7766f8b8d746dd039ce8c51ca6da185c3dc63ab579
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
@ -46,8 +47,9 @@ services:
|
||||
networks:
|
||||
- molecule-core-net
|
||||
|
||||
# digest-pinned 2026-05-10 (sha256:b1addbe72465a718643cff9e60a58e6df1841e29d6d7d60c9a85d8d72f08d1a7, linux/amd64)
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
image: redis@sha256:b1addbe72465a718643cff9e60a58e6df1841e29d6d7d60c9a85d8d72f08d1a7
|
||||
command: ["redis-server", "--notify-keyspace-events", "KEA"]
|
||||
ports:
|
||||
- "6379:6379"
|
||||
@ -63,8 +65,9 @@ services:
|
||||
retries: 10
|
||||
|
||||
# --- Observability ---
|
||||
# digest-pinned 2026-05-10 (sha256:5b296e0ba1da74efea3143c773ddd60245f249fb7c72eb1d866c2d6ebc759fbe, linux/amd64)
|
||||
langfuse-clickhouse:
|
||||
image: clickhouse/clickhouse-server:24-alpine
|
||||
image: clickhouse/clickhouse-server@sha256:5b296e0ba1da74efea3143c773ddd60245f249fb7c72eb1d866c2d6ebc759fbe
|
||||
environment:
|
||||
CLICKHOUSE_DB: langfuse
|
||||
CLICKHOUSE_USER: langfuse
|
||||
@ -79,8 +82,9 @@ services:
|
||||
timeout: 5s
|
||||
retries: 10
|
||||
|
||||
# digest-pinned 2026-05-10 (sha256:e7aafd3ccf721821b40f8b2251220b4bb8af5e4877b5c5a8846af5b3318aaf1d, linux/amd64)
|
||||
langfuse:
|
||||
image: langfuse/langfuse:2
|
||||
image: langfuse/langfuse@sha256:e7aafd3ccf721821b40f8b2251220b4bb8af5e4877b5c5a8846af5b3318aaf1d
|
||||
depends_on:
|
||||
langfuse-clickhouse:
|
||||
condition: service_healthy
|
||||
@ -239,6 +243,8 @@ services:
|
||||
# First-time local setup or testing unreleased changes — build from source:
|
||||
# docker compose build canvas && docker compose up -d canvas
|
||||
# Note: ECR images require AWS auth — `aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 153263036946.dkr.ecr.us-east-2.amazonaws.com` before pull.
|
||||
# Digest-pin requires: aws ecr describe-images --repository-name molecule-ai/canvas --image-tags latest --query 'imageDetails[0].imageDigest'
|
||||
# TODO: pin canvas ECR image digest once AWS creds are available in CI.
|
||||
image: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/canvas:latest
|
||||
build:
|
||||
context: ./canvas
|
||||
@ -279,8 +285,10 @@ services:
|
||||
# And use model names from infra/litellm_config.yml (e.g. "claude-opus-4-5",
|
||||
# "gpt-4o", "openrouter/deepseek-r1", "ollama/llama3.2").
|
||||
# Edit infra/litellm_config.yml to add/remove providers and models.
|
||||
# digest-pinned 2026-05-10 (sha256:7c311546c25e7bb6e8cafede9fcd3d0d622ac636b5c9418befaa32e85dfb0186)
|
||||
# Refresh: curl -sI https://ghcr.io/v2/berriai/litellm/manifests/main-latest (Docker-Content-Digest header)
|
||||
litellm:
|
||||
image: ghcr.io/berriai/litellm:main-latest
|
||||
image: ghcr.io/berriai/litellm/main-latest@sha256:7c311546c25e7bb6e8cafede9fcd3d0d622ac636b5c9418befaa32e85dfb0186
|
||||
profiles:
|
||||
- multi-provider
|
||||
ports:
|
||||
@ -311,8 +319,10 @@ services:
|
||||
# docker compose exec ollama ollama pull qwen2.5-coder:7b
|
||||
# Then set MODEL_PROVIDER=ollama:llama3.2 in your workspace config.yaml
|
||||
# Workspace agents reach Ollama at http://ollama:11434 (internal Docker network).
|
||||
# digest-pinned 2026-05-10 (sha256:90bd8ed1ad1853fbfb1ef5835f9d7a24fe890e05ace521e2d8d7a6f56bb667dd, linux/amd64)
|
||||
# Refresh: curl -s https://hub.docker.com/v2/repositories/ollama/ollama/tags/latest | python3 -c "import json,sys; ..."
|
||||
ollama:
|
||||
image: ollama/ollama:latest
|
||||
image: ollama/ollama@sha256:90bd8ed1ad1853fbfb1ef5835f9d7a24fe890e05ace521e2d8d7a6f56bb667dd
|
||||
profiles:
|
||||
- local-models
|
||||
ports:
|
||||
|
||||
@ -21,6 +21,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/envx"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
|
||||
@ -110,11 +111,14 @@ const maxProxyResponseBody = 10 << 20
|
||||
// a generic 502 page to canvas. 10s is well above realistic intra-region
|
||||
// latencies and well below CF's edge timeout.
|
||||
//
|
||||
// 3. Transport.ResponseHeaderTimeout — 60s. From request-body-end to
|
||||
// response-headers-start. Covers cold-start first-byte (the 30-60s OAuth
|
||||
// flow above), with margin. Body streaming after headers is governed by
|
||||
// the per-request context deadline, NOT this timeout — so multi-minute
|
||||
// agent responses still work fine.
|
||||
// 3. Transport.ResponseHeaderTimeout — 180s default. From request-body-end
|
||||
// to response-headers-start. Configurable via
|
||||
// A2A_PROXY_RESPONSE_HEADER_TIMEOUT (envx.Duration). Covers cold-start
|
||||
// first-byte (30-60s OAuth flow above) with enough room for Opus agent
|
||||
// turns (big context + internal delegate_task round-trips routinely exceed
|
||||
// the old 60s ceiling). Body streaming after headers is governed by the
|
||||
// per-request context deadline, NOT this timeout — so multi-minute agent
|
||||
// responses still work fine.
|
||||
//
|
||||
// The point of (2) and (3) is to surface a *structured* 503 from
|
||||
// handleA2ADispatchError when the workspace agent is unreachable, so canvas
|
||||
@ -127,7 +131,7 @@ var a2aClient = &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
KeepAlive: 30 * time.Second,
|
||||
}).DialContext,
|
||||
ResponseHeaderTimeout: 60 * time.Second,
|
||||
ResponseHeaderTimeout: envx.Duration("A2A_PROXY_RESPONSE_HEADER_TIMEOUT", 180*time.Second),
|
||||
TLSHandshakeTimeout: 10 * time.Second,
|
||||
// MaxIdleConns / IdleConnTimeout: stdlib defaults are fine; agent
|
||||
// fan-in is bounded by the platform's broadcaster fan-out, not by
|
||||
|
||||
@ -2276,3 +2276,43 @@ func TestProxyA2A_PollMode_FailsClosedToPush(t *testing.T) {
|
||||
t.Errorf("unmet sqlmock expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// ==================== a2aClient ResponseHeaderTimeout config ====================
|
||||
|
||||
func TestA2AClientResponseHeaderTimeout(t *testing.T) {
|
||||
const defaultTimeout = 180 * time.Second
|
||||
|
||||
// Default (unset env) — a2aClient was initialised at package load time.
|
||||
if a2aClient.Transport.(*http.Transport).ResponseHeaderTimeout != defaultTimeout {
|
||||
t.Errorf("a2aClient default ResponseHeaderTimeout = %v, want %v",
|
||||
a2aClient.Transport.(*http.Transport).ResponseHeaderTimeout, defaultTimeout)
|
||||
}
|
||||
|
||||
// Env var override — verify parsing logic inline since a2aClient is
|
||||
// initialised once at package load (env already consumed at import time).
|
||||
t.Run("A2A_PROXY_RESPONSE_HEADER_TIMEOUT parsed correctly", func(t *testing.T) {
|
||||
// We can't re-initialise a2aClient, but we can verify the same
|
||||
// envx.Duration logic inline for the 5m override case.
|
||||
t.Setenv("A2A_PROXY_RESPONSE_HEADER_TIMEOUT", "5m")
|
||||
if d, err := time.ParseDuration("5m"); err == nil && d > 0 {
|
||||
if d != 5*time.Minute {
|
||||
t.Errorf("ParseDuration(\"5m\") = %v, want 5m", d)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("invalid A2A_PROXY_RESPONSE_HEADER_TIMEOUT falls back to default", func(t *testing.T) {
|
||||
t.Setenv("A2A_PROXY_RESPONSE_HEADER_TIMEOUT", "not-a-duration")
|
||||
// Simulate what envx.Duration does with an invalid value.
|
||||
var fallback = 180 * time.Second
|
||||
override := fallback
|
||||
if v := os.Getenv("A2A_PROXY_RESPONSE_HEADER_TIMEOUT"); v != "" {
|
||||
if d, err := time.ParseDuration(v); err == nil && d > 0 {
|
||||
override = d
|
||||
}
|
||||
}
|
||||
if override != fallback {
|
||||
t.Errorf("invalid env var: got %v, want fallback %v", override, fallback)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@ -71,10 +71,17 @@ func TemplateImageRef(runtime string) string {
|
||||
|
||||
// ghcrAuthHeader returns the base64-encoded JSON auth payload Docker's
|
||||
// ImagePull expects in PullOptions.RegistryAuth, or empty string when no
|
||||
// GHCR_USER/GHCR_TOKEN env is set (lets public images pull through).
|
||||
// GHCR_USER/GHCR_TOKEN env is set (lets public images pull through and lets
|
||||
// ECR's credential-helper-driven flow take over without a stale GHCR
|
||||
// payload masking it).
|
||||
//
|
||||
// The Docker SDK doesn't read ~/.docker/config.json — every authenticated
|
||||
// pull needs an explicit RegistryAuth string.
|
||||
// pull needs an explicit RegistryAuth string. The serveraddress field is
|
||||
// resolved from provisioner.RegistryHost() so it tracks MOLECULE_IMAGE_REGISTRY
|
||||
// when the operator points the platform at a private mirror (e.g. ECR).
|
||||
// Leaving it hardcoded to "ghcr.io" caused the engine to match the wrong
|
||||
// auth entry post-suspension when MOLECULE_IMAGE_REGISTRY was flipped to
|
||||
// the AWS ECR mirror (RFC #229).
|
||||
func ghcrAuthHeader() string {
|
||||
user := strings.TrimSpace(os.Getenv("GHCR_USER"))
|
||||
token := strings.TrimSpace(os.Getenv("GHCR_TOKEN"))
|
||||
@ -84,7 +91,7 @@ func ghcrAuthHeader() string {
|
||||
payload := map[string]string{
|
||||
"username": user,
|
||||
"password": token,
|
||||
"serveraddress": "ghcr.io",
|
||||
"serveraddress": provisioner.RegistryHost(),
|
||||
}
|
||||
js, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
|
||||
@ -9,6 +9,7 @@ import (
|
||||
func TestGHCRAuthHeader_NoEnvReturnsEmpty(t *testing.T) {
|
||||
t.Setenv("GHCR_USER", "")
|
||||
t.Setenv("GHCR_TOKEN", "")
|
||||
t.Setenv("MOLECULE_IMAGE_REGISTRY", "")
|
||||
if got := ghcrAuthHeader(); got != "" {
|
||||
t.Errorf("expected empty (no auth → public-only), got %q", got)
|
||||
}
|
||||
@ -29,6 +30,10 @@ func TestGHCRAuthHeader_PartialEnvReturnsEmpty(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestGHCRAuthHeader_EncodesDockerEnginePayload(t *testing.T) {
|
||||
// Default registry env (unset → ghcr.io/molecule-ai) means the
|
||||
// serveraddress field should resolve to ghcr.io. Pin both env vars so the
|
||||
// test is hermetic regardless of the host's MOLECULE_IMAGE_REGISTRY.
|
||||
t.Setenv("MOLECULE_IMAGE_REGISTRY", "")
|
||||
t.Setenv("GHCR_USER", "alice")
|
||||
t.Setenv("GHCR_TOKEN", "fake-tok-value")
|
||||
got := ghcrAuthHeader()
|
||||
@ -54,7 +59,41 @@ func TestGHCRAuthHeader_EncodesDockerEnginePayload(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestGHCRAuthHeader_RespectsRegistryEnv pins the RFC #229 fix: when
|
||||
// MOLECULE_IMAGE_REGISTRY points at a private mirror (e.g. AWS ECR), the
|
||||
// Docker engine auth payload's serveraddress must reflect that mirror's
|
||||
// host so credential matching lands on the right entry. Pre-fix this was
|
||||
// hardcoded to "ghcr.io" and silently dropped the override.
|
||||
func TestGHCRAuthHeader_RespectsRegistryEnv(t *testing.T) {
|
||||
t.Setenv("GHCR_USER", "alice")
|
||||
t.Setenv("GHCR_TOKEN", "fake-tok-value")
|
||||
t.Setenv("MOLECULE_IMAGE_REGISTRY", "004947743811.dkr.ecr.us-east-2.amazonaws.com/molecule-ai")
|
||||
|
||||
got := ghcrAuthHeader()
|
||||
if got == "" {
|
||||
t.Fatal("expected non-empty auth header")
|
||||
}
|
||||
raw, err := base64.URLEncoding.DecodeString(got)
|
||||
if err != nil {
|
||||
t.Fatalf("auth header is not valid base64-url: %v", err)
|
||||
}
|
||||
var payload map[string]string
|
||||
if err := json.Unmarshal(raw, &payload); err != nil {
|
||||
t.Fatalf("decoded auth is not valid JSON: %v (raw=%s)", err, raw)
|
||||
}
|
||||
want := "004947743811.dkr.ecr.us-east-2.amazonaws.com"
|
||||
if payload["serveraddress"] != want {
|
||||
t.Errorf("serveraddress: got %q, want %q (must follow MOLECULE_IMAGE_REGISTRY host)",
|
||||
payload["serveraddress"], want)
|
||||
}
|
||||
// Sanity: the org-path portion must NOT leak into serveraddress.
|
||||
if payload["serveraddress"] == "004947743811.dkr.ecr.us-east-2.amazonaws.com/molecule-ai" {
|
||||
t.Error("serveraddress must be host-only, not host+org-path")
|
||||
}
|
||||
}
|
||||
|
||||
func TestGHCRAuthHeader_TrimsWhitespace(t *testing.T) {
|
||||
t.Setenv("MOLECULE_IMAGE_REGISTRY", "")
|
||||
// .env lines often have trailing newlines or accidental spaces. Without
|
||||
// trimming, a stray space would produce an auth payload the engine
|
||||
// rejects with a confusing 401.
|
||||
|
||||
@ -121,7 +121,7 @@ curl -fsS -X POST "{{PLATFORM_URL}}/registry/register" \
|
||||
// operators whose external agent IS a Claude Code session (laptop or
|
||||
// remote dev VM); routes the workspace's A2A traffic into the running
|
||||
// Claude Code session as conversation turns via MCP. The plugin source
|
||||
// lives at github.com/Molecule-AI/molecule-mcp-claude-channel — polling
|
||||
// lives at git.moleculesai.app/molecule-ai/molecule-mcp-claude-channel — polling
|
||||
// based, no tunnel required (uses /workspaces/:id/activity?since_secs=,
|
||||
// platform-side support shipped in #2300).
|
||||
const externalChannelTemplate = `# Claude Code channel — bridges this workspace's A2A traffic into your
|
||||
@ -134,8 +134,8 @@ const externalChannelTemplate = `# Claude Code channel — bridges this workspac
|
||||
# The plugin is NOT on Anthropic's default allowlist, so a one-time
|
||||
# marketplace-add is needed before install:
|
||||
#
|
||||
# /plugin marketplace add Molecule-AI/molecule-mcp-claude-channel
|
||||
# /plugin install molecule@molecule-mcp-claude-channel
|
||||
# /plugin marketplace add https://git.moleculesai.app/molecule-ai/molecule-mcp-claude-channel.git
|
||||
# /plugin install molecule@molecule-channel
|
||||
#
|
||||
# Then either run /reload-plugins or restart Claude Code so the
|
||||
# plugin is registered.
|
||||
@ -154,7 +154,7 @@ chmod 600 ~/.claude/channels/molecule/.env
|
||||
# flag to opt in — without it, you'll see "not on the approved channels
|
||||
# allowlist" on startup.
|
||||
claude --dangerously-load-development-channels \
|
||||
--channels plugin:molecule@molecule-mcp-claude-channel
|
||||
--channels plugin:molecule@molecule-channel
|
||||
|
||||
# You should see on stderr:
|
||||
# molecule channel: connected — watching 1 workspace(s) at {{PLATFORM_URL}}
|
||||
@ -176,7 +176,7 @@ claude --dangerously-load-development-channels \
|
||||
# add the plugin to allowedChannelPlugins in claude.ai admin settings.
|
||||
#
|
||||
# Multi-workspace: comma-separate IDs and tokens (same order). See
|
||||
# https://github.com/Molecule-AI/molecule-mcp-claude-channel for
|
||||
# https://git.moleculesai.app/molecule-ai/molecule-mcp-claude-channel for
|
||||
# pairing flow, push-mode upgrade, and v0.2 roadmap.
|
||||
|
||||
# Need help?
|
||||
@ -258,7 +258,7 @@ claude mcp add molecule -s user -- env \
|
||||
// externalPythonTemplate uses molecule-sdk-python's RemoteAgentClient +
|
||||
// A2AServer (PR #13 in that repo). Until the SDK cuts a v0.y release
|
||||
// to PyPI the snippet pins git+main.
|
||||
const externalPythonTemplate = `# pip install 'git+https://github.com/Molecule-AI/molecule-sdk-python.git@main'
|
||||
const externalPythonTemplate = `# pip install 'git+https://git.moleculesai.app/molecule-ai/molecule-sdk-python.git@main'
|
||||
|
||||
import asyncio
|
||||
from molecule_agent import RemoteAgentClient, A2AServer
|
||||
@ -307,7 +307,7 @@ if __name__ == "__main__":
|
||||
// A2A traffic into the running hermes gateway as platform messages
|
||||
// via the molecule-channel plugin.
|
||||
//
|
||||
// The plugin (Molecule-AI/hermes-channel-molecule) is a hermes
|
||||
// The plugin (molecule-ai/hermes-channel-molecule on Gitea) is a hermes
|
||||
// platform adapter that:
|
||||
// 1. Spawns ``python -m molecule_runtime.a2a_mcp_server`` as a
|
||||
// stdio MCP subprocess (separate from any hermes-side MCP
|
||||
@ -336,7 +336,7 @@ const externalHermesChannelTemplate = `# Hermes channel — bridges this workspa
|
||||
#
|
||||
# 1. Install the runtime + plugin:
|
||||
pip install molecule-ai-workspace-runtime
|
||||
pip install 'git+https://github.com/Molecule-AI/hermes-channel-molecule.git'
|
||||
pip install 'git+https://git.moleculesai.app/molecule-ai/hermes-channel-molecule.git'
|
||||
|
||||
# 2. Export the workspace credentials:
|
||||
export MOLECULE_WORKSPACE_ID={{WORKSPACE_ID}}
|
||||
@ -366,7 +366,7 @@ hermes gateway --replace
|
||||
# by the plugin's molecule_runtime MCP subprocess).
|
||||
#
|
||||
# Source + issue tracker:
|
||||
# https://github.com/Molecule-AI/hermes-channel-molecule
|
||||
# https://git.moleculesai.app/molecule-ai/hermes-channel-molecule
|
||||
|
||||
# Need help?
|
||||
# Documentation: https://doc.moleculesai.app/docs/guides/external-agent-registration
|
||||
|
||||
@ -75,3 +75,46 @@ func TestExternalMcpTemplates_UseMoleculeMcpWrapper(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestExternalTemplates_NoBrokenMoleculeAIGitHubURLs pins the invariant
|
||||
// that operator-facing snippets never embed github.com URLs pointing at
|
||||
// Molecule-AI repos.
|
||||
//
|
||||
// Why: the Molecule-AI GitHub org was suspended 2026-05-06 and the
|
||||
// canonical SCM is now git.moleculesai.app. Any `pip install
|
||||
// git+https://github.com/Molecule-AI/...` or marketplace-add Molecule-AI/
|
||||
// URL emitted to an external operator hits a 404 / org-suspended page,
|
||||
// breaking onboarding silently. RFC #229 P2-5.
|
||||
//
|
||||
// Third-party github URLs (gin, openai/codex, NousResearch/hermes-agent
|
||||
// upstream issue trackers, npm @openai/codex) remain valid — only
|
||||
// Molecule-AI/ paths are broken.
|
||||
func TestExternalTemplates_NoBrokenMoleculeAIGitHubURLs(t *testing.T) {
|
||||
templates := map[string]string{
|
||||
"externalCurlTemplate": externalCurlTemplate,
|
||||
"externalChannelTemplate": externalChannelTemplate,
|
||||
"externalUniversalMcpTemplate": externalUniversalMcpTemplate,
|
||||
"externalPythonTemplate": externalPythonTemplate,
|
||||
"externalHermesChannelTemplate": externalHermesChannelTemplate,
|
||||
"externalCodexTemplate": externalCodexTemplate,
|
||||
"externalOpenClawTemplate": externalOpenClawTemplate,
|
||||
}
|
||||
// Substrings that imply the snippet is pointing an operator at the
|
||||
// suspended Molecule-AI GitHub org.
|
||||
bannedSubstrings := []string{
|
||||
"github.com/Molecule-AI/",
|
||||
"github.com/molecule-ai/",
|
||||
// Bare `Molecule-AI/<repo>` form used by `/plugin marketplace add`
|
||||
// resolves through GitHub by default — explicit Gitea URL is
|
||||
// required post-suspension.
|
||||
"marketplace add Molecule-AI/",
|
||||
"marketplace add molecule-ai/",
|
||||
}
|
||||
for name, body := range templates {
|
||||
for _, banned := range bannedSubstrings {
|
||||
if strings.Contains(body, banned) {
|
||||
t.Errorf("%s contains %q — Molecule-AI GitHub org is suspended; use git.moleculesai.app/molecule-ai/<repo> instead (RFC #229 P2-5)", name, banned)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -29,6 +29,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/handlers"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
|
||||
)
|
||||
|
||||
// DefaultInterval is the polling cadence. Runtime publishes happen at most
|
||||
@ -127,20 +128,32 @@ func (w *Watcher) tick(ctx context.Context, fetch digestFetcher) {
|
||||
}
|
||||
}
|
||||
|
||||
// remoteDigest queries GHCR for the current manifest digest of the
|
||||
// workspace-template-<runtime>:latest image. Uses the Docker Registry V2
|
||||
// HTTP API: get a bearer token, then HEAD the manifest.
|
||||
// remoteDigest queries the configured registry for the current manifest
|
||||
// digest of the workspace-template-<runtime>:latest image. Uses the Docker
|
||||
// Registry V2 HTTP API: get a bearer token, then HEAD the manifest.
|
||||
//
|
||||
// Registry host is resolved from provisioner.RegistryHost() so the watcher
|
||||
// follows MOLECULE_IMAGE_REGISTRY in production tenants. Pre-RFC #229 this
|
||||
// was hardcoded to ghcr.io, which silently broke image-watch in tenants
|
||||
// pointed at the AWS ECR mirror.
|
||||
//
|
||||
// Auth: if GHCR_USER+GHCR_TOKEN are set, basic-auth the token request
|
||||
// (works for both public and private images). If unset, anonymous token
|
||||
// (works for public images only — every workspace template is public).
|
||||
//
|
||||
// NOTE: the bearer-token negotiation in fetchPullToken speaks GHCR's
|
||||
// `/token` flavor of the Docker Registry V2 spec. ECR uses a different
|
||||
// auth path (`aws ecr get-authorization-token` → SigV4 + basic-auth header).
|
||||
// Wiring ECR auth here is tracked as a follow-up; until then, operators on
|
||||
// ECR should keep IMAGE_AUTO_REFRESH=false and the watcher will fail loudly
|
||||
// at the token fetch instead of pulling from ghcr.io behind their back.
|
||||
func (w *Watcher) remoteDigest(ctx context.Context, runtime string) (string, error) {
|
||||
repo := "molecule-ai/workspace-template-" + runtime
|
||||
tok, err := w.fetchPullToken(ctx, repo)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("pull token: %w", err)
|
||||
}
|
||||
manifestURL := fmt.Sprintf("https://ghcr.io/v2/%s/manifests/latest", repo)
|
||||
manifestURL := fmt.Sprintf("https://%s/v2/%s/manifests/latest", provisioner.RegistryHost(), repo)
|
||||
req, err := http.NewRequestWithContext(ctx, "HEAD", manifestURL, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
@ -171,14 +184,22 @@ func (w *Watcher) remoteDigest(ctx context.Context, runtime string) (string, err
|
||||
return digest, nil
|
||||
}
|
||||
|
||||
// fetchPullToken negotiates a short-lived bearer token from GHCR's token
|
||||
// endpoint scoped to repo:pull. GHCR requires a token even for anonymous
|
||||
// pulls of public images.
|
||||
// fetchPullToken negotiates a short-lived bearer token from the registry's
|
||||
// `/token` endpoint scoped to repo:pull. GHCR requires a token even for
|
||||
// anonymous pulls of public images.
|
||||
//
|
||||
// Registry host follows provisioner.RegistryHost() so the request goes to
|
||||
// the same registry the rest of the platform pulls from. The `service`
|
||||
// query parameter mirrors the host because GHCR (and most registries
|
||||
// implementing the Docker Registry V2 token spec) validate it against the
|
||||
// realm/service the auth challenge advertised. ECR doesn't implement this
|
||||
// flow — see remoteDigest's note on the ECR auth follow-up.
|
||||
func (w *Watcher) fetchPullToken(ctx context.Context, repo string) (string, error) {
|
||||
host := provisioner.RegistryHost()
|
||||
q := url.Values{}
|
||||
q.Set("service", "ghcr.io")
|
||||
q.Set("service", host)
|
||||
q.Set("scope", "repository:"+repo+":pull")
|
||||
tokURL := "https://ghcr.io/token?" + q.Encode()
|
||||
tokURL := "https://" + host + "/token?" + q.Encode()
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", tokURL, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
|
||||
@ -3,6 +3,9 @@ package imagewatch
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
|
||||
@ -160,6 +163,100 @@ func TestTick_DigestFetchErrorSkipsRuntime(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestRemoteDigest_RegistryHostFollowsEnv pins the RFC #229 fix: with
|
||||
// MOLECULE_IMAGE_REGISTRY pointed at a private mirror, the watcher's HTTP
|
||||
// calls (token endpoint + manifest HEAD) must hit that mirror's host, not
|
||||
// the hardcoded ghcr.io of the pre-fix code path. We stand up an httptest
|
||||
// server, point MOLECULE_IMAGE_REGISTRY at its host, and assert both
|
||||
// endpoints get hit on it.
|
||||
//
|
||||
// Without this test, a future refactor could revert the helper indirection
|
||||
// and the watcher would silently go back to talking to ghcr.io even when
|
||||
// the platform is configured for ECR — exactly the bug RFC #229 is closing.
|
||||
func TestRemoteDigest_RegistryHostFollowsEnv(t *testing.T) {
|
||||
var (
|
||||
mu sync.Mutex
|
||||
tokenHits int
|
||||
manifestHits int
|
||||
lastTokenURL string
|
||||
lastManifestURL string
|
||||
)
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
switch {
|
||||
case strings.HasPrefix(r.URL.Path, "/token"):
|
||||
tokenHits++
|
||||
lastTokenURL = r.URL.String()
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(`{"token":"fake-bearer"}`))
|
||||
case strings.HasPrefix(r.URL.Path, "/v2/") && strings.Contains(r.URL.Path, "/manifests/latest"):
|
||||
manifestHits++
|
||||
lastManifestURL = r.URL.Path
|
||||
w.Header().Set("Docker-Content-Digest", "sha256:cafef00d")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
default:
|
||||
w.WriteHeader(http.StatusNotFound)
|
||||
}
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
// httptest.Server.URL is "http://127.0.0.1:NNNN". RegistryHost() works
|
||||
// over the host:port portion (provisioner.RegistryPrefix takes the env
|
||||
// verbatim), so we strip the scheme and append "/molecule-ai" to mimic
|
||||
// the prefix shape MOLECULE_IMAGE_REGISTRY actually uses in production.
|
||||
host := strings.TrimPrefix(srv.URL, "http://")
|
||||
t.Setenv("MOLECULE_IMAGE_REGISTRY", host+"/molecule-ai")
|
||||
|
||||
w := newTestWatcher(&fakeRefresher{}, "claude-code")
|
||||
// Use the test-server URL scheme by overriding the http client only —
|
||||
// remoteDigest constructs https://<host>/... internally. We need the
|
||||
// watcher to hit our http server, so swap the URL scheme by injecting
|
||||
// a transport that rewrites https→http for this test.
|
||||
w.http = &http.Client{Transport: rewriteToHTTP{}}
|
||||
|
||||
digest, err := w.remoteDigest(context.Background(), "claude-code")
|
||||
if err != nil {
|
||||
t.Fatalf("remoteDigest failed: %v", err)
|
||||
}
|
||||
if digest != "sha256:cafef00d" {
|
||||
t.Errorf("digest: got %q, want sha256:cafef00d", digest)
|
||||
}
|
||||
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
if tokenHits != 1 {
|
||||
t.Errorf("token endpoint hits: got %d, want 1 (watcher must hit configured registry, not ghcr.io)", tokenHits)
|
||||
}
|
||||
if manifestHits != 1 {
|
||||
t.Errorf("manifest HEAD hits: got %d, want 1 (watcher must hit configured registry, not ghcr.io)", manifestHits)
|
||||
}
|
||||
// service= query param must reflect the configured host so registries
|
||||
// that validate the param (GHCR-style spec) accept the request.
|
||||
if !strings.Contains(lastTokenURL, "service="+host) && !strings.Contains(lastTokenURL, "service=127.0.0.1") {
|
||||
t.Errorf("token URL service param not host-derived: got %q", lastTokenURL)
|
||||
}
|
||||
wantManifestPath := "/v2/molecule-ai/workspace-template-claude-code/manifests/latest"
|
||||
if lastManifestURL != wantManifestPath {
|
||||
t.Errorf("manifest path: got %q, want %q", lastManifestURL, wantManifestPath)
|
||||
}
|
||||
}
|
||||
|
||||
// rewriteToHTTP is a tiny RoundTripper that flips https→http so the watcher
|
||||
// (which builds https URLs from the configured registry host) can target an
|
||||
// httptest.Server that only speaks http. Production code paths still go
|
||||
// over https; this is a unit-test seam only.
|
||||
type rewriteToHTTP struct{}
|
||||
|
||||
func (rewriteToHTTP) RoundTrip(req *http.Request) (*http.Response, error) {
|
||||
if req.URL.Scheme == "https" {
|
||||
clone := req.Clone(req.Context())
|
||||
clone.URL.Scheme = "http"
|
||||
req = clone
|
||||
}
|
||||
return http.DefaultTransport.RoundTrip(req)
|
||||
}
|
||||
|
||||
func TestShortDigest(t *testing.T) {
|
||||
cases := map[string]string{
|
||||
"sha256:abcdef0123456789": "sha256:abcdef012345",
|
||||
|
||||
@ -3,6 +3,7 @@ package provisioner
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// defaultRegistryPrefix is the upstream OSS face for all workspace template
|
||||
@ -62,6 +63,32 @@ func RegistryPrefix() string {
|
||||
return defaultRegistryPrefix
|
||||
}
|
||||
|
||||
// RegistryHost returns just the registry host portion of RegistryPrefix() —
|
||||
// i.e. everything before the first "/" separator. This is the value that
|
||||
// belongs in:
|
||||
//
|
||||
// - Docker Engine PullOptions.RegistryAuth payloads (`serveraddress` field)
|
||||
// — the engine matches credentials against host, not host+org-path.
|
||||
// - Docker Registry V2 HTTP API base URLs (e.g. `https://<host>/v2/...`)
|
||||
// — the V2 API is host-rooted; the org-path lives in the manifest path.
|
||||
//
|
||||
// Examples:
|
||||
//
|
||||
// "ghcr.io/molecule-ai" → "ghcr.io"
|
||||
// "123456789012.dkr.ecr.us-east-2.amazonaws.com/molecule-ai" → "123456789012.dkr.ecr.us-east-2.amazonaws.com"
|
||||
// "git.moleculesai.app/molecule-ai" → "git.moleculesai.app"
|
||||
//
|
||||
// If RegistryPrefix() ever returns a bare host (no `/`), we return it as-is
|
||||
// rather than letting strings.SplitN produce an empty string — defensive
|
||||
// against a misconfiguration where the operator sets just the host.
|
||||
func RegistryHost() string {
|
||||
prefix := RegistryPrefix()
|
||||
if i := strings.IndexByte(prefix, '/'); i > 0 {
|
||||
return prefix[:i]
|
||||
}
|
||||
return prefix
|
||||
}
|
||||
|
||||
// RuntimeImage returns the canonical image reference for the given runtime,
|
||||
// using the current RegistryPrefix() and the moving `:latest` tag.
|
||||
//
|
||||
|
||||
@ -127,6 +127,50 @@ func TestComputeRuntimeImages_ReflectsCurrentEnv(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestRegistryHost_SplitsHostFromOrgPath pins the contract that callers
|
||||
// (Docker auth payloads, registry V2 HTTP base URLs) need: the host portion
|
||||
// must be free of the "/molecule-ai" org suffix that appears in the
|
||||
// pull-prefix form. Pre-RFC #229, ghcr.io was hardcoded in two places
|
||||
// (imagewatch + admin_workspace_images auth payload); this helper is the
|
||||
// single source they should resolve from.
|
||||
func TestRegistryHost_SplitsHostFromOrgPath(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
env string
|
||||
want string
|
||||
}{
|
||||
{"default GHCR", "", "ghcr.io"},
|
||||
{"AWS ECR mirror", "004947743811.dkr.ecr.us-east-2.amazonaws.com/molecule-ai", "004947743811.dkr.ecr.us-east-2.amazonaws.com"},
|
||||
{"self-hosted Gitea", "git.moleculesai.app/molecule-ai", "git.moleculesai.app"},
|
||||
// Bare host (no /org) — defensive: return as-is rather than empty.
|
||||
{"bare host no org-path", "registry.example.com", "registry.example.com"},
|
||||
// Multi-level org path — split at the first "/" only.
|
||||
{"nested org path", "registry.example.com/org/sub", "registry.example.com"},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Setenv("MOLECULE_IMAGE_REGISTRY", tc.env)
|
||||
got := RegistryHost()
|
||||
if got != tc.want {
|
||||
t.Errorf("RegistryHost() with env=%q: got %q, want %q", tc.env, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestRegistryHost_NeverEmpty — guard against a future refactor accidentally
|
||||
// returning "" for some edge env value. An empty serveraddress in the
|
||||
// Docker engine auth payload, or an empty host in `https:///v2/...`, would
|
||||
// silently break image operations.
|
||||
func TestRegistryHost_NeverEmpty(t *testing.T) {
|
||||
for _, env := range []string{"", "ghcr.io/molecule-ai", "/leading-slash", "host-only", "host/with/path"} {
|
||||
t.Setenv("MOLECULE_IMAGE_REGISTRY", env)
|
||||
if got := RegistryHost(); got == "" {
|
||||
t.Errorf("RegistryHost() with env=%q returned empty (would break Docker auth + V2 HTTP)", env)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestKnownRuntimes_AlphabeticalOrder — pin the order so test snapshots
|
||||
// (and human readers diffing the file) see deterministic output. Adding a
|
||||
// new runtime out of alphabetical order will fail this test, which is the
|
||||
|
||||
Loading…
Reference in New Issue
Block a user