forked from molecule-ai/molecule-core
Replaces the legacy nohup `go run ./cmd/server` setup with a fully
containerized local stack: postgres + redis + platform + canvas, all
with `restart: unless-stopped` so they survive Mac sleep/wake and
Docker Desktop daemon restarts.
## Changes
- **docker-compose.yml**
- `restart: unless-stopped` on platform/postgres/redis
- `BIND_ADDR=0.0.0.0` for platform — the dev-mode-fail-open default
of 127.0.0.1 (PR #7) made the host unable to reach the container
even with port mapping. Container netns is already isolated, so
binding all interfaces inside is safe.
- Healthchecks switched from `wget --spider` (HEAD → 404 forever
because /health is GET-only) to `wget -qO /dev/null` (GET).
Same regression existed on canvas; fixed both.
- **workspace-server/Dockerfile.dev**
- `CGO_ENABLED=1` → `0` to match prod Dockerfile + Dockerfile.tenant.
Without this, the alpine dev image fails with "gcc: not found"
because workspace-server has no actual cgo deps but the env was
forcing the cgo build path. Closes a divergence introduced in
9d50a6da (today's air hot-reload PR).
- **canvas/Dockerfile**
- `npm install` → `npm ci --include=optional` for lockfile-exact
installs that include platform-specific @tailwindcss/oxide native
binaries. Without these, `next build` fails with "Cannot read
properties of undefined (reading 'All')" on the
`@import "tailwindcss"` directive.
- **canvas/.dockerignore** (new)
- Excludes `node_modules` and `.next` so the Dockerfile's
`COPY . .` step doesn't clobber the freshly-installed container
node_modules with the host's (potentially stale or wrong-arch)
copy. This was the actual root cause of the canvas build break.
- **workspace-server/.gitignore**
- Adds `/tmp/` for air's live-reload build cache.
## Stage A verified
```
container status restart
postgres-1 Up (healthy) unless-stopped
redis-1 Up (healthy) unless-stopped
platform-1 Up (healthy, air-mode) unless-stopped
canvas-1 Up (healthy) unless-stopped
GET :8080/health → 200
GET :3000/ → 200
DB preserved: 407 workspace rows + 5 named personas
Persona mount: 28 dirs at /etc/molecule-bootstrap/personas
```
## Stage B — N/A
This is local-dev infrastructure only. None of these files ship to
SaaS tenants — production EC2s use `Dockerfile.tenant` + `ec2.go`
user-data, not docker-compose.
## Out of scope
- The decorative-but-broken `wget --spider` healthcheck has presumably
also been silently 404'ing on prod tenants. Ship a follow-up to
audit + fix the prod path; not done here to keep the PR scoped.
- Docker Desktop "Start at login" is a per-machine GUI setting that
must be toggled manually (Settings → General).
- The legacy heartbeat-all.sh that pinged 5 persona workspaces from
the host has been deleted (~/.molecule-ai/heartbeat-all.sh).
Per Hongming: each workspace is responsible for its own heartbeat.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
337 lines
15 KiB
YAML
337 lines
15 KiB
YAML
services:
|
||
# --- Infrastructure ---
|
||
postgres:
|
||
image: postgres:16-alpine
|
||
environment:
|
||
POSTGRES_USER: ${POSTGRES_USER:-dev}
|
||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-dev}
|
||
POSTGRES_DB: ${POSTGRES_DB:-molecule}
|
||
command: ["postgres", "-c", "wal_level=logical"]
|
||
ports:
|
||
- "5432:5432"
|
||
volumes:
|
||
- pgdata:/var/lib/postgresql/data
|
||
networks:
|
||
- molecule-monorepo-net
|
||
restart: unless-stopped
|
||
healthcheck:
|
||
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-dev}"]
|
||
interval: 2s
|
||
timeout: 5s
|
||
retries: 10
|
||
|
||
langfuse-db-init:
|
||
image: postgres:16-alpine
|
||
depends_on:
|
||
postgres:
|
||
condition: service_healthy
|
||
environment:
|
||
POSTGRES_USER: ${POSTGRES_USER:-dev}
|
||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-dev}
|
||
command:
|
||
- /bin/sh
|
||
- -c
|
||
- |
|
||
export PGPASSWORD="$${POSTGRES_PASSWORD}"
|
||
until pg_isready -h postgres -U "$${POSTGRES_USER}" -d postgres >/dev/null 2>&1; do
|
||
sleep 1
|
||
done
|
||
if ! psql -h postgres -U "$${POSTGRES_USER}" -d postgres -tAc "SELECT 1 FROM pg_database WHERE datname = 'langfuse'" | grep -q 1; then
|
||
psql -h postgres -U "$${POSTGRES_USER}" -d postgres -c "CREATE DATABASE langfuse"
|
||
fi
|
||
networks:
|
||
- molecule-monorepo-net
|
||
|
||
redis:
|
||
image: redis:7-alpine
|
||
command: ["redis-server", "--notify-keyspace-events", "KEA"]
|
||
ports:
|
||
- "6379:6379"
|
||
volumes:
|
||
- redisdata:/data
|
||
networks:
|
||
- molecule-monorepo-net
|
||
restart: unless-stopped
|
||
healthcheck:
|
||
test: ["CMD", "redis-cli", "ping"]
|
||
interval: 2s
|
||
timeout: 5s
|
||
retries: 10
|
||
|
||
# --- Observability ---
|
||
langfuse-clickhouse:
|
||
image: clickhouse/clickhouse-server:24-alpine
|
||
environment:
|
||
CLICKHOUSE_DB: langfuse
|
||
CLICKHOUSE_USER: langfuse
|
||
CLICKHOUSE_PASSWORD: langfuse
|
||
volumes:
|
||
- clickhousedata:/var/lib/clickhouse
|
||
networks:
|
||
- molecule-monorepo-net
|
||
healthcheck:
|
||
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://127.0.0.1:8123/ping || exit 1"]
|
||
interval: 5s
|
||
timeout: 5s
|
||
retries: 10
|
||
|
||
langfuse:
|
||
image: langfuse/langfuse:2
|
||
depends_on:
|
||
langfuse-clickhouse:
|
||
condition: service_healthy
|
||
langfuse-db-init:
|
||
condition: service_completed_successfully
|
||
environment:
|
||
DATABASE_URL: postgres://${POSTGRES_USER:-dev}:${POSTGRES_PASSWORD:-dev}@postgres:5432/langfuse
|
||
# Langfuse v2 expects the HTTP interface (port 8123). The previous
|
||
# clickhouse://...:9000 native-protocol URL is rejected with
|
||
# "ClickHouse URL protocol must be either http or https".
|
||
CLICKHOUSE_URL: http://langfuse-clickhouse:8123
|
||
CLICKHOUSE_MIGRATION_URL: clickhouse://langfuse-clickhouse:9000
|
||
CLICKHOUSE_USER: langfuse
|
||
CLICKHOUSE_PASSWORD: langfuse
|
||
NEXTAUTH_SECRET: ${LANGFUSE_SECRET:-changeme-langfuse-secret}
|
||
NEXTAUTH_URL: http://localhost:3001
|
||
SALT: ${LANGFUSE_SALT:-changeme-langfuse-salt}
|
||
ports:
|
||
- "3001:3000"
|
||
networks:
|
||
- molecule-monorepo-net
|
||
healthcheck:
|
||
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/public/health || exit 1"]
|
||
interval: 10s
|
||
timeout: 5s
|
||
retries: 10
|
||
|
||
# --- Platform ---
|
||
platform:
|
||
build:
|
||
# Build context MUST be repo root, not ./platform — the Dockerfile
|
||
# COPYs `workspace-server/migrations`, `workspace-server/go.mod`,
|
||
# `workspace-configs-templates/` etc. via repo-relative paths so it
|
||
# can bake in templates + migrations alongside the platform binary.
|
||
# When context was ./platform earlier, docker silently cached an
|
||
# earlier image (the COPY workspace-server/migrations resolved to nothing
|
||
# under ./workspace-server/, so layers stopped invalidating) — manifested
|
||
# as migration 023 not landing after PR #417 merged. CI workflow
|
||
# already uses context=. , this aligns local with CI.
|
||
context: .
|
||
dockerfile: workspace-server/Dockerfile
|
||
depends_on:
|
||
postgres:
|
||
condition: service_healthy
|
||
redis:
|
||
condition: service_healthy
|
||
environment:
|
||
DATABASE_URL: postgres://${POSTGRES_USER:-dev}:${POSTGRES_PASSWORD:-dev}@postgres:5432/${POSTGRES_DB:-molecule}?sslmode=disable
|
||
REDIS_URL: redis://redis:6379
|
||
PORT: "${PLATFORM_PORT:-8080}"
|
||
PLATFORM_URL: "http://platform:${PLATFORM_PORT:-8080}"
|
||
# Container network namespace is already isolated; "all interfaces"
|
||
# inside the container = the bridge interface only. The fail-open
|
||
# default (127.0.0.1) would block host-to-container access.
|
||
BIND_ADDR: "${BIND_ADDR:-0.0.0.0}"
|
||
# Default MOLECULE_ENV=development so the WorkspaceAuth / AdminAuth
|
||
# middleware fail-open path activates when ADMIN_TOKEN is unset —
|
||
# otherwise the canvas (which runs without a bearer in pure local
|
||
# dev) gets 401 "missing workspace auth token" on every request.
|
||
# Override to "production" for SaaS/staged deploys; in those modes
|
||
# ADMIN_TOKEN must also be set or every request rejects.
|
||
MOLECULE_ENV: "${MOLECULE_ENV:-development}"
|
||
CORS_ORIGINS: ${CORS_ORIGINS:-http://localhost:${CANVAS_PUBLISH_PORT:-3000},http://127.0.0.1:${CANVAS_PUBLISH_PORT:-3000},http://localhost:3001}
|
||
RATE_LIMIT: "${RATE_LIMIT:-1000}"
|
||
CONFIGS_DIR: /configs
|
||
CONFIGS_HOST_DIR: "${CONFIGS_HOST_DIR:-${PWD}/workspace-configs-templates}"
|
||
PLUGINS_HOST_DIR: "${PLUGINS_HOST_DIR:-${PWD}/plugins}"
|
||
# github-app-auth plugin — injects GITHUB_TOKEN / GH_TOKEN into every
|
||
# workspace env from the App installation token. Remap the host-side
|
||
# path in GITHUB_APP_PRIVATE_KEY_FILE to /secrets/github-app.pem inside
|
||
# the container (the private key is bind-mounted below read-only).
|
||
# Soft-dep: skipped entirely when GITHUB_APP_ID is unset.
|
||
GITHUB_APP_ID: "${GITHUB_APP_ID:-}"
|
||
GITHUB_APP_INSTALLATION_ID: "${GITHUB_APP_INSTALLATION_ID:-}"
|
||
GITHUB_APP_PRIVATE_KEY_FILE: "/secrets/github-app.pem"
|
||
# ADMIN_TOKEN — required to fully close issue #684 (AdminAuth bearer bypass, PR #729).
|
||
# When set, only this exact value is accepted on all /admin/* and /approvals/* routes;
|
||
# workspace bearer tokens are no longer accepted as admin credentials.
|
||
# Unset (default) → backward-compat fallback: any valid workspace token passes AdminAuth
|
||
# (same behaviour as before PR #729, still vulnerable to #684).
|
||
# Generate: openssl rand -base64 32
|
||
# Store in fly secrets / deployment env — NEVER commit the actual value.
|
||
ADMIN_TOKEN: "${ADMIN_TOKEN:-}"
|
||
# Workspace hibernation default (issue #724 / PR #724). Sets platform-wide idle
|
||
# threshold (minutes); per-workspace column takes precedence. Leave empty to
|
||
# rely on per-workspace config only (current behaviour — global-default code pending).
|
||
HIBERNATION_IDLE_MINUTES: "${HIBERNATION_IDLE_MINUTES:-}"
|
||
# Plugin supply chain hardening (issue #768 / PR #775). Never set in production.
|
||
PLUGIN_ALLOW_UNPINNED: "${PLUGIN_ALLOW_UNPINNED:-}"
|
||
# Force ImagePull/ContainerCreate to request linux/amd64 manifests
|
||
# for the workspace-template-* images. The templates ship single-arch
|
||
# amd64 today; without this override, an arm64 host (Apple Silicon)
|
||
# asks the daemon for linux/arm64/v8, which doesn't match the manifest
|
||
# and the pull fails with "no matching manifest". Apple Silicon will
|
||
# run the amd64 image under Rosetta — slower (~2-3×) but functional.
|
||
# Override to "" or another platform when the templates start shipping
|
||
# multi-arch (then this hardcoded amd64 becomes unnecessary).
|
||
MOLECULE_IMAGE_PLATFORM: "${MOLECULE_IMAGE_PLATFORM:-linux/amd64}"
|
||
# GHCR auth for the workspace-images refresh endpoint
|
||
# (POST /admin/workspace-images/refresh). When set, the platform's
|
||
# Docker SDK ImagePull on private workspace-template-* images
|
||
# succeeds without per-host `docker login`. GHCR_USER is the GitHub
|
||
# username; GHCR_TOKEN is a fine-grained PAT with `read:packages`
|
||
# on the Molecule-AI org. Both unset → endpoint can only pull
|
||
# public images (current state for all 8 templates).
|
||
GHCR_USER: "${GHCR_USER:-}"
|
||
GHCR_TOKEN: "${GHCR_TOKEN:-}"
|
||
# Auto-refresh workspace-template-* images. The watcher polls GHCR
|
||
# every 5 min; when a digest moves, it pulls and force-recreates any
|
||
# matching ws-* containers (existing /admin/workspace-images/refresh
|
||
# logic). Closes the runtime CD chain: merge → containers running
|
||
# new code, no operator step. Default ON for local dev because that's
|
||
# where the runtime → ws iteration loop is tightest. Set to "false"
|
||
# if you don't want the platform to mutate ws-* containers behind
|
||
# your back during a long-running test.
|
||
IMAGE_AUTO_REFRESH: "${IMAGE_AUTO_REFRESH:-true}"
|
||
volumes:
|
||
- ./workspace-configs-templates:/configs
|
||
- ./org-templates:/org-templates:ro
|
||
- ./plugins:/plugins:ro
|
||
- /var/run/docker.sock:/var/run/docker.sock
|
||
# App private key — read-only bind-mount. The host-side path is
|
||
# gitignored per .gitignore rules (/.secrets/ + *.pem).
|
||
- ./.secrets/github-app.pem:/secrets/github-app.pem:ro
|
||
# Per-role persona credentials (molecule-core#242 local surface).
|
||
# Sourced at workspace creation time by org_import.go::loadPersonaEnvFile
|
||
# when a workspace.yaml carries `role: <name>`. The host-side dir is
|
||
# populated by the operator-host bootstrap kit (28 dev-tree personas);
|
||
# /etc/molecule-bootstrap/personas is the in-container path the
|
||
# platform expects (matches the prod tenant-EC2 path so the same code
|
||
# works in both modes).
|
||
#
|
||
# Read-only mount — workspace-server only reads, never writes here.
|
||
# If the host dir is empty/missing the platform's loadPersonaEnvFile
|
||
# silently no-ops per its existing semantics, so this mount is safe
|
||
# even on a fresh machine that hasn't run the bootstrap kit yet.
|
||
- ${MOLECULE_PERSONA_ROOT_HOST:-${HOME}/.molecule-ai/personas}:/etc/molecule-bootstrap/personas:ro
|
||
ports:
|
||
- "${PLATFORM_PUBLISH_PORT:-8080}:${PLATFORM_PORT:-8080}"
|
||
networks:
|
||
- molecule-monorepo-net
|
||
restart: unless-stopped
|
||
healthcheck:
|
||
# Plain GET — `--spider` would issue HEAD, which returns 404 because
|
||
# /health is registered as GET only.
|
||
test: ["CMD-SHELL", "wget -qO /dev/null --tries=1 http://localhost:${PLATFORM_PORT:-8080}/health || exit 1"]
|
||
interval: 5s
|
||
timeout: 5s
|
||
retries: 10
|
||
|
||
# --- Canvas ---
|
||
canvas:
|
||
# The publish-canvas-image CI workflow pushes a fresh image to GHCR on
|
||
# every canvas/** merge to main. To update the running container:
|
||
# docker compose pull canvas && docker compose up -d canvas
|
||
# First-time local setup or testing unreleased changes — build from source:
|
||
# docker compose build canvas && docker compose up -d canvas
|
||
# Note: ECR images require AWS auth — `aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 153263036946.dkr.ecr.us-east-2.amazonaws.com` before pull.
|
||
image: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/canvas:latest
|
||
build:
|
||
context: ./canvas
|
||
dockerfile: Dockerfile
|
||
args:
|
||
NEXT_PUBLIC_PLATFORM_URL: ${NEXT_PUBLIC_PLATFORM_URL:-http://localhost:${PLATFORM_PUBLISH_PORT:-8080}}
|
||
NEXT_PUBLIC_WS_URL: ${NEXT_PUBLIC_WS_URL:-ws://localhost:${PLATFORM_PUBLISH_PORT:-8080}/ws}
|
||
NEXT_PUBLIC_ADMIN_TOKEN: ${ADMIN_TOKEN:-}
|
||
depends_on:
|
||
platform:
|
||
condition: service_healthy
|
||
environment:
|
||
PORT: "${CANVAS_PORT:-3000}"
|
||
# Local dev — relaxes CSP to allow cross-port fetches (canvas:3000 → platform:8080).
|
||
CSP_DEV_MODE: "${CSP_DEV_MODE:-1}"
|
||
# NOTE: NEXT_PUBLIC_* are baked into the JS bundle at `next build` time —
|
||
# these runtime values are ignored by the standalone output. They're kept
|
||
# here for documentation / override during `docker compose build`.
|
||
NEXT_PUBLIC_PLATFORM_URL: ${NEXT_PUBLIC_PLATFORM_URL:-http://localhost:${PLATFORM_PUBLISH_PORT:-8080}}
|
||
NEXT_PUBLIC_WS_URL: ${NEXT_PUBLIC_WS_URL:-ws://localhost:${PLATFORM_PUBLISH_PORT:-8080}/ws}
|
||
ports:
|
||
- "${CANVAS_PUBLISH_PORT:-3000}:${CANVAS_PORT:-3000}"
|
||
networks:
|
||
- molecule-monorepo-net
|
||
healthcheck:
|
||
test: ["CMD-SHELL", "wget -qO /dev/null --tries=1 http://127.0.0.1:${CANVAS_PORT:-3000} || exit 1"]
|
||
interval: 10s
|
||
timeout: 5s
|
||
retries: 10
|
||
|
||
# --- Optional: LiteLLM Proxy (unified OpenAI-compatible API for all providers) ---
|
||
# Start with: docker compose --profile multi-provider up
|
||
#
|
||
# Workspace agents then set:
|
||
# OPENAI_BASE_URL=http://litellm:4000
|
||
# OPENAI_API_KEY=${LITELLM_MASTER_KEY:-sk-molecule}
|
||
#
|
||
# And use model names from infra/litellm_config.yml (e.g. "claude-opus-4-5",
|
||
# "gpt-4o", "openrouter/deepseek-r1", "ollama/llama3.2").
|
||
# Edit infra/litellm_config.yml to add/remove providers and models.
|
||
litellm:
|
||
image: ghcr.io/berriai/litellm:main-latest
|
||
profiles:
|
||
- multi-provider
|
||
ports:
|
||
- "4000:4000"
|
||
volumes:
|
||
- ./infra/litellm_config.yml:/app/config.yaml:ro
|
||
command: ["--config", "/app/config.yaml", "--port", "4000", "--num_workers", "4"]
|
||
environment:
|
||
# Pass provider API keys through — only the ones you have are needed
|
||
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-}
|
||
OPENAI_API_KEY: ${OPENAI_API_KEY:-}
|
||
OPENROUTER_API_KEY: ${OPENROUTER_API_KEY:-}
|
||
LITELLM_MASTER_KEY: ${LITELLM_MASTER_KEY:-sk-molecule}
|
||
networks:
|
||
- molecule-monorepo-net
|
||
restart: unless-stopped
|
||
healthcheck:
|
||
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:4000/health || exit 1"]
|
||
interval: 10s
|
||
timeout: 5s
|
||
retries: 5
|
||
start_period: 15s
|
||
|
||
# --- Optional: Local LLM Models via Ollama ---
|
||
# Start with: docker compose --profile local-models up
|
||
# After first start, pull a model:
|
||
# docker compose exec ollama ollama pull llama3.2
|
||
# docker compose exec ollama ollama pull qwen2.5-coder:7b
|
||
# Then set MODEL_PROVIDER=ollama:llama3.2 in your workspace config.yaml
|
||
# Workspace agents reach Ollama at http://ollama:11434 (internal Docker network).
|
||
ollama:
|
||
image: ollama/ollama:latest
|
||
profiles:
|
||
- local-models
|
||
ports:
|
||
- "11434:11434"
|
||
volumes:
|
||
- ollamadata:/root/.ollama
|
||
networks:
|
||
- molecule-monorepo-net
|
||
restart: unless-stopped
|
||
healthcheck:
|
||
test: ["CMD-SHELL", "ollama list || exit 1"]
|
||
interval: 10s
|
||
timeout: 5s
|
||
retries: 5
|
||
start_period: 20s
|
||
|
||
networks:
|
||
molecule-monorepo-net:
|
||
name: molecule-monorepo-net
|
||
|
||
volumes:
|
||
pgdata:
|
||
redisdata:
|
||
clickhousedata:
|
||
ollamadata:
|