From 479f1776a811f95a1fa23da64df5a3539d78c903 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 14 Apr 2026 10:49:37 -0700 Subject: [PATCH] feat(provisioner): configurable per-tier memory/CPU limits (#14) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resolves #14. ApplyTierConfig now reads TIER{2,3,4}_MEMORY_MB and TIER{2,3,4}_CPU_SHARES env vars, falling back to the compiled defaults agreed in the issue: - T2: 512 MiB / 1024 shares (1 CPU) — unchanged baseline - T3: 2048 MiB / 2048 shares (2 CPU) — new cap (previously uncapped) - T4: 4096 MiB / 4096 shares (4 CPU) — new cap (previously uncapped) CPU_SHARES follows Docker's 1024 = 1 CPU convention; internally the value is translated to NanoCPUs for a hard allocation so behaviour remains deterministic across hosts. Malformed or non-positive env values silently fall back to the default. Behaviour change note: T3 and T4 previously had no explicit cap. Operators who relied on unlimited can set very large TIERn_MEMORY_MB / TIERn_CPU_SHARES values; a follow-up can add unset-means-unlimited semantics if required. Tests: - TestGetTierMemoryMB_DefaultsMatchLegacy - TestGetTierMemoryMB_EnvOverride (covers malformed + zero fallback) - TestGetTierCPUShares_EnvOverride - TestApplyTierConfig_T3_UsesEnvOverride (wiring) - TestApplyTierConfig_T3_DefaultCap (documents the new cap) Docs: .env.example section + CLAUDE.md platform env-vars list updated. Co-Authored-By: Claude Opus 4.6 (1M context) --- .env.example | 12 +++ CLAUDE.md | 5 + platform/internal/provisioner/provisioner.go | 100 ++++++++++++++++-- .../internal/provisioner/provisioner_test.go | 95 +++++++++++++++++ 4 files changed, 204 insertions(+), 8 deletions(-) diff --git a/.env.example b/.env.example index 7f89b84c..9e4c6599 100644 --- a/.env.example +++ b/.env.example @@ -71,6 +71,18 @@ MAX_TOKENS=2048 # Max output tokens for OpenRouter requests (defa LANGGRAPH_RECURSION_LIMIT=500 # LangGraph/DeepAgents max ReAct steps per turn (lib default: 25; raised to 500 — PM fan-out to 6+ reports + synthesis routinely exceeds 100) MODEL_PROVIDER=anthropic:claude-sonnet-4-6 # Format: provider:model. Providers: anthropic, openai, openrouter, groq, cerebras, google_genai, ollama +# ---- Workspace tier resource limits (issue #14) ---- +# Per-tier memory/CPU caps applied to each workspace Docker container. +# CPU_SHARES follows the Docker convention: 1024 shares == 1 CPU. +# Any value <=0 or malformed falls back to the compiled default shown. +# Tier 1 is sandboxed (tmpfs, readonly) and is not resource-capped here. +TIER2_MEMORY_MB=512 # Standard tier memory cap (default 512 MiB) +TIER2_CPU_SHARES=1024 # Standard tier CPU (default 1024 = 1 CPU) +TIER3_MEMORY_MB=2048 # Privileged tier memory cap (default 2048 MiB; previously uncapped) +TIER3_CPU_SHARES=2048 # Privileged tier CPU (default 2048 = 2 CPU; previously uncapped) +TIER4_MEMORY_MB=4096 # Full-host tier memory cap (default 4096 MiB; previously uncapped) +TIER4_CPU_SHARES=4096 # Full-host tier CPU (default 4096 = 4 CPU; previously uncapped) + # Social Channels (optional — configure per-workspace via API or Canvas) TELEGRAM_BOT_TOKEN= # Telegram Bot API token (talk to @BotFather). Used as default for new Telegram channels. diff --git a/CLAUDE.md b/CLAUDE.md index 56d23dcd..28544e01 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -55,6 +55,11 @@ go build -o molecli ./cmd/cli # Build TUI dashboard ``` Must run from `platform/` directory (not repo root). Env vars: `DATABASE_URL`, `REDIS_URL`, `PORT`, `PLATFORM_URL` (default `http://host.docker.internal:PORT` — passed to agent containers so they can reach the platform), `SECRETS_ENCRYPTION_KEY` (optional AES-256, 32 bytes), `CONFIGS_DIR` (auto-discovered), `PLUGINS_DIR` (deprecated — plugins are now installed per-workspace via API; the `plugins/` registry at repo root is auto-discovered), `ACTIVITY_RETENTION_DAYS` (default `7`), `ACTIVITY_CLEANUP_INTERVAL_HOURS` (default `6`), `CORS_ORIGINS` (comma-separated, default `http://localhost:3000,http://localhost:3001`), `RATE_LIMIT` (requests/min, default `600`), `WORKSPACE_DIR` (optional — global fallback host path for `/workspace` bind-mount; overridden by per-workspace `workspace_dir` column in DB; if neither is set, each workspace gets an isolated Docker named volume), `AWARENESS_URL` (optional — if set, injected into workspace containers along with a deterministic `AWARENESS_NAMESPACE` derived from workspace ID), `MOLECULE_IN_DOCKER` (optional — set to `1` when the platform itself runs inside Docker so the A2A proxy rewrites `127.0.0.1:` URLs to container hostnames; auto-detected via `/.dockerenv`), `MOLECULE_ENV` (optional — set to `production` to hide the `/admin/workspaces/:id/test-token` E2E helper endpoint; unset or any other value leaves it enabled), `MOLECULE_ENABLE_TEST_TOKENS` (optional — set to `1` to force-enable the test-token endpoint even when `MOLECULE_ENV=production`; intended for staging runs only). +**Workspace tier resource limits** (issue #14 — override the per-tier memory/CPU caps in `provisioner.ApplyTierConfig`; CPU_SHARES follows Docker's 1024 = 1 CPU convention, translated to NanoCPUs for a hard cap): +- `TIER2_MEMORY_MB` / `TIER2_CPU_SHARES` — Standard tier (defaults `512` / `1024`) +- `TIER3_MEMORY_MB` / `TIER3_CPU_SHARES` — Privileged tier (defaults `2048` / `2048`; previously uncapped) +- `TIER4_MEMORY_MB` / `TIER4_CPU_SHARES` — Full-host tier (defaults `4096` / `4096`; previously uncapped) + **Plugin install safeguards** (bound the cost of a single `POST /workspaces/:id/plugins` install so a slow/malicious source can't tie up a handler): - `PLUGIN_INSTALL_BODY_MAX_BYTES` — max request body size (default `65536` = 64 KiB) - `PLUGIN_INSTALL_FETCH_TIMEOUT` — duration string; whole fetch+copy deadline (default `5m`) diff --git a/platform/internal/provisioner/provisioner.go b/platform/internal/provisioner/provisioner.go index 92f89a1d..3caf76c4 100644 --- a/platform/internal/provisioner/provisioner.go +++ b/platform/internal/provisioner/provisioner.go @@ -10,6 +10,7 @@ import ( "log" "os" "path/filepath" + "strconv" "strings" "time" @@ -349,13 +350,95 @@ func buildContainerEnv(cfg WorkspaceConfig) []string { return env } +// Per-tier resource defaults. Configurable via TIERn_MEMORY_MB and +// TIERn_CPU_SHARES env vars (n in {2,3,4}). CPU shares follow the convention +// 1024 shares == 1 CPU; internally translated to NanoCPUs for a hard cap. +// +// Defaults reflect the tier sizing agreed in issue #14: +// - T2: 512 MiB, 1024 shares (1 CPU) — unchanged historical default +// - T3: 2048 MiB, 2048 shares (2 CPU) — new cap (previously uncapped) +// - T4: 4096 MiB, 4096 shares (4 CPU) — new cap (previously uncapped) +const ( + defaultTier2MemoryMB = 512 + defaultTier2CPUShares = 1024 + defaultTier3MemoryMB = 2048 + defaultTier3CPUShares = 2048 + defaultTier4MemoryMB = 4096 + defaultTier4CPUShares = 4096 +) + +// getTierMemoryMB returns the memory cap (MiB) for the given tier, reading +// TIERn_MEMORY_MB env var with fallback to the hardcoded default. Returns 0 +// for tiers with no cap (e.g. tier 1). +func getTierMemoryMB(tier int) int64 { + var def int64 + switch tier { + case 2: + def = defaultTier2MemoryMB + case 3: + def = defaultTier3MemoryMB + case 4: + def = defaultTier4MemoryMB + default: + return 0 + } + if v := os.Getenv(fmt.Sprintf("TIER%d_MEMORY_MB", tier)); v != "" { + if n, err := strconv.ParseInt(v, 10, 64); err == nil && n > 0 { + return n + } + } + return def +} + +// getTierCPUShares returns the CPU allocation (shares, where 1024 == 1 CPU) +// for the given tier, reading TIERn_CPU_SHARES env var with fallback to the +// hardcoded default. Returns 0 for tiers with no cap. +func getTierCPUShares(tier int) int64 { + var def int64 + switch tier { + case 2: + def = defaultTier2CPUShares + case 3: + def = defaultTier3CPUShares + case 4: + def = defaultTier4CPUShares + default: + return 0 + } + if v := os.Getenv(fmt.Sprintf("TIER%d_CPU_SHARES", tier)); v != "" { + if n, err := strconv.ParseInt(v, 10, 64); err == nil && n > 0 { + return n + } + } + return def +} + +// applyTierResources writes Memory + NanoCPUs to hostCfg from the tier's +// configured limits (env override or default). Returns the resolved values +// for logging. +func applyTierResources(hostCfg *container.HostConfig, tier int) (memMB, cpuShares int64) { + memMB = getTierMemoryMB(tier) + cpuShares = getTierCPUShares(tier) + if memMB > 0 { + hostCfg.Resources.Memory = memMB * 1024 * 1024 + } + if cpuShares > 0 { + // shares -> NanoCPUs: 1024 shares == 1 CPU == 1e9 NanoCPUs + hostCfg.Resources.NanoCPUs = (cpuShares * 1_000_000_000) / 1024 + } + return memMB, cpuShares +} + // ApplyTierConfig configures a HostConfig based on the workspace tier. // Extracted from Start() so it can be tested independently. // // - Tier 1 (Sandboxed): readonly rootfs, tmpfs /tmp, strip /workspace mount -// - Tier 2 (Standard): resource limits (512 MiB memory, 1 CPU), no special flags (default) -// - Tier 3 (Privileged): privileged mode, host PID, Docker network (not host) -// - Tier 4 (Full access): privileged, host PID, host network, Docker socket mount, all capabilities +// - Tier 2 (Standard): resource limits (default 512 MiB, 1 CPU) +// - Tier 3 (Privileged): privileged + host PID, Docker network, capped resources +// - Tier 4 (Full access): privileged, host PID, host network, Docker socket, capped resources +// +// Per-tier memory/CPU caps are overridable via TIERn_MEMORY_MB / +// TIERn_CPU_SHARES env vars (n in {2,3,4}). // // Unknown/zero tiers default to Tier 2 behavior (safe resource-limited container). func ApplyTierConfig(hostCfg *container.HostConfig, cfg WorkspaceConfig, configMount, name string) { @@ -378,7 +461,8 @@ func ApplyTierConfig(hostCfg *container.HostConfig, cfg WorkspaceConfig, configM // causes port collisions when multiple T3 containers run simultaneously. hostCfg.Privileged = true hostCfg.PidMode = "host" - log.Printf("Provisioner: T3 privileged mode for %s (privileged, host PID)", name) + memMB, shares := applyTierResources(hostCfg, 3) + log.Printf("Provisioner: T3 privileged mode for %s (privileged, host PID, %dm memory, %d CPU shares)", name, memMB, shares) case 4: // Full host access: everything from T3 + host network + Docker socket + all capabilities. @@ -388,14 +472,14 @@ func ApplyTierConfig(hostCfg *container.HostConfig, cfg WorkspaceConfig, configM hostCfg.NetworkMode = "host" // Mount Docker socket so workspace can manage containers hostCfg.Binds = append(hostCfg.Binds, "/var/run/docker.sock:/var/run/docker.sock") - log.Printf("Provisioner: T4 full-host mode for %s (privileged, host PID, host network, docker socket)", name) + memMB, shares := applyTierResources(hostCfg, 4) + log.Printf("Provisioner: T4 full-host mode for %s (privileged, host PID, host network, docker socket, %dm memory, %d CPU shares)", name, memMB, shares) default: // Tier 2 (Standard) and unknown tiers: normal container with resource limits. // This is the safe default — no privileged access, reasonable resource caps. - hostCfg.Resources.Memory = 512 * 1024 * 1024 // 512 MiB - hostCfg.Resources.NanoCPUs = 1_000_000_000 // 1.0 CPU - log.Printf("Provisioner: T2 standard mode for %s (512m memory, 1 CPU)", name) + memMB, shares := applyTierResources(hostCfg, 2) + log.Printf("Provisioner: T2 standard mode for %s (%dm memory, %d CPU shares)", name, memMB, shares) } } diff --git a/platform/internal/provisioner/provisioner_test.go b/platform/internal/provisioner/provisioner_test.go index 9db5f285..7bca0f73 100644 --- a/platform/internal/provisioner/provisioner_test.go +++ b/platform/internal/provisioner/provisioner_test.go @@ -662,3 +662,98 @@ func TestImageNotFoundErrorIncludesBuildHint(t *testing.T) { } } } + +// ---- issue #14: configurable per-tier memory/CPU limits ---- + +// TestGetTierMemoryMB_DefaultsMatchLegacy asserts that with no env overrides, +// getTierMemoryMB returns the agreed (issue #14) defaults. +func TestGetTierMemoryMB_DefaultsMatchLegacy(t *testing.T) { + for _, k := range []string{"TIER2_MEMORY_MB", "TIER3_MEMORY_MB", "TIER4_MEMORY_MB"} { + os.Unsetenv(k) + } + cases := map[int]int64{ + 1: 0, // no cap + 2: 512, + 3: 2048, + 4: 4096, + 9: 0, // unknown + } + for tier, want := range cases { + if got := getTierMemoryMB(tier); got != want { + t.Errorf("getTierMemoryMB(%d): got %d, want %d", tier, got, want) + } + } +} + +// TestGetTierMemoryMB_EnvOverride asserts TIERn_MEMORY_MB takes precedence, +// and that malformed / non-positive values fall back to the default. +func TestGetTierMemoryMB_EnvOverride(t *testing.T) { + t.Setenv("TIER3_MEMORY_MB", "512") + if got := getTierMemoryMB(3); got != 512 { + t.Errorf("with TIER3_MEMORY_MB=512, got %d, want 512", got) + } + t.Setenv("TIER3_MEMORY_MB", "not-a-number") + if got := getTierMemoryMB(3); got != defaultTier3MemoryMB { + t.Errorf("malformed TIER3_MEMORY_MB: got %d, want default %d", got, defaultTier3MemoryMB) + } + t.Setenv("TIER3_MEMORY_MB", "0") + if got := getTierMemoryMB(3); got != defaultTier3MemoryMB { + t.Errorf("zero TIER3_MEMORY_MB: got %d, want default %d", got, defaultTier3MemoryMB) + } +} + +// TestGetTierCPUShares_EnvOverride asserts TIERn_CPU_SHARES takes precedence. +func TestGetTierCPUShares_EnvOverride(t *testing.T) { + t.Setenv("TIER3_CPU_SHARES", "4096") + if got := getTierCPUShares(3); got != 4096 { + t.Errorf("with TIER3_CPU_SHARES=4096, got %d, want 4096", got) + } + os.Unsetenv("TIER3_CPU_SHARES") + if got := getTierCPUShares(3); got != defaultTier3CPUShares { + t.Errorf("unset TIER3_CPU_SHARES: got %d, want default %d", got, defaultTier3CPUShares) + } +} + +// TestApplyTierConfig_T3_UsesEnvOverride is the wiring test: env vars must +// flow through ApplyTierConfig into hostCfg.Resources. +func TestApplyTierConfig_T3_UsesEnvOverride(t *testing.T) { + t.Setenv("TIER3_MEMORY_MB", "8192") + t.Setenv("TIER3_CPU_SHARES", "4096") // 4 CPU == 4e9 NanoCPUs + + hc := baseHostConfig("") + cfg := WorkspaceConfig{WorkspaceID: "abc123", Tier: 3} + ApplyTierConfig(hc, cfg, "ws-abc123-configs:/configs", "ws-abc123") + + wantMem := int64(8192) * 1024 * 1024 + if hc.Resources.Memory != wantMem { + t.Errorf("T3 memory override: got %d, want %d", hc.Resources.Memory, wantMem) + } + wantCPU := int64(4_000_000_000) + if hc.Resources.NanoCPUs != wantCPU { + t.Errorf("T3 CPU override: got %d NanoCPUs, want %d", hc.Resources.NanoCPUs, wantCPU) + } + if !hc.Privileged || hc.PidMode != "host" { + t.Errorf("T3 override should preserve privileged/pid-host flags, got Privileged=%v PidMode=%q", + hc.Privileged, hc.PidMode) + } +} + +// TestApplyTierConfig_T3_DefaultCap asserts T3 now gets a memory/CPU cap by +// default (previously uncapped — behaviour change per issue #14). +func TestApplyTierConfig_T3_DefaultCap(t *testing.T) { + for _, k := range []string{"TIER3_MEMORY_MB", "TIER3_CPU_SHARES"} { + os.Unsetenv(k) + } + hc := baseHostConfig("") + cfg := WorkspaceConfig{WorkspaceID: "abc123", Tier: 3} + ApplyTierConfig(hc, cfg, "ws-abc123-configs:/configs", "ws-abc123") + + wantMem := int64(defaultTier3MemoryMB) * 1024 * 1024 + if hc.Resources.Memory != wantMem { + t.Errorf("T3 default memory: got %d, want %d", hc.Resources.Memory, wantMem) + } + wantCPU := int64(defaultTier3CPUShares) * 1_000_000_000 / 1024 + if hc.Resources.NanoCPUs != wantCPU { + t.Errorf("T3 default NanoCPUs: got %d, want %d", hc.Resources.NanoCPUs, wantCPU) + } +}