From 479f1776a811f95a1fa23da64df5a3539d78c903 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Tue, 14 Apr 2026 10:49:37 -0700
Subject: [PATCH] feat(provisioner): configurable per-tier memory/CPU limits
 (#14)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Resolves #14. ApplyTierConfig now reads TIER{2,3,4}_MEMORY_MB and
TIER{2,3,4}_CPU_SHARES env vars, falling back to the compiled defaults
agreed in the issue:

  - T2: 512 MiB  / 1024 shares (1 CPU)  — unchanged baseline
  - T3: 2048 MiB / 2048 shares (2 CPU)  — new cap (previously uncapped)
  - T4: 4096 MiB / 4096 shares (4 CPU)  — new cap (previously uncapped)

CPU_SHARES follows Docker's 1024 = 1 CPU convention; internally the
value is translated to NanoCPUs for a hard allocation so behaviour
remains deterministic across hosts. Malformed or non-positive env
values silently fall back to the default.

Behaviour change note: T3 and T4 previously had no explicit cap.
Operators who relied on unlimited can set very large TIERn_MEMORY_MB /
TIERn_CPU_SHARES values; a follow-up can add unset-means-unlimited
semantics if required.

Tests:
  - TestGetTierMemoryMB_DefaultsMatchLegacy
  - TestGetTierMemoryMB_EnvOverride (covers malformed + zero fallback)
  - TestGetTierCPUShares_EnvOverride
  - TestApplyTierConfig_T3_UsesEnvOverride (wiring)
  - TestApplyTierConfig_T3_DefaultCap (documents the new cap)

Docs: .env.example section + CLAUDE.md platform env-vars list updated.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .env.example                                  |  12 +++
 CLAUDE.md                                     |   5 +
 platform/internal/provisioner/provisioner.go  | 100 ++++++++++++++++--
 .../internal/provisioner/provisioner_test.go  |  95 +++++++++++++++++
 4 files changed, 204 insertions(+), 8 deletions(-)

diff --git a/.env.example b/.env.example
index 7f89b84c..9e4c6599 100644
--- a/.env.example
+++ b/.env.example
@@ -71,6 +71,18 @@ MAX_TOKENS=2048                # Max output tokens for OpenRouter requests (defa
 LANGGRAPH_RECURSION_LIMIT=500  # LangGraph/DeepAgents max ReAct steps per turn (lib default: 25; raised to 500 — PM fan-out to 6+ reports + synthesis routinely exceeds 100)
 MODEL_PROVIDER=anthropic:claude-sonnet-4-6   # Format: provider:model. Providers: anthropic, openai, openrouter, groq, cerebras, google_genai, ollama
 
+# ---- Workspace tier resource limits (issue #14) ----
+# Per-tier memory/CPU caps applied to each workspace Docker container.
+# CPU_SHARES follows the Docker convention: 1024 shares == 1 CPU.
+# Any value <=0 or malformed falls back to the compiled default shown.
+# Tier 1 is sandboxed (tmpfs, readonly) and is not resource-capped here.
+TIER2_MEMORY_MB=512            # Standard tier memory cap (default 512 MiB)
+TIER2_CPU_SHARES=1024          # Standard tier CPU (default 1024 = 1 CPU)
+TIER3_MEMORY_MB=2048           # Privileged tier memory cap (default 2048 MiB; previously uncapped)
+TIER3_CPU_SHARES=2048          # Privileged tier CPU (default 2048 = 2 CPU; previously uncapped)
+TIER4_MEMORY_MB=4096           # Full-host tier memory cap (default 4096 MiB; previously uncapped)
+TIER4_CPU_SHARES=4096          # Full-host tier CPU (default 4096 = 4 CPU; previously uncapped)
+
 # Social Channels (optional — configure per-workspace via API or Canvas)
 TELEGRAM_BOT_TOKEN=            # Telegram Bot API token (talk to @BotFather). Used as default for new Telegram channels.
 
diff --git a/CLAUDE.md b/CLAUDE.md
index 56d23dcd..28544e01 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -55,6 +55,11 @@ go build -o molecli ./cmd/cli  # Build TUI dashboard
 ```
 Must run from `platform/` directory (not repo root). Env vars: `DATABASE_URL`, `REDIS_URL`, `PORT`, `PLATFORM_URL` (default `http://host.docker.internal:PORT` — passed to agent containers so they can reach the platform), `SECRETS_ENCRYPTION_KEY` (optional AES-256, 32 bytes), `CONFIGS_DIR` (auto-discovered), `PLUGINS_DIR` (deprecated — plugins are now installed per-workspace via API; the `plugins/` registry at repo root is auto-discovered), `ACTIVITY_RETENTION_DAYS` (default `7`), `ACTIVITY_CLEANUP_INTERVAL_HOURS` (default `6`), `CORS_ORIGINS` (comma-separated, default `http://localhost:3000,http://localhost:3001`), `RATE_LIMIT` (requests/min, default `600`), `WORKSPACE_DIR` (optional — global fallback host path for `/workspace` bind-mount; overridden by per-workspace `workspace_dir` column in DB; if neither is set, each workspace gets an isolated Docker named volume), `AWARENESS_URL` (optional — if set, injected into workspace containers along with a deterministic `AWARENESS_NAMESPACE` derived from workspace ID), `MOLECULE_IN_DOCKER` (optional — set to `1` when the platform itself runs inside Docker so the A2A proxy rewrites `127.0.0.1:<port>` URLs to container hostnames; auto-detected via `/.dockerenv`), `MOLECULE_ENV` (optional — set to `production` to hide the `/admin/workspaces/:id/test-token` E2E helper endpoint; unset or any other value leaves it enabled), `MOLECULE_ENABLE_TEST_TOKENS` (optional — set to `1` to force-enable the test-token endpoint even when `MOLECULE_ENV=production`; intended for staging runs only).
 
+**Workspace tier resource limits** (issue #14 — override the per-tier memory/CPU caps in `provisioner.ApplyTierConfig`; CPU_SHARES follows Docker's 1024 = 1 CPU convention, translated to NanoCPUs for a hard cap):
+- `TIER2_MEMORY_MB` / `TIER2_CPU_SHARES` — Standard tier (defaults `512` / `1024`)
+- `TIER3_MEMORY_MB` / `TIER3_CPU_SHARES` — Privileged tier (defaults `2048` / `2048`; previously uncapped)
+- `TIER4_MEMORY_MB` / `TIER4_CPU_SHARES` — Full-host tier (defaults `4096` / `4096`; previously uncapped)
+
 **Plugin install safeguards** (bound the cost of a single `POST /workspaces/:id/plugins` install so a slow/malicious source can't tie up a handler):
 - `PLUGIN_INSTALL_BODY_MAX_BYTES` — max request body size (default `65536` = 64 KiB)
 - `PLUGIN_INSTALL_FETCH_TIMEOUT` — duration string; whole fetch+copy deadline (default `5m`)
diff --git a/platform/internal/provisioner/provisioner.go b/platform/internal/provisioner/provisioner.go
index 92f89a1d..3caf76c4 100644
--- a/platform/internal/provisioner/provisioner.go
+++ b/platform/internal/provisioner/provisioner.go
@@ -10,6 +10,7 @@ import (
 	"log"
 	"os"
 	"path/filepath"
+	"strconv"
 	"strings"
 	"time"
 
@@ -349,13 +350,95 @@ func buildContainerEnv(cfg WorkspaceConfig) []string {
 	return env
 }
 
+// Per-tier resource defaults. Configurable via TIERn_MEMORY_MB and
+// TIERn_CPU_SHARES env vars (n in {2,3,4}). CPU shares follow the convention
+// 1024 shares == 1 CPU; internally translated to NanoCPUs for a hard cap.
+//
+// Defaults reflect the tier sizing agreed in issue #14:
+//   - T2: 512 MiB,  1024 shares (1 CPU)  — unchanged historical default
+//   - T3: 2048 MiB, 2048 shares (2 CPU)  — new cap (previously uncapped)
+//   - T4: 4096 MiB, 4096 shares (4 CPU)  — new cap (previously uncapped)
+const (
+	defaultTier2MemoryMB  = 512
+	defaultTier2CPUShares = 1024
+	defaultTier3MemoryMB  = 2048
+	defaultTier3CPUShares = 2048
+	defaultTier4MemoryMB  = 4096
+	defaultTier4CPUShares = 4096
+)
+
+// getTierMemoryMB returns the memory cap (MiB) for the given tier, reading
+// TIERn_MEMORY_MB env var with fallback to the hardcoded default. Returns 0
+// for tiers with no cap (e.g. tier 1).
+func getTierMemoryMB(tier int) int64 {
+	var def int64
+	switch tier {
+	case 2:
+		def = defaultTier2MemoryMB
+	case 3:
+		def = defaultTier3MemoryMB
+	case 4:
+		def = defaultTier4MemoryMB
+	default:
+		return 0
+	}
+	if v := os.Getenv(fmt.Sprintf("TIER%d_MEMORY_MB", tier)); v != "" {
+		if n, err := strconv.ParseInt(v, 10, 64); err == nil && n > 0 {
+			return n
+		}
+	}
+	return def
+}
+
+// getTierCPUShares returns the CPU allocation (shares, where 1024 == 1 CPU)
+// for the given tier, reading TIERn_CPU_SHARES env var with fallback to the
+// hardcoded default. Returns 0 for tiers with no cap.
+func getTierCPUShares(tier int) int64 {
+	var def int64
+	switch tier {
+	case 2:
+		def = defaultTier2CPUShares
+	case 3:
+		def = defaultTier3CPUShares
+	case 4:
+		def = defaultTier4CPUShares
+	default:
+		return 0
+	}
+	if v := os.Getenv(fmt.Sprintf("TIER%d_CPU_SHARES", tier)); v != "" {
+		if n, err := strconv.ParseInt(v, 10, 64); err == nil && n > 0 {
+			return n
+		}
+	}
+	return def
+}
+
+// applyTierResources writes Memory + NanoCPUs to hostCfg from the tier's
+// configured limits (env override or default). Returns the resolved values
+// for logging.
+func applyTierResources(hostCfg *container.HostConfig, tier int) (memMB, cpuShares int64) {
+	memMB = getTierMemoryMB(tier)
+	cpuShares = getTierCPUShares(tier)
+	if memMB > 0 {
+		hostCfg.Resources.Memory = memMB * 1024 * 1024
+	}
+	if cpuShares > 0 {
+		// shares -> NanoCPUs: 1024 shares == 1 CPU == 1e9 NanoCPUs
+		hostCfg.Resources.NanoCPUs = (cpuShares * 1_000_000_000) / 1024
+	}
+	return memMB, cpuShares
+}
+
 // ApplyTierConfig configures a HostConfig based on the workspace tier.
 // Extracted from Start() so it can be tested independently.
 //
 //   - Tier 1 (Sandboxed):  readonly rootfs, tmpfs /tmp, strip /workspace mount
-//   - Tier 2 (Standard):   resource limits (512 MiB memory, 1 CPU), no special flags (default)
-//   - Tier 3 (Privileged):  privileged mode, host PID, Docker network (not host)
-//   - Tier 4 (Full access): privileged, host PID, host network, Docker socket mount, all capabilities
+//   - Tier 2 (Standard):   resource limits (default 512 MiB, 1 CPU)
+//   - Tier 3 (Privileged): privileged + host PID, Docker network, capped resources
+//   - Tier 4 (Full access): privileged, host PID, host network, Docker socket, capped resources
+//
+// Per-tier memory/CPU caps are overridable via TIERn_MEMORY_MB /
+// TIERn_CPU_SHARES env vars (n in {2,3,4}).
 //
 // Unknown/zero tiers default to Tier 2 behavior (safe resource-limited container).
 func ApplyTierConfig(hostCfg *container.HostConfig, cfg WorkspaceConfig, configMount, name string) {
@@ -378,7 +461,8 @@ func ApplyTierConfig(hostCfg *container.HostConfig, cfg WorkspaceConfig, configM
 		// causes port collisions when multiple T3 containers run simultaneously.
 		hostCfg.Privileged = true
 		hostCfg.PidMode = "host"
-		log.Printf("Provisioner: T3 privileged mode for %s (privileged, host PID)", name)
+		memMB, shares := applyTierResources(hostCfg, 3)
+		log.Printf("Provisioner: T3 privileged mode for %s (privileged, host PID, %dm memory, %d CPU shares)", name, memMB, shares)
 
 	case 4:
 		// Full host access: everything from T3 + host network + Docker socket + all capabilities.
@@ -388,14 +472,14 @@ func ApplyTierConfig(hostCfg *container.HostConfig, cfg WorkspaceConfig, configM
 		hostCfg.NetworkMode = "host"
 		// Mount Docker socket so workspace can manage containers
 		hostCfg.Binds = append(hostCfg.Binds, "/var/run/docker.sock:/var/run/docker.sock")
-		log.Printf("Provisioner: T4 full-host mode for %s (privileged, host PID, host network, docker socket)", name)
+		memMB, shares := applyTierResources(hostCfg, 4)
+		log.Printf("Provisioner: T4 full-host mode for %s (privileged, host PID, host network, docker socket, %dm memory, %d CPU shares)", name, memMB, shares)
 
 	default:
 		// Tier 2 (Standard) and unknown tiers: normal container with resource limits.
 		// This is the safe default — no privileged access, reasonable resource caps.
-		hostCfg.Resources.Memory = 512 * 1024 * 1024    // 512 MiB
-		hostCfg.Resources.NanoCPUs = 1_000_000_000       // 1.0 CPU
-		log.Printf("Provisioner: T2 standard mode for %s (512m memory, 1 CPU)", name)
+		memMB, shares := applyTierResources(hostCfg, 2)
+		log.Printf("Provisioner: T2 standard mode for %s (%dm memory, %d CPU shares)", name, memMB, shares)
 	}
 }
 
diff --git a/platform/internal/provisioner/provisioner_test.go b/platform/internal/provisioner/provisioner_test.go
index 9db5f285..7bca0f73 100644
--- a/platform/internal/provisioner/provisioner_test.go
+++ b/platform/internal/provisioner/provisioner_test.go
@@ -662,3 +662,98 @@ func TestImageNotFoundErrorIncludesBuildHint(t *testing.T) {
 		}
 	}
 }
+
+// ---- issue #14: configurable per-tier memory/CPU limits ----
+
+// TestGetTierMemoryMB_DefaultsMatchLegacy asserts that with no env overrides,
+// getTierMemoryMB returns the agreed (issue #14) defaults.
+func TestGetTierMemoryMB_DefaultsMatchLegacy(t *testing.T) {
+	for _, k := range []string{"TIER2_MEMORY_MB", "TIER3_MEMORY_MB", "TIER4_MEMORY_MB"} {
+		os.Unsetenv(k)
+	}
+	cases := map[int]int64{
+		1: 0, // no cap
+		2: 512,
+		3: 2048,
+		4: 4096,
+		9: 0, // unknown
+	}
+	for tier, want := range cases {
+		if got := getTierMemoryMB(tier); got != want {
+			t.Errorf("getTierMemoryMB(%d): got %d, want %d", tier, got, want)
+		}
+	}
+}
+
+// TestGetTierMemoryMB_EnvOverride asserts TIERn_MEMORY_MB takes precedence,
+// and that malformed / non-positive values fall back to the default.
+func TestGetTierMemoryMB_EnvOverride(t *testing.T) {
+	t.Setenv("TIER3_MEMORY_MB", "512")
+	if got := getTierMemoryMB(3); got != 512 {
+		t.Errorf("with TIER3_MEMORY_MB=512, got %d, want 512", got)
+	}
+	t.Setenv("TIER3_MEMORY_MB", "not-a-number")
+	if got := getTierMemoryMB(3); got != defaultTier3MemoryMB {
+		t.Errorf("malformed TIER3_MEMORY_MB: got %d, want default %d", got, defaultTier3MemoryMB)
+	}
+	t.Setenv("TIER3_MEMORY_MB", "0")
+	if got := getTierMemoryMB(3); got != defaultTier3MemoryMB {
+		t.Errorf("zero TIER3_MEMORY_MB: got %d, want default %d", got, defaultTier3MemoryMB)
+	}
+}
+
+// TestGetTierCPUShares_EnvOverride asserts TIERn_CPU_SHARES takes precedence.
+func TestGetTierCPUShares_EnvOverride(t *testing.T) {
+	t.Setenv("TIER3_CPU_SHARES", "4096")
+	if got := getTierCPUShares(3); got != 4096 {
+		t.Errorf("with TIER3_CPU_SHARES=4096, got %d, want 4096", got)
+	}
+	os.Unsetenv("TIER3_CPU_SHARES")
+	if got := getTierCPUShares(3); got != defaultTier3CPUShares {
+		t.Errorf("unset TIER3_CPU_SHARES: got %d, want default %d", got, defaultTier3CPUShares)
+	}
+}
+
+// TestApplyTierConfig_T3_UsesEnvOverride is the wiring test: env vars must
+// flow through ApplyTierConfig into hostCfg.Resources.
+func TestApplyTierConfig_T3_UsesEnvOverride(t *testing.T) {
+	t.Setenv("TIER3_MEMORY_MB", "8192")
+	t.Setenv("TIER3_CPU_SHARES", "4096") // 4 CPU == 4e9 NanoCPUs
+
+	hc := baseHostConfig("")
+	cfg := WorkspaceConfig{WorkspaceID: "abc123", Tier: 3}
+	ApplyTierConfig(hc, cfg, "ws-abc123-configs:/configs", "ws-abc123")
+
+	wantMem := int64(8192) * 1024 * 1024
+	if hc.Resources.Memory != wantMem {
+		t.Errorf("T3 memory override: got %d, want %d", hc.Resources.Memory, wantMem)
+	}
+	wantCPU := int64(4_000_000_000)
+	if hc.Resources.NanoCPUs != wantCPU {
+		t.Errorf("T3 CPU override: got %d NanoCPUs, want %d", hc.Resources.NanoCPUs, wantCPU)
+	}
+	if !hc.Privileged || hc.PidMode != "host" {
+		t.Errorf("T3 override should preserve privileged/pid-host flags, got Privileged=%v PidMode=%q",
+			hc.Privileged, hc.PidMode)
+	}
+}
+
+// TestApplyTierConfig_T3_DefaultCap asserts T3 now gets a memory/CPU cap by
+// default (previously uncapped — behaviour change per issue #14).
+func TestApplyTierConfig_T3_DefaultCap(t *testing.T) {
+	for _, k := range []string{"TIER3_MEMORY_MB", "TIER3_CPU_SHARES"} {
+		os.Unsetenv(k)
+	}
+	hc := baseHostConfig("")
+	cfg := WorkspaceConfig{WorkspaceID: "abc123", Tier: 3}
+	ApplyTierConfig(hc, cfg, "ws-abc123-configs:/configs", "ws-abc123")
+
+	wantMem := int64(defaultTier3MemoryMB) * 1024 * 1024
+	if hc.Resources.Memory != wantMem {
+		t.Errorf("T3 default memory: got %d, want %d", hc.Resources.Memory, wantMem)
+	}
+	wantCPU := int64(defaultTier3CPUShares) * 1_000_000_000 / 1024
+	if hc.Resources.NanoCPUs != wantCPU {
+		t.Errorf("T3 default NanoCPUs: got %d, want %d", hc.Resources.NanoCPUs, wantCPU)
+	}
+}