Merge pull request #58 from Molecule-AI/feat/issue-14-configurable-tier-limits

noteworthy: behavior-change — T3/T4 caps introduced where previously unlimited; defaults match issue #14 spec; operators can override via env
This commit is contained in:
Hongming Wang 2026-04-14 12:25:00 -07:00 committed by GitHub
commit 49cd28ac17
4 changed files with 204 additions and 8 deletions

View File

@ -71,6 +71,18 @@ MAX_TOKENS=2048 # Max output tokens for OpenRouter requests (defa
LANGGRAPH_RECURSION_LIMIT=500 # LangGraph/DeepAgents max ReAct steps per turn (lib default: 25; raised to 500 — PM fan-out to 6+ reports + synthesis routinely exceeds 100)
MODEL_PROVIDER=anthropic:claude-sonnet-4-6 # Format: provider:model. Providers: anthropic, openai, openrouter, groq, cerebras, google_genai, ollama
# ---- Workspace tier resource limits (issue #14) ----
# Per-tier memory/CPU caps applied to each workspace Docker container.
# CPU_SHARES follows the Docker convention: 1024 shares == 1 CPU.
# Any value <=0 or malformed falls back to the compiled default shown.
# Tier 1 is sandboxed (tmpfs, readonly) and is not resource-capped here.
TIER2_MEMORY_MB=512 # Standard tier memory cap (default 512 MiB)
TIER2_CPU_SHARES=1024 # Standard tier CPU (default 1024 = 1 CPU)
TIER3_MEMORY_MB=2048 # Privileged tier memory cap (default 2048 MiB; previously uncapped)
TIER3_CPU_SHARES=2048 # Privileged tier CPU (default 2048 = 2 CPU; previously uncapped)
TIER4_MEMORY_MB=4096 # Full-host tier memory cap (default 4096 MiB; previously uncapped)
TIER4_CPU_SHARES=4096 # Full-host tier CPU (default 4096 = 4 CPU; previously uncapped)
# Social Channels (optional — configure per-workspace via API or Canvas)
TELEGRAM_BOT_TOKEN= # Telegram Bot API token (talk to @BotFather). Used as default for new Telegram channels.

View File

@ -55,6 +55,11 @@ go build -o molecli ./cmd/cli # Build TUI dashboard
```
Must run from `platform/` directory (not repo root). Env vars: `DATABASE_URL`, `REDIS_URL`, `PORT`, `PLATFORM_URL` (default `http://host.docker.internal:PORT` — passed to agent containers so they can reach the platform), `SECRETS_ENCRYPTION_KEY` (optional AES-256, 32 bytes), `CONFIGS_DIR` (auto-discovered), `PLUGINS_DIR` (deprecated — plugins are now installed per-workspace via API; the `plugins/` registry at repo root is auto-discovered), `ACTIVITY_RETENTION_DAYS` (default `7`), `ACTIVITY_CLEANUP_INTERVAL_HOURS` (default `6`), `CORS_ORIGINS` (comma-separated, default `http://localhost:3000,http://localhost:3001`), `RATE_LIMIT` (requests/min, default `600`), `WORKSPACE_DIR` (optional — global fallback host path for `/workspace` bind-mount; overridden by per-workspace `workspace_dir` column in DB; if neither is set, each workspace gets an isolated Docker named volume), `AWARENESS_URL` (optional — if set, injected into workspace containers along with a deterministic `AWARENESS_NAMESPACE` derived from workspace ID), `MOLECULE_IN_DOCKER` (optional — set to `1` when the platform itself runs inside Docker so the A2A proxy rewrites `127.0.0.1:<port>` URLs to container hostnames; auto-detected via `/.dockerenv`), `MOLECULE_ENV` (optional — set to `production` to hide the `/admin/workspaces/:id/test-token` E2E helper endpoint; unset or any other value leaves it enabled), `MOLECULE_ENABLE_TEST_TOKENS` (optional — set to `1` to force-enable the test-token endpoint even when `MOLECULE_ENV=production`; intended for staging runs only).
**Workspace tier resource limits** (issue #14 — override the per-tier memory/CPU caps in `provisioner.ApplyTierConfig`; CPU_SHARES follows Docker's 1024 = 1 CPU convention, translated to NanoCPUs for a hard cap):
- `TIER2_MEMORY_MB` / `TIER2_CPU_SHARES` — Standard tier (defaults `512` / `1024`)
- `TIER3_MEMORY_MB` / `TIER3_CPU_SHARES` — Privileged tier (defaults `2048` / `2048`; previously uncapped)
- `TIER4_MEMORY_MB` / `TIER4_CPU_SHARES` — Full-host tier (defaults `4096` / `4096`; previously uncapped)
**Plugin install safeguards** (bound the cost of a single `POST /workspaces/:id/plugins` install so a slow/malicious source can't tie up a handler):
- `PLUGIN_INSTALL_BODY_MAX_BYTES` — max request body size (default `65536` = 64 KiB)
- `PLUGIN_INSTALL_FETCH_TIMEOUT` — duration string; whole fetch+copy deadline (default `5m`)

View File

@ -10,6 +10,7 @@ import (
"log"
"os"
"path/filepath"
"strconv"
"strings"
"time"
@ -349,13 +350,95 @@ func buildContainerEnv(cfg WorkspaceConfig) []string {
return env
}
// Per-tier resource defaults. Configurable via TIERn_MEMORY_MB and
// TIERn_CPU_SHARES env vars (n in {2,3,4}). CPU shares follow the convention
// 1024 shares == 1 CPU; internally translated to NanoCPUs for a hard cap.
//
// Defaults reflect the tier sizing agreed in issue #14:
// - T2: 512 MiB, 1024 shares (1 CPU) — unchanged historical default
// - T3: 2048 MiB, 2048 shares (2 CPU) — new cap (previously uncapped)
// - T4: 4096 MiB, 4096 shares (4 CPU) — new cap (previously uncapped)
const (
defaultTier2MemoryMB = 512
defaultTier2CPUShares = 1024
defaultTier3MemoryMB = 2048
defaultTier3CPUShares = 2048
defaultTier4MemoryMB = 4096
defaultTier4CPUShares = 4096
)
// getTierMemoryMB returns the memory cap (MiB) for the given tier, reading
// TIERn_MEMORY_MB env var with fallback to the hardcoded default. Returns 0
// for tiers with no cap (e.g. tier 1).
func getTierMemoryMB(tier int) int64 {
var def int64
switch tier {
case 2:
def = defaultTier2MemoryMB
case 3:
def = defaultTier3MemoryMB
case 4:
def = defaultTier4MemoryMB
default:
return 0
}
if v := os.Getenv(fmt.Sprintf("TIER%d_MEMORY_MB", tier)); v != "" {
if n, err := strconv.ParseInt(v, 10, 64); err == nil && n > 0 {
return n
}
}
return def
}
// getTierCPUShares returns the CPU allocation (shares, where 1024 == 1 CPU)
// for the given tier, reading TIERn_CPU_SHARES env var with fallback to the
// hardcoded default. Returns 0 for tiers with no cap.
func getTierCPUShares(tier int) int64 {
var def int64
switch tier {
case 2:
def = defaultTier2CPUShares
case 3:
def = defaultTier3CPUShares
case 4:
def = defaultTier4CPUShares
default:
return 0
}
if v := os.Getenv(fmt.Sprintf("TIER%d_CPU_SHARES", tier)); v != "" {
if n, err := strconv.ParseInt(v, 10, 64); err == nil && n > 0 {
return n
}
}
return def
}
// applyTierResources writes Memory + NanoCPUs to hostCfg from the tier's
// configured limits (env override or default). Returns the resolved values
// for logging.
func applyTierResources(hostCfg *container.HostConfig, tier int) (memMB, cpuShares int64) {
memMB = getTierMemoryMB(tier)
cpuShares = getTierCPUShares(tier)
if memMB > 0 {
hostCfg.Resources.Memory = memMB * 1024 * 1024
}
if cpuShares > 0 {
// shares -> NanoCPUs: 1024 shares == 1 CPU == 1e9 NanoCPUs
hostCfg.Resources.NanoCPUs = (cpuShares * 1_000_000_000) / 1024
}
return memMB, cpuShares
}
// ApplyTierConfig configures a HostConfig based on the workspace tier.
// Extracted from Start() so it can be tested independently.
//
// - Tier 1 (Sandboxed): readonly rootfs, tmpfs /tmp, strip /workspace mount
// - Tier 2 (Standard): resource limits (512 MiB memory, 1 CPU), no special flags (default)
// - Tier 3 (Privileged): privileged mode, host PID, Docker network (not host)
// - Tier 4 (Full access): privileged, host PID, host network, Docker socket mount, all capabilities
// - Tier 2 (Standard): resource limits (default 512 MiB, 1 CPU)
// - Tier 3 (Privileged): privileged + host PID, Docker network, capped resources
// - Tier 4 (Full access): privileged, host PID, host network, Docker socket, capped resources
//
// Per-tier memory/CPU caps are overridable via TIERn_MEMORY_MB /
// TIERn_CPU_SHARES env vars (n in {2,3,4}).
//
// Unknown/zero tiers default to Tier 2 behavior (safe resource-limited container).
func ApplyTierConfig(hostCfg *container.HostConfig, cfg WorkspaceConfig, configMount, name string) {
@ -378,7 +461,8 @@ func ApplyTierConfig(hostCfg *container.HostConfig, cfg WorkspaceConfig, configM
// causes port collisions when multiple T3 containers run simultaneously.
hostCfg.Privileged = true
hostCfg.PidMode = "host"
log.Printf("Provisioner: T3 privileged mode for %s (privileged, host PID)", name)
memMB, shares := applyTierResources(hostCfg, 3)
log.Printf("Provisioner: T3 privileged mode for %s (privileged, host PID, %dm memory, %d CPU shares)", name, memMB, shares)
case 4:
// Full host access: everything from T3 + host network + Docker socket + all capabilities.
@ -388,14 +472,14 @@ func ApplyTierConfig(hostCfg *container.HostConfig, cfg WorkspaceConfig, configM
hostCfg.NetworkMode = "host"
// Mount Docker socket so workspace can manage containers
hostCfg.Binds = append(hostCfg.Binds, "/var/run/docker.sock:/var/run/docker.sock")
log.Printf("Provisioner: T4 full-host mode for %s (privileged, host PID, host network, docker socket)", name)
memMB, shares := applyTierResources(hostCfg, 4)
log.Printf("Provisioner: T4 full-host mode for %s (privileged, host PID, host network, docker socket, %dm memory, %d CPU shares)", name, memMB, shares)
default:
// Tier 2 (Standard) and unknown tiers: normal container with resource limits.
// This is the safe default — no privileged access, reasonable resource caps.
hostCfg.Resources.Memory = 512 * 1024 * 1024 // 512 MiB
hostCfg.Resources.NanoCPUs = 1_000_000_000 // 1.0 CPU
log.Printf("Provisioner: T2 standard mode for %s (512m memory, 1 CPU)", name)
memMB, shares := applyTierResources(hostCfg, 2)
log.Printf("Provisioner: T2 standard mode for %s (%dm memory, %d CPU shares)", name, memMB, shares)
}
}

View File

@ -662,3 +662,98 @@ func TestImageNotFoundErrorIncludesBuildHint(t *testing.T) {
}
}
}
// ---- issue #14: configurable per-tier memory/CPU limits ----
// TestGetTierMemoryMB_DefaultsMatchLegacy asserts that with no env overrides,
// getTierMemoryMB returns the agreed (issue #14) defaults.
func TestGetTierMemoryMB_DefaultsMatchLegacy(t *testing.T) {
for _, k := range []string{"TIER2_MEMORY_MB", "TIER3_MEMORY_MB", "TIER4_MEMORY_MB"} {
os.Unsetenv(k)
}
cases := map[int]int64{
1: 0, // no cap
2: 512,
3: 2048,
4: 4096,
9: 0, // unknown
}
for tier, want := range cases {
if got := getTierMemoryMB(tier); got != want {
t.Errorf("getTierMemoryMB(%d): got %d, want %d", tier, got, want)
}
}
}
// TestGetTierMemoryMB_EnvOverride asserts TIERn_MEMORY_MB takes precedence,
// and that malformed / non-positive values fall back to the default.
func TestGetTierMemoryMB_EnvOverride(t *testing.T) {
t.Setenv("TIER3_MEMORY_MB", "512")
if got := getTierMemoryMB(3); got != 512 {
t.Errorf("with TIER3_MEMORY_MB=512, got %d, want 512", got)
}
t.Setenv("TIER3_MEMORY_MB", "not-a-number")
if got := getTierMemoryMB(3); got != defaultTier3MemoryMB {
t.Errorf("malformed TIER3_MEMORY_MB: got %d, want default %d", got, defaultTier3MemoryMB)
}
t.Setenv("TIER3_MEMORY_MB", "0")
if got := getTierMemoryMB(3); got != defaultTier3MemoryMB {
t.Errorf("zero TIER3_MEMORY_MB: got %d, want default %d", got, defaultTier3MemoryMB)
}
}
// TestGetTierCPUShares_EnvOverride asserts TIERn_CPU_SHARES takes precedence.
func TestGetTierCPUShares_EnvOverride(t *testing.T) {
t.Setenv("TIER3_CPU_SHARES", "4096")
if got := getTierCPUShares(3); got != 4096 {
t.Errorf("with TIER3_CPU_SHARES=4096, got %d, want 4096", got)
}
os.Unsetenv("TIER3_CPU_SHARES")
if got := getTierCPUShares(3); got != defaultTier3CPUShares {
t.Errorf("unset TIER3_CPU_SHARES: got %d, want default %d", got, defaultTier3CPUShares)
}
}
// TestApplyTierConfig_T3_UsesEnvOverride is the wiring test: env vars must
// flow through ApplyTierConfig into hostCfg.Resources.
func TestApplyTierConfig_T3_UsesEnvOverride(t *testing.T) {
t.Setenv("TIER3_MEMORY_MB", "8192")
t.Setenv("TIER3_CPU_SHARES", "4096") // 4 CPU == 4e9 NanoCPUs
hc := baseHostConfig("")
cfg := WorkspaceConfig{WorkspaceID: "abc123", Tier: 3}
ApplyTierConfig(hc, cfg, "ws-abc123-configs:/configs", "ws-abc123")
wantMem := int64(8192) * 1024 * 1024
if hc.Resources.Memory != wantMem {
t.Errorf("T3 memory override: got %d, want %d", hc.Resources.Memory, wantMem)
}
wantCPU := int64(4_000_000_000)
if hc.Resources.NanoCPUs != wantCPU {
t.Errorf("T3 CPU override: got %d NanoCPUs, want %d", hc.Resources.NanoCPUs, wantCPU)
}
if !hc.Privileged || hc.PidMode != "host" {
t.Errorf("T3 override should preserve privileged/pid-host flags, got Privileged=%v PidMode=%q",
hc.Privileged, hc.PidMode)
}
}
// TestApplyTierConfig_T3_DefaultCap asserts T3 now gets a memory/CPU cap by
// default (previously uncapped — behaviour change per issue #14).
func TestApplyTierConfig_T3_DefaultCap(t *testing.T) {
for _, k := range []string{"TIER3_MEMORY_MB", "TIER3_CPU_SHARES"} {
os.Unsetenv(k)
}
hc := baseHostConfig("")
cfg := WorkspaceConfig{WorkspaceID: "abc123", Tier: 3}
ApplyTierConfig(hc, cfg, "ws-abc123-configs:/configs", "ws-abc123")
wantMem := int64(defaultTier3MemoryMB) * 1024 * 1024
if hc.Resources.Memory != wantMem {
t.Errorf("T3 default memory: got %d, want %d", hc.Resources.Memory, wantMem)
}
wantCPU := int64(defaultTier3CPUShares) * 1_000_000_000 / 1024
if hc.Resources.NanoCPUs != wantCPU {
t.Errorf("T3 default NanoCPUs: got %d, want %d", hc.Resources.NanoCPUs, wantCPU)
}
}