From 6d7a7fc86faafbd5555ddc957c75d5e092090ea4 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 16:32:56 -0700 Subject: [PATCH 1/2] feat(org-import): make provision concurrency configurable via env MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Org-import was hard-capped at 3 concurrent workspace provisions (#1084), calibrated for Docker-mode workspaces where each provision was a docker-run. Now that workspaces are EC2 instances, AWS RunInstances parallelises happily and the artificial cap of 3 makes a 7-workspace org-import take 3-4× longer than necessary (3 batches × ~70s/provision ≈ 4 min wall time when AWS could absorb all 7 in parallel for ~70s). This PR makes the cap configurable via MOLECULE_PROVISION_CONCURRENCY: unset → 3 (Docker-mode default, unchanged) "0" → effectively unlimited (SaaS / EC2 backend; AWS rate-limit + vCPU quota are the real backpressure) N>0 → exactly N N<0 → fall back to default 3 + warning log garbage → fall back to default 3 + warning log The "0 = unlimited" mapping is the user-facing convention requested for SaaS deployments — operators don't have to pick an arbitrary large number. Implementation hands off 1<<20 internally so the channel-based semaphore stays a no-op without infinite-buffer risk. Test coverage (org_provision_concurrency_test.go, 6 cases / 15 subtests): - unset → default - "0" → large unlimited cap - positive integer exact (1, 5, 10, 50) - negative → default + warning - non-numeric → default + warning - whitespace-trimmed (" 7 " → 7) Boot-time log line confirms the resolved cap so an operator can verify their env is being honored without re-deploying. Does NOT address the separate 600s "never registered" timeout the user also reported during org-import — that's filed as molecule-core#2793 for proper investigation (parallel-provision contention, network routing, register-retry budget, or container-start failure are all candidates and need live SSM capture to bisect). Co-Authored-By: Claude Opus 4.7 (1M context) --- workspace-server/internal/handlers/org.go | 74 ++++++++++++-- .../org_provision_concurrency_test.go | 96 +++++++++++++++++++ 2 files changed, 164 insertions(+), 6 deletions(-) create mode 100644 workspace-server/internal/handlers/org_provision_concurrency_test.go diff --git a/workspace-server/internal/handlers/org.go b/workspace-server/internal/handlers/org.go index c28785b2..5f57b24c 100644 --- a/workspace-server/internal/handlers/org.go +++ b/workspace-server/internal/handlers/org.go @@ -11,6 +11,8 @@ import ( "net/http" "os" "path/filepath" + "strconv" + "strings" "github.com/Molecule-AI/molecule-monorepo/platform/internal/channels" "github.com/Molecule-AI/molecule-monorepo/platform/internal/events" @@ -25,10 +27,62 @@ import ( // during org import. Prevents overwhelming Docker when creating many containers. const workspaceCreatePacingMs = 2000 -// provisionConcurrency limits how many Docker containers can be provisioned -// simultaneously during org import. Without this, importing 39+ workspaces -// fires 39 goroutines that all hit Docker at once, causing timeouts (#1084). -const provisionConcurrency = 3 +// defaultProvisionConcurrency is the fallback cap for parallel +// workspace-provision goroutines when MOLECULE_PROVISION_CONCURRENCY +// is unset. Originally a hard constant of 3 (PR #1084) calibrated for +// Docker-mode workspaces. The constant is now a default — operators +// running on EC2 (where each provision is a RunInstances call AWS +// happily parallelises) typically want a much higher cap, while +// Docker-mode dev environments still prefer the conservative 3. +// +// 3 keeps the existing Docker-mode behavior. SaaS deployments override +// via env (see resolveProvisionConcurrency below). +const defaultProvisionConcurrency = 3 + +// resolveProvisionConcurrency returns the effective semaphore size for +// org-import workspace provisioning, honoring MOLECULE_PROVISION_CONCURRENCY: +// +// - unset / empty / non-numeric → defaultProvisionConcurrency (3) +// - "0" → unlimited (a very large cap; +// practically no semaphore — used on +// SaaS where AWS RunInstances is the +// rate-limiter, not us) +// - any positive integer N → N +// - negative integer → defaultProvisionConcurrency (3), +// log warning so operator notices +// the misconfiguration +// +// The "0 = unlimited" mapping was a deliberate choice: an env var of "0" +// is the natural shorthand for "no cap" without forcing operators to +// type a magic large number. The implementation hands off a large but +// finite value (1<<20) so the channel still works as a regular +// buffered chan; goroutines will never block on the semaphore in +// practice. +func resolveProvisionConcurrency() int { + raw := strings.TrimSpace(os.Getenv("MOLECULE_PROVISION_CONCURRENCY")) + if raw == "" { + return defaultProvisionConcurrency + } + n, err := strconv.Atoi(raw) + if err != nil { + log.Printf("org_import: MOLECULE_PROVISION_CONCURRENCY=%q is not an integer; falling back to default %d", + raw, defaultProvisionConcurrency) + return defaultProvisionConcurrency + } + if n < 0 { + log.Printf("org_import: MOLECULE_PROVISION_CONCURRENCY=%d is negative; falling back to default %d", + n, defaultProvisionConcurrency) + return defaultProvisionConcurrency + } + if n == 0 { + // Unlimited semantics — use a large but finite cap so the + // chan-based semaphore stays a no-op. 1M is well past any + // realistic org-import size; AWS RunInstances rate-limit and + // account vCPU quota are the real backpressure here. + return 1 << 20 + } + return n +} // Child grid layout constants — kept in sync with canvas-topology.ts on // the client. Children laid on import use the same 2-column grid so the @@ -600,8 +654,16 @@ func (h *OrgHandler) Import(c *gin.Context) { results := []map[string]interface{}{} var createErr error - // Semaphore limits concurrent Docker provisioning (#1084). - provisionSem := make(chan struct{}, provisionConcurrency) + // Semaphore limits concurrent provision goroutines (#1084). + // Cap is configurable via MOLECULE_PROVISION_CONCURRENCY: + // unset → 3 (Docker-mode default) + // "0" → effectively unlimited (SaaS / EC2 backend) + // N>0 → exactly N + // See resolveProvisionConcurrency for the full env-parse contract. + concurrency := resolveProvisionConcurrency() + provisionSem := make(chan struct{}, concurrency) + log.Printf("org_import: provision concurrency cap=%d (env MOLECULE_PROVISION_CONCURRENCY=%q)", + concurrency, os.Getenv("MOLECULE_PROVISION_CONCURRENCY")) // Recursively create workspaces. Root workspaces keep their YAML // canvas coords; children are positioned by createWorkspaceTree diff --git a/workspace-server/internal/handlers/org_provision_concurrency_test.go b/workspace-server/internal/handlers/org_provision_concurrency_test.go new file mode 100644 index 00000000..a07e7477 --- /dev/null +++ b/workspace-server/internal/handlers/org_provision_concurrency_test.go @@ -0,0 +1,96 @@ +package handlers + +import ( + "testing" +) + +// Tests for resolveProvisionConcurrency — the env-parse contract that +// turns MOLECULE_PROVISION_CONCURRENCY into the channel-buffer size for +// the org-import provision semaphore. +// +// Why this matters: with the wrong cap, org-import either serializes +// (cap=1, slow) or stampedes the provider (cap=infinity on a backend +// that can't take it). The defaults — 3 for Docker, "0=unlimited" for +// EC2/SaaS — are what most operators want; the parse logic exists to +// route the env var to the right behavior without surprise. +// +// The "0 → unlimited" mapping is the user-facing piece worth pinning +// in tests: easy to misread as "0 means stop entirely" if someone +// re-reads the constant block years later. + +func TestResolveProvisionConcurrency_UnsetUsesDefault(t *testing.T) { + t.Setenv("MOLECULE_PROVISION_CONCURRENCY", "") + if got := resolveProvisionConcurrency(); got != defaultProvisionConcurrency { + t.Errorf("unset env: got %d, want %d", got, defaultProvisionConcurrency) + } +} + +func TestResolveProvisionConcurrency_ZeroIsUnlimited(t *testing.T) { + // "0" is the user-facing shorthand for "no cap". The implementation + // returns a large but finite cap so the channel-based semaphore + // stays a no-op without infinite-buffer risk. + t.Setenv("MOLECULE_PROVISION_CONCURRENCY", "0") + got := resolveProvisionConcurrency() + if got <= defaultProvisionConcurrency { + t.Errorf("0 should map to large 'unlimited' cap, got %d", got) + } + // 1<<20 today; pin the lower bound rather than the exact value so + // future tuning of the magic number doesn't break this test. + if got < 1024 { + t.Errorf("0 should map to a cap >= 1024 (effectively unlimited), got %d", got) + } +} + +func TestResolveProvisionConcurrency_PositiveIntegerExact(t *testing.T) { + cases := []struct { + env string + want int + }{ + {"1", 1}, + {"5", 5}, + {"10", 10}, + {"50", 50}, + } + for _, tc := range cases { + t.Run(tc.env, func(t *testing.T) { + t.Setenv("MOLECULE_PROVISION_CONCURRENCY", tc.env) + if got := resolveProvisionConcurrency(); got != tc.want { + t.Errorf("env=%q: got %d, want %d", tc.env, got, tc.want) + } + }) + } +} + +func TestResolveProvisionConcurrency_NegativeFallsBackToDefault(t *testing.T) { + // Negative values are operator misconfiguration. Fall back to the + // safe default rather than passing through to make(chan, -5) which + // panics. The handler logs a warning so the operator notices. + t.Setenv("MOLECULE_PROVISION_CONCURRENCY", "-5") + if got := resolveProvisionConcurrency(); got != defaultProvisionConcurrency { + t.Errorf("negative env: got %d, want default %d", got, defaultProvisionConcurrency) + } +} + +func TestResolveProvisionConcurrency_NonNumericFallsBackToDefault(t *testing.T) { + // Garbage in env shouldn't crash org-import. Common in dev when an + // operator types `MOLECULE_PROVISION_CONCURRENCY=true` or similar. + cases := []string{"true", "yes", "infinity", "ten", "3.5", "0x10"} + for _, raw := range cases { + t.Run(raw, func(t *testing.T) { + t.Setenv("MOLECULE_PROVISION_CONCURRENCY", raw) + if got := resolveProvisionConcurrency(); got != defaultProvisionConcurrency { + t.Errorf("non-numeric env=%q: got %d, want default %d", + raw, got, defaultProvisionConcurrency) + } + }) + } +} + +func TestResolveProvisionConcurrency_WhitespaceTrimmed(t *testing.T) { + // Operators frequently set env vars with stray whitespace from + // copy-paste. Trim before parse so " 7 " == "7". + t.Setenv("MOLECULE_PROVISION_CONCURRENCY", " 7 ") + if got := resolveProvisionConcurrency(); got != 7 { + t.Errorf("whitespace env: got %d, want 7", got) + } +} From 26fa220befe5be0b2a3e506f4ab82f41e6c97cdd Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 16:35:21 -0700 Subject: [PATCH 2/2] ci(coverage): per-file 75% floor for MCP/inbox/auth Python critical paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes part of #2790 (Phase A). The Python total floor at 86% (set in workspace/pytest.ini, issue #1817) averages over ~6000 lines, so a single MCP-critical file could regress to ~50% with no CI complaint as long as other modules compensate. This is the same distribution gap that #1823 closed Go-side: total floor passes while a critical handler sits at 0%. Added gates for these five files (per-file floor 75%): - workspace/a2a_mcp_server.py — MCP dispatcher (PR #2766 / #2771) - workspace/mcp_cli.py — molecule-mcp standalone CLI entry - workspace/a2a_tools.py — workspace-scoped tool implementations - workspace/inbox.py — multi-workspace inbox + per-workspace cursors - workspace/platform_auth.py — per-workspace token resolver These handle multi-tenant routing, auth tokens, and inbox dispatch. Risk shape mirrors Go-side tokens*/secrets* — a 0%/50% file here is exactly where the PR #2766 dispatcher bug class slips through without a structural test. Floor 75% is strictly additive — current actuals 80-96% (measured 2026-05-04). No existing PR fails. Ratchet plan in COVERAGE_FLOOR.md target 90% by 2026-08-04. Implementation: pytest already writes .coverage; new step emits a JSON view scoped to the critical files via `coverage json --include="*name"`, then jq extracts each file's percent_covered. Exact key match by basename so workspace/builtin_tools/a2a_tools.py (a different 100% file) doesn't shadow workspace/a2a_tools.py. Verified locally with the actual coverage data: - floor=75 → 0 failures (matches current state) - floor=81 → 1 failure (a2a_tools.py at 80%) — proves the gate trips Pairs with PR #2791 (Phase B — schema↔dispatcher AST drift gate). Phase C (molecule-mcp e2e harness) remains the largest piece in #2790. YAML validated locally before commit per feedback_validate_yaml_before_commit. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci.yml | 66 ++++++++++++++++++++++++++++++++++++++++ COVERAGE_FLOOR.md | 52 +++++++++++++++++++++++++++++-- 2 files changed, 116 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7f0c72bb..a642e1a3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -358,6 +358,72 @@ jobs: - if: needs.changes.outputs.python == 'true' run: python -m pytest --tb=short + - if: needs.changes.outputs.python == 'true' + name: Per-file critical-path coverage (MCP / inbox / auth) + # MCP-critical Python files have a per-file floor on top of the + # 86% total floor in pytest.ini. Rationale (issue #2790, after + # the PR #2766 → PR #2771 cycle): the total floor averages ~6000 + # lines, so a single MCP file could regress to ~50% with no + # complaint as long as other modules compensate. These five + # files handle multi-tenant routing + auth + inbox dispatch — + # a coverage drop here is the same risk shape as a Go-side + # workspace-server token/secrets file dropping below 10%. + # + # Floor 75% sits below current actuals (80-96%) so this gate is + # strictly additive — no existing PR fails. Ratchet plan in + # COVERAGE_FLOOR.md. + run: | + set -e + PER_FILE_FLOOR=75 + CRITICAL_FILES=( + "a2a_mcp_server.py" + "mcp_cli.py" + "a2a_tools.py" + "inbox.py" + "platform_auth.py" + ) + + # pytest already wrote .coverage; emit a JSON view scoped to + # the critical files so jq/python can read the per-file pct + # without parsing tabular text. --include uses fnmatch, and + # the leading "*" allows the file to live anywhere under the + # workspace root (today they sit at workspace/.py). + INCLUDES=$(printf '*%s,' "${CRITICAL_FILES[@]}") + INCLUDES="${INCLUDES%,}" + python -m coverage json -o /tmp/critical-cov.json --include="$INCLUDES" + + FAILED=0 + for f in "${CRITICAL_FILES[@]}"; do + # Match by top-level path key (e.g. "a2a_tools.py", not + # "builtin_tools/a2a_tools.py" — different file at 100%). + # The keys in coverage.json are paths relative to the run + # cwd (workspace/), so the critical-path entry sits at the + # bare basename. + pct=$(jq -r --arg f "$f" '.files | to_entries | map(select(.key == $f)) | .[0].value.summary.percent_covered // "MISSING"' /tmp/critical-cov.json) + if [ "$pct" = "MISSING" ]; then + echo "::error file=workspace/$f::No coverage data — file may have moved or test exclusion mis-set." + FAILED=$((FAILED+1)) + continue + fi + echo "$f: ${pct}%" + if awk "BEGIN{exit !($pct < $PER_FILE_FLOOR)}"; then + echo "::error file=workspace/$f::${pct}% < ${PER_FILE_FLOOR}% per-file floor (MCP critical path). See COVERAGE_FLOOR.md." + FAILED=$((FAILED+1)) + fi + done + + if [ "$FAILED" -gt 0 ]; then + echo "" + echo "$FAILED MCP critical-path file(s) below the ${PER_FILE_FLOOR}% per-file floor." + echo "These paths handle multi-tenant routing, auth tokens, and inbox dispatch." + echo "A coverage drop here is the same risk shape as Go-side tokens/secrets files" + echo "dropping below 10% (see COVERAGE_FLOOR.md). Either:" + echo " (a) add tests to raise coverage back above ${PER_FILE_FLOOR}%, or" + echo " (b) if this is unavoidable historical debt, file an issue and propose" + echo " adjusting the floor with rationale in COVERAGE_FLOOR.md." + exit 1 + fi + # SDK + plugin validation moved to standalone repo: # github.com/Molecule-AI/molecule-sdk-python diff --git a/COVERAGE_FLOOR.md b/COVERAGE_FLOOR.md index 2870a649..c768b318 100644 --- a/COVERAGE_FLOOR.md +++ b/COVERAGE_FLOOR.md @@ -1,7 +1,7 @@ # Coverage Floor -CI enforces three coverage gates on `workspace-server` (Go). All defined in -`.github/workflows/ci.yml` → `platform-build` job. +CI enforces coverage gates on two surfaces — `workspace-server` (Go) and +`workspace/` (Python). All defined in `.github/workflows/ci.yml`. ## Current floors (2026-04-23) @@ -76,3 +76,51 @@ This gate makes "no untested critical paths merged" a mechanical property of the CI, not a behavioural property of QA agents or individual reviewers — which is the only way to make it survive fleet outages, agent rotations, or QA process changes. + +## Python (workspace/) — added 2026-05-04 from #2790 + +The Python side has its own gates in the `python-lint` job: + +| Gate | Threshold | Where | +|---|---|---| +| **Total floor** | `86%` | `workspace/pytest.ini` `--cov-fail-under=86` (issue #1817) | +| **Critical-path per-file floor** | `75%` | Inline shell step after the pytest run | + +### Critical-path Python files + +These handle multi-tenant routing, auth tokens, and inbox dispatch. A +coverage drop here is the same risk shape as a Go-side `tokens*` / +`secrets*` file regressing below 10%. + +- `workspace/a2a_mcp_server.py` — MCP dispatcher (PR #2766 / #2771) +- `workspace/mcp_cli.py` — molecule-mcp standalone CLI entry +- `workspace/a2a_tools.py` — workspace-scoped tool implementations +- `workspace/inbox.py` — multi-workspace inbox + per-workspace cursors +- `workspace/platform_auth.py` — per-workspace token resolver + +### Why 75% (vs 86% total) + +The total floor averages ~6000 lines across `workspace/`. A single MCP +file could drop to ~50% with no CI complaint as long as other modules +compensate. The per-file floor closes that distribution gap. 75% sits +below current actuals (80–96% as of 2026-05-04) — strictly additive, +no existing PR fails. + +### Python ratchet plan + +| Date | Total | Per-file critical | Notes | +|---|---|---|---| +| 2026-05-04 | 86% | 75% | Initial gate (this file). | +| 2026-06-04 | 86% | 80% | First ratchet — at-floor files must catch up. | +| 2026-07-04 | 88% | 85% | | +| 2026-08-04 | 90% | 90% | Target steady-state. | + +### Why this Python gate exists + +Issue #2790, after the PR #2766 → PR #2771 cycle. PR #2766 added +multi-workspace routing through `a2a_tools.py` + `a2a_mcp_server.py`, +shipped to main with green CI, but the dispatcher silently dropped a +load-bearing kwarg for 4 of 9 tools — caught only by post-merge code +review. The structural drift gate (`test_dispatcher_schema_drift.py`, +PR #2791) catches the schema↔dispatcher mismatch class; this floor +catches the broader "MCP-critical file regressed" class.