From 6d7a7fc86faafbd5555ddc957c75d5e092090ea4 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Mon, 4 May 2026 16:32:56 -0700
Subject: [PATCH 1/2] feat(org-import): make provision concurrency configurable
 via env
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Org-import was hard-capped at 3 concurrent workspace provisions (#1084),
calibrated for Docker-mode workspaces where each provision was a
docker-run. Now that workspaces are EC2 instances, AWS RunInstances
parallelises happily and the artificial cap of 3 makes a 7-workspace
org-import take 3-4× longer than necessary (3 batches × ~70s/provision
≈ 4 min wall time when AWS could absorb all 7 in parallel for ~70s).

This PR makes the cap configurable via MOLECULE_PROVISION_CONCURRENCY:
  unset    → 3 (Docker-mode default, unchanged)
  "0"      → effectively unlimited (SaaS / EC2 backend; AWS rate-limit
             + vCPU quota are the real backpressure)
  N>0      → exactly N
  N<0      → fall back to default 3 + warning log
  garbage  → fall back to default 3 + warning log

The "0 = unlimited" mapping is the user-facing convention requested for
SaaS deployments — operators don't have to pick an arbitrary large
number. Implementation hands off 1<<20 internally so the channel-based
semaphore stays a no-op without infinite-buffer risk.

Test coverage (org_provision_concurrency_test.go, 6 cases / 15 subtests):
- unset → default
- "0" → large unlimited cap
- positive integer exact (1, 5, 10, 50)
- negative → default + warning
- non-numeric → default + warning
- whitespace-trimmed (" 7 " → 7)

Boot-time log line confirms the resolved cap so an operator can verify
their env is being honored without re-deploying.

Does NOT address the separate 600s "never registered" timeout the user
also reported during org-import — that's filed as molecule-core#2793
for proper investigation (parallel-provision contention, network
routing, register-retry budget, or container-start failure are all
candidates and need live SSM capture to bisect).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 workspace-server/internal/handlers/org.go     | 74 ++++++++++++--
 .../org_provision_concurrency_test.go         | 96 +++++++++++++++++++
 2 files changed, 164 insertions(+), 6 deletions(-)
 create mode 100644 workspace-server/internal/handlers/org_provision_concurrency_test.go

diff --git a/workspace-server/internal/handlers/org.go b/workspace-server/internal/handlers/org.go
index c28785b2..5f57b24c 100644
--- a/workspace-server/internal/handlers/org.go
+++ b/workspace-server/internal/handlers/org.go
@@ -11,6 +11,8 @@ import (
 	"net/http"
 	"os"
 	"path/filepath"
+	"strconv"
+	"strings"
 
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/channels"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
@@ -25,10 +27,62 @@ import (
 // during org import. Prevents overwhelming Docker when creating many containers.
 const workspaceCreatePacingMs = 2000
 
-// provisionConcurrency limits how many Docker containers can be provisioned
-// simultaneously during org import. Without this, importing 39+ workspaces
-// fires 39 goroutines that all hit Docker at once, causing timeouts (#1084).
-const provisionConcurrency = 3
+// defaultProvisionConcurrency is the fallback cap for parallel
+// workspace-provision goroutines when MOLECULE_PROVISION_CONCURRENCY
+// is unset. Originally a hard constant of 3 (PR #1084) calibrated for
+// Docker-mode workspaces. The constant is now a default — operators
+// running on EC2 (where each provision is a RunInstances call AWS
+// happily parallelises) typically want a much higher cap, while
+// Docker-mode dev environments still prefer the conservative 3.
+//
+// 3 keeps the existing Docker-mode behavior. SaaS deployments override
+// via env (see resolveProvisionConcurrency below).
+const defaultProvisionConcurrency = 3
+
+// resolveProvisionConcurrency returns the effective semaphore size for
+// org-import workspace provisioning, honoring MOLECULE_PROVISION_CONCURRENCY:
+//
+//   - unset / empty / non-numeric → defaultProvisionConcurrency (3)
+//   - "0"                          → unlimited (a very large cap;
+//                                    practically no semaphore — used on
+//                                    SaaS where AWS RunInstances is the
+//                                    rate-limiter, not us)
+//   - any positive integer N       → N
+//   - negative integer             → defaultProvisionConcurrency (3),
+//                                    log warning so operator notices
+//                                    the misconfiguration
+//
+// The "0 = unlimited" mapping was a deliberate choice: an env var of "0"
+// is the natural shorthand for "no cap" without forcing operators to
+// type a magic large number. The implementation hands off a large but
+// finite value (1<<20) so the channel still works as a regular
+// buffered chan; goroutines will never block on the semaphore in
+// practice.
+func resolveProvisionConcurrency() int {
+	raw := strings.TrimSpace(os.Getenv("MOLECULE_PROVISION_CONCURRENCY"))
+	if raw == "" {
+		return defaultProvisionConcurrency
+	}
+	n, err := strconv.Atoi(raw)
+	if err != nil {
+		log.Printf("org_import: MOLECULE_PROVISION_CONCURRENCY=%q is not an integer; falling back to default %d",
+			raw, defaultProvisionConcurrency)
+		return defaultProvisionConcurrency
+	}
+	if n < 0 {
+		log.Printf("org_import: MOLECULE_PROVISION_CONCURRENCY=%d is negative; falling back to default %d",
+			n, defaultProvisionConcurrency)
+		return defaultProvisionConcurrency
+	}
+	if n == 0 {
+		// Unlimited semantics — use a large but finite cap so the
+		// chan-based semaphore stays a no-op. 1M is well past any
+		// realistic org-import size; AWS RunInstances rate-limit and
+		// account vCPU quota are the real backpressure here.
+		return 1 << 20
+	}
+	return n
+}
 
 // Child grid layout constants — kept in sync with canvas-topology.ts on
 // the client. Children laid on import use the same 2-column grid so the
@@ -600,8 +654,16 @@ func (h *OrgHandler) Import(c *gin.Context) {
 	results := []map[string]interface{}{}
 	var createErr error
 
-	// Semaphore limits concurrent Docker provisioning (#1084).
-	provisionSem := make(chan struct{}, provisionConcurrency)
+	// Semaphore limits concurrent provision goroutines (#1084).
+	// Cap is configurable via MOLECULE_PROVISION_CONCURRENCY:
+	//   unset → 3 (Docker-mode default)
+	//   "0"   → effectively unlimited (SaaS / EC2 backend)
+	//   N>0   → exactly N
+	// See resolveProvisionConcurrency for the full env-parse contract.
+	concurrency := resolveProvisionConcurrency()
+	provisionSem := make(chan struct{}, concurrency)
+	log.Printf("org_import: provision concurrency cap=%d (env MOLECULE_PROVISION_CONCURRENCY=%q)",
+		concurrency, os.Getenv("MOLECULE_PROVISION_CONCURRENCY"))
 
 	// Recursively create workspaces. Root workspaces keep their YAML
 	// canvas coords; children are positioned by createWorkspaceTree
diff --git a/workspace-server/internal/handlers/org_provision_concurrency_test.go b/workspace-server/internal/handlers/org_provision_concurrency_test.go
new file mode 100644
index 00000000..a07e7477
--- /dev/null
+++ b/workspace-server/internal/handlers/org_provision_concurrency_test.go
@@ -0,0 +1,96 @@
+package handlers
+
+import (
+	"testing"
+)
+
+// Tests for resolveProvisionConcurrency — the env-parse contract that
+// turns MOLECULE_PROVISION_CONCURRENCY into the channel-buffer size for
+// the org-import provision semaphore.
+//
+// Why this matters: with the wrong cap, org-import either serializes
+// (cap=1, slow) or stampedes the provider (cap=infinity on a backend
+// that can't take it). The defaults — 3 for Docker, "0=unlimited" for
+// EC2/SaaS — are what most operators want; the parse logic exists to
+// route the env var to the right behavior without surprise.
+//
+// The "0 → unlimited" mapping is the user-facing piece worth pinning
+// in tests: easy to misread as "0 means stop entirely" if someone
+// re-reads the constant block years later.
+
+func TestResolveProvisionConcurrency_UnsetUsesDefault(t *testing.T) {
+	t.Setenv("MOLECULE_PROVISION_CONCURRENCY", "")
+	if got := resolveProvisionConcurrency(); got != defaultProvisionConcurrency {
+		t.Errorf("unset env: got %d, want %d", got, defaultProvisionConcurrency)
+	}
+}
+
+func TestResolveProvisionConcurrency_ZeroIsUnlimited(t *testing.T) {
+	// "0" is the user-facing shorthand for "no cap". The implementation
+	// returns a large but finite cap so the channel-based semaphore
+	// stays a no-op without infinite-buffer risk.
+	t.Setenv("MOLECULE_PROVISION_CONCURRENCY", "0")
+	got := resolveProvisionConcurrency()
+	if got <= defaultProvisionConcurrency {
+		t.Errorf("0 should map to large 'unlimited' cap, got %d", got)
+	}
+	// 1<<20 today; pin the lower bound rather than the exact value so
+	// future tuning of the magic number doesn't break this test.
+	if got < 1024 {
+		t.Errorf("0 should map to a cap >= 1024 (effectively unlimited), got %d", got)
+	}
+}
+
+func TestResolveProvisionConcurrency_PositiveIntegerExact(t *testing.T) {
+	cases := []struct {
+		env  string
+		want int
+	}{
+		{"1", 1},
+		{"5", 5},
+		{"10", 10},
+		{"50", 50},
+	}
+	for _, tc := range cases {
+		t.Run(tc.env, func(t *testing.T) {
+			t.Setenv("MOLECULE_PROVISION_CONCURRENCY", tc.env)
+			if got := resolveProvisionConcurrency(); got != tc.want {
+				t.Errorf("env=%q: got %d, want %d", tc.env, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestResolveProvisionConcurrency_NegativeFallsBackToDefault(t *testing.T) {
+	// Negative values are operator misconfiguration. Fall back to the
+	// safe default rather than passing through to make(chan, -5) which
+	// panics. The handler logs a warning so the operator notices.
+	t.Setenv("MOLECULE_PROVISION_CONCURRENCY", "-5")
+	if got := resolveProvisionConcurrency(); got != defaultProvisionConcurrency {
+		t.Errorf("negative env: got %d, want default %d", got, defaultProvisionConcurrency)
+	}
+}
+
+func TestResolveProvisionConcurrency_NonNumericFallsBackToDefault(t *testing.T) {
+	// Garbage in env shouldn't crash org-import. Common in dev when an
+	// operator types `MOLECULE_PROVISION_CONCURRENCY=true` or similar.
+	cases := []string{"true", "yes", "infinity", "ten", "3.5", "0x10"}
+	for _, raw := range cases {
+		t.Run(raw, func(t *testing.T) {
+			t.Setenv("MOLECULE_PROVISION_CONCURRENCY", raw)
+			if got := resolveProvisionConcurrency(); got != defaultProvisionConcurrency {
+				t.Errorf("non-numeric env=%q: got %d, want default %d",
+					raw, got, defaultProvisionConcurrency)
+			}
+		})
+	}
+}
+
+func TestResolveProvisionConcurrency_WhitespaceTrimmed(t *testing.T) {
+	// Operators frequently set env vars with stray whitespace from
+	// copy-paste. Trim before parse so " 7 " == "7".
+	t.Setenv("MOLECULE_PROVISION_CONCURRENCY", "  7  ")
+	if got := resolveProvisionConcurrency(); got != 7 {
+		t.Errorf("whitespace env: got %d, want 7", got)
+	}
+}

From 26fa220befe5be0b2a3e506f4ab82f41e6c97cdd Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Mon, 4 May 2026 16:35:21 -0700
Subject: [PATCH 2/2] ci(coverage): per-file 75% floor for MCP/inbox/auth
 Python critical paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes part of #2790 (Phase A). The Python total floor at 86% (set in
workspace/pytest.ini, issue #1817) averages over ~6000 lines, so a
single MCP-critical file could regress to ~50% with no CI complaint as
long as other modules compensate. This is the same distribution gap
that #1823 closed Go-side: total floor passes while a critical handler
sits at 0%.

Added gates for these five files (per-file floor 75%):
- workspace/a2a_mcp_server.py — MCP dispatcher (PR #2766 / #2771)
- workspace/mcp_cli.py — molecule-mcp standalone CLI entry
- workspace/a2a_tools.py — workspace-scoped tool implementations
- workspace/inbox.py — multi-workspace inbox + per-workspace cursors
- workspace/platform_auth.py — per-workspace token resolver

These handle multi-tenant routing, auth tokens, and inbox dispatch.
Risk shape mirrors Go-side tokens*/secrets* — a 0%/50% file here is
exactly where the PR #2766 dispatcher bug class slips through without
a structural test.

Floor 75% is strictly additive — current actuals 80-96% (measured
2026-05-04). No existing PR fails. Ratchet plan in COVERAGE_FLOOR.md
target 90% by 2026-08-04.

Implementation: pytest already writes .coverage; new step emits a JSON
view scoped to the critical files via `coverage json --include="*name"`,
then jq extracts each file's percent_covered. Exact key match by
basename so workspace/builtin_tools/a2a_tools.py (a different 100%
file) doesn't shadow workspace/a2a_tools.py.

Verified locally with the actual coverage data:
- floor=75 → 0 failures (matches current state)
- floor=81 → 1 failure (a2a_tools.py at 80%) — proves the gate trips

Pairs with PR #2791 (Phase B — schema↔dispatcher AST drift gate). Phase
C (molecule-mcp e2e harness) remains the largest piece in #2790.

YAML validated locally before commit per
feedback_validate_yaml_before_commit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci.yml | 66 ++++++++++++++++++++++++++++++++++++++++
 COVERAGE_FLOOR.md        | 52 +++++++++++++++++++++++++++++--
 2 files changed, 116 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7f0c72bb..a642e1a3 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -358,6 +358,72 @@ jobs:
       - if: needs.changes.outputs.python == 'true'
         run: python -m pytest --tb=short
 
+      - if: needs.changes.outputs.python == 'true'
+        name: Per-file critical-path coverage (MCP / inbox / auth)
+        # MCP-critical Python files have a per-file floor on top of the
+        # 86% total floor in pytest.ini. Rationale (issue #2790, after
+        # the PR #2766 → PR #2771 cycle): the total floor averages ~6000
+        # lines, so a single MCP file could regress to ~50% with no
+        # complaint as long as other modules compensate. These five
+        # files handle multi-tenant routing + auth + inbox dispatch —
+        # a coverage drop here is the same risk shape as a Go-side
+        # workspace-server token/secrets file dropping below 10%.
+        #
+        # Floor 75% sits below current actuals (80-96%) so this gate is
+        # strictly additive — no existing PR fails. Ratchet plan in
+        # COVERAGE_FLOOR.md.
+        run: |
+          set -e
+          PER_FILE_FLOOR=75
+          CRITICAL_FILES=(
+            "a2a_mcp_server.py"
+            "mcp_cli.py"
+            "a2a_tools.py"
+            "inbox.py"
+            "platform_auth.py"
+          )
+
+          # pytest already wrote .coverage; emit a JSON view scoped to
+          # the critical files so jq/python can read the per-file pct
+          # without parsing tabular text. --include uses fnmatch, and
+          # the leading "*" allows the file to live anywhere under the
+          # workspace root (today they sit at workspace/<name>.py).
+          INCLUDES=$(printf '*%s,' "${CRITICAL_FILES[@]}")
+          INCLUDES="${INCLUDES%,}"
+          python -m coverage json -o /tmp/critical-cov.json --include="$INCLUDES"
+
+          FAILED=0
+          for f in "${CRITICAL_FILES[@]}"; do
+            # Match by top-level path key (e.g. "a2a_tools.py", not
+            # "builtin_tools/a2a_tools.py" — different file at 100%).
+            # The keys in coverage.json are paths relative to the run
+            # cwd (workspace/), so the critical-path entry sits at the
+            # bare basename.
+            pct=$(jq -r --arg f "$f" '.files | to_entries | map(select(.key == $f)) | .[0].value.summary.percent_covered // "MISSING"' /tmp/critical-cov.json)
+            if [ "$pct" = "MISSING" ]; then
+              echo "::error file=workspace/$f::No coverage data — file may have moved or test exclusion mis-set."
+              FAILED=$((FAILED+1))
+              continue
+            fi
+            echo "$f: ${pct}%"
+            if awk "BEGIN{exit !($pct < $PER_FILE_FLOOR)}"; then
+              echo "::error file=workspace/$f::${pct}% < ${PER_FILE_FLOOR}% per-file floor (MCP critical path). See COVERAGE_FLOOR.md."
+              FAILED=$((FAILED+1))
+            fi
+          done
+
+          if [ "$FAILED" -gt 0 ]; then
+            echo ""
+            echo "$FAILED MCP critical-path file(s) below the ${PER_FILE_FLOOR}% per-file floor."
+            echo "These paths handle multi-tenant routing, auth tokens, and inbox dispatch."
+            echo "A coverage drop here is the same risk shape as Go-side tokens/secrets files"
+            echo "dropping below 10% (see COVERAGE_FLOOR.md). Either:"
+            echo "  (a) add tests to raise coverage back above ${PER_FILE_FLOOR}%, or"
+            echo "  (b) if this is unavoidable historical debt, file an issue and propose"
+            echo "      adjusting the floor with rationale in COVERAGE_FLOOR.md."
+            exit 1
+          fi
+
       # SDK + plugin validation moved to standalone repo:
       # github.com/Molecule-AI/molecule-sdk-python
 
diff --git a/COVERAGE_FLOOR.md b/COVERAGE_FLOOR.md
index 2870a649..c768b318 100644
--- a/COVERAGE_FLOOR.md
+++ b/COVERAGE_FLOOR.md
@@ -1,7 +1,7 @@
 # Coverage Floor
 
-CI enforces three coverage gates on `workspace-server` (Go). All defined in
-`.github/workflows/ci.yml` → `platform-build` job.
+CI enforces coverage gates on two surfaces — `workspace-server` (Go) and
+`workspace/` (Python). All defined in `.github/workflows/ci.yml`.
 
 ## Current floors (2026-04-23)
 
@@ -76,3 +76,51 @@ This gate makes "no untested critical paths merged" a mechanical property of
 the CI, not a behavioural property of QA agents or individual reviewers —
 which is the only way to make it survive fleet outages, agent rotations, or
 QA process changes.
+
+## Python (workspace/) — added 2026-05-04 from #2790
+
+The Python side has its own gates in the `python-lint` job:
+
+| Gate | Threshold | Where |
+|---|---|---|
+| **Total floor** | `86%` | `workspace/pytest.ini` `--cov-fail-under=86` (issue #1817) |
+| **Critical-path per-file floor** | `75%` | Inline shell step after the pytest run |
+
+### Critical-path Python files
+
+These handle multi-tenant routing, auth tokens, and inbox dispatch. A
+coverage drop here is the same risk shape as a Go-side `tokens*` /
+`secrets*` file regressing below 10%.
+
+- `workspace/a2a_mcp_server.py` — MCP dispatcher (PR #2766 / #2771)
+- `workspace/mcp_cli.py` — molecule-mcp standalone CLI entry
+- `workspace/a2a_tools.py` — workspace-scoped tool implementations
+- `workspace/inbox.py` — multi-workspace inbox + per-workspace cursors
+- `workspace/platform_auth.py` — per-workspace token resolver
+
+### Why 75% (vs 86% total)
+
+The total floor averages ~6000 lines across `workspace/`. A single MCP
+file could drop to ~50% with no CI complaint as long as other modules
+compensate. The per-file floor closes that distribution gap. 75% sits
+below current actuals (80–96% as of 2026-05-04) — strictly additive,
+no existing PR fails.
+
+### Python ratchet plan
+
+| Date | Total | Per-file critical | Notes |
+|---|---|---|---|
+| 2026-05-04 | 86% | 75% | Initial gate (this file). |
+| 2026-06-04 | 86% | 80% | First ratchet — at-floor files must catch up. |
+| 2026-07-04 | 88% | 85% | |
+| 2026-08-04 | 90% | 90% | Target steady-state. |
+
+### Why this Python gate exists
+
+Issue #2790, after the PR #2766 → PR #2771 cycle. PR #2766 added
+multi-workspace routing through `a2a_tools.py` + `a2a_mcp_server.py`,
+shipped to main with green CI, but the dispatcher silently dropped a
+load-bearing kwarg for 4 of 9 tools — caught only by post-merge code
+review. The structural drift gate (`test_dispatcher_schema_drift.py`,
+PR #2791) catches the schema↔dispatcher mismatch class; this floor
+catches the broader "MCP-critical file regressed" class.