fix(github-token): add HTTP client timeout to prevent indefinite blocking

http.DefaultClient has no timeout, so a slow/unresponsive GitHub API could block the handler goroutine forever. Use an http.Client with a 30-second timeout in generateAppInstallationToken. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
fix(migrations): renumber workspace_compute to avoid collision with main
2026-05-23 05:08:29 +00:00 · 2026-05-23 04:53:14 +00:00 · 2026-05-23 04:11:01 +00:00 · 2026-05-23 04:04:18 +00:00 · 2026-05-23 01:05:10 +00:00 · 2026-05-23 00:33:15 +00:00
24 changed files with 738 additions and 143 deletions
@@ -98,10 +98,10 @@ jobs:
            --base-ref "$PR_BASE_REF" \
            --push-before "${GITHUB_EVENT_BEFORE:-$PUSH_BEFORE}"

-  # Platform (Go) — Go build/vet/test/lint + coverage gates. The always-run
-  # + per-step gating shape preserves the GitHub-side required-check name
-  # contract (so when this Gitea port becomes a required check in Phase 4,
-  # the name match works on PRs that don't touch workspace-server/).
+  # Platform (Go) — Go build/vet/test/lint + coverage gates. The job always
+  # emits the required context, but expensive steps are path-scoped on every
+  # event so docs/E2E/Canvas-only main pushes do not block deploy on unrelated
+  # Go bootstrap work.
  platform-build:
    name: Platform (Go)
    needs: changes
@@ -125,29 +125,29 @@ jobs:
      run:
        working-directory: workspace-server
    steps:
-      - if: ${{ github.event_name == 'pull_request' && needs.changes.outputs.platform != 'true' }}
+      - if: ${{ needs.changes.outputs.platform != 'true' }}
        working-directory: .
-        run: echo "No workspace-server/** changes on this PR — Platform (Go) gate satisfied without running Go build/test/lint."
-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.platform == 'true' }}
+        run: echo "No workspace-server/** changes — Platform (Go) gate satisfied without running Go build/test/lint."
+      - if: ${{ needs.changes.outputs.platform == 'true' }}
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.platform == 'true' }}
+      - if: ${{ needs.changes.outputs.platform == 'true' }}
        uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
        with:
          go-version: 'stable'
-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.platform == 'true' }}
+      - if: ${{ needs.changes.outputs.platform == 'true' }}
        run: go mod download
-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.platform == 'true' }}
+      - if: ${{ needs.changes.outputs.platform == 'true' }}
        run: go build ./cmd/server
      # CLI (molecli) moved to standalone repo: git.moleculesai.app/molecule-ai/molecule-cli
-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.platform == 'true' }}
+      - if: ${{ needs.changes.outputs.platform == 'true' }}
        run: go vet ./...
-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.platform == 'true' }}
+      - if: ${{ needs.changes.outputs.platform == 'true' }}
        name: Install golangci-lint
        run: go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.12.2
-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.platform == 'true' }}
+      - if: ${{ needs.changes.outputs.platform == 'true' }}
        name: Run golangci-lint
        run: $(go env GOPATH)/bin/golangci-lint run --timeout 3m ./...
-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.platform == 'true' }}
+      - if: ${{ needs.changes.outputs.platform == 'true' }}
        name: Diagnostic — per-package verbose 60s
        run: |
          set +e
@@ -163,7 +163,7 @@ jobs:
          echo "::endgroup::"
        # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true
-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.platform == 'true' }}
+      - if: ${{ needs.changes.outputs.platform == 'true' }}
        name: Run tests with race detection and coverage
        # Explicit timeout: cold runner cache causes OOM kills at ~4m39s on the
        # full ./... suite with race detection + coverage. A 10m per-step timeout
@@ -171,7 +171,7 @@ jobs:
        # instead of OOM-killing. The job-level timeout (15m) is a backstop.
        run: go test -race -timeout 10m -coverprofile=coverage.out ./...

-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.platform == 'true' }}
+      - if: ${{ needs.changes.outputs.platform == 'true' }}
        name: Per-file coverage report
        # Advisory — lists every source file with its coverage so reviewers
        # can see at-a-glance where gaps are. Sorted ascending so the worst
@@ -185,7 +185,7 @@ jobs:
                   END {for (f in s) printf "%6.1f%%  %s\n", s[f]/c[f], f}' \
            | sort -n

-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.platform == 'true' }}
+      - if: ${{ needs.changes.outputs.platform == 'true' }}
        name: Check coverage thresholds
        # Enforces two gates from #1823 Layer 1:
        #   1. Total floor (25% — ratchet plan in COVERAGE_FLOOR.md).
@@ -282,20 +282,20 @@ jobs:
      run:
        working-directory: canvas
    steps:
-      - if: ${{ github.event_name == 'pull_request' && needs.changes.outputs.canvas != 'true' }}
+      - if: ${{ needs.changes.outputs.canvas != 'true' }}
        working-directory: .
-        run: echo "No canvas/** changes on this PR — Canvas (Next.js) gate satisfied without running npm build/test."
-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.canvas == 'true' }}
+        run: echo "No canvas/** changes — Canvas (Next.js) gate satisfied without running npm build/test."
+      - if: ${{ needs.changes.outputs.canvas == 'true' }}
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.canvas == 'true' }}
+      - if: ${{ needs.changes.outputs.canvas == 'true' }}
        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
        with:
          node-version: '22'
-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.canvas == 'true' }}
+      - if: ${{ needs.changes.outputs.canvas == 'true' }}
        run: npm ci --include=optional --prefer-offline
-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.canvas == 'true' }}
+      - if: ${{ needs.changes.outputs.canvas == 'true' }}
        run: npm run build
-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.canvas == 'true' }}
+      - if: ${{ needs.changes.outputs.canvas == 'true' }}
        name: Run tests with coverage
        # Coverage instrumentation is configured in canvas/vitest.config.ts
        # (provider: v8, reporters: text + html + json-summary). Step 2 of
@@ -304,7 +304,7 @@ jobs:
        # tracked in #1815) after the team sees what current coverage is.
        run: npx vitest run --coverage
      - name: Upload coverage summary as artifact
-        if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.canvas == 'true' }}
+        if: ${{ needs.changes.outputs.canvas == 'true' }}
        # Pinned to v3 for Gitea act_runner v0.6 compatibility — v4+ uses
        # the GHES 3.10+ artifact protocol that Gitea 1.22.x does NOT
        # implement, surfacing as `GHESNotSupportedError: @actions/artifact
@@ -318,7 +318,7 @@ jobs:
          retention-days: 7
          if-no-files-found: warn

-  # Shellcheck (E2E scripts) — required check, always runs.
+  # Shellcheck (E2E scripts) — required context, path-scoped heavy steps.
  shellcheck:
    name: Shellcheck (E2E scripts)
    needs: changes
@@ -326,11 +326,11 @@ jobs:
    # Phase 4 (RFC #219 §1): confirmed green on main 2026-05-12.
    continue-on-error: false
    steps:
-      - if: ${{ github.event_name == 'pull_request' && needs.changes.outputs.scripts != 'true' }}
-        run: echo "No tests/e2e, scripts, or infra/scripts changes on this PR — Shellcheck gate satisfied without running script checks."
-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.scripts == 'true' }}
+      - if: ${{ needs.changes.outputs.scripts != 'true' }}
+        run: echo "No tests/e2e, scripts, or infra/scripts changes — Shellcheck gate satisfied without running script checks."
+      - if: ${{ needs.changes.outputs.scripts == 'true' }}
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.scripts == 'true' }}
+      - if: ${{ needs.changes.outputs.scripts == 'true' }}
        name: Run shellcheck on tests/e2e/*.sh and infra/scripts/*.sh
        # shellcheck is pre-installed on ubuntu-latest runners (via apt).
        # infra/scripts/ is included because setup.sh + nuke.sh gate the
@@ -341,16 +341,16 @@ jobs:
          find tests/e2e infra/scripts -type f -name '*.sh' -print0 \
            | xargs -0 shellcheck --severity=warning

-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.scripts == 'true' }}
+      - if: ${{ needs.changes.outputs.scripts == 'true' }}
        name: Lint cleanup-trap hygiene (RFC #2873)
        run: bash tests/e2e/lint_cleanup_traps.sh

-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.scripts == 'true' }}
+      - if: ${{ needs.changes.outputs.scripts == 'true' }}
        name: Run E2E bash unit tests (no live infra)
        run: |
          bash tests/e2e/test_model_slug.sh

-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.scripts == 'true' }}
+      - if: ${{ needs.changes.outputs.scripts == 'true' }}
        name: Test ECR promote-tenant-image script (mock-driven, no live infra)
        # Covers scripts/promote-tenant-image.sh — the codified
        # :staging-latest → :latest ECR promote + tenant fleet redeploy
@@ -360,7 +360,7 @@ jobs:
        run: |
          bash scripts/test-promote-tenant-image.sh

-      - if: ${{ github.event_name != 'pull_request' || needs.changes.outputs.scripts == 'true' }}
+      - if: ${{ needs.changes.outputs.scripts == 'true' }}
        name: Shellcheck promote-tenant-image script
        # scripts/ is excluded from the bulk shellcheck pass above (legacy
        # SC3040/SC3043 cleanup pending). Run shellcheck explicitly on
@@ -118,7 +118,7 @@ jobs:
    timeout-minutes: 20
    env:
      # claude-code default: cold-start ~5 min (comparable to langgraph),
-      # but uses MiniMax-M2.7-highspeed via the template's third-party-
+      # but uses MiniMax-M2 via the template's third-party-
      # Anthropic-compat path (workspace-configs-templates/claude-code-
      # default/config.yaml:64-69). MiniMax is ~5-10x cheaper than
      # gpt-4.1-mini per token AND avoids the recurring OpenAI quota-
@@ -131,9 +131,9 @@ jobs:
      # on the per-runtime default ("sonnet" → routes to direct
      # Anthropic, defeats the cost saving). Operators can override
      # via workflow_dispatch by setting a different E2E_MODEL_SLUG
-      # input if they need to exercise a specific model. M2.7-highspeed
-      # is "Token Plan only" but cheap-per-token and fast.
-      E2E_MODEL_SLUG: ${{ github.event.inputs.model_slug || 'MiniMax-M2.7-highspeed' }}
+      # input if they need to exercise a specific model. MiniMax-M2 is the
+      # stable staging MiniMax path used by the full-SaaS smoke.
+      E2E_MODEL_SLUG: ${{ github.event.inputs.model_slug || 'MiniMax-M2' }}
      # Bound to 10 min so a stuck provision fails the run instead of
      # holding up the next cron firing. 15-min default in the script
      # is for the on-PR full lifecycle where we have more headroom.
@@ -161,7 +161,7 @@ jobs:
      # and defeats the cost saving. Operators can override via the
      # workflow_dispatch flow (no input wired here yet — runtime
      # override is enough for ad-hoc).
-      E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'langgraph' && 'openai:gpt-4o' || 'MiniMax-M2.7-highspeed' }}
+      E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'langgraph' && 'openai:gpt-4o' || 'MiniMax-M2' }}
      E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
      E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}

@@ -112,9 +112,9 @@ jobs:
      E2E_RUNTIME: claude-code
      # Pin the smoke to a specific MiniMax model rather than relying
      # on the per-runtime default (which could resolve to "sonnet" →
-      # direct Anthropic and defeat the cost saving). M2.7-highspeed
-      # is "Token Plan only" but cheap-per-token and fast.
-      E2E_MODEL_SLUG: MiniMax-M2.7-highspeed
+      # direct Anthropic and defeat the cost saving). MiniMax-M2 is the
+      # stable staging MiniMax path used by the full-SaaS smoke.
+      E2E_MODEL_SLUG: MiniMax-M2
      E2E_RUN_ID: "smoke-${{ github.run_id }}"
      # Debug-only: when an operator dispatches with keep_on_failure=true,
      # the smoke script's E2E_KEEP_ORG=1 path skips teardown so the
@@ -19,11 +19,18 @@
 #                                    PR #2558+#2563+#2567 cleared the
 #                                    masking layers.)
 #
-#   claude-code → "sonnet"         (entry-id form: claude-code template's
-#                                    config.yaml uses bare model names,
-#                                    auth comes via CLAUDE_CODE_OAUTH_TOKEN
-#                                    or ANTHROPIC_API_KEY rather than the
-#                                    slug.)
+#   claude-code → auth-aware:
+#                  E2E_MINIMAX_API_KEY    → "MiniMax-M2"
+#                  E2E_ANTHROPIC_API_KEY  → "claude-sonnet-4-6"
+#                  otherwise              → "sonnet"
+#
+#                  claude-code provider routing is model-driven. The bare
+#                  "sonnet" alias selects the OAuth provider, so it is only a
+#                  good default when the canary is using Claude Code OAuth or
+#                  intentionally exercising the missing-auth path. MiniMax and
+#                  direct Anthropic API keys need model IDs that resolve to
+#                  their provider entries, otherwise the workspace boots
+#                  reachable but the first A2A call hits the wrong auth path.
 #
 # When E2E_MODEL_SLUG is set, it overrides this dispatch — useful when an
 # operator dispatches the workflow to test a specific slug.
@@ -45,7 +52,15 @@ pick_model_slug() {
  case "$runtime" in
    hermes)      printf 'openai/gpt-4o' ;;
    langgraph)   printf 'openai:gpt-4o' ;;
-    claude-code) printf 'sonnet' ;;
+    claude-code)
+      if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
+        printf 'MiniMax-M2'
+      elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
+        printf 'claude-sonnet-4-6'
+      else
+        printf 'sonnet'
+      fi
+      ;;
    *)           printf 'openai/gpt-4o' ;;  # safest fallback (matches hermes)
  esac
 }
@@ -16,7 +16,7 @@ set -uo pipefail
 # Resolve to the lib relative to this test file so the test runs from
 # any cwd (CI, local invocation, repo root).
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-# shellcheck source=lib/model_slug.sh
+# shellcheck source=tests/e2e/lib/model_slug.sh
 source "$SCRIPT_DIR/lib/model_slug.sh"

 PASS=0
@@ -48,7 +48,16 @@ echo
 # ── Per-runtime branches (the load-bearing ones for synth-E2E) ──
 run_test "hermes → slash-form (derive-provider.sh contract)"       hermes      "openai/gpt-4o"
 run_test "langgraph → colon-form (init_chat_model contract)"       langgraph   "openai:gpt-4o"
-run_test "claude-code → bare model name (entry-id form)"           claude-code "sonnet"
+run_test "claude-code → OAuth/default alias"                      claude-code "sonnet"
+
+got=$(unset E2E_MODEL_SLUG E2E_ANTHROPIC_API_KEY; E2E_MINIMAX_API_KEY="mx-test" pick_model_slug claude-code)
+assert_eq "claude-code + MiniMax key → MiniMax model"             "$got" "MiniMax-M2"
+
+got=$(unset E2E_MODEL_SLUG E2E_MINIMAX_API_KEY; E2E_ANTHROPIC_API_KEY="sk-ant-test" pick_model_slug claude-code)
+assert_eq "claude-code + Anthropic API key → Anthropic API model" "$got" "claude-sonnet-4-6"
+
+got=$(unset E2E_MODEL_SLUG; E2E_MINIMAX_API_KEY="mx-priority" E2E_ANTHROPIC_API_KEY="sk-ant-loser" pick_model_slug claude-code)
+assert_eq "claude-code + both keys → MiniMax priority"            "$got" "MiniMax-M2"

 # ── Fallback for unknown runtime ──
 # Picks slash-form (hermes-shaped) since hermes is the historical
@@ -25,6 +25,11 @@
 # Optional env:
 #   E2E_RUNTIME                  hermes (default) | claude-code | langgraph
 #   E2E_PROVISION_TIMEOUT_SECS   default 900 (15 min cold EC2 budget)
+#   E2E_WORKSPACE_ONLINE_TIMEOUT_SECS  default 3600 (60 min — hermes
+#                                cold-boot worst-case + slack). Raised from
+#                                1800 (#1646) because flaky tenant-provisioning
+#                                latency (not a code regression) causes
+#                                alternating pass/fail on identical SHAs.
 #   E2E_KEEP_ORG                 1 → skip teardown (debugging only)
 #   E2E_RUN_ID                   Slug suffix; CI: ${GITHUB_RUN_ID}
 #   E2E_MODE                     full (default) | smoke
@@ -56,6 +61,7 @@ CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}"
 ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}"
 RUNTIME="${E2E_RUNTIME:-hermes}"
 PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}"
+WORKSPACE_ONLINE_TIMEOUT_SECS="${E2E_WORKSPACE_ONLINE_TIMEOUT_SECS:-3600}"
 RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}"
 MODE="${E2E_MODE:-full}"
 # `canary` is a legacy alias for `smoke` retained for back-compat with
@@ -350,6 +356,75 @@ tenant_call() {
    "$@"
 }

+sanitize_http_body() {
+  python3 -c '
+import re, sys
+s = sys.stdin.read()
+s = re.sub(r"(?i)(Authorization:\s*Bearer\s+)[A-Za-z0-9._~+/=-]+", r"\1[redacted]", s)
+s = re.sub(r"(?i)(\"(?:auth_token|access_token|refresh_token|token|api_key|secret|password)\"\s*:\s*\")[^\"]+\"", r"\1[redacted]\"", s)
+s = re.sub(r"(?i)((?:auth_token|access_token|refresh_token|api_key|secret|password)=)[^&\s]+", r"\1[redacted]", s)
+print(s[:4000])
+'
+}
+
+wait_workspaces_online_routable() {
+  local label="$1"; shift
+  local deadline=$(( $(date +%s) + WORKSPACE_ONLINE_TIMEOUT_SECS ))
+  local wid ws_last_status ws_last_url ws_url_missing_logged ws_failed_logged
+  local ws_json ws_status ws_url ws_last_err
+
+  log "$label"
+  for wid in "$@"; do
+    ws_last_status=""
+    ws_last_url=""
+    ws_url_missing_logged=0
+    ws_failed_logged=0
+    while true; do
+      if [ "$(date +%s)" -gt "$deadline" ]; then
+        ws_last_err=$(tenant_call GET "/workspaces/$wid" 2>/dev/null | \
+          python3 -c "import json,sys; print(json.load(sys.stdin).get('last_sample_error',''))" 2>/dev/null || echo "")
+        fail "Workspace $wid never reached online with a routable URL within ${WORKSPACE_ONLINE_TIMEOUT_SECS}s (~$((WORKSPACE_ONLINE_TIMEOUT_SECS/60)) min) (last status=$ws_last_status, url=$ws_last_url, err=$ws_last_err)"
+      fi
+      ws_json=$(tenant_call GET "/workspaces/$wid" 2>/dev/null || echo '{}')
+      ws_status=$(echo "$ws_json" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status') or '')" 2>/dev/null)
+      ws_url=$(echo "$ws_json" | python3 -c "import json,sys; print(json.load(sys.stdin).get('url') or '')" 2>/dev/null)
+      if [ "$ws_status" != "$ws_last_status" ]; then
+        log "    $wid → $ws_status"
+        ws_last_status="$ws_status"
+      fi
+      if [ -n "$ws_url" ] && [ "$ws_url" != "$ws_last_url" ]; then
+        log "    $wid url ready: $ws_url"
+        ws_last_url="$ws_url"
+      fi
+      case "$ws_status" in
+        online)
+          if [ -n "$ws_url" ]; then
+            break
+          fi
+          if [ "$ws_url_missing_logged" = "0" ]; then
+            log "    $wid online but URL is not assigned yet — waiting for workspace routing readiness"
+            ws_url_missing_logged=1
+          fi
+          sleep 10
+          ;;
+        failed)
+          # Not a hard fail — bootstrap-watcher frequently marks failed at
+          # 5 min on hermes, then heartbeat recovers to online around 10-13
+          # min when install.sh finishes. Log once per workspace so the CI
+          # output isn't spammy.
+          if [ "$ws_failed_logged" = "0" ]; then
+            log "    $wid transiently failed — waiting for heartbeat recovery (bootstrap-watcher deadline, see cp#245)"
+            ws_failed_logged=1
+          fi
+          sleep 10
+          ;;
+        *)      sleep 10 ;;
+      esac
+    done
+    ok "    $wid online and routable"
+  done
+}
+
 # ─── 5. Provision parent workspace ─────────────────────────────────────
 # Inject the LLM provider key so the runtime can authenticate at boot.
 # Branch by which secret is set so the script supports multiple paths
@@ -402,9 +477,9 @@ elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
  # is still independent of MOLECULE_STAGING_OPENAI_API_KEY, so an OpenAI
  # quota collapse doesn't wedge this path. Pinned to the claude-code
  # runtime: hermes/langgraph use OpenAI-shaped envs and won't honour
-  # ANTHROPIC_API_KEY without further wiring (out of scope for this
-  # branch; if you need a hermes/Anthropic path, dispatch with
-  # E2E_RUNTIME=hermes + E2E_OPENAI_API_KEY pointing at a working key).
+  # ANTHROPIC_API_KEY without further wiring. pick_model_slug maps this
+  # branch to claude-sonnet-4-6 so the claude-code provider registry
+  # selects anthropic-api instead of the OAuth-only sonnet alias.
  SECRETS_JSON=$(python3 -c "
 import json, os
 k = os.environ['E2E_ANTHROPIC_API_KEY']
@@ -429,6 +504,7 @@ print(json.dumps({
 fi

 MODEL_SLUG=$(pick_model_slug "$RUNTIME")
+log "    MODEL_SLUG=$MODEL_SLUG"

 log "5/11 Provisioning parent workspace (runtime=$RUNTIME)..."
 PARENT_RESP=$(tenant_call POST /workspaces \
@@ -456,48 +532,16 @@ fi
 # deadline fires at 5 min and sets status=failed prematurely; heartbeat
 # then transitions failed → online after install.sh finishes. So:
 #
-#   - 20 min deadline (hermes worst-case + slack)
+#   - ${WORKSPACE_ONLINE_TIMEOUT_SECS}s (~$((WORKSPACE_ONLINE_TIMEOUT_SECS/60)) min)
+#     deadline (hermes worst-case + slack). Configurable via
+#     E2E_WORKSPACE_ONLINE_TIMEOUT_SECS (#1646).
 #   - 'failed' is a TRANSIENT state we must tolerate — log and keep
 #     polling, only hard-fail at the deadline. Pre-bootstrap-watcher-fix
 #     (controlplane#245) this was a flake generator: workspace went
 #     failed→online inside our window but we bailed at the failed read.
-log "7/11 Waiting for workspace(s) to reach status=online (up to 30 min — hermes cold boot)..."
-WS_DEADLINE=$(( $(date +%s) + 1800 ))
-WS_TO_CHECK="$PARENT_ID"
-[ -n "$CHILD_ID" ] && WS_TO_CHECK="$WS_TO_CHECK $CHILD_ID"
-for wid in $WS_TO_CHECK; do
-  WS_LAST_STATUS=""
-  WS_FAILED_LOGGED=0
-  while true; do
-    if [ "$(date +%s)" -gt "$WS_DEADLINE" ]; then
-      WS_LAST_ERR=$(tenant_call GET "/workspaces/$wid" 2>/dev/null | \
-        python3 -c "import json,sys; print(json.load(sys.stdin).get('last_sample_error',''))" 2>/dev/null || echo "")
-      fail "Workspace $wid never reached online within 20 min (last status=$WS_LAST_STATUS, err=$WS_LAST_ERR)"
-    fi
-    WS_JSON=$(tenant_call GET "/workspaces/$wid" 2>/dev/null || echo '{}')
-    WS_STATUS=$(echo "$WS_JSON" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))" 2>/dev/null)
-    if [ "$WS_STATUS" != "$WS_LAST_STATUS" ]; then
-      log "    $wid → $WS_STATUS"
-      WS_LAST_STATUS="$WS_STATUS"
-    fi
-    case "$WS_STATUS" in
-      online) break ;;
-      failed)
-        # Not a hard fail — bootstrap-watcher frequently marks failed at
-        # 5 min on hermes, then heartbeat recovers to online around 10-13
-        # min when install.sh finishes. Log once per workspace so the CI
-        # output isn't spammy.
-        if [ "$WS_FAILED_LOGGED" = "0" ]; then
-          log "    $wid transiently failed — waiting for heartbeat recovery (bootstrap-watcher deadline, see cp#245)"
-          WS_FAILED_LOGGED=1
-        fi
-        sleep 10
-        ;;
-      *)      sleep 10 ;;
-    esac
-  done
-  ok "    $wid online"
-done
+WS_TO_CHECK=("$PARENT_ID")
+[ -n "$CHILD_ID" ] && WS_TO_CHECK+=("$CHILD_ID")
+wait_workspaces_online_routable "7/11 Waiting for workspace(s) to reach status=online (up to $((WORKSPACE_ONLINE_TIMEOUT_SECS/60)) min — hermes cold boot)..." "${WS_TO_CHECK[@]}"

 # ─── 7b. Canvas-terminal diagnose (EIC chain probe) ────────────────────
 # This step exists because the canvas-terminal failure of 2026-05-03
@@ -523,7 +567,7 @@ done
 # probes docker.Ping + container exec; we still expect ok=true there
 # since local-docker is the alternative production path.
 log "7b/11 Canvas-terminal EIC diagnose probe..."
-for wid in $WS_TO_CHECK; do
+for wid in "${WS_TO_CHECK[@]}"; do
  DIAG_JSON=$(tenant_call GET "/workspaces/$wid/terminal/diagnose" 2>/dev/null || echo '{}')
  DIAG_OK=$(echo "$DIAG_JSON" | python3 -c "import json,sys; d=json.load(sys.stdin); print('true' if d.get('ok') else 'false')" 2>/dev/null || echo "false")
  if [ "$DIAG_OK" = "true" ]; then
@@ -559,7 +603,7 @@ CONFIG_PAYLOAD="${CONFIG_MARKER}
 name: synth-canary
 runtime: ${RUNTIME}
 "
-for wid in $WS_TO_CHECK; do
+for wid in "${WS_TO_CHECK[@]}"; do
  PUT_BODY=$(python3 -c "import json,sys; print(json.dumps({'content': sys.stdin.read()}))" <<< "$CONFIG_PAYLOAD")
  # Capture body to a tempfile so curl's -w '%{http_code}' is the only
  # thing on stdout. The first version used `-w '\n%{http_code}\n'` and
@@ -592,6 +636,12 @@ for wid in $WS_TO_CHECK; do
  ok "    $wid config.yaml PUT OK (HTTP $PUT_CODE)"
 done

+# Saving config.yaml follows the same path as Canvas Config Save & Restart.
+# The controlplane can briefly put the workspace back into provisioning and
+# clear its route while the runtime restarts, so A2A must wait on the same
+# externally routable readiness boundary again.
+wait_workspaces_online_routable "7d/11 Waiting for workspace(s) to recover routing after config.yaml PUT..." "${WS_TO_CHECK[@]}"
+
 # ─── 8. A2A round-trip on parent ───────────────────────────────────────
 log "8/11 Sending A2A message to parent — expecting agent response..."
 # Smoke prompt phrasing — DO NOT trim back to the bare "Reply with exactly: PONG"
@@ -631,10 +681,44 @@ print(json.dumps({
 # 90s gives ~3x headroom over observed cold-call P95 (~25-30s).
 # Subsequent A2A turns hit the same workspace and are sub-second, so
 # this only widens the window for step 8/11 of the canary's first turn.
-A2A_RESP=$(tenant_call POST "/workspaces/$PARENT_ID/a2a" \
-  --max-time 90 \
-  -H "Content-Type: application/json" \
-  -d "$A2A_PAYLOAD")
+A2A_TMP=$(mktemp -t synth_a2a.XXXXXX)
+for A2A_ATTEMPT in $(seq 1 12); do
+  : >"$A2A_TMP"
+  set +e
+  A2A_CODE=$(tenant_call POST "/workspaces/$PARENT_ID/a2a" \
+    --max-time 90 \
+    -H "Content-Type: application/json" \
+    -d "$A2A_PAYLOAD" \
+    -o "$A2A_TMP" \
+    -w '%{http_code}' \
+    2>/dev/null)
+  A2A_RC=$?
+  set -e
+  A2A_CODE=${A2A_CODE:-000}
+  A2A_RESP=$(cat "$A2A_TMP" 2>/dev/null || echo "")
+  if [ "$A2A_RC" = "0" ] && [ "$A2A_CODE" -ge 200 ] && [ "$A2A_CODE" -lt 300 ]; then
+    break
+  fi
+
+  A2A_SAFE_BODY=$(printf '%s' "$A2A_RESP" | sanitize_http_body)
+  if echo "$A2A_CODE" | grep -Eq '^(502|503|504)$' && echo "$A2A_SAFE_BODY" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|error code: 502|error code: 504|workspace agent unreachable|connection refused|no healthy upstream|workspace agent busy|native_session'; then
+    log "    A2A cold-start probe attempt $A2A_ATTEMPT/12 returned $A2A_CODE: $A2A_SAFE_BODY"
+    if [ "$A2A_ATTEMPT" -lt 12 ]; then
+      A2A_SLEEP=10
+      if echo "$A2A_SAFE_BODY" | grep -Eqi 'workspace agent busy|native_session'; then
+        A2A_SLEEP=30
+      fi
+      sleep "$A2A_SLEEP"
+      continue
+    fi
+  fi
+  break
+done
+rm -f "$A2A_TMP"
+if [ "$A2A_RC" != "0" ] || [ "$A2A_CODE" -lt 200 ] || [ "$A2A_CODE" -ge 300 ]; then
+  A2A_SAFE_BODY=$(printf '%s' "$A2A_RESP" | sanitize_http_body)
+  fail "A2A POST /workspaces/$PARENT_ID/a2a failed after $A2A_ATTEMPT attempt(s) (curl_rc=$A2A_RC, http=$A2A_CODE): $A2A_SAFE_BODY"
+fi
 AGENT_TEXT=$(echo "$A2A_RESP" | python3 -c "
 import json, sys
 d = json.load(sys.stdin)
@@ -831,20 +915,50 @@ print(json.dumps({
    }
 }))
 ")
-  set +e
-  # Raw curl (not tenant_call) because this call carries an extra
-  # X-Source-Workspace-Id header. Must still send X-Molecule-Org-Id
-  # or TenantGuard 404s — previously missing, caused section 10 to
-  # fail rc=22 despite everything upstream being correct (2026-04-21).
-  DELEG_RESP=$(curl "${CURL_COMMON[@]}" -X POST "$TENANT_URL/workspaces/$CHILD_ID/a2a" \
-    -H "Authorization: Bearer $EFFECTIVE_TENANT_TOKEN" \
-    -H "X-Molecule-Org-Id: $ORG_ID" \
-    -H "X-Source-Workspace-Id: $PARENT_ID" \
-    -H "Content-Type: application/json" \
-    -d "$DELEG_PAYLOAD")
-  DELEG_RC=$?
-  set -e
-  [ $DELEG_RC -ne 0 ] && fail "Delegation A2A POST failed (rc=$DELEG_RC)"
+  DELEG_TMP=$(mktemp -t deleg_a2a.XXXXXX)
+  for DELEG_ATTEMPT in $(seq 1 12); do
+    : >"$DELEG_TMP"
+    set +e
+    # Raw curl (not tenant_call) because this call carries an extra
+    # X-Source-Workspace-Id header. Must still send X-Molecule-Org-Id
+    # or TenantGuard 404s — previously missing, caused section 10 to
+    # fail rc=22 despite everything upstream being correct (2026-04-21).
+    DELEG_CODE=$(curl "${CURL_COMMON[@]}" -X POST "$TENANT_URL/workspaces/$CHILD_ID/a2a" \
+      -H "Authorization: Bearer $EFFECTIVE_TENANT_TOKEN" \
+      -H "X-Molecule-Org-Id: $ORG_ID" \
+      -H "X-Source-Workspace-Id: $PARENT_ID" \
+      -H "Content-Type: application/json" \
+      -d "$DELEG_PAYLOAD" \
+      -o "$DELEG_TMP" \
+      -w '%{http_code}' \
+      2>/dev/null)
+    DELEG_RC=$?
+    set -e
+    DELEG_CODE=${DELEG_CODE:-000}
+    DELEG_RESP=$(cat "$DELEG_TMP" 2>/dev/null || echo "")
+    if [ "$DELEG_RC" = "0" ] && [ "$DELEG_CODE" -ge 200 ] && [ "$DELEG_CODE" -lt 300 ]; then
+      break
+    fi
+
+    DELEG_SAFE_BODY=$(printf '%s' "$DELEG_RESP" | sanitize_http_body)
+    if echo "$DELEG_CODE" | grep -Eq '^(502|503|504)$' && echo "$DELEG_SAFE_BODY" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|error code: 502|error code: 504|workspace agent unreachable|connection refused|no healthy upstream|workspace agent busy|native_session'; then
+      log "    Delegation A2A cold-start attempt $DELEG_ATTEMPT/12 returned $DELEG_CODE: $DELEG_SAFE_BODY"
+      if [ "$DELEG_ATTEMPT" -lt 12 ]; then
+        DELEG_SLEEP=10
+        if echo "$DELEG_SAFE_BODY" | grep -Eqi 'workspace agent busy|native_session'; then
+          DELEG_SLEEP=30
+        fi
+        sleep "$DELEG_SLEEP"
+        continue
+      fi
+    fi
+    break
+  done
+  rm -f "$DELEG_TMP"
+  if [ "$DELEG_RC" != "0" ] || [ "$DELEG_CODE" -lt 200 ] || [ "$DELEG_CODE" -ge 300 ]; then
+    DELEG_SAFE_BODY=$(printf '%s' "$DELEG_RESP" | sanitize_http_body)
+    fail "Delegation A2A POST failed after $DELEG_ATTEMPT attempt(s) (curl_rc=$DELEG_RC, http=$DELEG_CODE): $DELEG_SAFE_BODY"
+  fi
  DELEG_TEXT=$(echo "$DELEG_RESP" | python3 -c "
 import json, sys
 try:
@@ -0,0 +1,18 @@
+from pathlib import Path
+
+
+ROOT = Path(__file__).resolve().parents[1]
+
+
+def test_staging_e2e_workflows_use_stable_minimax_default() -> None:
+    """Keep cron/push E2E on the same MiniMax model as the smoke-tested script."""
+    workflow_paths = [
+        ".gitea/workflows/e2e-staging-saas.yml",
+        ".gitea/workflows/staging-smoke.yml",
+        ".gitea/workflows/continuous-synth-e2e.yml",
+    ]
+
+    for rel in workflow_paths:
+        text = (ROOT / rel).read_text()
+        assert "MiniMax-M2.7-highspeed" not in text
+        assert "MiniMax-M2" in text
@@ -705,7 +705,7 @@ def test_ci_change_detector_docs_and_meta_scripts_do_not_trigger_surfaces():
    }


-def test_ci_platform_go_pr_steps_are_path_scoped():
+def test_ci_platform_go_steps_are_path_scoped_on_all_events():
    doc = yaml.safe_load(CI_WORKFLOW.read_text(encoding="utf-8"))
    platform = doc["jobs"]["platform-build"]
    assert platform.get("needs") == "changes"
@@ -720,11 +720,11 @@ def test_ci_platform_go_pr_steps_are_path_scoped():
    assert expensive_steps
    for step in expensive_steps:
        expr = step.get("if", "")
-        assert "github.event_name != 'pull_request'" in expr
        assert "needs.changes.outputs.platform == 'true'" in expr
+        assert "github.event_name != 'pull_request'" not in expr


-def test_ci_canvas_nextjs_pr_steps_are_path_scoped():
+def test_ci_canvas_nextjs_steps_are_path_scoped_on_all_events():
    doc = yaml.safe_load(CI_WORKFLOW.read_text(encoding="utf-8"))
    canvas = doc["jobs"]["canvas-build"]
    assert canvas.get("needs") == "changes"
@@ -739,11 +739,11 @@ def test_ci_canvas_nextjs_pr_steps_are_path_scoped():
    assert expensive_steps
    for step in expensive_steps:
        expr = step.get("if", "")
-        assert "github.event_name != 'pull_request'" in expr
        assert "needs.changes.outputs.canvas == 'true'" in expr
+        assert "github.event_name != 'pull_request'" not in expr


-def test_ci_shellcheck_pr_steps_are_path_scoped():
+def test_ci_shellcheck_steps_are_path_scoped_on_all_events():
    doc = yaml.safe_load(CI_WORKFLOW.read_text(encoding="utf-8"))
    shellcheck = doc["jobs"]["shellcheck"]
    assert shellcheck.get("needs") == "changes"
@@ -756,5 +756,5 @@ def test_ci_shellcheck_pr_steps_are_path_scoped():
    assert expensive_steps
    for step in expensive_steps:
        expr = step.get("if", "")
-        assert "github.event_name != 'pull_request'" in expr
        assert "needs.changes.outputs.scripts == 'true'" in expr
+        assert "github.event_name != 'pull_request'" not in expr
@@ -159,7 +159,8 @@ func generateAppInstallationToken() (string, time.Time, error) {
 	req, _ := http.NewRequest("POST", fmt.Sprintf("https://api.github.com/app/installations/%d/access_tokens", installID), nil)
 	req.Header.Set("Authorization", "Bearer "+signed)
 	req.Header.Set("Accept", "application/vnd.github+json")
-	resp, err := http.DefaultClient.Do(req)
+	client := &http.Client{Timeout: 30 * time.Second}
+	resp, err := client.Do(req)
 	if err != nil {
 		return "", time.Time{}, err
 	}
@@ -33,7 +33,7 @@ func TestWorkspaceCreate_WithParentID(t *testing.T) {
 	// Default tier is 3 (Privileged) — see workspace.go create-handler comment.
 	// delivery_mode defaults to "push" when payload omits it (#2339).
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "Child Agent", nil, 3, "langgraph", sqlmock.AnyArg(), &parentID, nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push").
+		WithArgs(sqlmock.AnyArg(), "Child Agent", nil, 3, "langgraph", sqlmock.AnyArg(), &parentID, nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push", (*string)(nil), (*int)(nil)).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 	mock.ExpectExec("INSERT INTO canvas_layouts").
@@ -69,7 +69,7 @@ func TestWorkspaceCreate_ExplicitClaudeCodeRuntime(t *testing.T) {
 	mock.ExpectBegin()
 	// delivery_mode defaults to "push" when payload omits it (#2339).
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "CC Agent", nil, 2, "claude-code", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push").
+		WithArgs(sqlmock.AnyArg(), "CC Agent", nil, 2, "claude-code", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push", (*string)(nil), (*int)(nil)).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 	mock.ExpectExec("INSERT INTO canvas_layouts").
@@ -291,7 +291,7 @@ func TestWorkspaceCreate_MaxConcurrentTasksOverride(t *testing.T) {

 	mock.ExpectBegin()
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "Leader Agent", nil, 3, "claude-code", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), 3, "push").
+		WithArgs(sqlmock.AnyArg(), "Leader Agent", nil, 3, "claude-code", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), 3, "push", (*string)(nil), (*int)(nil)).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 	mock.ExpectExec("INSERT INTO canvas_layouts").
@@ -368,7 +368,7 @@ func TestWorkspaceCreate(t *testing.T) {
 	// Default tier is 3 (Privileged) — see workspace.go create-handler comment.
 	// delivery_mode defaults to "push" when payload omits it (#2339).
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "Test Agent", nil, 3, "langgraph", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push").
+		WithArgs(sqlmock.AnyArg(), "Test Agent", nil, 3, "langgraph", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push", (*string)(nil), (*int)(nil)).
 		WillReturnResult(sqlmock.NewResult(0, 1))

 	// Expect transaction commit (no secrets in this payload)
@@ -214,6 +214,11 @@ func (h *WorkspaceHandler) Create(c *gin.Context) {
 		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid workspace fields"})
 		return
 	}
+	// #1686 Phase 1: validate per-workspace compute overrides.
+	if err := models.ValidateComputeConfig(payload.Compute); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		return
+	}

 	id := uuid.New().String()
 	awarenessNamespace := workspaceAwarenessNamespace(id)
@@ -398,11 +403,22 @@ func (h *WorkspaceHandler) Create(c *gin.Context) {
 	// double-click. Helper retries with " (2)", " (3)", … up to maxNameSuffix,
 	// returns the actually-persisted name (which we MUST thread back into
 	// payload + broadcast so the canvas displays what the DB has).
+	var computeInstanceType *string
+	var computeVolumeRootGB *int
+	if payload.Compute != nil {
+		if payload.Compute.InstanceType != "" {
+			computeInstanceType = &payload.Compute.InstanceType
+		}
+		if payload.Compute.Volume.RootGB != 0 {
+			computeVolumeRootGB = &payload.Compute.Volume.RootGB
+		}
+	}
+
 	const insertWorkspaceSQL = `
-		INSERT INTO workspaces (id, name, role, tier, runtime, awareness_namespace, status, parent_id, workspace_dir, workspace_access, budget_limit, max_concurrent_tasks, delivery_mode)
-		VALUES ($1, $2, $3, $4, $5, $6, 'provisioning', $7, $8, $9, $10, $11, $12)
+		INSERT INTO workspaces (id, name, role, tier, runtime, awareness_namespace, status, parent_id, workspace_dir, workspace_access, budget_limit, max_concurrent_tasks, delivery_mode, compute_instance_type, compute_volume_root_gb)
+		VALUES ($1, $2, $3, $4, $5, $6, 'provisioning', $7, $8, $9, $10, $11, $12, $13, $14)
 	`
-	insertArgs := []any{id, payload.Name, role, payload.Tier, payload.Runtime, awarenessNamespace, payload.ParentID, workspaceDir, workspaceAccess, payload.BudgetLimit, maxConcurrent, deliveryMode}
+	insertArgs := []any{id, payload.Name, role, payload.Tier, payload.Runtime, awarenessNamespace, payload.ParentID, workspaceDir, workspaceAccess, payload.BudgetLimit, maxConcurrent, deliveryMode, computeInstanceType, computeVolumeRootGB}
 	persistedName, currentTx, err := insertWorkspaceWithNameRetry(
 		ctx,
 		tx,
@@ -157,6 +157,8 @@ func TestWorkspaceBudget_Create_WithLimit(t *testing.T) {
 			&budgetVal,       // budget_limit ($10)
 			models.DefaultMaxConcurrentTasks, // max_concurrent_tasks default
 			"push",           // delivery_mode default (#2339)
+			(*string)(nil),   // compute_instance_type default
+			(*int)(nil),      // compute_volume_root_gb default
 		).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
@@ -309,9 +309,31 @@ func (h *WorkspaceHandler) buildProvisionerConfig(
 		// RuntimeImages[Runtime] :latest lookup, which is what the dead
 		// reader's sql.ErrNoRows path was producing already.
 		Image: "",
+		// Compute overrides (nullable — omitted = platform-managed default).
+		// Issue #1686 Phase 1.
+		InstanceType: extractComputeInstanceType(payload.Compute),
+		VolumeRootGB: extractComputeVolumeRootGB(payload.Compute),
 	}
 }

+// extractComputeInstanceType returns the instance type from a ComputeConfig,
+// or nil when cfg is nil or the field is empty.
+func extractComputeInstanceType(cfg *models.ComputeConfig) *string {
+	if cfg != nil && cfg.InstanceType != "" {
+		return &cfg.InstanceType
+	}
+	return nil
+}
+
+// extractComputeVolumeRootGB returns the root volume size from a ComputeConfig,
+// or nil when cfg is nil or the field is zero.
+func extractComputeVolumeRootGB(cfg *models.ComputeConfig) *int {
+	if cfg != nil && cfg.Volume.RootGB != 0 {
+		return &cfg.Volume.RootGB
+	}
+	return nil
+}
+
 // issueAndInjectToken rotates the workspace auth token and injects the
 // plaintext into cfg.ConfigFiles[".auth_token"] so it is written into the
 // /configs volume by WriteFilesToContainer immediately after the container
@@ -779,6 +779,75 @@ func TestBuildProvisionerConfig_WorkspacePathFromEnv(t *testing.T) {
 	}
 }

+// TestBuildProvisionerConfig_ComputeOverrides verifies that #1686 Phase 1
+// compute fields (instance_type + volume.root_gb) are threaded from the
+// create payload into the provisioner config.
+func TestBuildProvisionerConfig_ComputeOverrides(t *testing.T) {
+	mock := setupTestDB(t)
+	mock.ExpectQuery(`SELECT COALESCE\(workspace_dir`).
+		WithArgs("ws-compute").
+		WillReturnRows(sqlmock.NewRows([]string{"workspace_dir", "workspace_access"}).AddRow("", "none"))
+
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+	cfg := handler.buildProvisionerConfig(
+		context.Background(),
+		"ws-compute",
+		"",
+		nil,
+		models.CreateWorkspacePayload{
+			Tier:    2,
+			Runtime: "python",
+			Compute: &models.ComputeConfig{
+				InstanceType: "g4dn.xlarge",
+				Volume:       models.ComputeVolume{RootGB: 256},
+			},
+		},
+		nil,
+		"",
+		"workspace:ws-compute",
+	)
+
+	if cfg.InstanceType == nil || *cfg.InstanceType != "g4dn.xlarge" {
+		t.Errorf("InstanceType = %v, want g4dn.xlarge", cfg.InstanceType)
+	}
+	if cfg.VolumeRootGB == nil || *cfg.VolumeRootGB != 256 {
+		t.Errorf("VolumeRootGB = %v, want 256", cfg.VolumeRootGB)
+	}
+}
+
+// TestBuildProvisionerConfig_ComputeNil verifies backward compat: when the
+// payload omits compute, the provisioner config fields are nil so the CP
+// applies its own defaults.
+func TestBuildProvisionerConfig_ComputeNil(t *testing.T) {
+	mock := setupTestDB(t)
+	mock.ExpectQuery(`SELECT COALESCE\(workspace_dir`).
+		WithArgs("ws-no-compute").
+		WillReturnRows(sqlmock.NewRows([]string{"workspace_dir", "workspace_access"}).AddRow("", "none"))
+
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+	cfg := handler.buildProvisionerConfig(
+		context.Background(),
+		"ws-no-compute",
+		"",
+		nil,
+		models.CreateWorkspacePayload{Tier: 1, Runtime: "python"},
+		nil,
+		"",
+		"workspace:ws-no-compute",
+	)
+
+	if cfg.InstanceType != nil {
+		t.Errorf("InstanceType = %v, want nil", cfg.InstanceType)
+	}
+	if cfg.VolumeRootGB != nil {
+		t.Errorf("VolumeRootGB = %v, want nil", cfg.VolumeRootGB)
+	}
+}
+
 // ==================== issueAndInjectToken (issue #418) ====================

 // TestIssueAndInjectToken_HappyPath verifies that on a normal (re)provision the
@@ -8,6 +8,7 @@ import (
 	"net/http/httptest"
 	"os"
 	"path/filepath"
+	"strings"
 	"testing"
 	"time"

@@ -342,7 +343,7 @@ func TestWorkspaceCreate_DBInsertError(t *testing.T) {
 	// Transaction begins, workspace INSERT fails, transaction is rolled back.
 	mock.ExpectBegin()
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "Failing Agent", nil, 3, "langgraph", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push").
+		WithArgs(sqlmock.AnyArg(), "Failing Agent", nil, 3, "langgraph", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push", (*string)(nil), (*int)(nil)).
 		WillReturnError(sql.ErrConnDone)
 	mock.ExpectRollback()

@@ -364,6 +365,94 @@ func TestWorkspaceCreate_DBInsertError(t *testing.T) {
 	}
 }

+// TestWorkspaceCreate_InvalidCompute verifies #1686 Phase 1 create-time
+// validation: bad instance_type or volume.root_gb returns 400 before any
+// DB call.
+func TestWorkspaceCreate_InvalidCompute(t *testing.T) {
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+	cases := []struct {
+		name string
+		body string
+		want string
+	}{
+		{
+			name: "instance_type too long",
+			body: `{"name":"Bad Type","compute":{"instance_type":"` + strings.Repeat("x", 65) + `"}}`,
+			want: "compute.instance_type too long",
+		},
+		{
+			name: "root_gb too small",
+			body: `{"name":"Small Disk","compute":{"volume":{"root_gb":16}}}`,
+			want: "compute.volume.root_gb must be at least 32",
+		},
+		{
+			name: "root_gb too large",
+			body: `{"name":"Big Disk","compute":{"volume":{"root_gb":4096}}}`,
+			want: "compute.volume.root_gb exceeds maximum 2048",
+		},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			w := httptest.NewRecorder()
+			c, _ := gin.CreateTestContext(w)
+			c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(tc.body))
+			c.Request.Header.Set("Content-Type", "application/json")
+
+			handler.Create(c)
+			if w.Code != http.StatusBadRequest {
+				t.Errorf("expected 400, got %d: %s", w.Code, w.Body.String())
+			}
+			if !strings.Contains(w.Body.String(), tc.want) {
+				t.Errorf("body %q should contain %q", w.Body.String(), tc.want)
+			}
+		})
+	}
+}
+
+// TestWorkspaceCreate_WithComputeOverrides verifies that valid #1686 Phase 1
+// compute fields are persisted into the workspaces table.
+func TestWorkspaceCreate_WithComputeOverrides(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+	mock.ExpectBegin()
+	instanceType := "g4dn.xlarge"
+	rootGB := 256
+	mock.ExpectExec("INSERT INTO workspaces").
+		WithArgs(sqlmock.AnyArg(), "GPU Agent", nil, 3, "python", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push", &instanceType, &rootGB).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+	mock.ExpectCommit()
+
+	mock.ExpectExec("INSERT INTO canvas_layouts").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+	mock.ExpectExec("INSERT INTO structure_events").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+	mock.ExpectExec("INSERT INTO structure_events").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+	mock.ExpectExec(`UPDATE workspaces SET status =`).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+	mock.ExpectExec("INSERT INTO workspace_config").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	body := `{"name":"GPU Agent","runtime":"python","compute":{"instance_type":"g4dn.xlarge","volume":{"root_gb":256}}}`
+	c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Create(c)
+	if w.Code != http.StatusCreated {
+		t.Errorf("expected 201, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
 func TestWorkspaceCreate_DefaultsApplied(t *testing.T) {
 	mock := setupTestDB(t)
 	setupTestRedis(t)
@@ -375,7 +464,7 @@ func TestWorkspaceCreate_DefaultsApplied(t *testing.T) {
 	// Expect workspace INSERT with defaulted tier=3 (Privileged — the
 	// handler default in workspace.go), runtime="langgraph"
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "Default Agent", nil, 3, "langgraph", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push").
+		WithArgs(sqlmock.AnyArg(), "Default Agent", nil, 3, "langgraph", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push", (*string)(nil), (*int)(nil)).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()

@@ -423,7 +512,7 @@ func TestWorkspaceCreate_SaaSHardForcesTier4(t *testing.T) {

 	mock.ExpectBegin()
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "SaaS External Agent", nil, 4, "external", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push").
+		WithArgs(sqlmock.AnyArg(), "SaaS External Agent", nil, 4, "external", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push", (*string)(nil), (*int)(nil)).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 	mock.ExpectExec("INSERT INTO canvas_layouts").
@@ -464,7 +553,7 @@ func TestWorkspaceCreate_WithSecrets_Persists(t *testing.T) {

 	mock.ExpectBegin()
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "Hermes Agent", nil, 3, "hermes", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push").
+		WithArgs(sqlmock.AnyArg(), "Hermes Agent", nil, 3, "hermes", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push", (*string)(nil), (*int)(nil)).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	// Secret inserted inside the same transaction.
 	mock.ExpectExec("INSERT INTO workspace_secrets").
@@ -576,7 +665,7 @@ func TestWorkspaceCreate_ExternalURL_SSRFSafe(t *testing.T) {

 	mock.ExpectBegin()
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "Ext Agent", nil, 3, "external", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push").
+		WithArgs(sqlmock.AnyArg(), "Ext Agent", nil, 3, "external", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push", (*string)(nil), (*int)(nil)).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 	// External URL update (localhost is explicitly allowed by validateAgentURL).
@@ -615,7 +704,7 @@ func TestWorkspaceCreate_KimiRuntime_PreservesLabel(t *testing.T) {

 	mock.ExpectBegin()
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "Kimi Agent", nil, 3, "kimi", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push").
+		WithArgs(sqlmock.AnyArg(), "Kimi Agent", nil, 3, "kimi", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push", (*string)(nil), (*int)(nil)).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 	// Pre-register flow: awaiting_agent + runtime preserved as "kimi"
@@ -1639,7 +1728,7 @@ runtime_config:
 	mock.ExpectExec("INSERT INTO workspaces").
 		WithArgs(
 			sqlmock.AnyArg(), "Hermes Agent", nil, 3, "hermes",
-			sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push").
+			sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push", (*string)(nil), (*int)(nil)).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 	mock.ExpectExec("INSERT INTO canvas_layouts").
@@ -1696,7 +1785,7 @@ model: anthropic:claude-sonnet-4-5
 	mock.ExpectExec("INSERT INTO workspaces").
 		WithArgs(
 			sqlmock.AnyArg(), "Legacy Agent", nil, 3, "langgraph",
-			sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push").
+			sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push", (*string)(nil), (*int)(nil)).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 	mock.ExpectExec("INSERT INTO canvas_layouts").
@@ -1749,7 +1838,7 @@ runtime_config:
 	mock.ExpectExec("INSERT INTO workspaces").
 		WithArgs(
 			sqlmock.AnyArg(), "Custom Hermes", nil, 3, "hermes",
-			sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push").
+			sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push", (*string)(nil), (*int)(nil)).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 	mock.ExpectExec("INSERT INTO canvas_layouts").
@@ -1855,7 +1944,7 @@ func TestWorkspaceCreate_188_NoTemplateNoRuntime_StillDefaultsLanggraph(t *testi

 	mock.ExpectBegin()
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "Plain Default", nil, 3, "langgraph", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push").
+		WithArgs(sqlmock.AnyArg(), "Plain Default", nil, 3, "langgraph", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push", (*string)(nil), (*int)(nil)).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 	mock.ExpectExec("INSERT INTO canvas_layouts").
@@ -1890,7 +1979,7 @@ func TestWorkspaceCreate_188_ExplicitRuntimeNoTemplate_OK(t *testing.T) {

 	mock.ExpectBegin()
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "Explicit Codex", nil, 3, "codex", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push").
+		WithArgs(sqlmock.AnyArg(), "Explicit Codex", nil, 3, "codex", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks, "push", (*string)(nil), (*int)(nil)).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 	mock.ExpectExec("INSERT INTO canvas_layouts").
@@ -3,6 +3,7 @@ package models
 import (
 	"database/sql"
 	"encoding/json"
+	"fmt"
 	"time"
 )

@@ -45,6 +46,10 @@ type Workspace struct {
 	// forced to route updates through a parent workspace. Default true
 	// (preserves existing behaviour for all workspaces).
 	TalkToUserEnabled  bool            `json:"talk_to_user_enabled" db:"talk_to_user_enabled"`
+	// Compute overrides (nullable — omitted = platform-managed default).
+	// Issue #1686 Phase 1.
+	ComputeInstanceType *string `json:"compute_instance_type,omitempty" db:"compute_instance_type"`
+	ComputeVolumeRootGB *int    `json:"compute_volume_root_gb,omitempty" db:"compute_volume_root_gb"`
 	// Canvas layout fields (from JOIN)
 	X         float64 `json:"x"`
 	Y         float64 `json:"y"`
@@ -154,6 +159,40 @@ type MemorySeed struct {
 	Scope   string `json:"scope" yaml:"scope"` // LOCAL, TEAM, GLOBAL
 }

+// ComputeVolume holds per-workspace disk configuration.
+type ComputeVolume struct {
+	RootGB int `json:"root_gb"`
+}
+
+// ComputeConfig holds per-workspace EC2 compute overrides.
+// Omitted at create time means "use platform-managed defaults".
+type ComputeConfig struct {
+	InstanceType string        `json:"instance_type"`
+	Volume       ComputeVolume `json:"volume"`
+}
+
+// ValidateComputeConfig performs create-time validation on compute overrides.
+// Returns nil when cfg is nil (omitted = platform-managed default).
+func ValidateComputeConfig(cfg *ComputeConfig) error {
+	if cfg == nil {
+		return nil
+	}
+	if cfg.InstanceType != "" {
+		if len(cfg.InstanceType) > 64 {
+			return fmt.Errorf("compute.instance_type too long (max 64 chars)")
+		}
+	}
+	if cfg.Volume.RootGB != 0 {
+		if cfg.Volume.RootGB < 32 {
+			return fmt.Errorf("compute.volume.root_gb must be at least 32")
+		}
+		if cfg.Volume.RootGB > 2048 {
+			return fmt.Errorf("compute.volume.root_gb exceeds maximum 2048")
+		}
+	}
+	return nil
+}
+
 type CreateWorkspacePayload struct {
 	Name     string  `json:"name" binding:"required"`
 	Role     string  `json:"role"`
@@ -180,6 +219,9 @@ type CreateWorkspacePayload struct {
 	// MaxConcurrentTasks caps parallel A2A + cron dispatch. 0 means use
 	// DefaultMaxConcurrentTasks. Leaders typically set 3.
 	MaxConcurrentTasks int `json:"max_concurrent_tasks"`
+	// Compute is an optional per-workspace EC2 shape override.
+	// Omitted = platform-managed default (current behaviour).
+	Compute *ComputeConfig `json:"compute,omitempty"`
 	Canvas   struct {
 		X float64 `json:"x"`
 		Y float64 `json:"y"`
@@ -0,0 +1,90 @@
+package models
+
+import "testing"
+
+func TestValidateComputeConfig_NilIsValid(t *testing.T) {
+	if err := ValidateComputeConfig(nil); err != nil {
+		t.Errorf("nil compute config should be valid, got: %v", err)
+	}
+}
+
+func TestValidateComputeConfig_EmptyIsValid(t *testing.T) {
+	cfg := &ComputeConfig{}
+	if err := ValidateComputeConfig(cfg); err != nil {
+		t.Errorf("empty compute config should be valid, got: %v", err)
+	}
+}
+
+func TestValidateComputeConfig_ValidOverrides(t *testing.T) {
+	cfg := &ComputeConfig{
+		InstanceType: "g4dn.xlarge",
+		Volume:       ComputeVolume{RootGB: 256},
+	}
+	if err := ValidateComputeConfig(cfg); err != nil {
+		t.Errorf("valid overrides should pass, got: %v", err)
+	}
+}
+
+func TestValidateComputeConfig_InstanceTypeTooLong(t *testing.T) {
+	longName := string(make([]byte, 65))
+	for i := range longName {
+		longName = longName[:i] + "x" + longName[i+1:]
+	}
+	cfg := &ComputeConfig{InstanceType: longName}
+	if err := ValidateComputeConfig(cfg); err == nil {
+		t.Error("expected error for instance_type > 64 chars")
+	} else if err.Error() != "compute.instance_type too long (max 64 chars)" {
+		t.Errorf("unexpected error message: %q", err.Error())
+	}
+}
+
+func TestValidateComputeConfig_RootGBTooSmall(t *testing.T) {
+	cfg := &ComputeConfig{Volume: ComputeVolume{RootGB: 31}}
+	if err := ValidateComputeConfig(cfg); err == nil {
+		t.Error("expected error for root_gb < 32")
+	} else if err.Error() != "compute.volume.root_gb must be at least 32" {
+		t.Errorf("unexpected error message: %q", err.Error())
+	}
+}
+
+func TestValidateComputeConfig_RootGBTooLarge(t *testing.T) {
+	cfg := &ComputeConfig{Volume: ComputeVolume{RootGB: 2049}}
+	if err := ValidateComputeConfig(cfg); err == nil {
+		t.Error("expected error for root_gb > 2048")
+	} else if err.Error() != "compute.volume.root_gb exceeds maximum 2048" {
+		t.Errorf("unexpected error message: %q", err.Error())
+	}
+}
+
+func TestValidateComputeConfig_BoundaryValues(t *testing.T) {
+	cases := []struct {
+		name string
+		cfg  ComputeConfig
+		ok   bool
+	}{
+		{"min root_gb", ComputeConfig{Volume: ComputeVolume{RootGB: 32}}, true},
+		{"max root_gb", ComputeConfig{Volume: ComputeVolume{RootGB: 2048}}, true},
+		{"just under min", ComputeConfig{Volume: ComputeVolume{RootGB: 31}}, false},
+		{"just over max", ComputeConfig{Volume: ComputeVolume{RootGB: 2049}}, false},
+		{"exactly 64 char type", ComputeConfig{InstanceType: string(make([]byte, 64))}, true},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			// fill the 64-char case with 'x'
+			if tc.cfg.InstanceType != "" {
+				b := make([]byte, len(tc.cfg.InstanceType))
+				for i := range b {
+					b[i] = 'x'
+				}
+				tc.cfg.InstanceType = string(b)
+			}
+			err := ValidateComputeConfig(&tc.cfg)
+			if tc.ok && err != nil {
+				t.Errorf("expected valid, got: %v", err)
+			}
+			if !tc.ok && err == nil {
+				t.Error("expected invalid, got nil")
+			}
+		})
+	}
+}
@@ -163,6 +163,10 @@ type cpProvisionRequest struct {
 	// collectCPConfigFiles which rejects symlinks and non-regular files
 	// before including them. Serialised as base64 to avoid JSON escaping.
 	ConfigFiles map[string]string `json:"config_files,omitempty"`
+	// Compute overrides (nullable — omitted = platform-managed default).
+	// Issue #1686 Phase 1.
+	InstanceType *string `json:"instance_type,omitempty"`
+	VolumeRootGB *int    `json:"volume_root_gb,omitempty"`
 }

 type cpProvisionResponse struct {
@@ -206,13 +210,15 @@ func (p *CPProvisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string,
 	}

 	req := cpProvisionRequest{
-		OrgID:       p.orgID,
-		WorkspaceID: cfg.WorkspaceID,
-		Runtime:     cfg.Runtime,
-		Tier:        cfg.Tier,
-		PlatformURL: cfg.PlatformURL,
-		Env:         env,
-		ConfigFiles: configFiles,
+		OrgID:        p.orgID,
+		WorkspaceID:  cfg.WorkspaceID,
+		Runtime:      cfg.Runtime,
+		Tier:         cfg.Tier,
+		PlatformURL:  cfg.PlatformURL,
+		Env:          env,
+		ConfigFiles:  configFiles,
+		InstanceType: cfg.InstanceType,
+		VolumeRootGB: cfg.VolumeRootGB,
 	}

 	body, err := json.Marshal(req)
@@ -1062,3 +1062,75 @@ func TestCollectCPConfigFiles_RejectsRootSymlink(t *testing.T) {
 		t.Errorf("expected symlink-related error, got: %v", err)
 	}
 }
+
+// TestStart_ComputeOverrides — when WorkspaceConfig carries InstanceType and
+// VolumeRootGB, they must be forwarded in the cpProvisionRequest body so the
+// CP can pass them to EC2 RunInstances. Regression guard for #1686 Phase 1.
+func TestStart_ComputeOverrides(t *testing.T) {
+	var gotBody cpProvisionRequest
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if err := json.NewDecoder(r.Body).Decode(&gotBody); err != nil {
+			t.Errorf("decode request: %v", err)
+		}
+		w.WriteHeader(http.StatusCreated)
+		_, _ = io.WriteString(w, `{"instance_id":"i-compute","state":"pending"}`)
+	}))
+	defer srv.Close()
+
+	p := &CPProvisioner{baseURL: srv.URL, orgID: "org-1", httpClient: srv.Client()}
+	instanceType := "g4dn.xlarge"
+	volumeRootGB := 256
+	_, err := p.Start(context.Background(), WorkspaceConfig{
+		WorkspaceID:  "ws-1",
+		Runtime:      "python",
+		Tier:         2,
+		PlatformURL:  "http://tenant",
+		InstanceType: &instanceType,
+		VolumeRootGB: &volumeRootGB,
+	})
+	if err != nil {
+		t.Fatalf("Start: %v", err)
+	}
+	if gotBody.InstanceType == nil || *gotBody.InstanceType != "g4dn.xlarge" {
+		t.Errorf("instance_type = %v, want g4dn.xlarge", gotBody.InstanceType)
+	}
+	if gotBody.VolumeRootGB == nil || *gotBody.VolumeRootGB != 256 {
+		t.Errorf("volume_root_gb = %v, want 256", gotBody.VolumeRootGB)
+	}
+}
+
+// TestStart_ComputeOmittedWhenNil — when WorkspaceConfig has no compute
+// overrides, the JSON body must omit the keys entirely (omitempty) so CP
+// applies its own defaults rather than empty/zero values.
+func TestStart_ComputeOmittedWhenNil(t *testing.T) {
+	var raw json.RawMessage
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if err := json.NewDecoder(r.Body).Decode(&raw); err != nil {
+			t.Errorf("decode request: %v", err)
+		}
+		w.WriteHeader(http.StatusCreated)
+		_, _ = io.WriteString(w, `{"instance_id":"i-default","state":"pending"}`)
+	}))
+	defer srv.Close()
+
+	p := &CPProvisioner{baseURL: srv.URL, orgID: "org-1", httpClient: srv.Client()}
+	_, err := p.Start(context.Background(), WorkspaceConfig{
+		WorkspaceID: "ws-1",
+		Runtime:     "python",
+		Tier:        1,
+		PlatformURL: "http://tenant",
+	})
+	if err != nil {
+		t.Fatalf("Start: %v", err)
+	}
+	var decoded map[string]interface{}
+	if err := json.Unmarshal(raw, &decoded); err != nil {
+		t.Fatalf("unmarshal raw body: %v", err)
+	}
+	if _, ok := decoded["instance_type"]; ok {
+		t.Errorf("instance_type should be omitted when nil")
+	}
+	if _, ok := decoded["volume_root_gb"]; ok {
+		t.Errorf("volume_root_gb should be omitted when nil")
+	}
+}
@@ -105,6 +105,11 @@ type WorkspaceConfig struct {
 	WorkspaceAccess    string // #65: "none" (default), "read_only", or "read_write"
 	ResetClaudeSession bool   // #12: if true, discard the claude-sessions volume before start (fresh session dir)

+	// Compute overrides (nullable — omitted = platform-managed default).
+	// Issue #1686 Phase 1.
+	InstanceType *string `json:"instance_type,omitempty"`
+	VolumeRootGB *int    `json:"volume_root_gb,omitempty"`
+
 	// Image, when non-empty, overrides the runtime→image lookup. CP
 	// (molecule-controlplane) is the single SSOT for runtime image digest
 	// pins via its migrations/027_runtime_image_pins table — the pin is
@@ -726,6 +731,16 @@ func buildContainerEnv(cfg WorkspaceConfig) []string {
 		}
 		env = append(env, fmt.Sprintf("%s=%s", k, v))
 	}
+	// #1687: alias GH_PAT → GH_TOKEN / GITHUB_TOKEN on the READ side
+	// (container env assembly). gh CLI and git credential helpers look
+	// for these standard names; by aliasing here we avoid writing the
+	// forbidden keys into tenant-writer surfaces (workspace_secrets,
+	// envVars map, etc.). GH_PAT itself is not an SCM-write credential
+	// and passes through cfg.EnvVars untouched.
+	if pat, hasPAT := cfg.EnvVars["GH_PAT"]; hasPAT && pat != "" {
+		env = append(env, fmt.Sprintf("GH_TOKEN=%s", pat))
+		env = append(env, fmt.Sprintf("GITHUB_TOKEN=%s", pat))
+	}
 	// Inject ADMIN_TOKEN from the platform server's environment so workspace
 	// containers can call /admin/liveness and other admin-gated endpoints
 	// (core#831). cp_provisioner.go handles this separately for SaaS tenants.
@@ -0,0 +1,5 @@
+ALTER TABLE workspaces
+    DROP COLUMN IF EXISTS compute_instance_type;
+
+ALTER TABLE workspaces
+    DROP COLUMN IF EXISTS compute_volume_root_gb;
@@ -0,0 +1,10 @@
+-- Per-workspace EC2 compute configuration (#1686 Phase 1).
+-- Allows callers to override instance_type and root volume size
+-- at workspace creation time. Omitted/null values preserve the
+-- platform-managed default (current behaviour), so this is fully
+-- backwards-compatible.
+ALTER TABLE workspaces
+    ADD COLUMN IF NOT EXISTS compute_instance_type TEXT;
+
+ALTER TABLE workspaces
+    ADD COLUMN IF NOT EXISTS compute_volume_root_gb INTEGER;