fix(canvas): consolidate platform-auth headers via shared helper (#178) #54
2
.github/scripts/lint_secret_pattern_drift.py
vendored
2
.github/scripts/lint_secret_pattern_drift.py
vendored
@ -37,7 +37,7 @@ CANONICAL_FILE = Path(".github/workflows/secret-scan.yml")
|
||||
CONSUMERS: list[tuple[str, str]] = [
|
||||
(
|
||||
"molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh",
|
||||
"https://raw.githubusercontent.com/Molecule-AI/molecule-ai-workspace-runtime/main/molecule_runtime/scripts/pre-commit-checks.sh",
|
||||
"https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-runtime/raw/branch/main/molecule_runtime/scripts/pre-commit-checks.sh",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
2
.github/workflows/canary-verify.yml
vendored
2
.github/workflows/canary-verify.yml
vendored
@ -108,7 +108,7 @@ jobs:
|
||||
echo
|
||||
echo "One or more canary secrets are unset (\`CANARY_TENANT_URLS\`, \`CANARY_ADMIN_TOKENS\`, \`CANARY_CP_SHARED_SECRET\`)."
|
||||
echo "Phase 2 canary fleet has not been stood up yet —"
|
||||
echo "see [canary-tenants.md](https://github.com/molecule-ai/molecule-controlplane/blob/main/docs/canary-tenants.md)."
|
||||
echo "see [canary-tenants.md](https://git.moleculesai.app/molecule-ai/molecule-controlplane/blob/main/docs/canary-tenants.md)."
|
||||
echo
|
||||
echo "**Skipped — promote-to-latest will NOT auto-fire.** Dispatch \`promote-latest.yml\` manually when ready."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
130
.github/workflows/e2e-api.yml
vendored
130
.github/workflows/e2e-api.yml
vendored
@ -12,6 +12,59 @@ name: E2E API Smoke Test
|
||||
# spending CI cycles. See the in-job comment on the `e2e-api` job for
|
||||
# why this is one job (not two-jobs-sharing-name) and the 2026-04-29
|
||||
# PR #2264 incident that drove the consolidation.
|
||||
#
|
||||
# Parallel-safety (Class B Hongming-owned CICD red sweep, 2026-05-08)
|
||||
# -------------------------------------------------------------------
|
||||
# Same substrate hazard as PR #98 (handlers-postgres-integration). Our
|
||||
# Gitea act_runner runs with `container.network: host` (operator host
|
||||
# `/opt/molecule/runners/config.yaml`), which means:
|
||||
#
|
||||
# * Two concurrent runs both try to bind their `-p 15432:5432` /
|
||||
# `-p 16379:6379` host ports — the second postgres/redis FATALs
|
||||
# with `Address in use` and `docker run` returns exit 125 with
|
||||
# `Conflict. The container name "/molecule-ci-postgres" is already
|
||||
# in use by container ...`. Verified in run a7/2727 on 2026-05-07.
|
||||
# * The fixed container names `molecule-ci-postgres` / `-redis` (the
|
||||
# pre-fix shape) collide on name AS WELL AS port. The cleanup-with-
|
||||
# `docker rm -f` at the start of the second job KILLS the first
|
||||
# job's still-running postgres/redis.
|
||||
#
|
||||
# Fix shape (mirrors PR #98's bridge-net pattern, adapted because
|
||||
# platform-server is a Go binary on the host, not a containerised
|
||||
# step):
|
||||
#
|
||||
# 1. Unique container names per run:
|
||||
# pg-e2e-api-${RUN_ID}-${RUN_ATTEMPT}
|
||||
# redis-e2e-api-${RUN_ID}-${RUN_ATTEMPT}
|
||||
# `${RUN_ID}-${RUN_ATTEMPT}` is unique even across reruns of the
|
||||
# same run_id.
|
||||
# 2. Ephemeral host port per run (`-p 0:5432`), then read the actual
|
||||
# bound port via `docker port` and export DATABASE_URL/REDIS_URL
|
||||
# pointing at it. No fixed host-port → no port collision.
|
||||
# 3. `127.0.0.1` (NOT `localhost`) in URLs — IPv6 first-resolve was
|
||||
# the original flake fixed in #92 and the script's still IPv6-
|
||||
# enabled.
|
||||
# 4. `if: always()` cleanup so containers don't leak when test steps
|
||||
# fail.
|
||||
#
|
||||
# Issue #94 items #2 + #3 (also fixed here):
|
||||
# * Pre-pull `alpine:latest` so the platform-server's provisioner
|
||||
# (`internal/handlers/container_files.go`) can stand up its
|
||||
# ephemeral token-write helper without a daemon.io round-trip.
|
||||
# * Create `molecule-monorepo-net` bridge network if missing so the
|
||||
# provisioner's container.HostConfig {NetworkMode: ...} attach
|
||||
# succeeds.
|
||||
# Item #1 (timeouts) — evidence on recent runs (77/3191, ae/4270, 0e/
|
||||
# 2318) shows Postgres ready in 3s, Redis in 1s, Platform in 1s when
|
||||
# they DO come up. Timeouts are not the bottleneck; not bumped.
|
||||
#
|
||||
# Item explicitly NOT fixed here: failing test `Status back online`
|
||||
# fails because the platform's langgraph workspace template image
|
||||
# (ghcr.io/molecule-ai/workspace-template-langgraph:latest) returns
|
||||
# 403 Forbidden post-2026-05-06 GitHub org suspension. That is a
|
||||
# template-registry resolution issue (ADR-002 / local-build mode) and
|
||||
# belongs in a separate change that touches workspace-server, not
|
||||
# this workflow file.
|
||||
|
||||
on:
|
||||
push:
|
||||
@ -78,11 +131,14 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 15
|
||||
env:
|
||||
DATABASE_URL: postgres://dev:dev@localhost:15432/molecule?sslmode=disable
|
||||
REDIS_URL: redis://localhost:16379
|
||||
# Unique per-run container names so concurrent runs on the host-
|
||||
# network act_runner don't collide on name OR port.
|
||||
# `${RUN_ID}-${RUN_ATTEMPT}` stays unique across reruns of the
|
||||
# same run_id. PORT is set later (after docker port lookup) since
|
||||
# we let Docker assign an ephemeral host port.
|
||||
PG_CONTAINER: pg-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
|
||||
REDIS_CONTAINER: redis-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
|
||||
PORT: "8080"
|
||||
PG_CONTAINER: molecule-ci-postgres
|
||||
REDIS_CONTAINER: molecule-ci-redis
|
||||
steps:
|
||||
- name: No-op pass (paths filter excluded this commit)
|
||||
if: needs.detect-changes.outputs.api != 'true'
|
||||
@ -97,11 +153,53 @@ jobs:
|
||||
go-version: 'stable'
|
||||
cache: true
|
||||
cache-dependency-path: workspace-server/go.sum
|
||||
- name: Pre-pull alpine + ensure provisioner network (Issue #94 items #2 + #3)
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: |
|
||||
# Provisioner uses alpine:latest for ephemeral token-write
|
||||
# containers (workspace-server/internal/handlers/container_files.go).
|
||||
# Pre-pull so the first provision in test_api.sh doesn't race
|
||||
# the daemon's pull cache. Idempotent — `docker pull` is a no-op
|
||||
# when the image is already present.
|
||||
docker pull alpine:latest >/dev/null
|
||||
# Provisioner attaches workspace containers to
|
||||
# molecule-monorepo-net (workspace-server/internal/provisioner/
|
||||
# provisioner.go::DefaultNetwork). The bridge already exists on
|
||||
# the operator host's docker daemon — `network create` is
|
||||
# idempotent via `|| true`.
|
||||
docker network create molecule-monorepo-net >/dev/null 2>&1 || true
|
||||
echo "alpine:latest pre-pulled; molecule-monorepo-net ensured."
|
||||
- name: Start Postgres (docker)
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: |
|
||||
# Defensive cleanup — only matches THIS run's container name,
|
||||
# so it cannot kill a sibling run's postgres. (Pre-fix the
|
||||
# name was static and this rm hit other runs' containers.)
|
||||
docker rm -f "$PG_CONTAINER" 2>/dev/null || true
|
||||
docker run -d --name "$PG_CONTAINER" -e POSTGRES_USER=dev -e POSTGRES_PASSWORD=dev -e POSTGRES_DB=molecule -p 15432:5432 postgres:16
|
||||
# `-p 0:5432` requests an ephemeral host port; we read it back
|
||||
# below and export DATABASE_URL.
|
||||
docker run -d --name "$PG_CONTAINER" \
|
||||
-e POSTGRES_USER=dev -e POSTGRES_PASSWORD=dev -e POSTGRES_DB=molecule \
|
||||
-p 0:5432 postgres:16 >/dev/null
|
||||
# Resolve the host-side port assignment. `docker port` prints
|
||||
# `0.0.0.0:NNNN` (and on host-net runners may also print an
|
||||
# IPv6 line — take the first IPv4 line).
|
||||
PG_PORT=$(docker port "$PG_CONTAINER" 5432/tcp | awk -F: '/^0\.0\.0\.0:/ {print $2; exit}')
|
||||
if [ -z "$PG_PORT" ]; then
|
||||
# Fallback: any first line. Some Docker versions print only
|
||||
# one line.
|
||||
PG_PORT=$(docker port "$PG_CONTAINER" 5432/tcp | head -1 | awk -F: '{print $NF}')
|
||||
fi
|
||||
if [ -z "$PG_PORT" ]; then
|
||||
echo "::error::Could not resolve host port for $PG_CONTAINER"
|
||||
docker port "$PG_CONTAINER" 5432/tcp || true
|
||||
docker logs "$PG_CONTAINER" || true
|
||||
exit 1
|
||||
fi
|
||||
# 127.0.0.1 (NOT localhost) — IPv6 first-resolve flake (#92).
|
||||
echo "PG_PORT=${PG_PORT}" >> "$GITHUB_ENV"
|
||||
echo "DATABASE_URL=postgres://dev:dev@127.0.0.1:${PG_PORT}/molecule?sslmode=disable" >> "$GITHUB_ENV"
|
||||
echo "Postgres host port: ${PG_PORT}"
|
||||
for i in $(seq 1 30); do
|
||||
if docker exec "$PG_CONTAINER" pg_isready -U dev >/dev/null 2>&1; then
|
||||
echo "Postgres ready after ${i}s"
|
||||
@ -116,7 +214,20 @@ jobs:
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: |
|
||||
docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
|
||||
docker run -d --name "$REDIS_CONTAINER" -p 16379:6379 redis:7
|
||||
docker run -d --name "$REDIS_CONTAINER" -p 0:6379 redis:7 >/dev/null
|
||||
REDIS_PORT=$(docker port "$REDIS_CONTAINER" 6379/tcp | awk -F: '/^0\.0\.0\.0:/ {print $2; exit}')
|
||||
if [ -z "$REDIS_PORT" ]; then
|
||||
REDIS_PORT=$(docker port "$REDIS_CONTAINER" 6379/tcp | head -1 | awk -F: '{print $NF}')
|
||||
fi
|
||||
if [ -z "$REDIS_PORT" ]; then
|
||||
echo "::error::Could not resolve host port for $REDIS_CONTAINER"
|
||||
docker port "$REDIS_CONTAINER" 6379/tcp || true
|
||||
docker logs "$REDIS_CONTAINER" || true
|
||||
exit 1
|
||||
fi
|
||||
echo "REDIS_PORT=${REDIS_PORT}" >> "$GITHUB_ENV"
|
||||
echo "REDIS_URL=redis://127.0.0.1:${REDIS_PORT}" >> "$GITHUB_ENV"
|
||||
echo "Redis host port: ${REDIS_PORT}"
|
||||
for i in $(seq 1 15); do
|
||||
if docker exec "$REDIS_CONTAINER" redis-cli ping 2>/dev/null | grep -q PONG; then
|
||||
echo "Redis ready after ${i}s"
|
||||
@ -135,13 +246,15 @@ jobs:
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
working-directory: workspace-server
|
||||
run: |
|
||||
# DATABASE_URL + REDIS_URL exported by the start-postgres /
|
||||
# start-redis steps point at this run's per-run host ports.
|
||||
./platform-server > platform.log 2>&1 &
|
||||
echo $! > platform.pid
|
||||
- name: Wait for /health
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: |
|
||||
for i in $(seq 1 30); do
|
||||
if curl -sf http://localhost:8080/health > /dev/null; then
|
||||
if curl -sf http://127.0.0.1:8080/health > /dev/null; then
|
||||
echo "Platform up after ${i}s"
|
||||
exit 0
|
||||
fi
|
||||
@ -185,6 +298,9 @@ jobs:
|
||||
kill "$(cat workspace-server/platform.pid)" 2>/dev/null || true
|
||||
fi
|
||||
- name: Stop service containers
|
||||
# always() so containers don't leak when test steps fail. The
|
||||
# cleanup is best-effort: if the container is already gone
|
||||
# (e.g. concurrent rerun race), don't fail the job.
|
||||
if: always() && needs.detect-changes.outputs.api == 'true'
|
||||
run: |
|
||||
docker rm -f "$PG_CONTAINER" 2>/dev/null || true
|
||||
|
||||
141
.github/workflows/handlers-postgres-integration.yml
vendored
141
.github/workflows/handlers-postgres-integration.yml
vendored
@ -14,12 +14,42 @@ name: Handlers Postgres Integration
|
||||
# self-review caught it took 2 minutes to set up and would have caught
|
||||
# the bug at PR-time.
|
||||
#
|
||||
# This job spins a Postgres service container, applies the migration,
|
||||
# and runs `go test -tags=integration` against a live DB. Required
|
||||
# check on staging branch protection — backend handler PRs cannot
|
||||
# merge without a real-DB regression gate.
|
||||
# Why this workflow does NOT use `services: postgres:` (Class B fix)
|
||||
# ------------------------------------------------------------------
|
||||
# Our act_runner config has `container.network: host` (operator host
|
||||
# /opt/molecule/runners/config.yaml), which act_runner applies to BOTH
|
||||
# the job container AND every service container. With host-net, two
|
||||
# concurrent runs of this workflow both try to bind 0.0.0.0:5432 — the
|
||||
# second postgres FATALs with `could not create any TCP/IP sockets:
|
||||
# Address in use`, and Docker auto-removes it (act_runner sets
|
||||
# AutoRemove:true on service containers). By the time the migrations
|
||||
# step runs `psql`, the postgres container is gone, hence
|
||||
# `Connection refused` then `failed to remove container: No such
|
||||
# container` at cleanup time.
|
||||
#
|
||||
# Cost: ~30s job (postgres pull from GH cache + go build + 4 tests).
|
||||
# Per-job `container.network` override is silently ignored by
|
||||
# act_runner — `--network and --net in the options will be ignored.`
|
||||
# appears in the runner log. Documented constraint.
|
||||
#
|
||||
# So we sidestep `services:` entirely. The job container still uses
|
||||
# host-net (inherited from runner config; required for cache server
|
||||
# discovery on the bridge IP 172.18.0.17:42631). We launch a sibling
|
||||
# postgres on the existing `molecule-monorepo-net` bridge with a
|
||||
# UNIQUE name per run — `pg-handlers-${RUN_ID}-${RUN_ATTEMPT}` — and
|
||||
# read its bridge IP via `docker inspect`. A host-net job container
|
||||
# can reach a bridge-net container directly via the bridge IP (verified
|
||||
# manually on operator host 2026-05-08).
|
||||
#
|
||||
# Trade-offs vs. the original `services:` shape:
|
||||
# + No host-port collision; N parallel runs share the bridge cleanly
|
||||
# + `if: always()` cleanup runs even on test-step failure
|
||||
# - One more step in the workflow (+~3 lines)
|
||||
# - Requires `molecule-monorepo-net` to exist on the operator host
|
||||
# (it does; declared in docker-compose.yml + docker-compose.infra.yml)
|
||||
#
|
||||
# Class B Hongming-owned CICD red sweep, 2026-05-08.
|
||||
#
|
||||
# Cost: ~30s job (postgres pull from cache + go build + 4 tests).
|
||||
|
||||
on:
|
||||
push:
|
||||
@ -59,20 +89,14 @@ jobs:
|
||||
name: Handlers Postgres Integration
|
||||
needs: detect-changes
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
postgres:
|
||||
image: postgres:15-alpine
|
||||
env:
|
||||
POSTGRES_PASSWORD: test
|
||||
POSTGRES_DB: molecule
|
||||
ports:
|
||||
- 5432:5432
|
||||
# GHA spins this with --health-cmd built in for postgres images.
|
||||
options: >-
|
||||
--health-cmd pg_isready
|
||||
--health-interval 5s
|
||||
--health-timeout 5s
|
||||
--health-retries 10
|
||||
env:
|
||||
# Unique name per run so concurrent jobs don't collide on the
|
||||
# bridge network. ${RUN_ID}-${RUN_ATTEMPT} is unique even across
|
||||
# workflow_dispatch reruns of the same run_id.
|
||||
PG_NAME: pg-handlers-${{ github.run_id }}-${{ github.run_attempt }}
|
||||
# Bridge network already exists on the operator host (declared
|
||||
# in docker-compose.yml + docker-compose.infra.yml).
|
||||
PG_NETWORK: molecule-monorepo-net
|
||||
defaults:
|
||||
run:
|
||||
working-directory: workspace-server
|
||||
@ -89,16 +113,57 @@ jobs:
|
||||
with:
|
||||
go-version: 'stable'
|
||||
|
||||
- if: needs.detect-changes.outputs.handlers == 'true'
|
||||
name: Start sibling Postgres on bridge network
|
||||
working-directory: .
|
||||
run: |
|
||||
# Sanity: the bridge network must exist on the operator host.
|
||||
# Hard-fail loud if it doesn't — easier to spot than a silent
|
||||
# auto-create that diverges from the rest of the stack.
|
||||
if ! docker network inspect "${PG_NETWORK}" >/dev/null 2>&1; then
|
||||
echo "::error::Bridge network '${PG_NETWORK}' missing on operator host. Re-run docker-compose.infra.yml or check ops handbook."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# If a stale container with the same name exists (rerun on
|
||||
# the same run_id), wipe it first.
|
||||
docker rm -f "${PG_NAME}" >/dev/null 2>&1 || true
|
||||
|
||||
docker run -d \
|
||||
--name "${PG_NAME}" \
|
||||
--network "${PG_NETWORK}" \
|
||||
--health-cmd "pg_isready -U postgres" \
|
||||
--health-interval 5s \
|
||||
--health-timeout 5s \
|
||||
--health-retries 10 \
|
||||
-e POSTGRES_PASSWORD=test \
|
||||
-e POSTGRES_DB=molecule \
|
||||
postgres:15-alpine >/dev/null
|
||||
|
||||
# Read back the bridge IP. Always present immediately after
|
||||
# `docker run -d` for bridge networks.
|
||||
PG_HOST=$(docker inspect "${PG_NAME}" \
|
||||
--format "{{(index .NetworkSettings.Networks \"${PG_NETWORK}\").IPAddress}}")
|
||||
if [ -z "${PG_HOST}" ]; then
|
||||
echo "::error::Could not resolve PG_HOST for ${PG_NAME} on ${PG_NETWORK}"
|
||||
docker logs "${PG_NAME}" || true
|
||||
exit 1
|
||||
fi
|
||||
echo "PG_HOST=${PG_HOST}" >> "$GITHUB_ENV"
|
||||
echo "INTEGRATION_DB_URL=postgres://postgres:test@${PG_HOST}:5432/molecule?sslmode=disable" >> "$GITHUB_ENV"
|
||||
echo "Started ${PG_NAME} at ${PG_HOST}:5432"
|
||||
|
||||
- if: needs.detect-changes.outputs.handlers == 'true'
|
||||
name: Apply migrations to Postgres service
|
||||
env:
|
||||
PGPASSWORD: test
|
||||
run: |
|
||||
# Wait for postgres to actually accept connections (the
|
||||
# GHA --health-cmd is best-effort but psql can still race).
|
||||
# Wait for postgres to actually accept connections. Docker's
|
||||
# health-cmd handles container-side readiness, but the wire
|
||||
# to the bridge IP is best-tested with pg_isready directly.
|
||||
for i in {1..15}; do
|
||||
if pg_isready -h localhost -p 5432 -U postgres -q; then break; fi
|
||||
echo "waiting for postgres..."; sleep 2
|
||||
if pg_isready -h "${PG_HOST}" -p 5432 -U postgres -q; then break; fi
|
||||
echo "waiting for postgres at ${PG_HOST}:5432..."; sleep 2
|
||||
done
|
||||
|
||||
# Apply every .up.sql in lexicographic order with
|
||||
@ -131,7 +196,7 @@ jobs:
|
||||
# not fine once a cross-table atomicity test came in.
|
||||
set +e
|
||||
for migration in $(ls migrations/*.sql 2>/dev/null | grep -v '\.down\.sql$' | sort); do
|
||||
if psql -h localhost -U postgres -d molecule -v ON_ERROR_STOP=1 \
|
||||
if psql -h "${PG_HOST}" -U postgres -d molecule -v ON_ERROR_STOP=1 \
|
||||
-f "$migration" >/dev/null 2>&1; then
|
||||
echo "✓ $(basename "$migration")"
|
||||
else
|
||||
@ -145,7 +210,7 @@ jobs:
|
||||
# fail if any didn't land — that would be a real regression we
|
||||
# want loud.
|
||||
for tbl in delegations workspaces activity_logs pending_uploads; do
|
||||
if ! psql -h localhost -U postgres -d molecule -tA \
|
||||
if ! psql -h "${PG_HOST}" -U postgres -d molecule -tA \
|
||||
-c "SELECT 1 FROM information_schema.tables WHERE table_name = '$tbl'" \
|
||||
| grep -q 1; then
|
||||
echo "::error::$tbl table missing after migration replay — handler integration tests would be meaningless"
|
||||
@ -156,16 +221,32 @@ jobs:
|
||||
|
||||
- if: needs.detect-changes.outputs.handlers == 'true'
|
||||
name: Run integration tests
|
||||
env:
|
||||
INTEGRATION_DB_URL: postgres://postgres:test@localhost:5432/molecule?sslmode=disable
|
||||
run: |
|
||||
# INTEGRATION_DB_URL is exported by the start-postgres step;
|
||||
# points at the per-run bridge IP, not 127.0.0.1, so concurrent
|
||||
# workflow runs don't fight over a host-net 5432 port.
|
||||
go test -tags=integration -timeout 5m -v ./internal/handlers/ -run "^TestIntegration_"
|
||||
|
||||
- if: needs.detect-changes.outputs.handlers == 'true' && failure()
|
||||
- if: failure() && needs.detect-changes.outputs.handlers == 'true'
|
||||
name: Diagnostic dump on failure
|
||||
env:
|
||||
PGPASSWORD: test
|
||||
run: |
|
||||
echo "::group::delegations table state"
|
||||
psql -h localhost -U postgres -d molecule -c "SELECT * FROM delegations LIMIT 50;" || true
|
||||
echo "::group::postgres container status"
|
||||
docker ps -a --filter "name=${PG_NAME}" --format '{{.Status}} {{.Names}}' || true
|
||||
docker logs "${PG_NAME}" 2>&1 | tail -50 || true
|
||||
echo "::endgroup::"
|
||||
echo "::group::delegations table state"
|
||||
psql -h "${PG_HOST}" -U postgres -d molecule -c "SELECT * FROM delegations LIMIT 50;" || true
|
||||
echo "::endgroup::"
|
||||
|
||||
- if: always() && needs.detect-changes.outputs.handlers == 'true'
|
||||
name: Stop sibling Postgres
|
||||
working-directory: .
|
||||
run: |
|
||||
# always() so containers don't leak when migrations or tests
|
||||
# fail. The cleanup is best-effort: if the container is
|
||||
# already gone (e.g. concurrent rerun race), don't fail the job.
|
||||
docker rm -f "${PG_NAME}" >/dev/null 2>&1 || true
|
||||
echo "Cleaned up ${PG_NAME}"
|
||||
|
||||
|
||||
59
.github/workflows/pr-guards.yml
vendored
59
.github/workflows/pr-guards.yml
vendored
@ -1,14 +1,25 @@
|
||||
name: pr-guards
|
||||
|
||||
# Thin caller that delegates to the molecule-ci reusable guard. Today
|
||||
# the guard is just "disable auto-merge when a new commit is pushed
|
||||
# after auto-merge was enabled" — added 2026-04-27 after PR #2174
|
||||
# auto-merged with only its first commit because the second commit
|
||||
# was pushed after the merge queue had locked the PR's SHA.
|
||||
# PR-time guards. Today the only guard is "disable auto-merge when a
|
||||
# new commit is pushed after auto-merge was enabled" — added 2026-04-27
|
||||
# after PR #2174 auto-merged with only its first commit because the
|
||||
# second commit was pushed after the merge queue had locked the PR's
|
||||
# SHA.
|
||||
#
|
||||
# When more PR-time guards land in molecule-ci, add them here as
|
||||
# additional jobs that share the same pull_request:synchronize
|
||||
# trigger.
|
||||
# Why this is inlined (not delegated to molecule-ci's reusable
|
||||
# workflow): the reusable workflow uses `gh pr merge --disable-auto`,
|
||||
# which calls GitHub's GraphQL API. Gitea has no GraphQL endpoint and
|
||||
# returns HTTP 405 on /api/graphql, so the job failed on every Gitea
|
||||
# PR push since the 2026-05-06 migration. Gitea also has no `--auto`
|
||||
# merge primitive that this job could be acting on, so the right
|
||||
# behaviour on Gitea is "no-op + green status" — not a 405.
|
||||
#
|
||||
# Inlining (vs. an `if:` on the `uses:` line) keeps the job ALWAYS
|
||||
# running, which matters for branch protection: required-check names
|
||||
# need a job that emits SUCCESS terminal state, not SKIPPED. See
|
||||
# `feedback_branch_protection_check_name_parity` and `feedback_pr_merge_safety_guards`.
|
||||
#
|
||||
# Issue #88 item 1.
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
@ -19,4 +30,34 @@ permissions:
|
||||
|
||||
jobs:
|
||||
disable-auto-merge-on-push:
|
||||
uses: molecule-ai/molecule-ci/.github/workflows/disable-auto-merge-on-push.yml@main
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
# Detect Gitea Actions. act_runner sets GITEA_ACTIONS=true in the
|
||||
# step env on every job. Belt-and-suspenders: also check the repo
|
||||
# url's host, which is independent of any runner-side env config
|
||||
# (covers a future Gitea host where the env var is forgotten).
|
||||
- name: Detect runner host
|
||||
id: host
|
||||
run: |
|
||||
if [[ "${GITEA_ACTIONS:-}" == "true" ]] || [[ "${{ github.server_url }}" == *moleculesai.app* ]] || [[ "${{ github.event.repository.html_url }}" == *moleculesai.app* ]]; then
|
||||
echo "is_gitea=true" >> "$GITHUB_OUTPUT"
|
||||
echo "::notice::Gitea Actions detected — auto-merge gating is not applicable here (Gitea has no --auto merge primitive). Job will no-op."
|
||||
else
|
||||
echo "is_gitea=false" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
- name: Disable auto-merge (GitHub only)
|
||||
if: steps.host.outputs.is_gitea != 'true'
|
||||
env:
|
||||
GH_TOKEN: ${{ github.token }}
|
||||
PR: ${{ github.event.pull_request.number }}
|
||||
REPO: ${{ github.repository }}
|
||||
NEW_SHA: ${{ github.sha }}
|
||||
run: |
|
||||
set -eu
|
||||
gh pr merge "$PR" --disable-auto -R "$REPO" || true
|
||||
gh pr comment "$PR" -R "$REPO" --body "🔒 Auto-merge disabled — new commit (\`${NEW_SHA:0:7}\`) pushed after auto-merge was enabled. The merge queue locks SHAs at entry, so subsequent pushes can race. Verify the new commit and re-enable with \`gh pr merge --auto\`."
|
||||
|
||||
- name: Gitea no-op
|
||||
if: steps.host.outputs.is_gitea == 'true'
|
||||
run: echo "Gitea Actions — auto-merge gating not applicable; no-op (job intentionally green so branch protection's required-check name lands SUCCESS)."
|
||||
|
||||
177
.github/workflows/publish-runtime.yml
vendored
177
.github/workflows/publish-runtime.yml
vendored
@ -282,42 +282,33 @@ jobs:
|
||||
echo "::error::Refusing to fan out cascade against stale or corrupt PyPI surfaces."
|
||||
exit 1
|
||||
|
||||
- name: Fan out repository_dispatch
|
||||
- name: Fan out via push to .runtime-version
|
||||
env:
|
||||
# Fine-grained PAT with `actions:write` on the 8 template repos.
|
||||
# GITHUB_TOKEN can't fire dispatches across repos — needs an explicit
|
||||
# token. Stored as a repo secret; rotate per the standard schedule.
|
||||
DISPATCH_TOKEN: ${{ secrets.TEMPLATE_DISPATCH_TOKEN }}
|
||||
# Single source of truth: the publish job's output, which handles
|
||||
# tag/manual-input/auto-bump uniformly. The previous fallback
|
||||
# (`steps.version.outputs.version` from inside the cascade job)
|
||||
# was a dead reference — different job, no shared step scope.
|
||||
# Gitea PAT with write:repository scope on the 8 cascade-active
|
||||
# template repos. Used here for `git push` (NOT for an API
|
||||
# dispatch — Gitea 1.22.6 has no repository_dispatch endpoint;
|
||||
# empirically verified across 6 candidate paths in molecule-
|
||||
# core#20 issuecomment-913). The push trips each template's
|
||||
# existing `on: push: branches: [main]` trigger on
|
||||
# publish-image.yml, which then reads the updated
|
||||
# .runtime-version via its resolve-version job.
|
||||
DISPATCH_TOKEN: ${{ secrets.DISPATCH_TOKEN }}
|
||||
RUNTIME_VERSION: ${{ needs.publish.outputs.version }}
|
||||
run: |
|
||||
set +e # don't abort on a single repo failure — collect them all
|
||||
# Schedule-vs-dispatch behaviour split (hardened 2026-04-28
|
||||
# after the sweep-cf-orphans soft-skip incident — same class
|
||||
# of bug):
|
||||
#
|
||||
# The earlier "skipping cascade. templates will pick up the
|
||||
# new version on their own next rebuild" message was wrong —
|
||||
# templates only build on this dispatch trigger; without it
|
||||
# they stay pinned to whatever runtime version they last saw.
|
||||
# A silent skip here means "PyPI is current, templates are
|
||||
# not" and the gap is invisible until someone notices a
|
||||
# template still on the old version weeks later.
|
||||
#
|
||||
# - push → exit 1 (red CI surfaces the gap)
|
||||
# - workflow_dispatch → exit 0 with a warning (operator
|
||||
# ran this ad-hoc; let them rerun
|
||||
# after fixing the secret)
|
||||
|
||||
# Soft-skip on workflow_dispatch when the token is missing
|
||||
# (operator ad-hoc test); hard-fail on push so unattended
|
||||
# publishes can't silently skip the cascade. Same shape as
|
||||
# the original v1, intentional split per the schedule-vs-
|
||||
# dispatch hardening 2026-04-28.
|
||||
if [ -z "$DISPATCH_TOKEN" ]; then
|
||||
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
|
||||
echo "::warning::TEMPLATE_DISPATCH_TOKEN secret not set — skipping cascade."
|
||||
echo "::warning::DISPATCH_TOKEN secret not set — skipping cascade."
|
||||
echo "::warning::set it at Settings → Secrets and Variables → Actions, then rerun. Templates will stay on the prior runtime version until either this token is set or each template is rebuilt manually."
|
||||
exit 0
|
||||
fi
|
||||
echo "::error::TEMPLATE_DISPATCH_TOKEN secret missing — cascade cannot fan out."
|
||||
echo "::error::DISPATCH_TOKEN secret missing — cascade cannot fan out."
|
||||
echo "::error::PyPI was published, but the 8 template repos will NOT pick up the new version until this token is restored and a republish dispatches the cascade."
|
||||
echo "::error::set it at Settings → Secrets and Variables → Actions; then re-trigger publish-runtime via workflow_dispatch."
|
||||
exit 1
|
||||
@ -327,37 +318,119 @@ jobs:
|
||||
echo "::error::publish job did not expose a version output — cascade cannot fan out"
|
||||
exit 1
|
||||
fi
|
||||
# All 9 active workspace template repos. The PR #2536 pruning
|
||||
# ("deprecated, no shipping images") was empirically wrong:
|
||||
# continuous-synth-e2e.yml defaults to langgraph as its primary
|
||||
# canary (line 44), and every excluded template had successful
|
||||
# publish-image runs as of 2026-05-03 — none were dormant.
|
||||
# Symptom of the prune: today's a2a-sdk strict-mode fix
|
||||
# (#2566 / commit e1628c4) cascaded to 4 templates but never
|
||||
# reached langgraph, so the synth-E2E correctly canary'd a fix
|
||||
# that had landed but not deployed. Re-added the 5 templates.
|
||||
# Long-term: derive this list from manifest.json so cascade
|
||||
# scope can't drift from E2E scope — tracked in RFC #388 as a
|
||||
# Phase-1 invariant.
|
||||
|
||||
# All 9 workspace templates declared in manifest.json. The list
|
||||
# MUST stay aligned with manifest.json's workspace_templates —
|
||||
# cascade-list-drift-gate.yml enforces this in CI per the
|
||||
# codex-stuck-on-stale-runtime invariant from PR #2556.
|
||||
# Long-term goal: derive this list from manifest.json so it
|
||||
# can't drift even on a manifest edit (RFC #388 Phase-1).
|
||||
#
|
||||
# Per-template publish-image.yml presence is checked at
|
||||
# cascade-time below: codex doesn't ship one today, so the
|
||||
# cascade soft-skips it with an informational message rather
|
||||
# than dropping it from this list (which would re-introduce
|
||||
# the drift the gate exists to catch).
|
||||
GITEA_URL="${GITEA_URL:-https://git.moleculesai.app}"
|
||||
TEMPLATES="claude-code hermes openclaw codex langgraph crewai autogen deepagents gemini-cli"
|
||||
FAILED=""
|
||||
SKIPPED=""
|
||||
|
||||
# Configure git identity once. The persona owning DISPATCH_TOKEN
|
||||
# is the same identity that authored this commit on each
|
||||
# template; using a generic "publish-runtime cascade" co-author
|
||||
# trailer in the message keeps the audit trail honest about the
|
||||
# workflow-driven origin.
|
||||
git config --global user.name "publish-runtime cascade"
|
||||
git config --global user.email "publish-runtime@moleculesai.app"
|
||||
|
||||
WORKDIR="$(mktemp -d)"
|
||||
for tpl in $TEMPLATES; do
|
||||
REPO="molecule-ai/molecule-ai-workspace-template-$tpl"
|
||||
STATUS=$(curl -sS -o /tmp/dispatch.out -w "%{http_code}" \
|
||||
-X POST "https://api.github.com/repos/$REPO/dispatches" \
|
||||
-H "Authorization: Bearer $DISPATCH_TOKEN" \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
-d "{\"event_type\":\"runtime-published\",\"client_payload\":{\"runtime_version\":\"$VERSION\"}}")
|
||||
if [ "$STATUS" = "204" ]; then
|
||||
echo "✓ dispatched $tpl ($VERSION)"
|
||||
else
|
||||
echo "::warning::✗ failed to dispatch $tpl: HTTP $STATUS — $(cat /tmp/dispatch.out)"
|
||||
CLONE="$WORKDIR/$tpl"
|
||||
|
||||
# Pre-check: skip templates without a publish-image.yml.
|
||||
# The cascade's job is to trip the template's on-push
|
||||
# rebuild — if there's no rebuild workflow, pushing a
|
||||
# .runtime-version commit is just noise on the target
|
||||
# repo. Use the Gitea contents API (no clone required for
|
||||
# the probe). 200 = present; 404 = absent.
|
||||
HTTP=$(curl -sS -o /dev/null -w "%{http_code}" \
|
||||
-H "Authorization: token $DISPATCH_TOKEN" \
|
||||
"$GITEA_URL/api/v1/repos/$REPO/contents/.github/workflows/publish-image.yml")
|
||||
if [ "$HTTP" = "404" ]; then
|
||||
echo "↷ $tpl has no publish-image.yml — soft-skip (informational; manifest still tracks it)"
|
||||
SKIPPED="$SKIPPED $tpl"
|
||||
continue
|
||||
fi
|
||||
if [ "$HTTP" != "200" ]; then
|
||||
echo "::warning::$tpl publish-image.yml probe returned HTTP $HTTP — proceeding anyway, push will surface the real failure if any"
|
||||
fi
|
||||
|
||||
# Use a per-template attempt loop so a transient race (e.g.
|
||||
# human pushing to the same template at the same instant)
|
||||
# doesn't lose the cascade. Bounded retries (3) — beyond
|
||||
# that we surface the failure and let the operator retry.
|
||||
attempt=0
|
||||
success=false
|
||||
while [ $attempt -lt 3 ]; do
|
||||
attempt=$((attempt + 1))
|
||||
rm -rf "$CLONE"
|
||||
if ! git clone --depth=1 \
|
||||
"https://x-access-token:${DISPATCH_TOKEN}@${GITEA_URL#https://}/$REPO.git" \
|
||||
"$CLONE" >/tmp/clone.log 2>&1; then
|
||||
echo "::warning::clone $tpl attempt $attempt failed: $(tail -n3 /tmp/clone.log)"
|
||||
sleep 2
|
||||
continue
|
||||
fi
|
||||
|
||||
cd "$CLONE"
|
||||
echo "$VERSION" > .runtime-version
|
||||
|
||||
# Idempotency guard: if the file already matches, this
|
||||
# publish is a re-run for a version already cascaded.
|
||||
# Don't push a no-op commit (would spuriously re-trip the
|
||||
# template's on-push and rebuild for nothing).
|
||||
if git diff --quiet -- .runtime-version; then
|
||||
echo "✓ $tpl already at $VERSION — no commit needed (idempotent)"
|
||||
success=true
|
||||
cd - >/dev/null
|
||||
break
|
||||
fi
|
||||
|
||||
git add .runtime-version
|
||||
git commit -m "chore: pin runtime to $VERSION (publish-runtime cascade)" \
|
||||
-m "Co-Authored-By: publish-runtime cascade <publish-runtime@moleculesai.app>" \
|
||||
>/dev/null
|
||||
|
||||
if git push origin HEAD:main >/tmp/push.log 2>&1; then
|
||||
echo "✓ $tpl pushed $VERSION on attempt $attempt"
|
||||
success=true
|
||||
cd - >/dev/null
|
||||
break
|
||||
fi
|
||||
|
||||
# Likely a non-fast-forward — pull-rebase and retry.
|
||||
# Don't force-push: that would silently overwrite a racing
|
||||
# human/cascade commit.
|
||||
echo "::warning::push $tpl attempt $attempt failed, pull-rebasing: $(tail -n3 /tmp/push.log)"
|
||||
git pull --rebase origin main >/tmp/rebase.log 2>&1 || true
|
||||
cd - >/dev/null
|
||||
done
|
||||
|
||||
if [ "$success" != "true" ]; then
|
||||
FAILED="$FAILED $tpl"
|
||||
fi
|
||||
done
|
||||
rm -rf "$WORKDIR"
|
||||
|
||||
if [ -n "$FAILED" ]; then
|
||||
echo "::warning::Cascade incomplete. Failed templates:$FAILED"
|
||||
# Don't fail the whole job — PyPI publish already succeeded;
|
||||
# operators can retry the failed templates manually.
|
||||
echo "::error::Cascade incomplete after 3 retries each. Failed templates:$FAILED"
|
||||
echo "::error::PyPI publish succeeded; failed templates lag the new version. Re-run this workflow_dispatch with the same version to retry only the laggers (idempotent — already-cascaded templates skip)."
|
||||
exit 1
|
||||
fi
|
||||
if [ -n "$SKIPPED" ]; then
|
||||
echo "Cascade complete: pinned $VERSION on cascade-active templates. Soft-skipped (no publish-image.yml):$SKIPPED"
|
||||
else
|
||||
echo "Cascade complete: $VERSION pinned across all manifest workspace_templates."
|
||||
fi
|
||||
|
||||
@ -22,7 +22,7 @@ development workflow, conventions, and how to get your changes merged.
|
||||
|
||||
```bash
|
||||
# Clone the repo
|
||||
git clone https://github.com/Molecule-AI/molecule-core.git
|
||||
git clone https://git.moleculesai.app/molecule-ai/molecule-core.git
|
||||
cd molecule-core
|
||||
|
||||
# Install git hooks
|
||||
@ -57,7 +57,7 @@ See `CLAUDE.md` for a full list of environment variables and their purposes.
|
||||
|
||||
This repo is scoped to **code** (canvas, workspace, workspace-server, related
|
||||
infra). Public content (blog posts, marketing copy, OG images, SEO briefs,
|
||||
DevRel demos) lives in [`Molecule-AI/docs`](https://github.com/Molecule-AI/docs).
|
||||
DevRel demos) lives in [`Molecule-AI/docs`](https://git.moleculesai.app/molecule-ai/docs).
|
||||
The `Block forbidden paths` CI gate fails any PR that writes to `marketing/`
|
||||
or other removed paths — open against `Molecule-AI/docs` instead.
|
||||
|
||||
@ -110,7 +110,7 @@ causing a render loop when any node position changed.
|
||||
|
||||
1. **Repo-wide:** "Automatically delete head branches" is on. Once a PR merges, the branch is deleted server-side. Any subsequent `git push` to that branch fails with `remote rejected — no such branch`.
|
||||
|
||||
2. **CI:** the `pr-guards` workflow (calling [molecule-ci `disable-auto-merge-on-push`](https://github.com/Molecule-AI/molecule-ci/blob/main/.github/workflows/disable-auto-merge-on-push.yml)) fires on every push to an open PR. If auto-merge was already enabled, it's disabled and a comment is posted. You must explicitly re-enable after verifying the new commit.
|
||||
2. **CI:** the `pr-guards` workflow (calling [molecule-ci `disable-auto-merge-on-push`](https://git.moleculesai.app/molecule-ai/molecule-ci/src/branch/main/.github/workflows/disable-auto-merge-on-push.yml)) fires on every push to an open PR. If auto-merge was already enabled, it's disabled and a comment is posted. You must explicitly re-enable after verifying the new commit.
|
||||
|
||||
**Workflow rules that follow from the guards:**
|
||||
- Push **all** commits before running `gh pr merge --auto`.
|
||||
@ -180,9 +180,9 @@ and run CI manually.
|
||||
Code in this repo lands in molecule-core. Some related runtime artifacts
|
||||
live in their own repos:
|
||||
|
||||
- [`Molecule-AI/molecule-ai-workspace-runtime`](https://github.com/Molecule-AI/molecule-ai-workspace-runtime) — Python adapter SDK (`molecule_runtime`) that runs inside containerized Molecule workspaces. Bridges Claude Code SDK / hermes / langgraph / etc. → A2A queue.
|
||||
- [`Molecule-AI/molecule-sdk-python`](https://github.com/Molecule-AI/molecule-sdk-python) — `A2AServer` + `RemoteAgentClient` for external agents that register over the public `/registry/register` flow.
|
||||
- [`Molecule-AI/molecule-mcp-claude-channel`](https://github.com/Molecule-AI/molecule-mcp-claude-channel) — Claude Code channel plugin. Bridges A2A traffic into a running Claude Code session via MCP `notifications/claude/channel`. Polling-based (no tunnel required); install with `claude --channels plugin:molecule@Molecule-AI/molecule-mcp-claude-channel`.
|
||||
- [`Molecule-AI/molecule-ai-workspace-runtime`](https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-runtime) — Python adapter SDK (`molecule_runtime`) that runs inside containerized Molecule workspaces. Bridges Claude Code SDK / hermes / langgraph / etc. → A2A queue.
|
||||
- [`Molecule-AI/molecule-sdk-python`](https://git.moleculesai.app/molecule-ai/molecule-sdk-python) — `A2AServer` + `RemoteAgentClient` for external agents that register over the public `/registry/register` flow.
|
||||
- [`Molecule-AI/molecule-mcp-claude-channel`](https://git.moleculesai.app/molecule-ai/molecule-mcp-claude-channel) — Claude Code channel plugin. Bridges A2A traffic into a running Claude Code session via MCP `notifications/claude/channel`. Polling-based (no tunnel required); install with `claude --channels plugin:molecule@Molecule-AI/molecule-mcp-claude-channel`.
|
||||
|
||||
When extending the **A2A surface** in molecule-core (`workspace-server/internal/handlers/a2a_proxy.go` etc.), consider whether the change has a downstream impact on the runtime SDK or the channel plugin — they're versioned independently but share the wire shape.
|
||||
|
||||
|
||||
75
README.md
75
README.md
@ -1,7 +1,7 @@
|
||||
<div align="center">
|
||||
|
||||
<p>
|
||||
<img src="./docs/assets/branding/molecule-icon.png" alt="Molecule AI Icon Logo" width="160" />
|
||||
<img src="./docs/assets/branding/molecule-icon.svg" alt="Molecule AI" width="160" />
|
||||
</p>
|
||||
|
||||
<p>
|
||||
@ -39,8 +39,8 @@
|
||||
<a href="./docs/agent-runtime/workspace-runtime.md"><strong>Workspace Runtime</strong></a>
|
||||
</p>
|
||||
|
||||
[](https://railway.app/new/template?template=https://github.com/Molecule-AI/molecule-monorepo)
|
||||
[](https://render.com/deploy?repo=https://github.com/Molecule-AI/molecule-monorepo)
|
||||
[](https://railway.app/new/template?template=https://git.moleculesai.app/molecule-ai/molecule-core)
|
||||
[](https://render.com/deploy?repo=https://git.moleculesai.app/molecule-ai/molecule-core)
|
||||
|
||||
</div>
|
||||
|
||||
@ -53,8 +53,8 @@ Molecule AI is the most powerful way to govern an AI agent organization in produ
|
||||
It combines the parts that are usually scattered across demos, internal glue code, and framework-specific tooling into one product:
|
||||
|
||||
- one org-native control plane for teams, roles, hierarchy, and lifecycle
|
||||
- one runtime layer that lets LangGraph, DeepAgents, Claude Code, CrewAI, AutoGen, and OpenClaw run side by side
|
||||
- one memory model that keeps recall, sharing, and skill evolution aligned with organizational boundaries
|
||||
- one runtime layer that lets **eight** agent runtimes — LangGraph, DeepAgents, Claude Code, CrewAI, AutoGen, **Hermes**, **Gemini CLI**, and OpenClaw — run side by side behind one workspace contract
|
||||
- one memory model that keeps recall, sharing, and skill evolution aligned with organizational boundaries (Memory v2 backed by pgvector for semantic recall)
|
||||
- one operational surface for observing, pausing, restarting, inspecting, and improving live workspaces
|
||||
|
||||
Most teams can build a workflow, a strong single agent, a coding agent, or a custom multi-agent graph.
|
||||
@ -75,7 +75,7 @@ You do not wire collaboration paths by hand. Hierarchy defines the default commu
|
||||
|
||||
### 3. Runtime choice stops being a dead-end decision
|
||||
|
||||
LangGraph, DeepAgents, Claude Code, CrewAI, AutoGen, and OpenClaw can all plug into the same workspace abstraction. Teams can standardize governance without forcing every group onto one runtime.
|
||||
LangGraph, DeepAgents, Claude Code, CrewAI, AutoGen, Hermes, Gemini CLI, and OpenClaw can all plug into the same workspace abstraction. Teams can standardize governance without forcing every group onto one runtime.
|
||||
|
||||
### 4. Memory is treated like infrastructure
|
||||
|
||||
@ -117,6 +117,8 @@ Molecule AI is not trying to replace the frameworks below. It is the system that
|
||||
| **Claude Code** | Shipping on `main` | Real coding workflows, CLI-native continuity | Secure workspace abstraction, A2A delegation, org boundaries, shared control plane |
|
||||
| **CrewAI** | Shipping on `main` | Role-based crews | Persistent workspace identity, policy consistency, shared canvas and registry |
|
||||
| **AutoGen** | Shipping on `main` | Assistant/tool orchestration | Standardized deployment, hierarchy-aware collaboration, shared ops plane |
|
||||
| **Hermes 4** | Shipping on `main` | Hybrid reasoning, native tools, json_schema (NousResearch/hermes-agent) | Option B upstream hook, A2A bridge to OpenAI-compat API, multi-provider provider derivation |
|
||||
| **Gemini CLI** | Shipping on `main` | Google Gemini CLI continuity | Workspace lifecycle, A2A, hierarchy-aware collaboration, shared ops plane |
|
||||
| **OpenClaw** | Shipping on `main` | CLI-native runtime with its own session model | Workspace lifecycle, templates, activity logs, topology-aware collaboration |
|
||||
| **NemoClaw** | WIP on `feat/nemoclaw-t4-docker` | NVIDIA-oriented runtime path | Planned to join the same abstraction once merged; not yet part of `main` |
|
||||
|
||||
@ -182,9 +184,10 @@ The result is not just “an agent that learns.” It is **an organization that
|
||||
|
||||
## What Ships In `main`
|
||||
|
||||
### Canvas
|
||||
### Canvas (v4)
|
||||
|
||||
- Next.js 15 + React Flow + Zustand
|
||||
- **warm-paper theme system** — light / dark / follow-system, SSR cookie + nonce'd boot script + ThemeProvider; terminal + code surfaces stay dark unconditionally
|
||||
- drag-to-nest team building
|
||||
- empty-state deployment + onboarding wizard
|
||||
- template palette
|
||||
@ -193,8 +196,9 @@ The result is not just “an agent that learns.” It is **an organization that
|
||||
|
||||
### Platform
|
||||
|
||||
- Go/Gin control plane
|
||||
- workspace CRUD and provisioning
|
||||
- Go 1.25 / Gin control plane (80+ HTTP endpoints + Gorilla WebSocket fanout)
|
||||
- workspace CRUD and provisioning (pluggable Provisioner — Docker locally, EC2 + SSM in production)
|
||||
- **A2A response path is a typed discriminated union (RFC #2967)** — frozen dataclasses + total parser; 100% unit + adversarial fuzz coverage
|
||||
- registry and heartbeats
|
||||
- browser-safe A2A proxy
|
||||
- team expansion/collapse
|
||||
@ -204,10 +208,10 @@ The result is not just “an agent that learns.” It is **an organization that
|
||||
|
||||
### Runtime
|
||||
|
||||
- unified `workspace/` image
|
||||
- adapter-driven execution
|
||||
- unified `workspace/` image; thin AMI in production (us-east-2)
|
||||
- adapter-driven execution across **8 runtimes** (Claude Code, Hermes, Gemini CLI, LangGraph, DeepAgents, CrewAI, AutoGen, OpenClaw)
|
||||
- Agent Card registration
|
||||
- awareness-backed memory integration
|
||||
- awareness-backed memory integration; **Memory v2 backed by pgvector** for semantic recall
|
||||
- plugin-mounted shared rules/skills
|
||||
- hot-reloadable local skills
|
||||
- coordinator-only delegation path
|
||||
@ -221,6 +225,21 @@ The result is not just “an agent that learns.” It is **an organization that
|
||||
- runtime tiers
|
||||
- direct workspace inspection through terminal and files
|
||||
|
||||
### SaaS (via [`molecule-controlplane`](https://git.moleculesai.app/molecule-ai/molecule-controlplane))
|
||||
|
||||
- multi-tenant on AWS EC2 + Neon (per-tenant Postgres branch) + Cloudflare Tunnels (per-tenant, no public ports)
|
||||
- WorkOS AuthKit + Stripe Checkout + Customer Portal
|
||||
- AWS KMS envelope encryption (DB / Redis connection strings); AWS Secrets Manager for tenant bootstrap
|
||||
- `tenant_resources` audit table + 30-min boot-event-aware reconciler — every CF / AWS lifecycle event recorded, claim vs live state diffed
|
||||
|
||||
### Bring your own Claude Code session (via [`molecule-mcp-claude-channel`](https://git.moleculesai.app/molecule-ai/molecule-mcp-claude-channel))
|
||||
|
||||
- Claude Code plugin that bridges Molecule A2A traffic into a local Claude Code session via MCP
|
||||
- subscribe to one or more workspaces; peer messages surface as conversation turns; replies route back through Molecule's A2A
|
||||
- no tunnel, no public endpoint — the plugin self-registers each watched workspace as `delivery_mode=poll` and long-polls `/activity?since_id=…`
|
||||
- multi-tenant friendly: one plugin install can watch workspaces across multiple Molecule tenants (`MOLECULE_PLATFORM_URLS` per-workspace)
|
||||
- install via the standard marketplace flow: `/plugin marketplace add Molecule-AI/molecule-mcp-claude-channel` → `/plugin install molecule-channel@molecule-mcp-claude-channel`
|
||||
|
||||
## Built For Teams That Need More Than A Demo
|
||||
|
||||
Molecule AI is especially strong when you need to run:
|
||||
@ -233,24 +252,30 @@ Molecule AI is especially strong when you need to run:
|
||||
## Architecture
|
||||
|
||||
```text
|
||||
Canvas (Next.js :3000) <--HTTP / WS--> Platform (Go :8080) <---> Postgres + Redis
|
||||
| |
|
||||
| +--> Docker provisioner / bundles / templates / secrets
|
||||
Canvas (Next.js 15, warm-paper :3000) <--HTTP / WS--> Platform (Go 1.25 :8080) <---> Postgres + Redis
|
||||
| |
|
||||
| +--> Provisioner: Docker (local) / EC2 + SSM (prod)
|
||||
| +--> bundles · templates · secrets · KMS
|
||||
|
|
||||
+-------------------- shows --------------------> workspaces, teams, tasks, traces, events
|
||||
+------------------------- shows ------------------------> workspaces, teams, tasks, traces, events
|
||||
|
||||
Workspace Runtime (Python image with adapters)
|
||||
- LangGraph / DeepAgents / Claude Code / CrewAI / AutoGen / OpenClaw
|
||||
- Agent Card + A2A server
|
||||
- heartbeat + activity + awareness-backed memory
|
||||
Workspace Runtime (Python ≥3.11, image with adapters)
|
||||
- 8 adapters: LangGraph / DeepAgents / Claude Code / CrewAI / AutoGen / Hermes / Gemini CLI / OpenClaw
|
||||
- Agent Card + A2A server (typed-SSOT response path, RFC #2967)
|
||||
- heartbeat + activity + awareness-backed memory (Memory v2 — pgvector semantic recall)
|
||||
- skills + plugins + hot reload
|
||||
|
||||
SaaS Control Plane (molecule-controlplane, private)
|
||||
- per-tenant EC2 + Neon (Postgres branch) + Cloudflare Tunnel
|
||||
- WorkOS · Stripe · KMS · AWS Secrets Manager
|
||||
- tenant_resources audit + 30-min reconciler
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
git clone https://github.com/Molecule-AI/molecule-monorepo.git
|
||||
cd molecule-monorepo
|
||||
git clone https://git.moleculesai.app/molecule-ai/molecule-core.git
|
||||
cd molecule-core
|
||||
|
||||
cp .env.example .env
|
||||
# Defaults boot the stack locally out of the box. See .env.example for
|
||||
@ -303,7 +328,11 @@ Then open `http://localhost:3000`:
|
||||
|
||||
## Current Scope
|
||||
|
||||
The current `main` branch already includes the core platform, canvas, memory model, six production adapters, skill lifecycle, and operational surfaces. Adjacent runtime work such as **NemoClaw** remains branch-level until merged, and this README keeps that distinction explicit on purpose.
|
||||
The current `main` branch ships the core platform, Canvas v4 (warm-paper themed), Memory v2 (pgvector semantic recall), the typed-SSOT A2A response path (RFC #2967), **eight production adapters** (Claude Code, Hermes, Gemini CLI, LangGraph, DeepAgents, CrewAI, AutoGen, OpenClaw), skill lifecycle, and operational surfaces.
|
||||
|
||||
The companion private repo [`molecule-controlplane`](https://git.moleculesai.app/molecule-ai/molecule-controlplane) provides the SaaS surface — multi-tenant orchestration on EC2 + Neon + Cloudflare Tunnels, KMS envelope encryption, WorkOS auth, Stripe billing, and a `tenant_resources` audit table with a 30-min reconciler.
|
||||
|
||||
Adjacent runtime work such as **NemoClaw** remains branch-level until merged, and this README keeps that distinction explicit on purpose.
|
||||
|
||||
## License
|
||||
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
<div align="center">
|
||||
|
||||
<p>
|
||||
<img src="./docs/assets/branding/molecule-icon.png" alt="Molecule AI 图案 Logo" width="160" />
|
||||
<img src="./docs/assets/branding/molecule-icon.svg" alt="Molecule AI" width="160" />
|
||||
</p>
|
||||
|
||||
<p>
|
||||
@ -38,8 +38,8 @@
|
||||
<a href="./docs/agent-runtime/workspace-runtime.md"><strong>Workspace Runtime</strong></a>
|
||||
</p>
|
||||
|
||||
[](https://railway.app/new/template?template=https://github.com/Molecule-AI/molecule-core)
|
||||
[](https://render.com/deploy?repo=https://github.com/Molecule-AI/molecule-core)
|
||||
[](https://railway.app/new/template?template=https://git.moleculesai.app/molecule-ai/molecule-core)
|
||||
[](https://render.com/deploy?repo=https://git.moleculesai.app/molecule-ai/molecule-core)
|
||||
|
||||
</div>
|
||||
|
||||
@ -52,8 +52,8 @@ Molecule AI 是目前最强的 AI Agent 组织治理方案之一,用来把 age
|
||||
它把过去分散在 demo、内部胶水代码和各类 framework 私有工具里的关键能力,收敛成一个产品:
|
||||
|
||||
- 一套组织原生 control plane,管理团队、角色、层级和生命周期
|
||||
- 一套 runtime abstraction,让 LangGraph、DeepAgents、Claude Code、CrewAI、AutoGen、OpenClaw 并存运行
|
||||
- 一套与组织边界对齐的 memory 模型,把 recall、sharing 和 skill evolution 放进同一体系
|
||||
- 一套 runtime abstraction,让 **8 个** agent runtime —— LangGraph、DeepAgents、Claude Code、CrewAI、AutoGen、**Hermes**、**Gemini CLI**、OpenClaw —— 共用一套 workspace 契约
|
||||
- 一套与组织边界对齐的 memory 模型,把 recall、sharing 和 skill evolution 放进同一体系(Memory v2 由 pgvector 支撑语义召回)
|
||||
- 一套面向线上 workspace 的运维面,统一完成观测、暂停、重启、检查和持续改进
|
||||
|
||||
今天很多团队能做好 workflow、单 agent、coding agent,或者自定义 multi-agent graph 中的一种。
|
||||
@ -74,7 +74,7 @@ Molecule AI 填的就是这个空白。
|
||||
|
||||
### 3. Runtime 选择不再是死路
|
||||
|
||||
LangGraph、DeepAgents、Claude Code、CrewAI、AutoGen、OpenClaw 都可以挂到同一个 workspace abstraction 下。团队可以统一治理方式,而不必统一到底层 runtime。
|
||||
LangGraph、DeepAgents、Claude Code、CrewAI、AutoGen、Hermes、Gemini CLI、OpenClaw 都可以挂到同一个 workspace abstraction 下。团队可以统一治理方式,而不必统一到底层 runtime。
|
||||
|
||||
### 4. Memory 被当成基础设施来做
|
||||
|
||||
@ -116,6 +116,8 @@ Molecule AI 并不是要替代下面这些 framework,而是把它们纳入更
|
||||
| **Claude Code** | `main` 已支持 | 真实编码工作流、CLI-native continuity | 安全 workspace 抽象、A2A delegation、组织边界、共享 control plane |
|
||||
| **CrewAI** | `main` 已支持 | 角色型 crew 模式清晰 | 持久 workspace 身份、统一策略、共享 Canvas 和 registry |
|
||||
| **AutoGen** | `main` 已支持 | assistant/tool orchestration | 统一部署、层级协作、共享运维平面 |
|
||||
| **Hermes 4** | `main` 已支持 | 混合推理、原生工具调用、json_schema 输出(NousResearch/hermes-agent) | Option B 上游 hook、A2A 桥接 OpenAI 兼容 API、多 provider 自动派生 |
|
||||
| **Gemini CLI** | `main` 已支持 | Google Gemini CLI 持续会话 | workspace 生命周期、A2A、层级感知协作、共享运维平面 |
|
||||
| **OpenClaw** | `main` 已支持 | CLI-native runtime,自有 session 模型 | workspace 生命周期、templates、activity logs、拓扑感知协作 |
|
||||
| **NemoClaw** | `feat/nemoclaw-t4-docker` 分支 WIP | NVIDIA 方向 runtime 路线 | 计划并入同一抽象层,但当前还不是 `main` 已合并能力 |
|
||||
|
||||
@ -181,9 +183,10 @@ Molecule AI 并不是要替代下面这些 framework,而是把它们纳入更
|
||||
|
||||
## `main` 分支已经具备什么
|
||||
|
||||
### Canvas
|
||||
### Canvas(v4)
|
||||
|
||||
- Next.js 15 + React Flow + Zustand
|
||||
- **warm-paper 主题系统** —— light / dark / 跟随系统;SSR cookie + nonce'd boot 脚本 + ThemeProvider;终端与代码面板始终保持深色
|
||||
- drag-to-nest 团队构建
|
||||
- empty state + onboarding wizard
|
||||
- template palette
|
||||
@ -192,8 +195,9 @@ Molecule AI 并不是要替代下面这些 framework,而是把它们纳入更
|
||||
|
||||
### Platform
|
||||
|
||||
- Go/Gin control plane
|
||||
- workspace CRUD 和 provisioning
|
||||
- Go 1.25 / Gin control plane(80+ HTTP 端点 + Gorilla WebSocket fanout)
|
||||
- workspace CRUD 和 provisioning(可插拔 Provisioner —— 本地 Docker、生产 EC2 + SSM)
|
||||
- **A2A 响应路径已收敛为类型化的判别联合(RFC #2967)** —— 冻结 dataclass + 全量 parser;100% 单元测试 + 对抗性 fuzz 覆盖
|
||||
- registry 与 heartbeat
|
||||
- 浏览器安全的 A2A proxy
|
||||
- team expansion/collapse
|
||||
@ -203,10 +207,10 @@ Molecule AI 并不是要替代下面这些 framework,而是把它们纳入更
|
||||
|
||||
### Runtime
|
||||
|
||||
- 统一 `workspace/` 镜像
|
||||
- adapter 驱动执行
|
||||
- 统一 `workspace/` 镜像;生产环境采用 thin AMI(us-east-2)
|
||||
- adapter 驱动执行,覆盖 **8 个 runtime**(Claude Code、Hermes、Gemini CLI、LangGraph、DeepAgents、CrewAI、AutoGen、OpenClaw)
|
||||
- Agent Card 注册
|
||||
- awareness-backed memory
|
||||
- awareness-backed memory;**Memory v2 由 pgvector 支撑**语义召回
|
||||
- plugin 挂载共享 rules/skills
|
||||
- 本地 skills 热加载
|
||||
- coordinator-only delegation 路径
|
||||
@ -220,6 +224,21 @@ Molecule AI 并不是要替代下面这些 framework,而是把它们纳入更
|
||||
- runtime tiers
|
||||
- 终端与文件层面的 workspace 直接排障
|
||||
|
||||
### SaaS(由 [`molecule-controlplane`](https://git.moleculesai.app/molecule-ai/molecule-controlplane) 提供)
|
||||
|
||||
- 多租户运行在 AWS EC2 + Neon(每租户一个 Postgres branch)+ Cloudflare Tunnels(每租户一条隧道,对外不开任何端口)
|
||||
- WorkOS AuthKit + Stripe Checkout + Customer Portal
|
||||
- AWS KMS 信封加密(DB / Redis 连接串);AWS Secrets Manager 负责租户 bootstrap
|
||||
- `tenant_resources` 审计表 + 30 分钟 boot-event-aware reconciler —— 每个 CF / AWS lifecycle 事件都有记录,每 30 分钟比对 claim 与实际状态
|
||||
|
||||
### 在 Claude Code 里直接接入(由 [`molecule-mcp-claude-channel`](https://git.moleculesai.app/molecule-ai/molecule-mcp-claude-channel) 提供)
|
||||
|
||||
- 把 Molecule A2A 流量桥接到本地 Claude Code 会话的 MCP 插件
|
||||
- 订阅一个或多个 workspace;peer 的消息会以 user-turn 出现,回复会经 Molecule A2A 路由出去
|
||||
- 无需公网隧道、无需公开端点 —— 插件启动时自动把每个 watched workspace 注册成 `delivery_mode=poll`,长轮询 `/activity?since_id=…`
|
||||
- 多租户友好:单次安装即可同时 watch 跨多个 Molecule 租户的 workspace(`MOLECULE_PLATFORM_URLS` 按 workspace 配置)
|
||||
- 通过标准 marketplace 流程安装:`/plugin marketplace add Molecule-AI/molecule-mcp-claude-channel` → `/plugin install molecule-channel@molecule-mcp-claude-channel`
|
||||
|
||||
## 适合什么团队
|
||||
|
||||
Molecule AI 特别适合下面这些场景:
|
||||
@ -232,23 +251,29 @@ Molecule AI 特别适合下面这些场景:
|
||||
## 架构总览
|
||||
|
||||
```text
|
||||
Canvas (Next.js :3000) <--HTTP / WS--> Platform (Go :8080) <---> Postgres + Redis
|
||||
| |
|
||||
| +--> Docker provisioner / bundles / templates / secrets
|
||||
Canvas (Next.js 15, warm-paper :3000) <--HTTP / WS--> Platform (Go 1.25 :8080) <---> Postgres + Redis
|
||||
| |
|
||||
| +--> Provisioner: Docker (本地) / EC2 + SSM (生产)
|
||||
| +--> bundles · templates · secrets · KMS
|
||||
|
|
||||
+-------------------- 展示 --------------------> workspaces, teams, tasks, traces, events
|
||||
+------------------------- 展示 ------------------------> workspaces, teams, tasks, traces, events
|
||||
|
||||
Workspace Runtime (Python image with adapters)
|
||||
- LangGraph / DeepAgents / Claude Code / CrewAI / AutoGen / OpenClaw
|
||||
- Agent Card + A2A server
|
||||
- heartbeat + activity + awareness-backed memory
|
||||
Workspace Runtime (Python ≥3.11,含 adapter 集合的镜像)
|
||||
- 8 个 adapter: LangGraph / DeepAgents / Claude Code / CrewAI / AutoGen / Hermes / Gemini CLI / OpenClaw
|
||||
- Agent Card + A2A server(typed-SSOT 响应路径,RFC #2967)
|
||||
- heartbeat + activity + awareness-backed memory(Memory v2 —— pgvector 语义召回)
|
||||
- skills + plugins + hot reload
|
||||
|
||||
SaaS Control Plane (molecule-controlplane,私有)
|
||||
- 每租户 EC2 + Neon (Postgres branch) + Cloudflare Tunnel
|
||||
- WorkOS · Stripe · KMS · AWS Secrets Manager
|
||||
- tenant_resources 审计 + 30 分钟 reconciler
|
||||
```
|
||||
|
||||
## 快速开始
|
||||
|
||||
```bash
|
||||
git clone https://github.com/Molecule-AI/molecule-core.git
|
||||
git clone https://git.moleculesai.app/molecule-ai/molecule-core.git
|
||||
cd molecule-core
|
||||
|
||||
cp .env.example .env
|
||||
@ -296,7 +321,11 @@ npm run dev
|
||||
|
||||
## 当前范围说明
|
||||
|
||||
当前 `main` 已经包含核心平台、Canvas、memory model、6 个正式 adapter、skill lifecycle 和主要运维面。像 **NemoClaw** 这样的相邻 runtime 路线仍然属于分支级工作,只有合并后才会进入正式支持列表,这里会明确区分。
|
||||
当前 `main` 已经包含核心平台、Canvas v4(warm-paper 主题)、Memory v2(pgvector 语义召回)、typed-SSOT A2A 响应路径(RFC #2967)、**8 个正式 adapter**(Claude Code、Hermes、Gemini CLI、LangGraph、DeepAgents、CrewAI、AutoGen、OpenClaw)、skill lifecycle,以及主要运维面。
|
||||
|
||||
配套的私有仓库 [`molecule-controlplane`](https://git.moleculesai.app/molecule-ai/molecule-controlplane) 提供 SaaS 层 —— 多租户编排(EC2 + Neon + Cloudflare Tunnels)、KMS 信封加密、WorkOS 鉴权、Stripe 计费,以及 `tenant_resources` 审计表加 30 分钟 reconciler。
|
||||
|
||||
像 **NemoClaw** 这样的相邻 runtime 路线仍然属于分支级工作,只有合并后才会进入正式支持列表,这里会明确区分。
|
||||
|
||||
## License
|
||||
|
||||
|
||||
@ -41,7 +41,7 @@ export default function PricingPage() {
|
||||
<p className="mt-2 text-ink-mid">
|
||||
We publish the{" "}
|
||||
<a
|
||||
href="https://github.com/Molecule-AI/molecule-monorepo"
|
||||
href="https://git.moleculesai.app/molecule-ai/molecule-monorepo"
|
||||
className="text-accent underline hover:text-accent"
|
||||
>
|
||||
full source on GitHub
|
||||
|
||||
@ -13,7 +13,6 @@ import { AttachmentPreview } from "./chat/AttachmentPreview";
|
||||
import { extractFilesFromTask } from "./chat/message-parser";
|
||||
import { AgentCommsPanel } from "./chat/AgentCommsPanel";
|
||||
import { appendActivityLine } from "./chat/activityLog";
|
||||
import { activityRowToMessages, type ActivityRowForHydration } from "./chat/historyHydration";
|
||||
import { runtimeDisplayName } from "@/lib/runtime-names";
|
||||
import { ConfirmDialog } from "@/components/ConfirmDialog";
|
||||
|
||||
@ -50,38 +49,12 @@ interface A2AResponse {
|
||||
};
|
||||
}
|
||||
|
||||
/** Detect activity-log rows that the workspace's own runtime fired
|
||||
* against itself but were misclassified as canvas-source. The proper
|
||||
* fix is the X-Workspace-ID header from `self_source_headers()` in
|
||||
* workspace/platform_auth.py, which makes the platform record
|
||||
* source_id = workspace_id. But three failure modes still leak a
|
||||
* self-message into "My Chat":
|
||||
*
|
||||
* 1. Historical rows already in the DB with source_id=NULL.
|
||||
* 2. Workspace containers running pre-fix heartbeat.py / main.py
|
||||
* (the fix only takes effect after an image rebuild + redeploy).
|
||||
* 3. Future internal triggers added without the helper.
|
||||
*
|
||||
* This client-side filter recognises the heartbeat trigger by its
|
||||
* exact prefix — the heartbeat assembles
|
||||
*
|
||||
* "Delegation results are ready. Review them and take appropriate
|
||||
* action:\n" + summary_lines + report_instruction
|
||||
*
|
||||
* in workspace/heartbeat.py. The prefix is template-fixed so a
|
||||
* string match is reliable. If the heartbeat copy ever changes,
|
||||
* update this constant in the same commit.
|
||||
*
|
||||
* This is a backstop, not the primary defence — the X-Workspace-ID
|
||||
* header is. Filtering content is fragile to copy edits, so keep
|
||||
* the list narrow. */
|
||||
const INTERNAL_SELF_MESSAGE_PREFIXES = [
|
||||
"Delegation results are ready. Review them and take appropriate action",
|
||||
];
|
||||
|
||||
function isInternalSelfMessage(text: string): boolean {
|
||||
return INTERNAL_SELF_MESSAGE_PREFIXES.some((p) => text.startsWith(p));
|
||||
}
|
||||
// Internal-self-message filtering moved server-side in RFC #2945
|
||||
// PR-C/D — the platform's /chat-history endpoint applies the
|
||||
// IsInternalSelfMessage predicate before returning rows, so the
|
||||
// client no longer needs the local backstop on the history path.
|
||||
// The proper fix is still X-Workspace-ID header (source_id=workspace_id);
|
||||
// the platform-side prefix filter handles the residual cases.
|
||||
|
||||
// extractReplyText pulls the agent's text reply out of an A2A response.
|
||||
// Concatenates ALL text parts (joined with "\n") rather than returning
|
||||
@ -134,8 +107,19 @@ const INITIAL_HISTORY_LIMIT = 10;
|
||||
const OLDER_HISTORY_BATCH = 20;
|
||||
|
||||
/**
|
||||
* Load chat history from the activity_logs database via the platform API.
|
||||
* Uses source=canvas to only get user-initiated messages (not agent-to-agent).
|
||||
* Load chat history from the platform's typed /chat-history endpoint.
|
||||
*
|
||||
* Server-side rendering of activity_logs rows into ChatMessage shape
|
||||
* lives in workspace-server/internal/messagestore/postgres_store.go
|
||||
* (RFC #2945 PR-C/D). The server already applies the canvas-source
|
||||
* filter, the internal-self-message predicate, the role decision
|
||||
* (status=error vs agent-error prefix → system), and the v0/v1
|
||||
* file-shape extraction. Canvas just renders what it receives.
|
||||
*
|
||||
* Wire shape (mirrors ChatMessage exactly, no per-row mapping needed):
|
||||
*
|
||||
* GET /workspaces/:id/chat-history?limit=N&before_ts=T
|
||||
* 200 → {"messages": ChatMessage[], "reached_end": boolean}
|
||||
*
|
||||
* Pagination:
|
||||
* - Pass `limit` to bound the page size (newest-first from server).
|
||||
@ -143,10 +127,10 @@ const OLDER_HISTORY_BATCH = 20;
|
||||
* timestamp. Combined with limit, this yields the next-older page
|
||||
* when scrolling backward through history.
|
||||
*
|
||||
* `reachedEnd` is true when the server returned fewer rows than asked
|
||||
* for — caller uses this to disable further older-batch fetches.
|
||||
* (Counts row-level returns, not chat-bubble count: each row may
|
||||
* produce 1-2 bubbles.)
|
||||
* `reachedEnd` is propagated from the server. The server computes it
|
||||
* by comparing rowCount vs limit so a partial last page is correctly
|
||||
* detected even when the row→bubble fan-out is non-1:1 (each row
|
||||
* produces 1-2 bubbles).
|
||||
*/
|
||||
async function loadMessagesFromDB(
|
||||
workspaceId: string,
|
||||
@ -154,25 +138,23 @@ async function loadMessagesFromDB(
|
||||
beforeTs?: string,
|
||||
): Promise<{ messages: ChatMessage[]; error: string | null; reachedEnd: boolean }> {
|
||||
try {
|
||||
const params = new URLSearchParams({
|
||||
type: "a2a_receive",
|
||||
source: "canvas",
|
||||
limit: String(limit),
|
||||
});
|
||||
const params = new URLSearchParams({ limit: String(limit) });
|
||||
if (beforeTs) params.set("before_ts", beforeTs);
|
||||
const activities = await api.get<ActivityRowForHydration[]>(
|
||||
`/workspaces/${workspaceId}/activity?${params.toString()}`,
|
||||
const resp = await api.get<{ messages: ChatMessage[]; reached_end: boolean }>(
|
||||
`/workspaces/${workspaceId}/chat-history?${params.toString()}`,
|
||||
);
|
||||
|
||||
const messages: ChatMessage[] = [];
|
||||
// Activities are newest-first, reverse for chronological order.
|
||||
// Per-row mapping lives in chat/historyHydration.ts so it can be
|
||||
// unit-tested without spinning up the full ChatTab component
|
||||
// (regression cover for the timestamp-collapse bug).
|
||||
for (const a of [...activities].reverse()) {
|
||||
messages.push(...activityRowToMessages(a, isInternalSelfMessage));
|
||||
}
|
||||
return { messages, error: null, reachedEnd: activities.length < limit };
|
||||
// Server emits oldest-first within the page (RFC #2945 PR-C-2
|
||||
// post-fix: server reverses row-aware before returning so the
|
||||
// wire is display-ready). Canvas appends/prepends without
|
||||
// reordering — this avoids the pair-flip bug a naive flat
|
||||
// reverse causes when each row produces a (user, agent) pair
|
||||
// with the same timestamp.
|
||||
return {
|
||||
messages: resp.messages ?? [],
|
||||
error: null,
|
||||
reachedEnd: resp.reached_end,
|
||||
};
|
||||
} catch (err) {
|
||||
return {
|
||||
messages: [],
|
||||
|
||||
@ -21,20 +21,39 @@ interface Props {
|
||||
// --- Agent Card Section ---
|
||||
|
||||
function AgentCardSection({ workspaceId }: { workspaceId: string }) {
|
||||
const [card, setCard] = useState<Record<string, unknown> | null>(null);
|
||||
const [loading, setLoading] = useState(true);
|
||||
// Initial card value comes from the canvas store — node.data.agentCard
|
||||
// is hydrated by the platform stream when the workspace appears in the
|
||||
// graph, so reading it here avoids a duplicate `GET /workspaces/${id}`
|
||||
// (the parent ConfigTab.loadConfig already fetches workspace metadata,
|
||||
// and refetching here adds a serialised RTT to the panel-open path —
|
||||
// contributed to the ~20s detail-panel load reported in core#11).
|
||||
// Local state still tracks the edited/saved value so the editor flow
|
||||
// is unchanged.
|
||||
const storeCard = useCanvasStore((s) => {
|
||||
// Defensive against test mocks that omit `nodes` (some test files
|
||||
// stub the store with a minimal shape). In production `nodes` is
|
||||
// always an array — empty or not — so the optional chaining only
|
||||
// matters for the test path.
|
||||
const node = s.nodes?.find?.((n) => n.id === workspaceId);
|
||||
return (node?.data.agentCard as
|
||||
| Record<string, unknown>
|
||||
| null
|
||||
| undefined) ?? null;
|
||||
});
|
||||
const [card, setCard] = useState<Record<string, unknown> | null>(storeCard);
|
||||
const [editing, setEditing] = useState(false);
|
||||
const [draft, setDraft] = useState("");
|
||||
const [saving, setSaving] = useState(false);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
const [success, setSuccess] = useState(false);
|
||||
|
||||
// If the store updates while this section is mounted (another tab
|
||||
// pushed an update via the platform event stream), reflect that —
|
||||
// unless the user is mid-edit, in which case we don't clobber their
|
||||
// unsaved draft.
|
||||
useEffect(() => {
|
||||
api.get<Record<string, unknown>>(`/workspaces/${workspaceId}`)
|
||||
.then((ws) => setCard((ws.agent_card as Record<string, unknown>) || null))
|
||||
.catch(() => {})
|
||||
.finally(() => setLoading(false));
|
||||
}, [workspaceId]);
|
||||
if (!editing) setCard(storeCard);
|
||||
}, [storeCard, editing]);
|
||||
|
||||
const handleSave = async () => {
|
||||
setError(null);
|
||||
@ -53,9 +72,7 @@ function AgentCardSection({ workspaceId }: { workspaceId: string }) {
|
||||
|
||||
return (
|
||||
<Section title="Agent Card" defaultOpen={false}>
|
||||
{loading ? (
|
||||
<div className="text-[10px] text-ink-soft">Loading...</div>
|
||||
) : editing ? (
|
||||
{editing ? (
|
||||
<div className="space-y-2">
|
||||
<textarea
|
||||
aria-label="Agent card JSON editor"
|
||||
@ -221,47 +238,51 @@ export function ConfigTab({ workspaceId }: Props) {
|
||||
setLoading(true);
|
||||
setError(null);
|
||||
|
||||
// ALWAYS load workspace metadata first (runtime + model). These are the
|
||||
// source of truth regardless of whether the runtime uses our config.yaml
|
||||
// template. Without this the form falls back to empty/default values on
|
||||
// a hermes workspace (which doesn't use our template), creating the
|
||||
// appearance that the saved runtime is unset — and worse, clicking Save
|
||||
// would silently flip `runtime` from `hermes` back to the dropdown
|
||||
// default `LangGraph`. See GH #1894.
|
||||
let wsMetadataRuntime = "";
|
||||
let wsMetadataModel = "";
|
||||
let wsMetadataTier: number | null = null;
|
||||
try {
|
||||
const ws = await api.get<{ runtime?: string; tier?: number }>(`/workspaces/${workspaceId}`);
|
||||
wsMetadataRuntime = (ws.runtime || "").trim();
|
||||
if (typeof ws.tier === "number") wsMetadataTier = ws.tier;
|
||||
} catch { /* fall back to config.yaml */ }
|
||||
try {
|
||||
const m = await api.get<{ model?: string }>(`/workspaces/${workspaceId}/model`);
|
||||
wsMetadataModel = (m.model || "").trim();
|
||||
} catch { /* non-fatal */ }
|
||||
// Load workspace metadata (runtime + model + provider) in parallel.
|
||||
// These are independent GETs against three workspace-server endpoints
|
||||
// and used to be awaited serially — for SaaS workspaces each call
|
||||
// round-trips through an EIC SSH tunnel, so the previous serial
|
||||
// pattern stacked 3-5s of tunnel-setup latency per call (core#11).
|
||||
// Promise.all overlaps them; the per-call cost stays the same but
|
||||
// wall time drops to max() instead of sum().
|
||||
//
|
||||
// Each leg has its own .catch handler that yields a sentinel value,
|
||||
// matching the previous semantics:
|
||||
// - /workspaces/${id}: required source-of-truth for runtime+tier;
|
||||
// fall back to YAML if the GET fails (rare, network-class only).
|
||||
// - /workspaces/${id}/model: non-fatal; empty model lets the form
|
||||
// fall through to YAML runtime_config.model.
|
||||
// - /workspaces/${id}/provider: non-fatal; old workspace-servers
|
||||
// return 404, in which case provider="" and Save skips the PUT.
|
||||
//
|
||||
// See GH #1894 for the workspace-row-as-source-of-truth rationale
|
||||
// that motivated splitting from a single config.yaml read.
|
||||
const [wsRes, modelRes, providerRes] = await Promise.all([
|
||||
api.get<{ runtime?: string; tier?: number }>(`/workspaces/${workspaceId}`)
|
||||
.catch(() => ({} as { runtime?: string; tier?: number })),
|
||||
api.get<{ model?: string }>(`/workspaces/${workspaceId}/model`)
|
||||
.catch(() => ({} as { model?: string })),
|
||||
api.get<{ provider?: string }>(`/workspaces/${workspaceId}/provider`)
|
||||
.catch(() => null),
|
||||
]);
|
||||
const wsMetadataRuntime = (wsRes.runtime || "").trim();
|
||||
const wsMetadataModel = (modelRes.model || "").trim();
|
||||
const wsMetadataTier: number | null =
|
||||
typeof wsRes.tier === "number" ? wsRes.tier : null;
|
||||
if (providerRes !== null) {
|
||||
const loadedProvider = (providerRes.provider || "").trim();
|
||||
setProvider(loadedProvider);
|
||||
setOriginalProvider(loadedProvider);
|
||||
} else {
|
||||
setProvider("");
|
||||
setOriginalProvider("");
|
||||
}
|
||||
// originalModel is set further down once the YAML has been parsed —
|
||||
// we want it to reflect what the form ACTUALLY rendered, which may
|
||||
// be the YAML's runtime_config.model fallback when MODEL_PROVIDER
|
||||
// is empty. Setting it here from wsMetadataModel alone would be
|
||||
// wrong for hermes/pre-#240 workspaces.
|
||||
|
||||
// Load explicit provider override (Option B PR-5). Endpoint returns
|
||||
// {provider: "", source: "default"} when no override is set, so the
|
||||
// empty string is the legitimate "auto-derive" signal — don't treat
|
||||
// it as a load error. Non-fatal: an older workspace-server that
|
||||
// predates PR-2 returns 404 here; the form falls back to "" and
|
||||
// Save just won't PUT the provider field.
|
||||
try {
|
||||
const p = await api.get<{ provider?: string }>(`/workspaces/${workspaceId}/provider`);
|
||||
const loadedProvider = (p.provider || "").trim();
|
||||
setProvider(loadedProvider);
|
||||
setOriginalProvider(loadedProvider);
|
||||
} catch {
|
||||
setProvider("");
|
||||
setOriginalProvider("");
|
||||
}
|
||||
|
||||
// Skip the config.yaml fetch entirely for runtimes that manage
|
||||
// their own config (external, hermes, etc.) — they don't have a
|
||||
// platform-side template, so the GET would 404. The catch block
|
||||
|
||||
@ -1,13 +1,11 @@
|
||||
// @vitest-environment jsdom
|
||||
//
|
||||
// Pins the lazy-loading chat-history pagination added 2026-05-05.
|
||||
// Pins the lazy-loading chat-history pagination.
|
||||
//
|
||||
// Pre-fix: ChatTab fetched the newest 50 messages on every mount and
|
||||
// scrolled to bottom, paying full DOM cost up-front even when the user
|
||||
// only wanted to read the last few bubbles. Post-fix: initial load is
|
||||
// bounded to 10 newest, and an IntersectionObserver on a top sentinel
|
||||
// triggers loadOlder() (batch of 20 with `before_ts` cursor) when the
|
||||
// user scrolls up.
|
||||
// PR-C-2 (RFC #2945): canvas was migrated from /activity?type=a2a_receive
|
||||
// to /chat-history. Server now returns typed ChatMessage[] in
|
||||
// display-ready oldest-first order. These tests guard the canvas-side
|
||||
// pagination invariants against the new endpoint surface.
|
||||
//
|
||||
// Pinned branches:
|
||||
// 1. Initial fetch carries `limit=10` and NO before_ts (newest-first
|
||||
@ -20,11 +18,10 @@
|
||||
// asserting the rendered bubble count matches the full page).
|
||||
// 4. The retry button after a failed initial load uses the same
|
||||
// INITIAL_HISTORY_LIMIT (10), not the legacy 50.
|
||||
//
|
||||
// IntersectionObserver / scroll-anchor restoration is exercised by the
|
||||
// E2E synth-canary suite — pinning it in jsdom would require mocking
|
||||
// the observer and faking layout, which is brittler than trusting a
|
||||
// live-DOM canary against the staging tenant.
|
||||
// 5. before_ts cursor is the OLDEST timestamp from the current page,
|
||||
// passed verbatim to walk backward.
|
||||
// 6. Inflight guard rejects duplicate IO triggers while a loadOlder
|
||||
// fetch is in flight.
|
||||
|
||||
import { describe, it, expect, vi, afterEach, beforeEach } from "vitest";
|
||||
import { render, screen, cleanup, waitFor, fireEvent } from "@testing-library/react";
|
||||
@ -33,24 +30,31 @@ import React from "react";
|
||||
afterEach(cleanup);
|
||||
|
||||
// Both ChatTab sub-panels (MyChat + AgentComms) mount simultaneously so
|
||||
// keyboard tab order and aria-controls land on a real DOM. Both fire
|
||||
// /activity GETs on mount: MyChat's hits `type=a2a_receive&source=canvas`,
|
||||
// AgentComms's hits a different filter. Route the mock by URL so each
|
||||
// gets a sensible default and only MyChat's call is what the assertions
|
||||
// scrutinise.
|
||||
const myChatActivityCalls: string[] = [];
|
||||
let myChatNextResponse: { ok: true; rows: unknown[] } | { ok: false; err: Error } = {
|
||||
ok: true,
|
||||
rows: [],
|
||||
};
|
||||
// keyboard tab order and aria-controls land on a real DOM. MyChat's
|
||||
// loadMessagesFromDB hits /chat-history; AgentComms's polling hits a
|
||||
// different URL. Route the mock by URL so each gets a sensible default
|
||||
// and only MyChat's calls land in the assertion array.
|
||||
const myChatHistoryCalls: string[] = [];
|
||||
let myChatNextResponse:
|
||||
| { ok: true; messages: unknown[]; reachedEnd?: boolean }
|
||||
| { ok: false; err: Error } = { ok: true, messages: [] };
|
||||
|
||||
const apiGet = vi.fn((path: string): Promise<unknown> => {
|
||||
if (path.includes("type=a2a_receive") && path.includes("source=canvas")) {
|
||||
myChatActivityCalls.push(path);
|
||||
if (myChatNextResponse.ok) return Promise.resolve(myChatNextResponse.rows);
|
||||
if (path.includes("/chat-history")) {
|
||||
myChatHistoryCalls.push(path);
|
||||
if (myChatNextResponse.ok) {
|
||||
const reached_end =
|
||||
myChatNextResponse.reachedEnd !== undefined
|
||||
? myChatNextResponse.reachedEnd
|
||||
: myChatNextResponse.messages.length < 10;
|
||||
return Promise.resolve({
|
||||
messages: myChatNextResponse.messages,
|
||||
reached_end,
|
||||
});
|
||||
}
|
||||
return Promise.reject(myChatNextResponse.err);
|
||||
}
|
||||
// AgentComms / heartbeat / anything else — empty array is a safe
|
||||
// default that won't blow up the corresponding component's .then().
|
||||
// AgentComms / heartbeat / anything else — empty array safe default.
|
||||
return Promise.resolve([]);
|
||||
});
|
||||
const apiPost = vi.fn();
|
||||
@ -84,8 +88,8 @@ const ioInstances: IOInstance[] = [];
|
||||
beforeEach(() => {
|
||||
apiGet.mockClear();
|
||||
apiPost.mockReset();
|
||||
myChatActivityCalls.length = 0;
|
||||
myChatNextResponse = { ok: true, rows: [] };
|
||||
myChatHistoryCalls.length = 0;
|
||||
myChatNextResponse = { ok: true, messages: [] };
|
||||
ioInstances.length = 0;
|
||||
class FakeIO {
|
||||
private inst: IOInstance;
|
||||
@ -101,20 +105,12 @@ beforeEach(() => {
|
||||
this.inst.disconnected = true;
|
||||
}
|
||||
}
|
||||
// Install on every reachable global — different bundlers / module
|
||||
// graphs can resolve `IntersectionObserver` via `window`, `globalThis`,
|
||||
// or the bare global. Without all three, jsdom's own (pre-existing)
|
||||
// stub silently wins and ioInstances stays empty.
|
||||
(window as unknown as { IntersectionObserver: unknown }).IntersectionObserver = FakeIO;
|
||||
(globalThis as unknown as { IntersectionObserver: unknown }).IntersectionObserver = FakeIO;
|
||||
// jsdom doesn't implement scrollIntoView; ChatTab calls it after every
|
||||
// messages update.
|
||||
Element.prototype.scrollIntoView = vi.fn();
|
||||
});
|
||||
|
||||
function triggerIntersection(instanceIdx = -1) {
|
||||
// -1 → the latest observer (the live one). Tests targeting an old
|
||||
// (disconnected) instance pass a positive index.
|
||||
const inst = ioInstances.at(instanceIdx);
|
||||
if (!inst) throw new Error(`no IO instance at ${instanceIdx}`);
|
||||
inst.callback(
|
||||
@ -125,25 +121,30 @@ function triggerIntersection(instanceIdx = -1) {
|
||||
|
||||
import { ChatTab } from "../ChatTab";
|
||||
|
||||
function makeActivityRow(seq: number): Record<string, unknown> {
|
||||
// Zero-pad seq into the minute slot so "seq=10" doesn't produce
|
||||
// the invalid timestamp "00:010:00Z" (caught by the loadOlder URL
|
||||
// assertion below — first version of the helper used `0${seq}` and
|
||||
// the test failed on `before_ts` having an extra digit).
|
||||
// makeMessagePair returns a (user, agent) pair sharing a timestamp,
|
||||
// matching the wire shape /chat-history emits per activity_logs row.
|
||||
// Server-side reverseRowChunks ensures the wire is oldest-first across
|
||||
// rows but [user, agent] within each row.
|
||||
function makeMessagePair(seq: number): unknown[] {
|
||||
// Zero-pad seq into the minute slot so seq=10 produces a valid
|
||||
// timestamp (00:10:00Z, not 00:010:00Z).
|
||||
const mm = String(seq).padStart(2, "0");
|
||||
return {
|
||||
activity_type: "a2a_receive",
|
||||
status: "ok",
|
||||
created_at: `2026-05-05T00:${mm}:00Z`,
|
||||
request_body: { params: { message: { parts: [{ kind: "text", text: `user msg ${seq}` }] } } },
|
||||
response_body: { result: `agent reply ${seq}` },
|
||||
};
|
||||
const ts = `2026-05-05T00:${mm}:00Z`;
|
||||
return [
|
||||
{ id: `u-${seq}`, role: "user", content: `user msg ${seq}`, timestamp: ts },
|
||||
{ id: `a-${seq}`, role: "agent", content: `agent reply ${seq}`, timestamp: ts },
|
||||
];
|
||||
}
|
||||
|
||||
// Server returns newest-first; the helper builds a server-shape page
|
||||
// so the order in the rendered messages array matches production.
|
||||
function newestFirstPage(start: number, count: number): unknown[] {
|
||||
return Array.from({ length: count }, (_, i) => makeActivityRow(start + count - 1 - i));
|
||||
// pageOldestFirst builds a wire-shape page (oldest-first within page)
|
||||
// of `count` row-pairs starting at seq=`start`. Mirrors the server's
|
||||
// post-reverseRowChunks emission order.
|
||||
function pageOldestFirst(start: number, count: number): unknown[] {
|
||||
const out: unknown[] = [];
|
||||
for (let i = 0; i < count; i++) {
|
||||
out.push(...makeMessagePair(start + i));
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
const minimalData = {
|
||||
@ -153,28 +154,30 @@ const minimalData = {
|
||||
} as unknown as Parameters<typeof ChatTab>[0]["data"];
|
||||
|
||||
describe("ChatTab lazy history pagination", () => {
|
||||
it("initial fetch carries limit=10 (not the legacy 50)", async () => {
|
||||
myChatNextResponse = { ok: true, rows: [makeActivityRow(1)] };
|
||||
it("initial fetch carries limit=10 (not the legacy 50) and hits /chat-history", async () => {
|
||||
myChatNextResponse = { ok: true, messages: makeMessagePair(1) };
|
||||
render(<ChatTab workspaceId="ws-1" data={minimalData} />);
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
|
||||
const url = myChatActivityCalls[0];
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
|
||||
const url = myChatHistoryCalls[0];
|
||||
expect(url).toContain("/chat-history");
|
||||
expect(url).toContain("limit=10");
|
||||
expect(url).not.toContain("limit=50");
|
||||
// before_ts should NOT be set on the initial fetch — that's the
|
||||
// newest-first slice the user lands on.
|
||||
expect(url).not.toContain("before_ts");
|
||||
// /chat-history filters source-canvas server-side; client should
|
||||
// NOT pass type/source params (they belonged to /activity).
|
||||
expect(url).not.toContain("type=a2a_receive");
|
||||
expect(url).not.toContain("source=canvas");
|
||||
});
|
||||
|
||||
it("hides the top sentinel when initial fetch returns fewer than the limit", async () => {
|
||||
// 3 < 10 → server says "no more older history exists"; sentinel
|
||||
// should NOT mount and the "Loading older messages…" line should
|
||||
// never appear (it can't, since the sentinel is what triggers it).
|
||||
myChatNextResponse = {
|
||||
ok: true,
|
||||
rows: [makeActivityRow(1), makeActivityRow(2), makeActivityRow(3)],
|
||||
};
|
||||
// never appear.
|
||||
myChatNextResponse = { ok: true, messages: pageOldestFirst(1, 3) };
|
||||
render(<ChatTab workspaceId="ws-2" data={minimalData} />);
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
|
||||
await waitFor(() => {
|
||||
expect(screen.queryByText(/Loading chat history/i)).toBeNull();
|
||||
});
|
||||
@ -182,15 +185,15 @@ describe("ChatTab lazy history pagination", () => {
|
||||
});
|
||||
|
||||
it("renders all messages when initial fetch returns exactly the limit", async () => {
|
||||
// 10 == limit → server might have more older rows; sentinel SHOULD
|
||||
// mount so the IO observer can fire loadOlder() on scroll-up. We
|
||||
// verify by checking the rendered bubble count — if hasMore stayed
|
||||
// true the sentinel render path doesn't crash and all 10 rows
|
||||
// produced their pair of bubbles.
|
||||
const fullPage = Array.from({ length: 10 }, (_, i) => makeActivityRow(i + 1));
|
||||
myChatNextResponse = { ok: true, rows: fullPage };
|
||||
// limit=10 row-pairs → 20 ChatMessages. reachedEnd should be FALSE
|
||||
// so the sentinel mounts. Verified by bubble counts.
|
||||
myChatNextResponse = {
|
||||
ok: true,
|
||||
messages: pageOldestFirst(1, 10),
|
||||
reachedEnd: false,
|
||||
};
|
||||
render(<ChatTab workspaceId="ws-3" data={minimalData} />);
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
|
||||
await waitFor(() => {
|
||||
expect(screen.queryByText(/Loading chat history/i)).toBeNull();
|
||||
});
|
||||
@ -202,54 +205,67 @@ describe("ChatTab lazy history pagination", () => {
|
||||
myChatNextResponse = { ok: false, err: new Error("network down") };
|
||||
render(<ChatTab workspaceId="ws-4" data={minimalData} />);
|
||||
const retry = await screen.findByText(/Retry/);
|
||||
myChatNextResponse = { ok: true, rows: [makeActivityRow(1)] };
|
||||
myChatNextResponse = { ok: true, messages: makeMessagePair(1) };
|
||||
fireEvent.click(retry);
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(2));
|
||||
const retryUrl = myChatActivityCalls[1];
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(2));
|
||||
const retryUrl = myChatHistoryCalls[1];
|
||||
expect(retryUrl).toContain("/chat-history");
|
||||
expect(retryUrl).toContain("limit=10");
|
||||
expect(retryUrl).not.toContain("limit=50");
|
||||
});
|
||||
|
||||
it("loadOlder fetches limit=20 with before_ts=oldest.timestamp", async () => {
|
||||
// Initial page = 10 rows in newest-first order (seq 10..1). After
|
||||
// the component reverses to oldest-first for display, messages[0]
|
||||
// is built from seq=1 — the oldest — and its timestamp is what
|
||||
// before_ts should carry.
|
||||
myChatNextResponse = { ok: true, rows: newestFirstPage(1, 10) };
|
||||
// Initial page = 10 row-pairs in oldest-first order (seq 1..10).
|
||||
// The oldest (and so the cursor for loadOlder) is seq=1's
|
||||
// timestamp 2026-05-05T00:01:00Z.
|
||||
myChatNextResponse = {
|
||||
ok: true,
|
||||
messages: pageOldestFirst(1, 10),
|
||||
reachedEnd: false,
|
||||
};
|
||||
render(<ChatTab workspaceId="ws-load-older" data={minimalData} />);
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
|
||||
await waitFor(() => expect(ioInstances.length).toBeGreaterThan(0));
|
||||
|
||||
// Stage the older-batch response, then fire the IO callback.
|
||||
myChatNextResponse = { ok: true, rows: newestFirstPage(0, 1) };
|
||||
// Stage older-batch response, then fire IO callback.
|
||||
myChatNextResponse = {
|
||||
ok: true,
|
||||
messages: pageOldestFirst(0, 1),
|
||||
reachedEnd: true,
|
||||
};
|
||||
triggerIntersection();
|
||||
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(2));
|
||||
const olderUrl = myChatActivityCalls[1];
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(2));
|
||||
const olderUrl = myChatHistoryCalls[1];
|
||||
expect(olderUrl).toContain("/chat-history");
|
||||
expect(olderUrl).toContain("limit=20");
|
||||
expect(olderUrl).toContain("before_ts=");
|
||||
expect(decodeURIComponent(olderUrl)).toContain("before_ts=2026-05-05T00:01:00Z");
|
||||
});
|
||||
|
||||
it("inflight guard rejects a second IO trigger while first loadOlder is in flight", async () => {
|
||||
myChatNextResponse = { ok: true, rows: newestFirstPage(1, 10) };
|
||||
myChatNextResponse = {
|
||||
ok: true,
|
||||
messages: pageOldestFirst(1, 10),
|
||||
reachedEnd: false,
|
||||
};
|
||||
render(<ChatTab workspaceId="ws-inflight" data={minimalData} />);
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
|
||||
await waitFor(() => expect(ioInstances.length).toBeGreaterThan(0));
|
||||
|
||||
// Hold the next loadOlder fetch open with a manual deferred so we
|
||||
// can fire the second trigger while the first is in-flight.
|
||||
let release!: (rows: unknown[]) => void;
|
||||
const deferred = new Promise<unknown[]>((res) => {
|
||||
let release!: (resp: unknown) => void;
|
||||
const deferred = new Promise<unknown>((res) => {
|
||||
release = res;
|
||||
});
|
||||
apiGet.mockImplementationOnce((path: string): Promise<unknown> => {
|
||||
myChatActivityCalls.push(path);
|
||||
myChatHistoryCalls.push(path);
|
||||
return deferred;
|
||||
});
|
||||
|
||||
triggerIntersection(); // start loadOlder #1
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(2));
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(2));
|
||||
|
||||
// Second IO trigger lands while #1 is still pending.
|
||||
triggerIntersection();
|
||||
@ -258,79 +274,62 @@ describe("ChatTab lazy history pagination", () => {
|
||||
// Without the inflight guard, each of these would have started a
|
||||
// new fetch. With the guard, none of them do — call count stays 2.
|
||||
await new Promise((r) => setTimeout(r, 10));
|
||||
expect(myChatActivityCalls.length).toBe(2);
|
||||
expect(myChatHistoryCalls.length).toBe(2);
|
||||
|
||||
// Release the first fetch. Inflight clears in the finally block;
|
||||
// a subsequent IO trigger is permitted again (verified by checking
|
||||
// we can fire a follow-up after release without hanging the test).
|
||||
release([]);
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(2));
|
||||
// Release the first fetch with a valid wire response shape.
|
||||
release({ messages: [], reached_end: true });
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(2));
|
||||
});
|
||||
|
||||
it("empty older response clears the scroll anchor and unmounts the sentinel", async () => {
|
||||
// The bug we're pinning: if loadOlder returns 0 rows, the
|
||||
// scrollAnchorRef must be cleared so the next paint doesn't try to
|
||||
// restore against a no-op prepend (which would fight the natural
|
||||
// bottom-pin for any subsequent live message). hasMore flipping to
|
||||
// false is the same flag-flip path; sentinel disappearing is the
|
||||
// observable proxy.
|
||||
myChatNextResponse = { ok: true, rows: newestFirstPage(1, 10) };
|
||||
myChatNextResponse = {
|
||||
ok: true,
|
||||
messages: pageOldestFirst(1, 10),
|
||||
reachedEnd: false,
|
||||
};
|
||||
render(<ChatTab workspaceId="ws-anchor" data={minimalData} />);
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
|
||||
await waitFor(() => expect(ioInstances.length).toBeGreaterThan(0));
|
||||
|
||||
myChatNextResponse = { ok: true, rows: [] }; // empty → reachedEnd
|
||||
myChatNextResponse = {
|
||||
ok: true,
|
||||
messages: [],
|
||||
reachedEnd: true,
|
||||
};
|
||||
triggerIntersection();
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(2));
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(2));
|
||||
|
||||
// After reachedEnd the sentinel unmounts (hasMore=false). We can't
|
||||
// peek scrollAnchorRef directly, but we can assert the consequence:
|
||||
// scrollIntoView (the bottom-pin for live appends) is not blocked
|
||||
// by a stale anchor. Trigger a re-render via an unrelated state
|
||||
// change… in practice the safest assertion here is that the
|
||||
// sentinel disappeared (proving the empty response propagated to
|
||||
// hasMore correctly, which is the same flag-flip path as anchor
|
||||
// clearing).
|
||||
await waitFor(() => {
|
||||
expect(screen.queryByText(/Loading older messages/i)).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
it("IntersectionObserver does not churn when older messages prepend", async () => {
|
||||
// Whole-PR perf invariant: prepending older history (the load-bearing
|
||||
// user gesture) must NOT tear down + re-arm the IO observer.
|
||||
// Triggering loadOlder is the cleanest way to drive a messages
|
||||
// mutation from inside the test, since live agent push goes through
|
||||
// a Zustand store that's harder to drive reliably from jsdom.
|
||||
//
|
||||
// Pre-fix, loadOlder depended on `messages`, so every prepend
|
||||
// recreated loadOlder → re-ran the IO effect → new observer. Each
|
||||
// call to triggerIntersection() produced a fresh disconnected
|
||||
// observer + a new live one. Post-fix, the observer survives.
|
||||
myChatNextResponse = { ok: true, rows: newestFirstPage(1, 10) };
|
||||
myChatNextResponse = {
|
||||
ok: true,
|
||||
messages: pageOldestFirst(1, 10),
|
||||
reachedEnd: false,
|
||||
};
|
||||
render(<ChatTab workspaceId="ws-stable-io" data={minimalData} />);
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
|
||||
await waitFor(() => expect(ioInstances.length).toBeGreaterThan(0));
|
||||
|
||||
// Snapshot the observer instance after first paint stabilises.
|
||||
const observerBefore = ioInstances.at(-1);
|
||||
expect(observerBefore).toBeDefined();
|
||||
expect(observerBefore!.disconnected).toBe(false);
|
||||
|
||||
// Trigger three older-batch prepends. Each batch returns the full
|
||||
// OLDER_HISTORY_BATCH (20 rows) so reachedEnd stays false and the
|
||||
// sentinel keeps mounting. Pre-fix, each prepend mutated `messages`
|
||||
// → recreated loadOlder → re-ran the IO effect → new observer.
|
||||
// OLDER_HISTORY_BATCH (20 row-pairs = 40 messages) so reachedEnd
|
||||
// stays false and the sentinel keeps mounting.
|
||||
for (let batch = 0; batch < 3; batch++) {
|
||||
myChatNextResponse = {
|
||||
ok: true,
|
||||
rows: newestFirstPage(-(batch + 1) * 20, 20),
|
||||
messages: pageOldestFirst(-(batch + 1) * 20, 20),
|
||||
reachedEnd: false,
|
||||
};
|
||||
const callsBefore = myChatActivityCalls.length;
|
||||
const callsBefore = myChatHistoryCalls.length;
|
||||
triggerIntersection();
|
||||
await waitFor(() =>
|
||||
expect(myChatActivityCalls.length).toBe(callsBefore + 1),
|
||||
);
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(callsBefore + 1));
|
||||
}
|
||||
|
||||
// The original observer is still the live one — no churn.
|
||||
|
||||
@ -7,6 +7,32 @@ export default defineConfig({
|
||||
test: {
|
||||
environment: 'node',
|
||||
exclude: ['e2e/**', 'node_modules/**', '**/dist/**'],
|
||||
// CI-conditional test timeout (issue #96).
|
||||
//
|
||||
// Vitest's 5000ms default is too tight for the first test in any
|
||||
// file under our CI shape: `npx vitest run --coverage` on the
|
||||
// self-hosted Gitea Actions Docker runner. The cold-start cost
|
||||
// (v8 coverage instrumentation init + JSDOM bootstrap + module-
|
||||
// graph import for @/components/* and @/lib/* + first React
|
||||
// render) consistently consumes 5-7 seconds for the first
|
||||
// synchronous test in heavyweight component files
|
||||
// (ActivityTab.test.tsx, CreateWorkspaceDialog.test.tsx,
|
||||
// ConfigTab.provider.test.tsx) — even though every subsequent
|
||||
// test in the same file completes in 100-1500ms.
|
||||
//
|
||||
// Empirically the worst observed first-test was 6453ms in a
|
||||
// single file (CreateWorkspaceDialog). 30000ms gives ~5x
|
||||
// headroom over that on CI; we still keep 5000ms locally so
|
||||
// genuine waitFor races / hung promises stay sensitive in dev.
|
||||
//
|
||||
// Same vitest pattern documented at:
|
||||
// https://vitest.dev/config/testtimeout
|
||||
// https://vitest.dev/guide/coverage#profiling-test-performance
|
||||
//
|
||||
// Per-test duration is still emitted to the CI log; if a test
|
||||
// ever silently approaches 25-30s under this raised ceiling that
|
||||
// will surface as a duration regression and we revisit.
|
||||
testTimeout: process.env.CI ? 30000 : 5000,
|
||||
// Coverage is instrumented but NOT yet a CI gate — first land
|
||||
// observability so we can see the baseline, then dial in
|
||||
// thresholds + a hard gate in a follow-up PR (#1815). Today's
|
||||
|
||||
@ -212,8 +212,8 @@ services:
|
||||
# docker compose pull canvas && docker compose up -d canvas
|
||||
# First-time local setup or testing unreleased changes — build from source:
|
||||
# docker compose build canvas && docker compose up -d canvas
|
||||
# Note: GHCR images are private — `docker login ghcr.io` required before pull.
|
||||
image: ghcr.io/molecule-ai/canvas:latest
|
||||
# Note: ECR images require AWS auth — `aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 153263036946.dkr.ecr.us-east-2.amazonaws.com` before pull.
|
||||
image: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/canvas:latest
|
||||
build:
|
||||
context: ./canvas
|
||||
dockerfile: Dockerfile
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
**Status:** living document — update when you ship a feature that touches one backend.
|
||||
**Owner:** workspace-server + controlplane teams.
|
||||
**Last audit:** 2026-05-05 (Claude agent — `provisionWorkspaceAuto` / `StopWorkspaceAuto` / `HasProvisioner` SoT pattern landed in PRs #2811 + #2824).
|
||||
**Last audit:** 2026-05-07 (plugin install/uninstall closed for EC2 backend via EIC SSH push to the bind-mounted `/configs/plugins/<name>/`, mirroring the Files API PR #1702 pattern).
|
||||
|
||||
## Why this exists
|
||||
|
||||
@ -54,7 +54,7 @@ For "do we have any backend?", use `HasProvisioner()`, never bare `h.provisioner
|
||||
| **Files API** | | | | |
|
||||
| List / Read / Write / Replace / Delete | `container_files.go`, `template_import.go` | `docker exec` + tar `CopyToContainer` | SSH via EIC tunnel (PR #1702) | ✅ parity as of 2026-04-22 (previously docker-only) |
|
||||
| **Plugins** | | | | |
|
||||
| Install / uninstall / list | `plugins_install.go` | `deliverToContainer()` + volume rm | **gap — no live plugin delivery** | 🔴 **docker-only** |
|
||||
| Install / uninstall / list | `plugins_install.go` + `plugins_install_eic.go` | `deliverToContainer()` → exec+`CopyToContainer` on local container | `instance_id` set → EIC SSH push of the staged tarball into the EC2's bind-mounted `/configs/plugins/<name>/` (per `workspaceFilePathPrefix`), `chown 1000:1000`, restart | ✅ parity |
|
||||
| **Terminal (WebSocket)** | | | | |
|
||||
| Dispatch | `terminal.go:90-105` | `instance_id=""` → `handleLocalConnect` → `docker attach` | `instance_id` set → `handleRemoteConnect` → EIC SSH + `docker exec` | ✅ parity (different implementations, same UX) |
|
||||
| **A2A proxy** | | | | |
|
||||
|
||||
@ -4,7 +4,7 @@ How a workspace-server code change reaches the prod tenant fleet — and how to
|
||||
|
||||
> **⚠️ State note (2026-04-22):** this doc describes the **intended design**. As of this write, the canary fleet described below is **not actually running** — no canary tenants are provisioned, `CANARY_TENANT_URLS` / `CANARY_ADMIN_TOKENS` / `CANARY_CP_SHARED_SECRET` are empty in repo secrets, and `canary-verify.yml` fails every run.
|
||||
>
|
||||
> Current merges gate on manual `promote-latest.yml` dispatches, not canary. See [molecule-controlplane/docs/canary-tenants.md](https://github.com/Molecule-AI/molecule-controlplane/blob/main/docs/canary-tenants.md) for the Phase 1 code work that's already shipped + the Phase 2 plan for actually standing up the fleet + a "should we even do this now?" decision framework.
|
||||
> Current merges gate on manual `promote-latest.yml` dispatches, not canary. See [molecule-controlplane/docs/canary-tenants.md](https://git.moleculesai.app/molecule-ai/molecule-controlplane/src/branch/main/docs/canary-tenants.md) for the Phase 1 code work that's already shipped + the Phase 2 plan for actually standing up the fleet + a "should we even do this now?" decision framework.
|
||||
>
|
||||
> **Account-specific identifiers (AWS account ID, IAM role name) referenced below in the original design have been redacted from this public doc.** The actual values — if they exist — are in `Molecule-AI/internal/runbooks/canary-fleet.md`. If you're implementing Phase 2, start there.
|
||||
>
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# Molecule AI — Comprehensive Technical Documentation
|
||||
|
||||
> Definitive technical reference for the Molecule AI Agent Team platform.
|
||||
> Based on a full non-invasive scan of the [molecule-monorepo](https://github.com/Molecule-AI/molecule-monorepo) repository.
|
||||
> Based on a full non-invasive scan of the [molecule-monorepo](https://git.moleculesai.app/molecule-ai/molecule-monorepo) repository.
|
||||
|
||||
---
|
||||
|
||||
@ -1149,11 +1149,11 @@ Molecule AI's workspace abstraction is **runtime-agnostic by design**. A workspa
|
||||
|
||||
## Links
|
||||
|
||||
- **GitHub**: https://github.com/Molecule-AI/molecule-monorepo
|
||||
- **Architecture Docs**: https://github.com/Molecule-AI/molecule-monorepo/tree/main/docs/architecture
|
||||
- **API Protocol**: https://github.com/Molecule-AI/molecule-monorepo/tree/main/docs/api-protocol
|
||||
- **Agent Runtime**: https://github.com/Molecule-AI/molecule-monorepo/tree/main/docs/agent-runtime
|
||||
- **Product Docs**: https://github.com/Molecule-AI/molecule-monorepo/tree/main/docs/product
|
||||
- **GitHub**: https://git.moleculesai.app/molecule-ai/molecule-monorepo
|
||||
- **Architecture Docs**: https://git.moleculesai.app/molecule-ai/molecule-monorepo/src/branch/main/docs/architecture
|
||||
- **API Protocol**: https://git.moleculesai.app/molecule-ai/molecule-monorepo/src/branch/main/docs/api-protocol
|
||||
- **Agent Runtime**: https://git.moleculesai.app/molecule-ai/molecule-monorepo/src/branch/main/docs/agent-runtime
|
||||
- **Product Docs**: https://git.moleculesai.app/molecule-ai/molecule-monorepo/src/branch/main/docs/product
|
||||
|
||||
---
|
||||
|
||||
|
||||
@ -79,7 +79,7 @@ For SOC2 / ISO 27001 / customer security questionnaires:
|
||||
|
||||
## Pointers
|
||||
|
||||
- KMS envelope code: [`molecule-controlplane/internal/crypto/kms.go`](https://github.com/Molecule-AI/molecule-controlplane/blob/main/internal/crypto/kms.go)
|
||||
- Static-key fallback: [`molecule-controlplane/internal/crypto/aes.go`](https://github.com/Molecule-AI/molecule-controlplane/blob/main/internal/crypto/aes.go)
|
||||
- KMS envelope code: [`molecule-controlplane/internal/crypto/kms.go`](https://git.moleculesai.app/molecule-ai/molecule-controlplane/src/branch/main/internal/crypto/kms.go)
|
||||
- Static-key fallback: [`molecule-controlplane/internal/crypto/aes.go`](https://git.moleculesai.app/molecule-ai/molecule-controlplane/src/branch/main/internal/crypto/aes.go)
|
||||
- Tenant secrets handler: [`workspace-server/internal/crypto/aes.go`](../../workspace-server/internal/crypto/aes.go)
|
||||
- Tenant secrets schema: [database-schema.md](./database-schema.md#workspace_secrets)
|
||||
|
||||
28
docs/assets/branding/molecule-icon.svg
Normal file
28
docs/assets/branding/molecule-icon.svg
Normal file
@ -0,0 +1,28 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 64 64">
|
||||
<style>
|
||||
.bg { fill: #0a1120; }
|
||||
.accent { fill: #7fe8d6; }
|
||||
.accent-stroke { stroke: #7fe8d6; }
|
||||
@media (prefers-color-scheme: light) {
|
||||
.bg { fill: #f5f7fa; }
|
||||
.accent { fill: #1a8a72; }
|
||||
.accent-stroke { stroke: #1a8a72; }
|
||||
}
|
||||
</style>
|
||||
<rect class="bg" width="64" height="64" rx="14"/>
|
||||
<g class="accent-stroke" stroke-width="2.4" stroke-linecap="round" fill="none">
|
||||
<line x1="32" y1="32" x2="12" y2="14"/>
|
||||
<line x1="32" y1="32" x2="52" y2="18"/>
|
||||
<line x1="32" y1="32" x2="10" y2="40"/>
|
||||
<line x1="32" y1="32" x2="54" y2="44"/>
|
||||
<line x1="32" y1="32" x2="32" y2="56"/>
|
||||
</g>
|
||||
<g class="accent">
|
||||
<circle cx="32" cy="32" r="6.5"/>
|
||||
<circle cx="12" cy="14" r="3.5"/>
|
||||
<circle cx="52" cy="18" r="3.5"/>
|
||||
<circle cx="10" cy="40" r="3.5"/>
|
||||
<circle cx="54" cy="44" r="3.5"/>
|
||||
<circle cx="32" cy="56" r="3.5"/>
|
||||
</g>
|
||||
</svg>
|
||||
|
Before Width: | Height: | Size: 957 B After Width: | Height: | Size: 957 B |
17
docs/assets/branding/molecule-logo.svg
Normal file
17
docs/assets/branding/molecule-logo.svg
Normal file
@ -0,0 +1,17 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 64 64" role="img" aria-label="Molecule AI">
|
||||
<g stroke="#7fe8d6" stroke-width="2.6" stroke-linecap="round" fill="none">
|
||||
<line x1="32" y1="32" x2="12" y2="14"/>
|
||||
<line x1="32" y1="32" x2="52" y2="18"/>
|
||||
<line x1="32" y1="32" x2="10" y2="40"/>
|
||||
<line x1="32" y1="32" x2="54" y2="44"/>
|
||||
<line x1="32" y1="32" x2="32" y2="56"/>
|
||||
</g>
|
||||
<g fill="#7fe8d6">
|
||||
<circle cx="32" cy="32" r="7"/>
|
||||
<circle cx="12" cy="14" r="3.6"/>
|
||||
<circle cx="52" cy="18" r="3.6"/>
|
||||
<circle cx="10" cy="40" r="3.6"/>
|
||||
<circle cx="54" cy="44" r="3.6"/>
|
||||
<circle cx="32" cy="56" r="3.6"/>
|
||||
</g>
|
||||
</svg>
|
||||
|
Before Width: | Height: | Size: 662 B After Width: | Height: | Size: 662 B |
@ -10,7 +10,7 @@ tags: [platform, fly.io, deployment, infrastructure]
|
||||
|
||||
Your infrastructure choice just got decoupled from your agent platform choice. Molecule AI now ships three production-ready workspace backends — `docker`, `flyio`, and `controlplane` — and switching between them takes a single environment variable. Your agent code, model choices, and workspace topology stay exactly the same.
|
||||
|
||||
This post covers what shipped in [PR #501](https://github.com/Molecule-AI/molecule-core/pull/501) (Fly Machines provisioner) and [PR #503](https://github.com/Molecule-AI/molecule-core/pull/503) (control plane provisioner), and which backend fits your situation.
|
||||
This post covers what shipped in [PR #501](https://git.moleculesai.app/molecule-ai/molecule-core/pull/501) (Fly Machines provisioner) and [PR #503](https://git.moleculesai.app/molecule-ai/molecule-core/pull/503) (control plane provisioner), and which backend fits your situation.
|
||||
|
||||
## Before: One Deployment Model for Every Use Case
|
||||
|
||||
@ -107,4 +107,4 @@ No changes to agent code, tool definitions, or orchestration logic. Swap `CONTAI
|
||||
|
||||
---
|
||||
|
||||
*[PR #501](https://github.com/Molecule-AI/molecule-core/pull/501) (Fly Machines provisioner) and [PR #503](https://github.com/Molecule-AI/molecule-core/pull/503) (control plane provisioner) are both merged to `main`. Molecule AI is open source — contributions welcome.*
|
||||
*[PR #501](https://git.moleculesai.app/molecule-ai/molecule-core/pull/501) (Fly Machines provisioner) and [PR #503](https://git.moleculesai.app/molecule-ai/molecule-core/pull/503) (control plane provisioner) are both merged to `main`. Molecule AI is open source — contributions welcome.*
|
||||
|
||||
@ -299,8 +299,8 @@ Or use the Canvas UI: Workspace → Config → MCP Servers → Add browser MCP s
|
||||
|
||||
**Try it free** — Molecule AI is open source and self-hostable. Get a workspace running in under 5 minutes.
|
||||
|
||||
→ [Get started on GitHub →](https://github.com/Molecule-AI/molecule-core)
|
||||
→ [Get started on GitHub →](https://git.moleculesai.app/molecule-ai/molecule-core)
|
||||
|
||||
---
|
||||
|
||||
*Have a browser automation use case you want to see covered? Open a discussion on [GitHub Discussions](https://github.com/Molecule-AI/molecule-core/discussions) — or file an issue with the `enhancement` label.*
|
||||
*Have a browser automation use case you want to see covered? File an issue with the `enhancement` label on the [molecule-core issue tracker](https://git.moleculesai.app/molecule-ai/molecule-core/issues).*
|
||||
|
||||
@ -148,7 +148,7 @@ Then follow the [quick-start guide](/docs/guides/remote-workspaces.md).
|
||||
Or run the annotated example directly:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/Molecule-AI/molecule-sdk-python
|
||||
git clone https://git.moleculesai.app/molecule-ai/molecule-sdk-python
|
||||
cd molecule-sdk-python/examples/remote-agent
|
||||
# Create workspace with runtime:external, grab the ID, then:
|
||||
WORKSPACE_ID=<your-id> PLATFORM_URL=https://acme.moleculesai.app python3 run.py
|
||||
@ -160,6 +160,6 @@ The agent appears on the canvas within seconds.
|
||||
|
||||
→ [Remote Workspaces Guide →](/docs/guides/remote-workspaces.md)
|
||||
→ [External Agent Registration Reference →](/docs/guides/external-agent-registration.md)
|
||||
→ [molecule-sdk-python →](https://github.com/Molecule-AI/molecule-sdk-python)
|
||||
→ [molecule-sdk-python →](https://git.moleculesai.app/molecule-ai/molecule-sdk-python)
|
||||
|
||||
*Phase 30 shipped in PRs #1075–#1083 and #1085–#1100 on `molecule-core`.*
|
||||
|
||||
@ -27,7 +27,7 @@ The biggest user-facing change: every Molecule AI org can now mint named, revoca
|
||||
|
||||
→ [User guide: Organization API Keys](/docs/guides/org-api-keys.md)
|
||||
→ [Architecture: Org API Keys](/docs/architecture/org-api-keys.md)
|
||||
→ PRs: [#1105](https://github.com/Molecule-AI/molecule-core/pull/1105), [#1107](https://github.com/Molecule-AI/molecule-core/pull/1107), [#1109](https://github.com/Molecule-AI/molecule-core/pull/1109), [#1110](https://github.com/Molecule-AI/molecule-core/pull/1110)
|
||||
→ PRs: [#1105](https://git.moleculesai.app/molecule-ai/molecule-core/pull/1105), [#1107](https://git.moleculesai.app/molecule-ai/molecule-core/pull/1107), [#1109](https://git.moleculesai.app/molecule-ai/molecule-core/pull/1109), [#1110](https://git.moleculesai.app/molecule-ai/molecule-core/pull/1110)
|
||||
|
||||
---
|
||||
|
||||
@ -48,7 +48,7 @@ AdminAuth now accepts a session-verification tier that runs **before** the beare
|
||||
**Self-hosted / local dev:** `CP_UPSTREAM_URL` is unset → this feature is disabled, behaviour is unchanged.
|
||||
|
||||
→ [Guide: Same-Origin Canvas Fetches & Session Auth](/docs/guides/same-origin-canvas-fetches.md)
|
||||
→ PRs: [#1099](https://github.com/Molecule-AI/molecule-core/pull/1099), [#1100](https://github.com/Molecule-AI/molecule-core/pull/1100)
|
||||
→ PRs: [#1099](https://git.moleculesai.app/molecule-ai/molecule-core/pull/1099), [#1100](https://git.moleculesai.app/molecule-ai/molecule-core/pull/1100)
|
||||
|
||||
---
|
||||
|
||||
@ -87,7 +87,7 @@ The proxy is **fail-closed**: only an explicit allowlist of paths (`/cp/auth/`,
|
||||
This is also the structural fix for the lateral-movement risk that session auth introduced: without the allowlist, a tenant-authed browser user could have proxied `/cp/admin/*` requests upstream and exploited the fact that those endpoints accept WorkOS session cookies. The allowlist makes that impossible by construction.
|
||||
|
||||
→ [Guide: Same-Origin Canvas Fetches & Session Auth](/docs/guides/same-origin-canvas-fetches.md)
|
||||
→ PR: [#1095](https://github.com/Molecule-AI/molecule-core/pull/1095)
|
||||
→ PR: [#1095](https://git.moleculesai.app/molecule-ai/molecule-core/pull/1095)
|
||||
|
||||
---
|
||||
|
||||
@ -99,7 +99,7 @@ The waitlist itself is a Canvas-administered list with email hashing in audit lo
|
||||
|
||||
This is the operational surface that makes the above security work matter: the beta is invitation-only, credentials are scoped, and every admin action is auditable.
|
||||
|
||||
→ Control plane PRs [#145](https://github.com/Molecule-AI/molecule-controlplane/pull/145), [#148](https://github.com/Molecule-AI/molecule-controlplane/pull/148), [#150](https://github.com/Molecule-AI/molecule-controlplane/pull/150)
|
||||
→ Control plane PRs [#145](https://git.moleculesai.app/molecule-ai/molecule-controlplane/pull/145), [#148](https://git.moleculesai.app/molecule-ai/molecule-controlplane/pull/148), [#150](https://git.moleculesai.app/molecule-ai/molecule-controlplane/pull/150)
|
||||
|
||||
---
|
||||
|
||||
|
||||
@ -12,7 +12,7 @@ Your team is in Discord. Your AI agents are in Molecule AI. Until today, those t
|
||||
|
||||
That's now one webhook URL.
|
||||
|
||||
Molecule AI workspaces can now connect to Discord. Here's what shipped in [PR #656](https://github.com/Molecule-AI/molecule-core/pull/656).
|
||||
Molecule AI workspaces can now connect to Discord. Here's what shipped in [PR #656](https://git.moleculesai.app/molecule-ai/molecule-core/pull/656).
|
||||
|
||||
---
|
||||
|
||||
@ -70,7 +70,7 @@ For inbound slash commands, point your Discord app's **Interactions Endpoint URL
|
||||
|
||||
## Security: Webhook Tokens Don't Appear in Logs
|
||||
|
||||
Webhook URLs contain a token (`/webhooks/{id}/{token}`). If that token leaks into server logs, it's a rotation event. The Discord adapter is explicit about this: HTTP request errors are logged without the URL, and the adapter returns a generic error message. This was hardened in [PR #659](https://github.com/Molecule-AI/molecule-core/pull/659).
|
||||
Webhook URLs contain a token (`/webhooks/{id}/{token}`). If that token leaks into server logs, it's a rotation event. The Discord adapter is explicit about this: HTTP request errors are logged without the URL, and the adapter returns a generic error message. This was hardened in [PR #659](https://git.moleculesai.app/molecule-ai/molecule-core/pull/659).
|
||||
|
||||
---
|
||||
|
||||
@ -97,4 +97,4 @@ Documentation: [Social Channels guide](/docs/agent-runtime/social-channels#disco
|
||||
|
||||
---
|
||||
|
||||
*Discord adapter shipped in [PR #656](https://github.com/Molecule-AI/molecule-core/pull/656). Security hardening in [PR #659](https://github.com/Molecule-AI/molecule-core/pull/659). Molecule AI is open source — contributions welcome.*
|
||||
*Discord adapter shipped in [PR #656](https://git.moleculesai.app/molecule-ai/molecule-core/pull/656). Security hardening in [PR #659](https://git.moleculesai.app/molecule-ai/molecule-core/pull/659). Molecule AI is open source — contributions welcome.*
|
||||
|
||||
@ -133,4 +133,4 @@ With protocol-native A2A, you get:
|
||||
|
||||
Molecule AI's external agent registration is production-ready. Documentation is live at [External Agent Registration Guide](https://docs.molecule.ai/docs/guides/external-agent-registration). The npm package for the MCP server is available at [`@molecule-ai/mcp-server`](https://www.npmjs.com/package/@molecule-ai/mcp-server).
|
||||
|
||||
Read the full [A2A v1.0 protocol spec](https://github.com/Molecule-AI/molecule-core/blob/main/docs/api-protocol/a2a-protocol.md) on GitHub.
|
||||
Read the full [A2A v1.0 protocol spec](https://git.moleculesai.app/molecule-ai/molecule-core/src/branch/main/docs/api-protocol/a2a-protocol.md) on GitHub.
|
||||
@ -45,7 +45,7 @@ canonicalUrl: "https://docs.molecule.ai/blog/remote-workspaces"
|
||||
" proficiencyLevel": "Expert",
|
||||
"genre": ["technical documentation", "product announcement"],
|
||||
"sameAs": [
|
||||
"https://github.com/Molecule-AI/molecule-core",
|
||||
"https://git.moleculesai.app/molecule-ai/molecule-core",
|
||||
"https://molecule.ai"
|
||||
]
|
||||
}
|
||||
@ -270,7 +270,7 @@ Configure it in your project's `.mcp.json` and any AI agent (Claude Code, Cursor
|
||||
|
||||
→ [External Agent Registration Guide](/docs/guides/external-agent-registration) — full step-by-step with Python and Node.js reference implementations
|
||||
|
||||
→ [GitHub: molecule-core](https://github.com/Molecule-AI/molecule-core) — source and issues
|
||||
→ [GitHub: molecule-core](https://git.moleculesai.app/molecule-ai/molecule-core) — source and issues
|
||||
|
||||
→ [Phase 30 Launch Thread on X](https://x.com) — follow for updates
|
||||
|
||||
|
||||
@ -170,4 +170,4 @@ The `staging` branch is now on `a2a-sdk` 1.0.0. The `main` branch still carries
|
||||
|
||||
If you're running `a2a-sdk` 0.3.x and planning the 1.0.0 migration, this post is the reference. The four breaking changes are well-contained, the migration is a single PR, and the eight smoke scenarios above will tell you whether the upgrade is clean before you merge.
|
||||
|
||||
Questions? The [A2A protocol spec](https://github.com/google-a2a/a2a-specification) is the authoritative source. For Molecule AI's production A2A implementation, see [External Agent Registration](https://docs.molecule.ai/docs/guides/external-agent-registration) or open an issue in the [molecule-core](https://github.com/Molecule-AI/molecule-core) repo.
|
||||
Questions? The [A2A protocol spec](https://github.com/google-a2a/a2a-specification) is the authoritative source. For Molecule AI's production A2A implementation, see [External Agent Registration](https://docs.molecule.ai/docs/guides/external-agent-registration) or open an issue in the [molecule-core](https://git.moleculesai.app/molecule-ai/molecule-core) repo.
|
||||
|
||||
@ -3,8 +3,8 @@
|
||||
**Date:** 2026-04-23
|
||||
**Severity:** High — every new SaaS tenant blocked
|
||||
**Detection path:** E2E Staging SaaS run 24848425822 failed at "tenant provisioning"; investigation of CP Railway logs surfaced the auth mismatch.
|
||||
**Status:** Fix pushed on [molecule-controlplane#238](https://github.com/Molecule-AI/molecule-controlplane/pull/238).
|
||||
**Related:** [issue #239](https://github.com/Molecule-AI/molecule-controlplane/issues/239) (Cloudflare DNS record quota), [testing-strategy.md](../engineering/testing-strategy.md)
|
||||
**Status:** Fix pushed on [molecule-controlplane#238](https://git.moleculesai.app/molecule-ai/molecule-controlplane/pull/238).
|
||||
**Related:** [issue #239](https://git.moleculesai.app/molecule-ai/molecule-controlplane/issues/239) (Cloudflare DNS record quota), [testing-strategy.md](../engineering/testing-strategy.md)
|
||||
|
||||
## Summary
|
||||
|
||||
@ -35,7 +35,7 @@ The flow was:
|
||||
|
||||
### The commit that introduced the bug
|
||||
|
||||
[molecule-controlplane#235](https://github.com/Molecule-AI/molecule-controlplane/pull/235) — "fix(provision): wait for tenant boot-event before falling back to canary". Merged 2026-04-22.
|
||||
[molecule-controlplane#235](https://git.moleculesai.app/molecule-ai/molecule-controlplane/pull/235) — "fix(provision): wait for tenant boot-event before falling back to canary". Merged 2026-04-22.
|
||||
|
||||
Before #235, readiness was determined via a canary probe through Cloudflare's edge — which didn't need CP-side auth, so the INSERT ordering didn't matter. #235 made boot-events the primary readiness signal but didn't move the INSERT earlier. The race was latent before but became load-bearing after.
|
||||
|
||||
@ -90,7 +90,7 @@ bootReady, _ := provisioner.WaitForTenantReady(ctx, h.db, org.ID, 4*time.Minute)
|
||||
h.db.ExecContext(ctx, `UPDATE org_instances SET status = 'running' WHERE org_id = $1`, org.ID)
|
||||
```
|
||||
|
||||
See [molecule-controlplane#238](https://github.com/Molecule-AI/molecule-controlplane/pull/238) for the full diff.
|
||||
See [molecule-controlplane#238](https://git.moleculesai.app/molecule-ai/molecule-controlplane/pull/238) for the full diff.
|
||||
|
||||
## Lessons
|
||||
|
||||
@ -122,9 +122,9 @@ Early investigation blamed the hermes provider 401 bug (a separate, known issue
|
||||
|
||||
## Follow-ups
|
||||
|
||||
- [ ] Land [molecule-controlplane#238](https://github.com/Molecule-AI/molecule-controlplane/pull/238)
|
||||
- [ ] Land [molecule-controlplane#238](https://git.moleculesai.app/molecule-ai/molecule-controlplane/pull/238)
|
||||
- [ ] Redeploy staging-api, verify E2E goes green
|
||||
- [ ] Add CP integration test suite (see lesson #2)
|
||||
- [ ] Wire E2E failure → notification (see lesson #3)
|
||||
- [ ] Add invariant comment in `provisionTenant` (see lesson #4)
|
||||
- [ ] Cloudflare DNS quota cleanup — [molecule-controlplane#239](https://github.com/Molecule-AI/molecule-controlplane/issues/239)
|
||||
- [ ] Cloudflare DNS quota cleanup — [molecule-controlplane#239](https://git.moleculesai.app/molecule-ai/molecule-controlplane/issues/239)
|
||||
|
||||
@ -138,5 +138,5 @@ If you see any of these, don't try to "clean it up in place" — **cherry-pick o
|
||||
|
||||
## Related
|
||||
|
||||
- [Issue #1822](https://github.com/Molecule-AI/molecule-core/issues/1822) — backend parity drift tracker (example of docs that have to stay current)
|
||||
- [Issue #1822](https://git.moleculesai.app/molecule-ai/molecule-core/issues/1822) — backend parity drift tracker (example of docs that have to stay current)
|
||||
- [Postmortem: CP boot-event 401](./postmortem-2026-04-23-boot-event-401.md) — caught before shipping because a reviewer could read the diff
|
||||
|
||||
@ -103,9 +103,9 @@ A bad test:
|
||||
|
||||
## Related
|
||||
|
||||
- [Issue #1821](https://github.com/Molecule-AI/molecule-core/issues/1821) — policy tracking issue
|
||||
- [Issue #1815](https://github.com/Molecule-AI/molecule-core/issues/1815) — Canvas coverage instrumentation
|
||||
- [Issue #1818](https://github.com/Molecule-AI/molecule-core/issues/1818) — Python pytest-cov
|
||||
- [Issue #1814](https://github.com/Molecule-AI/molecule-core/issues/1814) — workspace_provision_test.go unblock
|
||||
- [Issue #1816](https://github.com/Molecule-AI/molecule-core/issues/1816) — tokens.go coverage
|
||||
- [Issue #1819](https://github.com/Molecule-AI/molecule-core/issues/1819) — wsauth_middleware coverage
|
||||
- [Issue #1821](https://git.moleculesai.app/molecule-ai/molecule-core/issues/1821) — policy tracking issue
|
||||
- [Issue #1815](https://git.moleculesai.app/molecule-ai/molecule-core/issues/1815) — Canvas coverage instrumentation
|
||||
- [Issue #1818](https://git.moleculesai.app/molecule-ai/molecule-core/issues/1818) — Python pytest-cov
|
||||
- [Issue #1814](https://git.moleculesai.app/molecule-ai/molecule-core/issues/1814) — workspace_provision_test.go unblock
|
||||
- [Issue #1816](https://git.moleculesai.app/molecule-ai/molecule-core/issues/1816) — tokens.go coverage
|
||||
- [Issue #1819](https://git.moleculesai.app/molecule-ai/molecule-core/issues/1819) — wsauth_middleware coverage
|
||||
|
||||
@ -153,7 +153,7 @@ The `id` field is your workspace ID — remember it.
|
||||
|---|---|
|
||||
| "Failed to send message — agent may be unreachable" | The tenant couldn't POST to your URL. Verify `curl https://<your-tunnel>/health` returns 200 from another machine. |
|
||||
| Response takes > 30s | Canvas times out around 30s. Keep initial implementations simple. For long-running work, return a placeholder and use [polling mode](#next-step-polling-mode-preview) (once available). |
|
||||
| Agent duplicated in chat | Known canvas bug where WebSocket + HTTP responses both render. Fixed in [PR #1517](https://github.com/Molecule-AI/molecule-core/pull/1517). |
|
||||
| Agent duplicated in chat | Known canvas bug where WebSocket + HTTP responses both render. Fixed in [PR #1517](https://git.moleculesai.app/molecule-ai/molecule-core/pull/1517). |
|
||||
| Agent replies but canvas shows "Agent unreachable" | Check the tenant can reach your URL. Cloudflare quick tunnels rotate — the URL in your canvas may point at a dead tunnel after restart. |
|
||||
| Getting 404 when POSTing to tenant | Add `X-Molecule-Org-Id` header. The tenant's security layer 404s unmatched origin requests by design. |
|
||||
|
||||
@ -215,7 +215,7 @@ Push mode (this guide) works today but requires an inbound-reachable URL — whi
|
||||
|
||||
Your agent makes only outbound HTTPS calls to the platform, pulling messages from an inbox queue and posting replies back. Works behind any NAT/firewall, tolerates offline laptops, no tunnel needed.
|
||||
|
||||
See the [design doc](https://github.com/Molecule-AI/internal/blob/main/product/external-workspaces-polling.md) (internal) and [implementation tracking issue](https://github.com/Molecule-AI/molecule-core/issues?q=polling+mode) once opened.
|
||||
See the [design doc](https://git.moleculesai.app/molecule-ai/internal/src/branch/main/product/external-workspaces-polling.md) (internal) and the implementation tracking issue (search `polling+mode` on the [molecule-core issue tracker](https://git.moleculesai.app/molecule-ai/molecule-core/issues)).
|
||||
|
||||
---
|
||||
|
||||
@ -255,7 +255,7 @@ If all four pass and canvas still shows your agent as unreachable, see the [remo
|
||||
## Feedback
|
||||
|
||||
This is a new path. Tell us what broke:
|
||||
- Open an issue: https://github.com/Molecule-AI/molecule-core/issues/new?labels=external-workspace
|
||||
- Open an issue: https://git.moleculesai.app/molecule-ai/molecule-core/issues/new?labels=external-workspace
|
||||
- Join #external-workspaces on our Slack
|
||||
- Submit a PR improving this doc if something tripped you up — the faster we can make the quickstart, the more developers we bring in
|
||||
|
||||
|
||||
@ -143,5 +143,5 @@ The agent appears on the canvas with a **purple REMOTE badge** within seconds. F
|
||||
## Next Steps
|
||||
|
||||
- **[External Agent Registration Guide →](/docs/guides/external-agent-registration)** — full endpoint reference, Python + Node.js examples, troubleshooting
|
||||
- **[molecule-sdk-python →](https://github.com/Molecule-AI/molecule-sdk-python)** — SDK source, `RemoteAgentClient` API docs
|
||||
- **[SDK Examples →](https://github.com/Molecule-AI/molecule-sdk-python/tree/main/examples/remote-agent)** — `run.py` demo script, annotated walkthrough
|
||||
- **[molecule-sdk-python →](https://git.moleculesai.app/molecule-ai/molecule-sdk-python)** — SDK source, `RemoteAgentClient` API docs
|
||||
- **[SDK Examples →](https://git.moleculesai.app/molecule-ai/molecule-sdk-python/src/branch/main/examples/remote-agent)** — `run.py` demo script, annotated walkthrough
|
||||
|
||||
@ -61,7 +61,7 @@ molecule skills install arxiv-research --from community
|
||||
|
||||
Community skills are reviewed by the Molecule AI team before being
|
||||
listed. Submit a skill for review by opening a PR against
|
||||
[`molecule-ai/skills`](https://github.com/Molecule-AI/skills).
|
||||
[`molecule-ai/skills`](https://git.moleculesai.app/molecule-ai/skills).
|
||||
|
||||
## Installing via config.yaml
|
||||
|
||||
@ -151,7 +151,7 @@ molecule skills bundle my-custom-skill --output ./org-templates/my-role/
|
||||
```
|
||||
|
||||
**Publishing to the community:** Open a PR against
|
||||
[`molecule-ai/skills`](https://github.com/Molecule-AI/skills) with a
|
||||
[`molecule-ai/skills`](https://git.moleculesai.app/molecule-ai/skills) with a
|
||||
complete skill package. Community skills are reviewed for security and
|
||||
correctness before listing.
|
||||
|
||||
|
||||
@ -99,10 +99,10 @@ fork needed in production.
|
||||
`resolve_platform_id` for plugin-platform-safe deserialization, and
|
||||
`self.adapters[adapter.platform]` keying fix (caught by real-subprocess
|
||||
test before merge — see below).
|
||||
- **Plugin package**: [Molecule-AI/hermes-platform-molecule-a2a](https://github.com/Molecule-AI/hermes-platform-molecule-a2a)
|
||||
- **Plugin package**: [Molecule-AI/hermes-platform-molecule-a2a](https://git.moleculesai.app/molecule-ai/hermes-platform-molecule-a2a)
|
||||
v0.1.0 — public, MIT-licensed. 11 unit tests + 8 in-process E2E
|
||||
+ 4 real-subprocess E2E checkpoints all green.
|
||||
- **Workspace template patch**: [Molecule-AI/molecule-ai-workspace-template-hermes#32](https://github.com/Molecule-AI/molecule-ai-workspace-template-hermes/pull/32)
|
||||
- **Workspace template patch**: [Molecule-AI/molecule-ai-workspace-template-hermes#32](https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-hermes/pull/32)
|
||||
— Dockerfile installs the patched fork + plugin into the hermes
|
||||
installer's venv; start.sh seeds `platforms.molecule-a2a` config
|
||||
stanza. Pre-demo deliberately install-only; adapter.py rewrite to
|
||||
@ -157,9 +157,9 @@ intermediate shim earns its complexity.
|
||||
## Codex (OpenAI Codex CLI)
|
||||
|
||||
**Status:** Template SHIPPED. Repo live at
|
||||
[`Molecule-AI/molecule-ai-workspace-template-codex`](https://github.com/Molecule-AI/molecule-ai-workspace-template-codex)
|
||||
[`Molecule-AI/molecule-ai-workspace-template-codex`](https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-codex)
|
||||
(14 files, 1411 LOC, 12/12 tests). molecule-core registration in
|
||||
[PR #2512](https://github.com/Molecule-AI/molecule-core/pull/2512).
|
||||
[PR #2512](https://git.moleculesai.app/molecule-ai/molecule-core/pull/2512).
|
||||
E2E with real A2A traffic remains.
|
||||
|
||||
**Path:** Persistent `codex app-server` stdio JSON-RPC client
|
||||
|
||||
@ -101,7 +101,7 @@ incident-shaped.
|
||||
## [v1.0.0] — initial release (RFC #2728, PRs #2729-#2742)
|
||||
|
||||
Initial plugin contract + 11-PR rollout. See
|
||||
[issue #2728](https://github.com/Molecule-AI/molecule-core/issues/2728)
|
||||
[issue #2728](https://git.moleculesai.app/molecule-ai/molecule-core/issues/2728)
|
||||
for the full RFC.
|
||||
|
||||
Endpoints: `/v1/health`, `/v1/namespaces/{name}` (PUT/PATCH/DELETE),
|
||||
|
||||
@ -160,11 +160,11 @@ not expose.
|
||||
| `molecule-skill-update-docs` | `[claude_code]` | `[claude_code, hermes]` |
|
||||
|
||||
Companion PRs:
|
||||
- [molecule-ai-plugin-ecc#2](https://github.com/Molecule-AI/molecule-ai-plugin-ecc/pull/2)
|
||||
- [molecule-ai-plugin-superpowers#2](https://github.com/Molecule-AI/molecule-ai-plugin-superpowers/pull/2)
|
||||
- [molecule-ai-plugin-molecule-dev#2](https://github.com/Molecule-AI/molecule-ai-plugin-molecule-dev/pull/2)
|
||||
- [molecule-ai-plugin-molecule-skill-cron-learnings#2](https://github.com/Molecule-AI/molecule-ai-plugin-molecule-skill-cron-learnings/pull/2)
|
||||
- [molecule-ai-plugin-molecule-skill-update-docs#2](https://github.com/Molecule-AI/molecule-ai-plugin-molecule-skill-update-docs/pull/2)
|
||||
- [molecule-ai-plugin-ecc#2](https://git.moleculesai.app/molecule-ai/molecule-ai-plugin-ecc/pull/2)
|
||||
- [molecule-ai-plugin-superpowers#2](https://git.moleculesai.app/molecule-ai/molecule-ai-plugin-superpowers/pull/2)
|
||||
- [molecule-ai-plugin-molecule-dev#2](https://git.moleculesai.app/molecule-ai/molecule-ai-plugin-molecule-dev/pull/2)
|
||||
- [molecule-ai-plugin-molecule-skill-cron-learnings#2](https://git.moleculesai.app/molecule-ai/molecule-ai-plugin-molecule-skill-cron-learnings/pull/2)
|
||||
- [molecule-ai-plugin-molecule-skill-update-docs#2](https://git.moleculesai.app/molecule-ai/molecule-ai-plugin-molecule-skill-update-docs/pull/2)
|
||||
|
||||
Security note: Security Auditor was offline at time of change. Self-assessed
|
||||
as non-security-impacting — adding `hermes` to a string list in `plugin.yaml`
|
||||
|
||||
@ -17,7 +17,7 @@ This path is aligned to the current repository and current UI. It gets you from
|
||||
## The one-command path
|
||||
|
||||
```bash
|
||||
git clone https://github.com/Molecule-AI/molecule-monorepo.git
|
||||
git clone https://git.moleculesai.app/molecule-ai/molecule-monorepo.git
|
||||
cd molecule-monorepo
|
||||
./scripts/dev-start.sh
|
||||
```
|
||||
@ -42,7 +42,7 @@ If you'd rather run each component yourself — useful when you're iterating on
|
||||
### Step 1: Clone the repository
|
||||
|
||||
```bash
|
||||
git clone https://github.com/Molecule-AI/molecule-monorepo.git
|
||||
git clone https://git.moleculesai.app/molecule-ai/molecule-monorepo.git
|
||||
cd molecule-monorepo
|
||||
```
|
||||
|
||||
|
||||
137
docs/runbooks/handlers-postgres-integration-port-collision.md
Normal file
137
docs/runbooks/handlers-postgres-integration-port-collision.md
Normal file
@ -0,0 +1,137 @@
|
||||
# Runbook — Handlers Postgres Integration port-collision substrate
|
||||
|
||||
**Status:** Resolved 2026-05-08 (PR for class B Hongming-owned CICD red sweep).
|
||||
|
||||
## Symptom
|
||||
|
||||
`Handlers Postgres Integration` workflow fails on staging push and PRs.
|
||||
Step `Apply migrations to Postgres service` shows:
|
||||
|
||||
```
|
||||
psql: error: connection to server at "127.0.0.1", port 5432 failed: Connection refused
|
||||
```
|
||||
|
||||
Job-cleanup step further down logs:
|
||||
|
||||
```
|
||||
Cleaning up services for job Handlers Postgres Integration
|
||||
failed to remove container: Error response from daemon: No such container: <id>
|
||||
```
|
||||
|
||||
…confirming the postgres service container was already gone before
|
||||
cleanup ran.
|
||||
|
||||
## Root cause
|
||||
|
||||
Our Gitea act_runner (operator host `5.78.80.188`,
|
||||
`/opt/molecule/runners/config.yaml`) sets:
|
||||
|
||||
```yaml
|
||||
container:
|
||||
network: host
|
||||
```
|
||||
|
||||
…which act_runner applies to BOTH the job container AND every
|
||||
`services:` container in a workflow. Multiple workflow instances
|
||||
running concurrently across the 16 parallel runners each try to bind
|
||||
postgres on `0.0.0.0:5432`. The first wins; subsequent instances exit
|
||||
immediately with:
|
||||
|
||||
```
|
||||
LOG: could not bind IPv4 address "0.0.0.0": Address in use
|
||||
HINT: Is another postmaster already running on port 5432?
|
||||
FATAL: could not create any TCP/IP sockets
|
||||
```
|
||||
|
||||
act_runner sets `AutoRemove:true` on service containers, so Docker
|
||||
garbage-collects them as soon as they exit. By the time the migrations
|
||||
step runs `pg_isready` / `psql`, the container is gone and connection
|
||||
refused.
|
||||
|
||||
Reproduction (operator host):
|
||||
|
||||
```bash
|
||||
docker run --rm -d --name pg-A --network host \
|
||||
-e POSTGRES_PASSWORD=test postgres:15-alpine
|
||||
docker run -d --name pg-B --network host \
|
||||
-e POSTGRES_PASSWORD=test postgres:15-alpine
|
||||
docker logs pg-B # FATAL: could not create any TCP/IP sockets
|
||||
```
|
||||
|
||||
## Why per-job override doesn't work
|
||||
|
||||
The natural fix — per-job `container.network` override — is silently
|
||||
ignored by act_runner. The runner log emits:
|
||||
|
||||
```
|
||||
--network and --net in the options will be ignored.
|
||||
```
|
||||
|
||||
This is a documented act_runner constraint: container network is a
|
||||
runner-wide setting, not per-job. Source: gitea/act_runner config docs
|
||||
+ vegardit/docker-gitea-act-runner issue #7.
|
||||
|
||||
Flipping the global `container.network` to `bridge` would break every
|
||||
other workflow in the repo (cache server discovery,
|
||||
`molecule-monorepo-net` peer access during integration tests, etc.) —
|
||||
unacceptable blast radius for a per-test bug.
|
||||
|
||||
## Fix shape
|
||||
|
||||
`handlers-postgres-integration.yml` no longer uses `services: postgres:`.
|
||||
It launches a sibling postgres container manually on the existing
|
||||
`molecule-monorepo-net` bridge network with a per-run unique name:
|
||||
|
||||
```yaml
|
||||
env:
|
||||
PG_NAME: pg-handlers-${{ github.run_id }}-${{ github.run_attempt }}
|
||||
PG_NETWORK: molecule-monorepo-net
|
||||
|
||||
steps:
|
||||
- name: Start sibling Postgres on bridge network
|
||||
run: |
|
||||
docker run -d --name "${PG_NAME}" --network "${PG_NETWORK}" \
|
||||
...
|
||||
postgres:15-alpine
|
||||
PG_HOST=$(docker inspect "${PG_NAME}" \
|
||||
--format "{{(index .NetworkSettings.Networks \"${PG_NETWORK}\").IPAddress}}")
|
||||
echo "PG_HOST=${PG_HOST}" >> "$GITHUB_ENV"
|
||||
|
||||
# … migrations + tests use ${PG_HOST}, not 127.0.0.1 …
|
||||
|
||||
- if: always() && …
|
||||
name: Stop sibling Postgres
|
||||
run: docker rm -f "${PG_NAME}" || true
|
||||
```
|
||||
|
||||
The host-net job container can reach a bridge-net container via the
|
||||
bridge IP directly (verified manually, 2026-05-08). Two parallel runs
|
||||
use different names + different bridge IPs — no collision.
|
||||
|
||||
## Future-proofing
|
||||
|
||||
Other workflows that hit the same shape (any `services:` with a
|
||||
fixed-port image) will exhibit the same failure mode under
|
||||
host-network runner config. Translate using this same pattern:
|
||||
|
||||
1. Drop the `services:` block.
|
||||
2. Use `${{ github.run_id }}-${{ github.run_attempt }}` for unique
|
||||
container name.
|
||||
3. Launch on `molecule-monorepo-net` (already trusted bridge in
|
||||
`docker-compose.infra.yml`).
|
||||
4. Read back the bridge IP via `docker inspect` and export as a step env.
|
||||
5. `if: always()` cleanup step at the end.
|
||||
|
||||
If the count of such workflows grows, factor into a composite action
|
||||
(`./.github/actions/sibling-postgres`) so the substrate logic lives
|
||||
in one place.
|
||||
|
||||
## Related
|
||||
|
||||
- Issue #88 (closed by #92): localhost → 127.0.0.1 fix that unmasked
|
||||
this collision; the IPv6 fix is correct, port collision is the new
|
||||
layer.
|
||||
- Issue #94 created `molecule-monorepo-net` + `alpine:latest` as
|
||||
prereqs.
|
||||
- Saved memory `feedback_act_runner_github_server_url` documents
|
||||
another act_runner-vs-GHA divergence (server URL).
|
||||
@ -198,7 +198,7 @@ Lighthouse audit against staging.yourapp.com:
|
||||
FCP: 2.4s | LCP: 5.2s | CLS: 0.18 | TBT: 620ms
|
||||
|
||||
Performance regression detected — opening GitHub issue.
|
||||
Issue: https://github.com/Molecule-AI/molecule-core/issues/1527
|
||||
Issue: https://git.moleculesai.app/molecule-ai/molecule-core/issues/1527
|
||||
Label: performance-regression | Assignees: @your-team
|
||||
```
|
||||
|
||||
|
||||
@ -85,8 +85,8 @@ Fly Machines start in milliseconds and run in 35+ regions. Provisioning agent wo
|
||||
|
||||
## Related
|
||||
|
||||
- PR #501: [feat(platform): Fly Machines provisioner](https://github.com/Molecule-AI/molecule-core/pull/501)
|
||||
- PR #481: [feat(ci): deploy to Fly after image push](https://github.com/Molecule-AI/molecule-core/pull/481)
|
||||
- PR #501: [feat(platform): Fly Machines provisioner](https://git.moleculesai.app/molecule-ai/molecule-core/pull/501)
|
||||
- PR #481: [feat(ci): deploy to Fly after image push](https://git.moleculesai.app/molecule-ai/molecule-core/pull/481)
|
||||
- [Fly Machines API docs](https://fly.io/docs/machines/api/)
|
||||
- [Platform API reference](../api-reference.md)
|
||||
- Issue [#525](https://github.com/Molecule-AI/molecule-core/issues/525)
|
||||
- Issue [#525](https://git.moleculesai.app/molecule-ai/molecule-core/issues/525)
|
||||
|
||||
@ -61,6 +61,6 @@ The real power surfaces when you mix runtimes on the same Molecule AI tenant. Yo
|
||||
|
||||
## Related
|
||||
|
||||
- PR #379: [feat(adapters): add gemini-cli runtime adapter](https://github.com/Molecule-AI/molecule-core/pull/379)
|
||||
- PR #379: [feat(adapters): add gemini-cli runtime adapter](https://git.moleculesai.app/molecule-ai/molecule-core/pull/379)
|
||||
- [Multi-provider Hermes docs](../architecture/hermes.md)
|
||||
- [Workspace runtimes reference](../reference/runtimes.md)
|
||||
|
||||
@ -68,7 +68,7 @@ ADK workspaces participate in the same A2A network as Claude Code, Gemini CLI, H
|
||||
|
||||
## Related
|
||||
|
||||
- PR #550: [feat(adapters): add google-adk runtime adapter](https://github.com/Molecule-AI/molecule-core/pull/550)
|
||||
- PR #550: [feat(adapters): add google-adk runtime adapter](https://git.moleculesai.app/molecule-ai/molecule-core/pull/550)
|
||||
- [Google ADK (adk-python)](https://github.com/google/adk-python)
|
||||
- [Gemini CLI runtime tutorial](./gemini-cli-runtime.md)
|
||||
- [Platform API reference](../api-reference.md)
|
||||
|
||||
@ -176,9 +176,9 @@ What is on the roadmap for Phase 2d (not yet shipped):
|
||||
|
||||
## Related
|
||||
|
||||
- PR #240: [Phase 2a — native Anthropic dispatch](https://github.com/Molecule-AI/molecule-core/pull/240)
|
||||
- PR #255: [Phase 2b — native Gemini dispatch](https://github.com/Molecule-AI/molecule-core/pull/255)
|
||||
- PR #267: [Phase 2c — multi-turn history on all paths](https://github.com/Molecule-AI/molecule-core/pull/267)
|
||||
- PR #240: [Phase 2a — native Anthropic dispatch](https://git.moleculesai.app/molecule-ai/molecule-core/pull/240)
|
||||
- PR #255: [Phase 2b — native Gemini dispatch](https://git.moleculesai.app/molecule-ai/molecule-core/pull/255)
|
||||
- PR #267: [Phase 2c — multi-turn history on all paths](https://git.moleculesai.app/molecule-ai/molecule-core/pull/267)
|
||||
- [Hermes adapter design](../adapters/hermes-adapter-design.md)
|
||||
- [Platform API reference](../api-reference.md)
|
||||
- Issue [#513](https://github.com/Molecule-AI/molecule-core/issues/513)
|
||||
- Issue [#513](https://git.moleculesai.app/molecule-ai/molecule-core/issues/513)
|
||||
|
||||
@ -90,6 +90,6 @@ Molecule AI canvas without code changes.
|
||||
|
||||
## Related
|
||||
|
||||
- PR #480: [feat(channels): Lark / Feishu channel adapter](https://github.com/Molecule-AI/molecule-core/pull/480)
|
||||
- PR #480: [feat(channels): Lark / Feishu channel adapter](https://git.moleculesai.app/molecule-ai/molecule-core/pull/480)
|
||||
- [Social channels architecture](../agent-runtime/social-channels.md)
|
||||
- [Channel adapter reference](../api-reference.md#channels)
|
||||
@ -98,14 +98,14 @@ Each of the 8 adapter template repos contains:
|
||||
|
||||
| Adapter | Repo |
|
||||
|---------|------|
|
||||
| claude-code | https://github.com/Molecule-AI/molecule-ai-workspace-template-claude-code |
|
||||
| langgraph | https://github.com/Molecule-AI/molecule-ai-workspace-template-langgraph |
|
||||
| crewai | https://github.com/Molecule-AI/molecule-ai-workspace-template-crewai |
|
||||
| autogen | https://github.com/Molecule-AI/molecule-ai-workspace-template-autogen |
|
||||
| deepagents | https://github.com/Molecule-AI/molecule-ai-workspace-template-deepagents |
|
||||
| hermes | https://github.com/Molecule-AI/molecule-ai-workspace-template-hermes |
|
||||
| gemini-cli | https://github.com/Molecule-AI/molecule-ai-workspace-template-gemini-cli |
|
||||
| openclaw | https://github.com/Molecule-AI/molecule-ai-workspace-template-openclaw |
|
||||
| claude-code | https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-claude-code |
|
||||
| langgraph | https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-langgraph |
|
||||
| crewai | https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-crewai |
|
||||
| autogen | https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-autogen |
|
||||
| deepagents | https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-deepagents |
|
||||
| hermes | https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-hermes |
|
||||
| gemini-cli | https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-gemini-cli |
|
||||
| openclaw | https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-openclaw |
|
||||
|
||||
## Adapter discovery (ADAPTER_MODULE)
|
||||
|
||||
@ -244,7 +244,7 @@ correctness before pushing a `runtime-v*` tag.
|
||||
## Writing a new adapter
|
||||
|
||||
Use the GitHub template repo
|
||||
[`Molecule-AI/molecule-ai-workspace-template-starter`](https://github.com/Molecule-AI/molecule-ai-workspace-template-starter)
|
||||
[`molecule-ai/molecule-ai-workspace-template-starter`](https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-starter) (note: the starter repo did not survive the 2026-05-06 GitHub-org-suspension migration; recreation tracked at internal#41)
|
||||
— it ships with the canonical Dockerfile + adapter.py skeleton + config.yaml
|
||||
schema + the `repository_dispatch: [runtime-published]` cascade receiver
|
||||
already wired up. No follow-up setup PR required.
|
||||
@ -256,7 +256,7 @@ gh repo create Molecule-AI/molecule-ai-workspace-template-<runtime> \
|
||||
--public \
|
||||
--description "Molecule AI workspace template: <runtime>"
|
||||
|
||||
git clone https://github.com/Molecule-AI/molecule-ai-workspace-template-<runtime>
|
||||
git clone https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-<runtime>.git
|
||||
cd molecule-ai-workspace-template-<runtime>
|
||||
```
|
||||
|
||||
@ -286,7 +286,7 @@ After `git push`:
|
||||
If the canonical shape changes (e.g. `config.yaml` schema gets a new field,
|
||||
the `BaseAdapter` interface adds a method, the reusable CI workflow
|
||||
signature changes), update the
|
||||
[starter](https://github.com/Molecule-AI/molecule-ai-workspace-template-starter)
|
||||
[starter](https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-starter) (recreation pending — see note above)
|
||||
**first**. Existing templates can either migrate at their own pace or be
|
||||
touched in a coordinated cleanup PR. Either way, future templates pick up
|
||||
the new shape from day one.
|
||||
|
||||
@ -2,46 +2,46 @@
|
||||
"_comment": "Pin refs to release tags for reproducible builds. 'main' is OK while all repos are internal.",
|
||||
"version": 1,
|
||||
"plugins": [
|
||||
{"name": "browser-automation", "repo": "Molecule-AI/molecule-ai-plugin-browser-automation", "ref": "main"},
|
||||
{"name": "ecc", "repo": "Molecule-AI/molecule-ai-plugin-ecc", "ref": "main"},
|
||||
{"name": "gh-identity", "repo": "Molecule-AI/molecule-ai-plugin-gh-identity", "ref": "main"},
|
||||
{"name": "molecule-audit", "repo": "Molecule-AI/molecule-ai-plugin-molecule-audit", "ref": "main"},
|
||||
{"name": "molecule-audit-trail", "repo": "Molecule-AI/molecule-ai-plugin-molecule-audit-trail", "ref": "main"},
|
||||
{"name": "molecule-careful-bash", "repo": "Molecule-AI/molecule-ai-plugin-molecule-careful-bash", "ref": "main"},
|
||||
{"name": "molecule-compliance", "repo": "Molecule-AI/molecule-ai-plugin-molecule-compliance", "ref": "main"},
|
||||
{"name": "molecule-dev", "repo": "Molecule-AI/molecule-ai-plugin-molecule-dev", "ref": "main"},
|
||||
{"name": "molecule-freeze-scope", "repo": "Molecule-AI/molecule-ai-plugin-molecule-freeze-scope", "ref": "main"},
|
||||
{"name": "molecule-hitl", "repo": "Molecule-AI/molecule-ai-plugin-molecule-hitl", "ref": "main"},
|
||||
{"name": "molecule-prompt-watchdog", "repo": "Molecule-AI/molecule-ai-plugin-molecule-prompt-watchdog", "ref": "main"},
|
||||
{"name": "molecule-security-scan", "repo": "Molecule-AI/molecule-ai-plugin-molecule-security-scan", "ref": "main"},
|
||||
{"name": "molecule-session-context", "repo": "Molecule-AI/molecule-ai-plugin-molecule-session-context", "ref": "main"},
|
||||
{"name": "molecule-skill-code-review", "repo": "Molecule-AI/molecule-ai-plugin-molecule-skill-code-review", "ref": "main"},
|
||||
{"name": "molecule-skill-cron-learnings", "repo": "Molecule-AI/molecule-ai-plugin-molecule-skill-cron-learnings", "ref": "main"},
|
||||
{"name": "molecule-skill-cross-vendor-review", "repo": "Molecule-AI/molecule-ai-plugin-molecule-skill-cross-vendor-review", "ref": "main"},
|
||||
{"name": "molecule-skill-llm-judge", "repo": "Molecule-AI/molecule-ai-plugin-molecule-skill-llm-judge", "ref": "main"},
|
||||
{"name": "molecule-skill-update-docs", "repo": "Molecule-AI/molecule-ai-plugin-molecule-skill-update-docs", "ref": "main"},
|
||||
{"name": "molecule-workflow-retro", "repo": "Molecule-AI/molecule-ai-plugin-molecule-workflow-retro", "ref": "main"},
|
||||
{"name": "molecule-workflow-triage", "repo": "Molecule-AI/molecule-ai-plugin-molecule-workflow-triage", "ref": "main"},
|
||||
{"name": "superpowers", "repo": "Molecule-AI/molecule-ai-plugin-superpowers", "ref": "main"}
|
||||
{"name": "browser-automation", "repo": "molecule-ai/molecule-ai-plugin-browser-automation", "ref": "main"},
|
||||
{"name": "ecc", "repo": "molecule-ai/molecule-ai-plugin-ecc", "ref": "main"},
|
||||
{"name": "gh-identity", "repo": "molecule-ai/molecule-ai-plugin-gh-identity", "ref": "main"},
|
||||
{"name": "molecule-audit", "repo": "molecule-ai/molecule-ai-plugin-molecule-audit", "ref": "main"},
|
||||
{"name": "molecule-audit-trail", "repo": "molecule-ai/molecule-ai-plugin-molecule-audit-trail", "ref": "main"},
|
||||
{"name": "molecule-careful-bash", "repo": "molecule-ai/molecule-ai-plugin-molecule-careful-bash", "ref": "main"},
|
||||
{"name": "molecule-compliance", "repo": "molecule-ai/molecule-ai-plugin-molecule-compliance", "ref": "main"},
|
||||
{"name": "molecule-dev", "repo": "molecule-ai/molecule-ai-plugin-molecule-dev", "ref": "main"},
|
||||
{"name": "molecule-freeze-scope", "repo": "molecule-ai/molecule-ai-plugin-molecule-freeze-scope", "ref": "main"},
|
||||
{"name": "molecule-hitl", "repo": "molecule-ai/molecule-ai-plugin-molecule-hitl", "ref": "main"},
|
||||
{"name": "molecule-prompt-watchdog", "repo": "molecule-ai/molecule-ai-plugin-molecule-prompt-watchdog", "ref": "main"},
|
||||
{"name": "molecule-security-scan", "repo": "molecule-ai/molecule-ai-plugin-molecule-security-scan", "ref": "main"},
|
||||
{"name": "molecule-session-context", "repo": "molecule-ai/molecule-ai-plugin-molecule-session-context", "ref": "main"},
|
||||
{"name": "molecule-skill-code-review", "repo": "molecule-ai/molecule-ai-plugin-molecule-skill-code-review", "ref": "main"},
|
||||
{"name": "molecule-skill-cron-learnings", "repo": "molecule-ai/molecule-ai-plugin-molecule-skill-cron-learnings", "ref": "main"},
|
||||
{"name": "molecule-skill-cross-vendor-review", "repo": "molecule-ai/molecule-ai-plugin-molecule-skill-cross-vendor-review", "ref": "main"},
|
||||
{"name": "molecule-skill-llm-judge", "repo": "molecule-ai/molecule-ai-plugin-molecule-skill-llm-judge", "ref": "main"},
|
||||
{"name": "molecule-skill-update-docs", "repo": "molecule-ai/molecule-ai-plugin-molecule-skill-update-docs", "ref": "main"},
|
||||
{"name": "molecule-workflow-retro", "repo": "molecule-ai/molecule-ai-plugin-molecule-workflow-retro", "ref": "main"},
|
||||
{"name": "molecule-workflow-triage", "repo": "molecule-ai/molecule-ai-plugin-molecule-workflow-triage", "ref": "main"},
|
||||
{"name": "superpowers", "repo": "molecule-ai/molecule-ai-plugin-superpowers", "ref": "main"}
|
||||
],
|
||||
"workspace_templates": [
|
||||
{"name": "claude-code-default", "repo": "Molecule-AI/molecule-ai-workspace-template-claude-code", "ref": "main"},
|
||||
{"name": "hermes", "repo": "Molecule-AI/molecule-ai-workspace-template-hermes", "ref": "main"},
|
||||
{"name": "openclaw", "repo": "Molecule-AI/molecule-ai-workspace-template-openclaw", "ref": "main"},
|
||||
{"name": "codex", "repo": "Molecule-AI/molecule-ai-workspace-template-codex", "ref": "main"},
|
||||
{"name": "langgraph", "repo": "Molecule-AI/molecule-ai-workspace-template-langgraph", "ref": "main"},
|
||||
{"name": "crewai", "repo": "Molecule-AI/molecule-ai-workspace-template-crewai", "ref": "main"},
|
||||
{"name": "autogen", "repo": "Molecule-AI/molecule-ai-workspace-template-autogen", "ref": "main"},
|
||||
{"name": "deepagents", "repo": "Molecule-AI/molecule-ai-workspace-template-deepagents", "ref": "main"},
|
||||
{"name": "gemini-cli", "repo": "Molecule-AI/molecule-ai-workspace-template-gemini-cli", "ref": "main"}
|
||||
{"name": "claude-code-default", "repo": "molecule-ai/molecule-ai-workspace-template-claude-code", "ref": "main"},
|
||||
{"name": "hermes", "repo": "molecule-ai/molecule-ai-workspace-template-hermes", "ref": "main"},
|
||||
{"name": "openclaw", "repo": "molecule-ai/molecule-ai-workspace-template-openclaw", "ref": "main"},
|
||||
{"name": "codex", "repo": "molecule-ai/molecule-ai-workspace-template-codex", "ref": "main"},
|
||||
{"name": "langgraph", "repo": "molecule-ai/molecule-ai-workspace-template-langgraph", "ref": "main"},
|
||||
{"name": "crewai", "repo": "molecule-ai/molecule-ai-workspace-template-crewai", "ref": "main"},
|
||||
{"name": "autogen", "repo": "molecule-ai/molecule-ai-workspace-template-autogen", "ref": "main"},
|
||||
{"name": "deepagents", "repo": "molecule-ai/molecule-ai-workspace-template-deepagents", "ref": "main"},
|
||||
{"name": "gemini-cli", "repo": "molecule-ai/molecule-ai-workspace-template-gemini-cli", "ref": "main"}
|
||||
],
|
||||
"org_templates": [
|
||||
{"name": "molecule-dev", "repo": "Molecule-AI/molecule-ai-org-template-molecule-dev", "ref": "main"},
|
||||
{"name": "free-beats-all", "repo": "Molecule-AI/molecule-ai-org-template-free-beats-all", "ref": "main"},
|
||||
{"name": "medo-smoke", "repo": "Molecule-AI/molecule-ai-org-template-medo-smoke", "ref": "main"},
|
||||
{"name": "molecule-worker-gemini", "repo": "Molecule-AI/molecule-ai-org-template-molecule-worker-gemini", "ref": "main"},
|
||||
{"name": "reno-stars", "repo": "Molecule-AI/molecule-ai-org-template-reno-stars", "ref": "main"},
|
||||
{"name": "ux-ab-lab", "repo": "Molecule-AI/molecule-ai-org-template-ux-ab-lab", "ref": "main"},
|
||||
{"name": "mock-bigorg", "repo": "Molecule-AI/molecule-ai-org-template-mock-bigorg", "ref": "main"}
|
||||
{"name": "molecule-dev", "repo": "molecule-ai/molecule-ai-org-template-molecule-dev", "ref": "main"},
|
||||
{"name": "free-beats-all", "repo": "molecule-ai/molecule-ai-org-template-free-beats-all", "ref": "main"},
|
||||
{"name": "medo-smoke", "repo": "molecule-ai/molecule-ai-org-template-medo-smoke", "ref": "main"},
|
||||
{"name": "molecule-worker-gemini", "repo": "molecule-ai/molecule-ai-org-template-molecule-worker-gemini", "ref": "main"},
|
||||
{"name": "reno-stars", "repo": "molecule-ai/molecule-ai-org-template-reno-stars", "ref": "main"},
|
||||
{"name": "ux-ab-lab", "repo": "molecule-ai/molecule-ai-org-template-ux-ab-lab", "ref": "main"},
|
||||
{"name": "mock-bigorg", "repo": "molecule-ai/molecule-ai-org-template-mock-bigorg", "ref": "main"}
|
||||
]
|
||||
}
|
||||
|
||||
@ -11,7 +11,7 @@ There are three related scripts; pick the right one:
|
||||
|---|---|---|
|
||||
| `measure-coordinator-task-bounds.sh` | **Canonical** v1 harness for the RFC #2251 / Issue 4 reproduction. Provisions a PM coordinator + Researcher child via `claude-code-default` + `langgraph` templates, sends a synthesis-heavy A2A kickoff, observes elapsed time + activity trace. | OSS-shape platform — localhost or any `/workspaces`-shaped endpoint. Has tenant/admin-token guards for non-localhost runs. |
|
||||
| `measure-coordinator-task-bounds-runner.sh` | Generalised runner for the same measurement contract but with **arbitrary template + secret + model combinations** (Hermes/MiniMax, etc.). Useful for cross-runtime variants without modifying the canonical harness. | Same as above (local or SaaS via `MODE=saas`). |
|
||||
| `measure-coordinator-task-bounds.sh` (in [molecule-controlplane](https://github.com/Molecule-AI/molecule-controlplane)) | **Production-shape** variant that bootstraps a real staging tenant via `POST /cp/admin/orgs`, then runs the same measurement against `<slug>.staging.moleculesai.app`. | Staging controlplane only — refuses to run against production. |
|
||||
| `measure-coordinator-task-bounds.sh` (in [molecule-controlplane](https://git.moleculesai.app/molecule-ai/molecule-controlplane)) | **Production-shape** variant that bootstraps a real staging tenant via `POST /cp/admin/orgs`, then runs the same measurement against `<slug>.staging.moleculesai.app`. | Staging controlplane only — refuses to run against production. |
|
||||
|
||||
See `reference_harness_pair_pattern` (auto-memory) for when to use which
|
||||
and the cross-repo design rationale.
|
||||
|
||||
@ -278,7 +278,7 @@ include = ["molecule_runtime*"]
|
||||
README_TEMPLATE = """\
|
||||
# molecule-ai-workspace-runtime
|
||||
|
||||
Shared workspace runtime for [Molecule AI](https://github.com/Molecule-AI/molecule-core)
|
||||
Shared workspace runtime for [Molecule AI](https://git.moleculesai.app/molecule-ai/molecule-core)
|
||||
agent adapters. Installed by every workspace template image
|
||||
(`workspace-template-claude-code`, `-langgraph`, `-hermes`, etc.) to provide
|
||||
A2A delegation, heartbeat, memory, plugin loading, and skill management.
|
||||
@ -376,7 +376,7 @@ hold:
|
||||
non-plugin-sourced server, which Claude Code rejects with
|
||||
`channel_enable requires a marketplace plugin`. Until the
|
||||
official `moleculesai/claude-code-plugin` marketplace lands
|
||||
(tracking [#2936](https://github.com/Molecule-AI/molecule-core/issues/2936)),
|
||||
(tracking [#2936](https://git.moleculesai.app/molecule-ai/molecule-core/issues/2936)),
|
||||
operators who want push must scaffold their own local marketplace
|
||||
under
|
||||
`~/.claude/marketplaces/molecule-local/` containing a
|
||||
@ -389,14 +389,14 @@ hold:
|
||||
Symptom of any condition failing: messages arrive but only via the
|
||||
poll path (every ~1–60s), not real-time. There's currently no
|
||||
diagnostic surfaced — `molecule-mcp doctor` (tracking
|
||||
[#2937](https://github.com/Molecule-AI/molecule-core/issues/2937)) is
|
||||
[#2937](https://git.moleculesai.app/molecule-ai/molecule-core/issues/2937)) is
|
||||
planned.
|
||||
|
||||
If you don't need real-time push, the default poll path works
|
||||
universally with no extra setup; both modes converge on the same
|
||||
`inbox_pop` ack so messages never duplicate.
|
||||
|
||||
See [`docs/workspace-runtime-package.md`](https://github.com/Molecule-AI/molecule-core/blob/main/docs/workspace-runtime-package.md)
|
||||
See [`docs/workspace-runtime-package.md`](https://git.moleculesai.app/molecule-ai/molecule-core/src/branch/main/docs/workspace-runtime-package.md)
|
||||
for the publish flow and architecture.
|
||||
"""
|
||||
|
||||
|
||||
@ -68,22 +68,19 @@ clone_category() {
|
||||
continue
|
||||
fi
|
||||
|
||||
# Post-2026-05-06 GitHub-org-suspension: clone from Gitea instead.
|
||||
# manifest.json paths still read "Molecule-AI/..." (the historic
|
||||
# github.com slug); Gitea lowercases the org part to "molecule-ai/".
|
||||
# Lowercase the org segment on the fly so we don't need to rewrite
|
||||
# every manifest entry.
|
||||
repo_gitea="$(echo "$repo" | awk -F/ '{ printf "%s", tolower($1); for (i=2; i<=NF; i++) printf "/%s", $i; print "" }')"
|
||||
|
||||
# Build the clone URL. When MOLECULE_GITEA_TOKEN is set (CI path)
|
||||
# embed it as basic-auth so private repos succeed. The username
|
||||
# part ("oauth2") is conventional and ignored by Gitea — only the
|
||||
# token-as-password is verified.
|
||||
#
|
||||
# manifest.json was migrated to lowercase org slugs on
|
||||
# 2026-05-07 (post-suspension reconciliation), so we use $repo
|
||||
# verbatim — no on-the-fly tolower transform needed.
|
||||
if [ -n "${MOLECULE_GITEA_TOKEN:-}" ]; then
|
||||
clone_url="https://oauth2:${MOLECULE_GITEA_TOKEN}@git.moleculesai.app/${repo_gitea}.git"
|
||||
display_url="https://oauth2:***@git.moleculesai.app/${repo_gitea}.git"
|
||||
clone_url="https://oauth2:${MOLECULE_GITEA_TOKEN}@git.moleculesai.app/${repo}.git"
|
||||
display_url="https://oauth2:***@git.moleculesai.app/${repo}.git"
|
||||
else
|
||||
clone_url="https://git.moleculesai.app/${repo_gitea}.git"
|
||||
clone_url="https://git.moleculesai.app/${repo}.git"
|
||||
display_url="$clone_url"
|
||||
fi
|
||||
|
||||
|
||||
@ -10,11 +10,11 @@
|
||||
# → PyPI auto-bumps molecule-ai-workspace-runtime patch version
|
||||
# → repository_dispatch fans out to 8 workspace-template-* repos
|
||||
# → each template repo rebuilds and re-tags
|
||||
# ghcr.io/molecule-ai/workspace-template-<runtime>:latest
|
||||
# 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/workspace-template-<runtime>:latest
|
||||
#
|
||||
# PATH 2: any merge to a workspace-template-* repo's main branch
|
||||
# → that repo's publish-image.yml fires
|
||||
# → ghcr.io/molecule-ai/workspace-template-<runtime>:latest
|
||||
# → 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/workspace-template-<runtime>:latest
|
||||
# gets re-tagged
|
||||
#
|
||||
# provisioner.go:296 RuntimeImages[runtime] reads `:latest` at every
|
||||
|
||||
@ -51,7 +51,7 @@ log "pulling latest images for: ${RUNTIMES[*]}"
|
||||
PULLED=()
|
||||
FAILED=()
|
||||
for rt in "${RUNTIMES[@]}"; do
|
||||
IMG="ghcr.io/molecule-ai/workspace-template-$rt:latest"
|
||||
IMG="153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/workspace-template-$rt:latest"
|
||||
if docker pull "$IMG" >/dev/null 2>&1; then
|
||||
log " ✓ $rt"
|
||||
PULLED+=("$rt")
|
||||
|
||||
@ -1,9 +1,10 @@
|
||||
#!/bin/bash
|
||||
# rollback-latest.sh — moves the :latest tag on ghcr.io/molecule-ai/platform
|
||||
# (and the matching tenant image) back to a prior :staging-<sha> digest
|
||||
# without rebuilding anything. Prod tenants auto-pull :latest every 5
|
||||
# min, so this is the fast path when a canary-verified image turns out
|
||||
# to have a runtime regression that canary didn't catch.
|
||||
# rollback-latest.sh — moves the :latest tag on the platform image
|
||||
# (and the matching tenant image) on AWS ECR back to a prior
|
||||
# :staging-<sha> digest without rebuilding anything. Prod tenants
|
||||
# auto-pull :latest every 5 min, so this is the fast path when a
|
||||
# canary-verified image turns out to have a runtime regression that
|
||||
# canary didn't catch.
|
||||
#
|
||||
# Usage:
|
||||
# scripts/rollback-latest.sh <sha>
|
||||
@ -12,12 +13,14 @@
|
||||
# Prereqs:
|
||||
# - crane on $PATH (brew install crane OR download from
|
||||
# https://github.com/google/go-containerregistry/releases)
|
||||
# - GHCR token exported as GITHUB_TOKEN with write:packages scope
|
||||
# - aws CLI authenticated for region us-east-2 with ECR pull/push
|
||||
# access to the molecule-ai/platform + platform-tenant repositories.
|
||||
# `aws sts get-caller-identity` should succeed.
|
||||
#
|
||||
# What it does (per image — platform + tenant):
|
||||
# crane digest ghcr.io/…:<sha> # verify the target sha exists
|
||||
# crane tag ghcr.io/…:<sha> latest # retag remotely, single API call
|
||||
# crane digest ghcr.io/…:latest # confirm the move
|
||||
# crane digest <ecr>:<sha> # verify the target sha exists
|
||||
# crane tag <ecr>:<sha> latest # retag remotely, single API call
|
||||
# crane digest <ecr>:latest # confirm the move
|
||||
#
|
||||
# Exit codes: 0 = both retagged, 1 = tag missing / crane error, 2 = bad args.
|
||||
|
||||
@ -30,21 +33,23 @@ if [ "${1:-}" = "" ]; then
|
||||
fi
|
||||
|
||||
TARGET_SHA="$1"
|
||||
PLATFORM=ghcr.io/molecule-ai/platform
|
||||
TENANT=ghcr.io/molecule-ai/platform-tenant
|
||||
ECR_HOST=153263036946.dkr.ecr.us-east-2.amazonaws.com
|
||||
PLATFORM=$ECR_HOST/molecule-ai/platform
|
||||
TENANT=$ECR_HOST/molecule-ai/platform-tenant
|
||||
|
||||
if ! command -v crane >/dev/null; then
|
||||
echo "ERROR: crane not installed. brew install crane" >&2
|
||||
exit 1
|
||||
fi
|
||||
if [ -z "${GITHUB_TOKEN:-}" ]; then
|
||||
echo "ERROR: GITHUB_TOKEN unset. export it with write:packages scope." >&2
|
||||
if ! command -v aws >/dev/null; then
|
||||
echo "ERROR: aws CLI not installed. brew install awscli" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Log in once. crane stores creds in a config file keyed by registry;
|
||||
# re-running is cheap.
|
||||
printf '%s\n' "$GITHUB_TOKEN" | crane auth login ghcr.io -u "${GITHUB_ACTOR:-$(whoami)}" --password-stdin >/dev/null
|
||||
# Log in once. ECR auth is via short-lived password from `aws ecr
|
||||
# get-login-password`. crane stores creds in a config file keyed by
|
||||
# registry; re-running is cheap.
|
||||
aws ecr get-login-password --region us-east-2 | crane auth login "$ECR_HOST" -u AWS --password-stdin >/dev/null
|
||||
|
||||
roll() {
|
||||
local image="$1"
|
||||
|
||||
@ -105,5 +105,5 @@ Hard per-workflow timeouts (15–40 min) cap runaway cost. Three teardown layers
|
||||
|
||||
## Known gaps (tracked elsewhere)
|
||||
|
||||
- [#1369](https://github.com/Molecule-AI/molecule-core/issues/1369): SaaS canvas Files / Terminal / Peers tabs — architecturally broken; whitelisted in the spec
|
||||
- [#1369](https://git.moleculesai.app/molecule-ai/molecule-core/issues/1369): SaaS canvas Files / Terminal / Peers tabs — architecturally broken; whitelisted in the spec
|
||||
- LLM-driven delegation (autonomous `delegate_task` tool use) — probabilistic, not in v1; proxy mechanics covered
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
# Production-shape local harness
|
||||
|
||||
<!-- Retrigger Harness Replays after Class G #168 + clone-manifest fix (#42). -->
|
||||
|
||||
The harness brings up the SaaS tenant topology on localhost using the
|
||||
same `Dockerfile.tenant` image that ships to production. Tests target
|
||||
the cf-proxy on `http://localhost:8080` and pass the tenant identity
|
||||
|
||||
14
tests/harness/cf-proxy/Dockerfile
Normal file
14
tests/harness/cf-proxy/Dockerfile
Normal file
@ -0,0 +1,14 @@
|
||||
# cf-proxy harness image — nginx + the harness's tenant-routing config baked
|
||||
# in at build time.
|
||||
#
|
||||
# Why bake (not bind-mount): on Gitea Actions / act_runner, the runner is a
|
||||
# container talking to the OUTER docker daemon over the host socket; runc
|
||||
# resolves bind-mount source paths on the outer host filesystem, where the
|
||||
# repo at `/workspace/.../tests/harness/cf-proxy/nginx.conf` is invisible.
|
||||
# Compose `configs:` (with `file:`) falls back to bind mounts when swarm is
|
||||
# not active, so it hits the same gap. A build-time COPY uploads the file
|
||||
# as part of the docker build context — the daemon receives the tarball
|
||||
# directly and never bind-mounts. See issue #88 item 2.
|
||||
FROM nginx:1.27-alpine
|
||||
|
||||
COPY nginx.conf /etc/nginx/nginx.conf
|
||||
@ -167,15 +167,26 @@ services:
|
||||
# Production shape: same single CF tunnel front-doors every tenant
|
||||
# subdomain — the Host header carries the tenant identity, not the
|
||||
# routing destination. Local cf-proxy mirrors this exactly.
|
||||
#
|
||||
# nginx.conf delivery: built into a custom image via cf-proxy/Dockerfile
|
||||
# (a thin nginx:1.27-alpine + COPY). NOT a bind mount and NOT a
|
||||
# compose `configs:` block, both of which break under Gitea's
|
||||
# act_runner: the runner talks to the OUTER docker daemon over the
|
||||
# host socket, and runc resolves bind sources on the outer host
|
||||
# filesystem, where `/workspace/.../tests/harness/cf-proxy/nginx.conf`
|
||||
# is invisible. Compose `configs:` falls back to bind mounts without
|
||||
# swarm, so it hits the same gap. A build context, by contrast, is
|
||||
# uploaded to the daemon as a tarball at build time — no bind. See
|
||||
# issue #88 item 2.
|
||||
cf-proxy:
|
||||
image: nginx:1.27-alpine
|
||||
build:
|
||||
context: ./cf-proxy
|
||||
dockerfile: Dockerfile
|
||||
depends_on:
|
||||
tenant-alpha:
|
||||
condition: service_healthy
|
||||
tenant-beta:
|
||||
condition: service_healthy
|
||||
volumes:
|
||||
- ./cf-proxy/nginx.conf:/etc/nginx/nginx.conf:ro
|
||||
# Bind to 127.0.0.1 only — hardcoded ADMIN_TOKENs make 0.0.0.0
|
||||
# exposure unsafe even on a local network.
|
||||
ports:
|
||||
|
||||
@ -18,7 +18,7 @@
|
||||
#
|
||||
# Or inline via curl:
|
||||
#
|
||||
# bash <(curl -fsSL https://raw.githubusercontent.com/Molecule-AI/molecule-core/main/tools/check-template-parity.sh) \
|
||||
# bash <(curl -fsSL https://git.moleculesai.app/molecule-ai/molecule-core/raw/branch/main/tools/check-template-parity.sh) \
|
||||
# install.sh start.sh
|
||||
#
|
||||
# Exit codes:
|
||||
|
||||
@ -249,6 +249,19 @@ func main() {
|
||||
})
|
||||
}
|
||||
|
||||
// CP-mode orphan sweeper — SaaS counterpart to the Docker sweeper
|
||||
// above. Re-issues cpProv.Stop for any workspace at status='removed'
|
||||
// with a non-NULL instance_id, healing the deprovision split-write
|
||||
// race documented in #2989: tenant marks status='removed' BEFORE
|
||||
// calling CP DELETE, so a transient CP failure leaves the EC2
|
||||
// running with no retry path. cpProv.Stop is idempotent against
|
||||
// already-terminated instances; on success we clear instance_id.
|
||||
if cpProv != nil {
|
||||
go supervised.RunWithRecover(ctx, "cp-orphan-sweeper", func(c context.Context) {
|
||||
registry.StartCPOrphanSweeper(c, cpProv)
|
||||
})
|
||||
}
|
||||
|
||||
// Pending-uploads GC sweep — deletes acked rows past their retention
|
||||
// window plus unacked rows past expires_at. Without this the
|
||||
// pending_uploads table grows unbounded; even with the 24h hard TTL,
|
||||
|
||||
457
workspace-server/internal/handlers/eic_tunnel_pool.go
Normal file
457
workspace-server/internal/handlers/eic_tunnel_pool.go
Normal file
@ -0,0 +1,457 @@
|
||||
package handlers
|
||||
|
||||
// eic_tunnel_pool.go — refcounted pool for EIC SSH tunnels keyed on
|
||||
// instanceID. Reuses one tunnel across N file ops, amortising the
|
||||
// ssh-keygen + SendSSHPublicKey + open-tunnel + waitForPort cost
|
||||
// (~3-5s) over multiple cats/finds (~50-200ms each).
|
||||
//
|
||||
// Origin: core#11 — canvas detail-panel config + filesystem load
|
||||
// took ~20s. ConfigTab fans out 4 GETs serially; the slowest is
|
||||
// /files/config.yaml which dispatches to readFileViaEIC. Without a
|
||||
// pool, every readFileViaEIC + listFilesViaEIC + writeFileViaEIC +
|
||||
// deleteFileViaEIC pays the full setup cost even when fired
|
||||
// back-to-back on the same workspace EC2.
|
||||
//
|
||||
// The pool keeps one eicSSHSession alive per instanceID for up to
|
||||
// poolTTL. SendSSHPublicKey grants a 60s key validity, so poolTTL
|
||||
// must stay strictly below that to avoid serving requests on a
|
||||
// just-expired key. We default to 50s with a 10s safety margin.
|
||||
//
|
||||
// Concurrency model:
|
||||
//
|
||||
// - Single mutex guards the entries map.
|
||||
// - Slow path (tunnel setup) runs OUTSIDE the lock, gated by an
|
||||
// "intent" placeholder so concurrent acquires for the same
|
||||
// instanceID don't both build a tunnel — the loser drops its
|
||||
// setup and uses the winner's.
|
||||
// - Refcount on each entry; eviction blocked while refcount > 0.
|
||||
// - Janitor goroutine sweeps every poolJanitorInterval, drops
|
||||
// entries where refcount == 0 && expiresAt < now.
|
||||
//
|
||||
// Test injection:
|
||||
//
|
||||
// - poolSetupTunnel is a package-level var so tests can swap the
|
||||
// slow path for a counting stub. Production wires it to
|
||||
// realWithEICTunnel-style setup.
|
||||
// - withEICTunnel (the public, single-shot API) is also a var
|
||||
// (already, see template_files_eic.go). It's rebound here to
|
||||
// pooledWithEICTunnel which routes through globalEICTunnelPool.
|
||||
// - Tests that need single-shot behaviour can set poolTTL = 0,
|
||||
// which makes pooledWithEICTunnel fall through to the underlying
|
||||
// setup directly (no pool entry kept).
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// poolTTL is the maximum age of a pooled tunnel. Must be strictly
|
||||
// less than the SendSSHPublicKey grant window (60s) so we never
|
||||
// serve a request through a key that's about to expire mid-op.
|
||||
//
|
||||
// Configurable via init-time wiring (see initEICTunnelPool); not a
|
||||
// const so tests can pin TTL=0 (disable pooling) or TTL=50ms (drive
|
||||
// eviction tests).
|
||||
var poolTTL = 50 * time.Second
|
||||
|
||||
// poolJanitorInterval is how often the janitor goroutine sweeps for
|
||||
// expired idle entries. Tighter than poolTTL so eviction is timely;
|
||||
// loose enough that the goroutine doesn't burn CPU.
|
||||
var poolJanitorInterval = 10 * time.Second
|
||||
|
||||
// poolMaxEntries caps simultaneous instanceIDs the pool tracks.
|
||||
// Beyond this, new acquires evict the LRU entry. Defends against a
|
||||
// pathological caller (e.g. a sweep over hundreds of workspace
|
||||
// EC2s) from leaking unbounded tunnel processes. 32 is a generous
|
||||
// ceiling for the canvas use case (one human navigates ≤ ~5
|
||||
// workspaces at a time).
|
||||
var poolMaxEntries = 32
|
||||
|
||||
// poolSetupTunnel is the slow-path tunnel constructor. Wrapped in a
|
||||
// var so tests can inject a counter stub. Returns a session and a
|
||||
// cleanup function (closes the open-tunnel subprocess + scrubs the
|
||||
// ephemeral keydir). nil session + non-nil err means setup failed
|
||||
// and there is nothing to clean up.
|
||||
//
|
||||
// Production wiring lives in eic_tunnel_pool_setup.go (a thin shim
|
||||
// over the existing realWithEICTunnel logic).
|
||||
var poolSetupTunnel = func(ctx context.Context, instanceID string) (
|
||||
sess eicSSHSession, cleanup func(), err error) {
|
||||
return setupRealEICTunnel(ctx, instanceID)
|
||||
}
|
||||
|
||||
// pooledTunnel is one entry in the pool. session is shared by N
|
||||
// concurrent fn calls; cleanup runs once when refcount returns to
|
||||
// zero AND the entry is past expiresAt or evicted.
|
||||
//
|
||||
// lastUsed tracks the most recent acquire time for LRU bookkeeping
|
||||
// (overflow eviction). expiresAt is set at construction and not
|
||||
// extended on use — a tunnel cannot live past poolTTL even if it's
|
||||
// hot, because the underlying SendSSHPublicKey grant expires.
|
||||
type pooledTunnel struct {
|
||||
session eicSSHSession
|
||||
cleanup func()
|
||||
expiresAt time.Time
|
||||
lastUsed time.Time
|
||||
refcount int
|
||||
poisoned bool // true if a fn returned a tunnel-fatal error; do not reuse
|
||||
}
|
||||
|
||||
// eicTunnelPool is the package-level pool. Single instance lives
|
||||
// in globalEICTunnelPool; constructor runs lazily on first acquire.
|
||||
type eicTunnelPool struct {
|
||||
mu sync.Mutex
|
||||
entries map[string]*pooledTunnel
|
||||
// pendingSetups guards concurrent setup for the same instanceID.
|
||||
// First acquirer takes the slot; later ones wait on the channel.
|
||||
pendingSetups map[string]chan struct{}
|
||||
stopJanitor chan struct{}
|
||||
// janitorInterval is captured at pool construction from the
|
||||
// package-level poolJanitorInterval var. Captured (not re-read on
|
||||
// every tick) so a test that swaps the package var via t.Cleanup
|
||||
// after a global pool's janitor is already running can't race
|
||||
// with that goroutine's ticker read. The global pool is created
|
||||
// lazily once per process via sync.Once; before this capture
|
||||
// landed, every test that touched poolJanitorInterval after the
|
||||
// global pool's first-touch raced the janitor (caught by -race
|
||||
// on staging tip 249dbc6a — TestPooledWithEICTunnel_PanicPoisonsEntry).
|
||||
// Tests still get the new value on a freshPool() because they
|
||||
// set the package var BEFORE calling newEICTunnelPool().
|
||||
janitorInterval time.Duration
|
||||
}
|
||||
|
||||
var (
|
||||
globalEICTunnelPool *eicTunnelPool
|
||||
globalEICTunnelPoolOnce sync.Once
|
||||
)
|
||||
|
||||
// getEICTunnelPool returns the singleton pool, lazy-initialising on
|
||||
// first call. Idempotent.
|
||||
func getEICTunnelPool() *eicTunnelPool {
|
||||
globalEICTunnelPoolOnce.Do(func() {
|
||||
globalEICTunnelPool = newEICTunnelPool()
|
||||
go globalEICTunnelPool.janitor()
|
||||
})
|
||||
return globalEICTunnelPool
|
||||
}
|
||||
|
||||
// newEICTunnelPool constructs an empty pool. Exported so tests can
|
||||
// build isolated pools without sharing the singleton.
|
||||
//
|
||||
// Captures poolJanitorInterval at construction time so the janitor
|
||||
// goroutine doesn't race with t.Cleanup-driven swaps of the package
|
||||
// var. See the janitorInterval field comment for the failure mode.
|
||||
func newEICTunnelPool() *eicTunnelPool {
|
||||
return &eicTunnelPool{
|
||||
entries: map[string]*pooledTunnel{},
|
||||
pendingSetups: map[string]chan struct{}{},
|
||||
stopJanitor: make(chan struct{}),
|
||||
janitorInterval: poolJanitorInterval,
|
||||
}
|
||||
}
|
||||
|
||||
// acquire returns a usable session for instanceID. If a healthy entry
|
||||
// exists, refcount++ and return it. If a setup is in flight for the
|
||||
// same instanceID, wait for it. Otherwise build one (slow path).
|
||||
//
|
||||
// done() must be called by the caller when the op finishes. It
|
||||
// decrements refcount and triggers cleanup if the entry is past
|
||||
// TTL or poisoned and refcount==0.
|
||||
//
|
||||
// Errors from the slow path propagate; pool state is not modified
|
||||
// for failed setups (no poisoned entry created — that's only for
|
||||
// fn-returned errors on a previously-good session).
|
||||
func (p *eicTunnelPool) acquire(ctx context.Context, instanceID string) (
|
||||
sess eicSSHSession, done func(poisoned bool), err error) {
|
||||
|
||||
if poolTTL <= 0 {
|
||||
// Pool disabled (TTL=0 mode for tests / opt-out). Fall
|
||||
// through to a direct setup with caller-driven cleanup.
|
||||
s, cleanup, err := poolSetupTunnel(ctx, instanceID)
|
||||
if err != nil {
|
||||
return eicSSHSession{}, nil, err
|
||||
}
|
||||
return s, func(_ bool) { cleanup() }, nil
|
||||
}
|
||||
|
||||
for {
|
||||
p.mu.Lock()
|
||||
if pt, ok := p.entries[instanceID]; ok && !pt.poisoned && pt.expiresAt.After(time.Now()) {
|
||||
pt.refcount++
|
||||
pt.lastUsed = time.Now()
|
||||
p.mu.Unlock()
|
||||
return pt.session, p.releaser(instanceID, pt), nil
|
||||
}
|
||||
// Either no entry, expired entry, or poisoned entry. If a
|
||||
// setup is already in flight, wait and retry.
|
||||
if pending, ok := p.pendingSetups[instanceID]; ok {
|
||||
p.mu.Unlock()
|
||||
select {
|
||||
case <-pending:
|
||||
continue // re-check the entries map
|
||||
case <-ctx.Done():
|
||||
return eicSSHSession{}, nil, ctx.Err()
|
||||
}
|
||||
}
|
||||
// Drop expired/poisoned entry now (we'll cleanup outside
|
||||
// the lock — the entry is unreferenced or we'd not be here).
|
||||
var oldCleanup func()
|
||||
if pt, ok := p.entries[instanceID]; ok {
|
||||
if pt.refcount == 0 {
|
||||
oldCleanup = pt.cleanup
|
||||
delete(p.entries, instanceID)
|
||||
}
|
||||
}
|
||||
// Reserve the setup slot.
|
||||
signal := make(chan struct{})
|
||||
p.pendingSetups[instanceID] = signal
|
||||
p.mu.Unlock()
|
||||
|
||||
if oldCleanup != nil {
|
||||
go oldCleanup()
|
||||
}
|
||||
|
||||
// Slow path: build a new tunnel. Anything that goes wrong
|
||||
// here cleans up the pendingSetups slot and propagates to
|
||||
// the caller without leaving the pool in a state where the
|
||||
// next acquire blocks waiting on a signal that never fires.
|
||||
newSess, cleanup, setupErr := poolSetupTunnel(ctx, instanceID)
|
||||
|
||||
p.mu.Lock()
|
||||
delete(p.pendingSetups, instanceID)
|
||||
close(signal)
|
||||
|
||||
if setupErr != nil {
|
||||
p.mu.Unlock()
|
||||
return eicSSHSession{}, nil, fmt.Errorf("eic tunnel setup: %w", setupErr)
|
||||
}
|
||||
|
||||
// Enforce LRU bound BEFORE inserting so we don't briefly
|
||||
// exceed the cap even by one entry.
|
||||
p.evictLRUIfFullLocked(instanceID)
|
||||
|
||||
pt := &pooledTunnel{
|
||||
session: newSess,
|
||||
cleanup: cleanup,
|
||||
expiresAt: time.Now().Add(poolTTL),
|
||||
lastUsed: time.Now(),
|
||||
refcount: 1,
|
||||
}
|
||||
p.entries[instanceID] = pt
|
||||
p.mu.Unlock()
|
||||
return pt.session, p.releaser(instanceID, pt), nil
|
||||
}
|
||||
}
|
||||
|
||||
// releaser returns a closure that decrements refcount and triggers
|
||||
// cleanup if (a) the entry is past TTL or (b) the caller signalled
|
||||
// poison. Idempotent against double-release (decrements once via the
|
||||
// captured pt; pool entry may have been replaced by then).
|
||||
func (p *eicTunnelPool) releaser(instanceID string, pt *pooledTunnel) func(poisoned bool) {
|
||||
released := false
|
||||
return func(poisoned bool) {
|
||||
p.mu.Lock()
|
||||
defer p.mu.Unlock()
|
||||
if released {
|
||||
return
|
||||
}
|
||||
released = true
|
||||
pt.refcount--
|
||||
if poisoned {
|
||||
pt.poisoned = true
|
||||
}
|
||||
// Evict immediately if poisoned-and-idle OR expired-and-idle.
|
||||
// Hot entries (refcount > 0) defer eviction to the last release.
|
||||
if pt.refcount == 0 && (pt.poisoned || pt.expiresAt.Before(time.Now())) {
|
||||
// If the entry in the map is still us, remove it.
|
||||
if cur, ok := p.entries[instanceID]; ok && cur == pt {
|
||||
delete(p.entries, instanceID)
|
||||
}
|
||||
go pt.cleanup()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// evictLRUIfFullLocked drops the least-recently-used IDLE entry
|
||||
// when the pool is at capacity. Caller must hold p.mu. The new
|
||||
// instanceID about to be inserted is excluded so we don't evict
|
||||
// ourselves. If no idle entries exist, no eviction happens — the
|
||||
// new entry will push us above the soft cap until something releases.
|
||||
func (p *eicTunnelPool) evictLRUIfFullLocked(skipInstance string) {
|
||||
if len(p.entries) < poolMaxEntries {
|
||||
return
|
||||
}
|
||||
var oldestKey string
|
||||
var oldest *pooledTunnel
|
||||
for k, pt := range p.entries {
|
||||
if k == skipInstance {
|
||||
continue
|
||||
}
|
||||
if pt.refcount > 0 {
|
||||
continue
|
||||
}
|
||||
if oldest == nil || pt.lastUsed.Before(oldest.lastUsed) {
|
||||
oldestKey = k
|
||||
oldest = pt
|
||||
}
|
||||
}
|
||||
if oldest == nil {
|
||||
return // every entry is in use; no eviction possible
|
||||
}
|
||||
delete(p.entries, oldestKey)
|
||||
go oldest.cleanup()
|
||||
}
|
||||
|
||||
// janitor periodically scans for entries that are idle AND expired,
|
||||
// closing their tunnels. Runs forever (per pool lifetime); cancelled
|
||||
// by close(p.stopJanitor) for tests that build short-lived pools.
|
||||
//
|
||||
// Reads p.janitorInterval (captured at construction) instead of the
|
||||
// package-level poolJanitorInterval — see janitorInterval field comment.
|
||||
func (p *eicTunnelPool) janitor() {
|
||||
t := time.NewTicker(p.janitorInterval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-t.C:
|
||||
p.sweep()
|
||||
case <-p.stopJanitor:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sweep is one janitor pass. Drops idle expired entries.
|
||||
func (p *eicTunnelPool) sweep() {
|
||||
p.mu.Lock()
|
||||
now := time.Now()
|
||||
var toClose []func()
|
||||
for k, pt := range p.entries {
|
||||
if pt.refcount == 0 && pt.expiresAt.Before(now) {
|
||||
toClose = append(toClose, pt.cleanup)
|
||||
delete(p.entries, k)
|
||||
}
|
||||
}
|
||||
p.mu.Unlock()
|
||||
for _, c := range toClose {
|
||||
go c()
|
||||
}
|
||||
}
|
||||
|
||||
// stop terminates the janitor and closes all idle entries. Hot
|
||||
// (refcount > 0) entries are NOT force-closed — callers running
|
||||
// against them would see a use-after-free. In practice stop is only
|
||||
// called by tests that have already drained their callers.
|
||||
func (p *eicTunnelPool) stop() {
|
||||
close(p.stopJanitor)
|
||||
p.mu.Lock()
|
||||
defer p.mu.Unlock()
|
||||
for k, pt := range p.entries {
|
||||
if pt.refcount == 0 {
|
||||
go pt.cleanup()
|
||||
delete(p.entries, k)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// pooledWithEICTunnel is the pool-backed replacement for
|
||||
// realWithEICTunnel. The signature matches `var withEICTunnel`
|
||||
// exactly so the rebind (in initEICTunnelPool) is a drop-in.
|
||||
//
|
||||
// Errors from `fn` itself are forwarded to the caller AND mark the
|
||||
// pool entry as poisoned, so the next acquire builds a fresh
|
||||
// tunnel. This catches the case where the workspace EC2 was
|
||||
// restarted out-of-band (tunnel still appears alive locally but
|
||||
// every cat/find errors out).
|
||||
func pooledWithEICTunnel(ctx context.Context, instanceID string,
|
||||
fn func(s eicSSHSession) error) error {
|
||||
pool := getEICTunnelPool()
|
||||
sess, done, err := pool.acquire(ctx, instanceID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// poisoned defaults to true so a panic from fn poisons the
|
||||
// entry on the way through the deferred release. Without the
|
||||
// defer, a panicking fn would leak refcount=1 forever and
|
||||
// permanently block eviction of this entry. The fn-error path
|
||||
// resets poisoned to its real classification before return.
|
||||
poisoned := true
|
||||
defer func() { done(poisoned) }()
|
||||
fnErr := fn(sess)
|
||||
poisoned = fnErrIndicatesTunnelFault(fnErr)
|
||||
return fnErr
|
||||
}
|
||||
|
||||
// fnErrIndicatesTunnelFault returns true for fn errors whose nature
|
||||
// suggests the underlying tunnel is no longer reusable (auth gone,
|
||||
// network gone, ssh process dead). Returning true poisons the pool
|
||||
// entry so the next acquire builds fresh.
|
||||
//
|
||||
// Conservative: only marks tunnel-faulty for clearly tunnel-level
|
||||
// failures (connection refused, broken pipe, ssh exit-status from
|
||||
// fatal-channel signals). A `cat` returning os.ErrNotExist on a
|
||||
// missing file is NOT a tunnel fault — that's the file path being
|
||||
// wrong, the tunnel is fine.
|
||||
func fnErrIndicatesTunnelFault(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
msg := err.Error()
|
||||
// stderr substrings produced by ssh when the tunnel is broken.
|
||||
for _, marker := range []string{
|
||||
"connection refused",
|
||||
"connection closed",
|
||||
"broken pipe",
|
||||
"Connection reset by peer",
|
||||
"kex_exchange_identification",
|
||||
"port forwarding failed",
|
||||
"Permission denied",
|
||||
"Authentication failed",
|
||||
} {
|
||||
if containsCaseInsensitive(msg, marker) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// containsCaseInsensitive avoids importing strings just for this
|
||||
// (the file already needs ssh stderr matching elsewhere — this
|
||||
// keeps the helper local to avoid a cross-file dependency).
|
||||
func containsCaseInsensitive(s, substr string) bool {
|
||||
if len(substr) > len(s) {
|
||||
return false
|
||||
}
|
||||
// Manual lowercase compare loop; ssh error markers are ASCII so
|
||||
// no need for unicode-aware folding.
|
||||
low := func(b byte) byte {
|
||||
if b >= 'A' && b <= 'Z' {
|
||||
return b + 32
|
||||
}
|
||||
return b
|
||||
}
|
||||
for i := 0; i+len(substr) <= len(s); i++ {
|
||||
match := true
|
||||
for j := 0; j < len(substr); j++ {
|
||||
if low(s[i+j]) != low(substr[j]) {
|
||||
match = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if match {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// initEICTunnelPool rebinds the package-level withEICTunnel var to
|
||||
// the pooled implementation. Called once at package init via the
|
||||
// init() in eic_tunnel_pool_setup.go (split file so the rebind
|
||||
// itself is testable without dragging in the production setup
|
||||
// shim's exec/aws dependencies).
|
||||
func initEICTunnelPool() {
|
||||
withEICTunnel = pooledWithEICTunnel
|
||||
}
|
||||
136
workspace-server/internal/handlers/eic_tunnel_pool_setup.go
Normal file
136
workspace-server/internal/handlers/eic_tunnel_pool_setup.go
Normal file
@ -0,0 +1,136 @@
|
||||
package handlers
|
||||
|
||||
// eic_tunnel_pool_setup.go — production setup shim.
|
||||
//
|
||||
// setupRealEICTunnel decomposes the existing realWithEICTunnel into
|
||||
// its slow half (build the tunnel) and its caller half (run fn). The
|
||||
// pool calls the slow half once and shares the resulting session
|
||||
// across N callers, holding cleanup until the last release.
|
||||
//
|
||||
// Why decompose instead of refactoring realWithEICTunnel: the
|
||||
// existing function and its test stub-vars (withEICTunnel,
|
||||
// sendSSHPublicKey, openTunnelCmd) are load-bearing for the
|
||||
// dispatch tests. Extracting a sibling setup function preserves the
|
||||
// existing single-shot path verbatim — the pool wraps it by calling
|
||||
// realWithEICTunnel through a thin adapter, leaving the tested
|
||||
// surface unchanged.
|
||||
//
|
||||
// The pool's acquire() invokes poolSetupTunnel, which is a `var`
|
||||
// pointing to setupRealEICTunnel for production and a counting stub
|
||||
// for tests.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// setupRealEICTunnel is the slow path that the pool consumes when
|
||||
// no warm entry exists. Mirrors realWithEICTunnel's setup half but
|
||||
// returns the session + cleanup instead of running fn inline.
|
||||
//
|
||||
// The cleanup func owns the tunnel subprocess, ephemeral key dir,
|
||||
// and a one-time wait. Idempotent — calling it twice is safe; the
|
||||
// pool guarantees one call per session, but defence-in-depth helps
|
||||
// when tests run pools in parallel and racy sweeps re-trigger.
|
||||
func setupRealEICTunnel(ctx context.Context, instanceID string) (
|
||||
eicSSHSession, func(), error) {
|
||||
|
||||
if instanceID == "" {
|
||||
return eicSSHSession{}, nil,
|
||||
fmt.Errorf("workspace has no instance_id — not a SaaS EC2 workspace")
|
||||
}
|
||||
osUser := os.Getenv("WORKSPACE_EC2_OS_USER")
|
||||
if osUser == "" {
|
||||
osUser = "ubuntu"
|
||||
}
|
||||
region := os.Getenv("AWS_REGION")
|
||||
if region == "" {
|
||||
region = "us-east-2"
|
||||
}
|
||||
|
||||
keyDir, err := os.MkdirTemp("", "molecule-eic-pool-*")
|
||||
if err != nil {
|
||||
return eicSSHSession{}, nil, fmt.Errorf("keydir mkdir: %w", err)
|
||||
}
|
||||
keyPath := keyDir + "/id"
|
||||
if out, kerr := exec.CommandContext(ctx, "ssh-keygen",
|
||||
"-t", "ed25519", "-f", keyPath, "-N", "", "-q",
|
||||
"-C", "molecule-eic-pool",
|
||||
).CombinedOutput(); kerr != nil {
|
||||
_ = os.RemoveAll(keyDir)
|
||||
return eicSSHSession{}, nil,
|
||||
fmt.Errorf("ssh-keygen: %w (%s)", kerr, strings.TrimSpace(string(out)))
|
||||
}
|
||||
pubKey, err := os.ReadFile(keyPath + ".pub")
|
||||
if err != nil {
|
||||
_ = os.RemoveAll(keyDir)
|
||||
return eicSSHSession{}, nil, fmt.Errorf("read pubkey: %w", err)
|
||||
}
|
||||
|
||||
if err := sendSSHPublicKey(ctx, region, instanceID, osUser,
|
||||
strings.TrimSpace(string(pubKey))); err != nil {
|
||||
_ = os.RemoveAll(keyDir)
|
||||
return eicSSHSession{}, nil, fmt.Errorf("send-ssh-public-key: %w", err)
|
||||
}
|
||||
|
||||
localPort, err := pickFreePort()
|
||||
if err != nil {
|
||||
_ = os.RemoveAll(keyDir)
|
||||
return eicSSHSession{}, nil, fmt.Errorf("pick free port: %w", err)
|
||||
}
|
||||
|
||||
tunnel := openTunnelCmd(eicSSHOptions{
|
||||
InstanceID: instanceID,
|
||||
OSUser: osUser,
|
||||
Region: region,
|
||||
LocalPort: localPort,
|
||||
PrivateKeyPath: keyPath,
|
||||
})
|
||||
tunnel.Env = os.Environ()
|
||||
if err := tunnel.Start(); err != nil {
|
||||
_ = os.RemoveAll(keyDir)
|
||||
return eicSSHSession{}, nil, fmt.Errorf("open-tunnel start: %w", err)
|
||||
}
|
||||
|
||||
if err := waitForPort(ctx, "127.0.0.1", localPort, 10*time.Second); err != nil {
|
||||
if tunnel.Process != nil {
|
||||
_ = tunnel.Process.Kill()
|
||||
}
|
||||
_ = tunnel.Wait()
|
||||
_ = os.RemoveAll(keyDir)
|
||||
return eicSSHSession{}, nil, fmt.Errorf("tunnel never listened: %w", err)
|
||||
}
|
||||
|
||||
cleanedUp := false
|
||||
cleanup := func() {
|
||||
if cleanedUp {
|
||||
return
|
||||
}
|
||||
cleanedUp = true
|
||||
if tunnel.Process != nil {
|
||||
_ = tunnel.Process.Kill()
|
||||
}
|
||||
_ = tunnel.Wait()
|
||||
_ = os.RemoveAll(keyDir)
|
||||
}
|
||||
|
||||
return eicSSHSession{
|
||||
keyPath: keyPath,
|
||||
localPort: localPort,
|
||||
osUser: osUser,
|
||||
instanceID: instanceID,
|
||||
}, cleanup, nil
|
||||
}
|
||||
|
||||
// init wires the pool into the package-level withEICTunnel var so
|
||||
// every read/write/list/delete EIC op uses pooled tunnels by default.
|
||||
// Test files that need single-shot behaviour can swap withEICTunnel
|
||||
// back via the existing stubWithEICTunnel pattern, OR set poolTTL=0
|
||||
// to disable pooling without rebinding the var.
|
||||
func init() {
|
||||
initEICTunnelPool()
|
||||
}
|
||||
467
workspace-server/internal/handlers/eic_tunnel_pool_test.go
Normal file
467
workspace-server/internal/handlers/eic_tunnel_pool_test.go
Normal file
@ -0,0 +1,467 @@
|
||||
package handlers
|
||||
|
||||
// eic_tunnel_pool_test.go — tests for the refcounted EIC tunnel pool
|
||||
// added in core#11. Stubs poolSetupTunnel with a counter so the
|
||||
// tests don't fork ssh-keygen / aws subprocesses.
|
||||
//
|
||||
// Per memory feedback_assert_exact_not_substring: each test pins
|
||||
// exact expected counts (not "at least N") so a regression that
|
||||
// silently double-sets-up surfaces here.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// withPoolSetupStub swaps poolSetupTunnel for a counting fake that
|
||||
// returns a sentinel session and a cleanup func that records its
|
||||
// invocation. Restores on test cleanup.
|
||||
//
|
||||
// setupSignal blocks each setup until released — for concurrent-
|
||||
// acquire tests where we want to gate setup completion.
|
||||
func withPoolSetupStub(t *testing.T) (
|
||||
setupCount *int64, cleanupCount *int64, restore func(), unblock func()) {
|
||||
t.Helper()
|
||||
prev := poolSetupTunnel
|
||||
prevTTL := poolTTL
|
||||
prevJanitor := poolJanitorInterval
|
||||
|
||||
var sc, cc int64
|
||||
setupCount, cleanupCount = &sc, &cc
|
||||
|
||||
gate := make(chan struct{}, 1)
|
||||
gate <- struct{}{} // allow the first setup through immediately
|
||||
unblock = func() { gate <- struct{}{} }
|
||||
|
||||
poolSetupTunnel = func(ctx context.Context, instanceID string) (
|
||||
eicSSHSession, func(), error) {
|
||||
select {
|
||||
case <-gate:
|
||||
case <-ctx.Done():
|
||||
return eicSSHSession{}, nil, ctx.Err()
|
||||
}
|
||||
atomic.AddInt64(&sc, 1)
|
||||
sess := eicSSHSession{
|
||||
instanceID: instanceID,
|
||||
osUser: "ubuntu",
|
||||
localPort: 10000 + int(atomic.LoadInt64(&sc)),
|
||||
keyPath: "/tmp/molecule-eic-test-" + instanceID,
|
||||
}
|
||||
cleanup := func() { atomic.AddInt64(&cc, 1) }
|
||||
return sess, cleanup, nil
|
||||
}
|
||||
|
||||
restore = func() {
|
||||
poolSetupTunnel = prev
|
||||
poolTTL = prevTTL
|
||||
poolJanitorInterval = prevJanitor
|
||||
}
|
||||
t.Cleanup(restore)
|
||||
return
|
||||
}
|
||||
|
||||
// freshPool returns an isolated pool (NOT the global) so tests run
|
||||
// independently. Stops the janitor on cleanup.
|
||||
func freshPool(t *testing.T) *eicTunnelPool {
|
||||
t.Helper()
|
||||
p := newEICTunnelPool()
|
||||
t.Cleanup(p.stop)
|
||||
return p
|
||||
}
|
||||
|
||||
// TestEICTunnelPool_FourOpsAmortise pins the core invariant: four
|
||||
// sequential acquire/release cycles on the same instanceID share
|
||||
// ONE underlying tunnel setup. Mutation: delete the cache hit branch
|
||||
// in acquire() → setupCount goes 1 → 4 → test fails.
|
||||
func TestEICTunnelPool_FourOpsAmortise(t *testing.T) {
|
||||
setupCount, cleanupCount, _, _ := withPoolSetupStub(t)
|
||||
// Refill gate after each setup so concurrent stubs aren't blocked
|
||||
// (we want every test to be able to set up if it needs to).
|
||||
t.Cleanup(func() { /* no-op; defer is enough */ })
|
||||
poolTTL = 50 * time.Second
|
||||
pool := freshPool(t)
|
||||
ctx := context.Background()
|
||||
|
||||
for i := 0; i < 4; i++ {
|
||||
sess, done, err := pool.acquire(ctx, "i-test-1")
|
||||
if err != nil {
|
||||
t.Fatalf("op %d: acquire: %v", i, err)
|
||||
}
|
||||
if sess.instanceID != "i-test-1" {
|
||||
t.Fatalf("op %d: session has wrong instanceID: %q", i, sess.instanceID)
|
||||
}
|
||||
done(false)
|
||||
}
|
||||
|
||||
if got := atomic.LoadInt64(setupCount); got != 1 {
|
||||
t.Errorf("expected exactly 1 tunnel setup across 4 ops, got %d", got)
|
||||
}
|
||||
if got := atomic.LoadInt64(cleanupCount); got != 0 {
|
||||
t.Errorf("expected 0 cleanups while entry is hot (TTL=50s), got %d", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestEICTunnelPool_DifferentInstancesDoNotShare pins that two
|
||||
// different instanceIDs each get their own tunnel — the pool is
|
||||
// keyed on instanceID, not a single global slot.
|
||||
func TestEICTunnelPool_DifferentInstancesDoNotShare(t *testing.T) {
|
||||
setupCount, _, _, unblock := withPoolSetupStub(t)
|
||||
poolTTL = 50 * time.Second
|
||||
pool := freshPool(t)
|
||||
ctx := context.Background()
|
||||
|
||||
// First instance setup uses the initial gate slot.
|
||||
_, doneA, err := pool.acquire(ctx, "i-a")
|
||||
if err != nil {
|
||||
t.Fatalf("acquire A: %v", err)
|
||||
}
|
||||
doneA(false)
|
||||
|
||||
// Second instance needs a new slot through the gate.
|
||||
unblock()
|
||||
_, doneB, err := pool.acquire(ctx, "i-b")
|
||||
if err != nil {
|
||||
t.Fatalf("acquire B: %v", err)
|
||||
}
|
||||
doneB(false)
|
||||
|
||||
if got := atomic.LoadInt64(setupCount); got != 2 {
|
||||
t.Errorf("expected 2 setups (one per instance), got %d", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestEICTunnelPool_TTLEviction: a short TTL forces the second op
|
||||
// to build a fresh tunnel after the first expires.
|
||||
func TestEICTunnelPool_TTLEviction(t *testing.T) {
|
||||
setupCount, cleanupCount, _, unblock := withPoolSetupStub(t)
|
||||
poolTTL = 50 * time.Millisecond
|
||||
poolJanitorInterval = 1 * time.Second // keep janitor away
|
||||
pool := freshPool(t)
|
||||
ctx := context.Background()
|
||||
|
||||
_, done, err := pool.acquire(ctx, "i-ttl")
|
||||
if err != nil {
|
||||
t.Fatalf("acquire 1: %v", err)
|
||||
}
|
||||
done(false)
|
||||
|
||||
time.Sleep(80 * time.Millisecond) // past TTL
|
||||
|
||||
unblock() // allow next setup
|
||||
_, done, err = pool.acquire(ctx, "i-ttl")
|
||||
if err != nil {
|
||||
t.Fatalf("acquire 2: %v", err)
|
||||
}
|
||||
done(false)
|
||||
|
||||
if got := atomic.LoadInt64(setupCount); got != 2 {
|
||||
t.Errorf("expected 2 setups (TTL eviction between), got %d", got)
|
||||
}
|
||||
// First entry should have been cleaned up when the second
|
||||
// acquire evicted it on the slow path. Cleanup runs in a
|
||||
// goroutine; poll briefly for it to land.
|
||||
deadline := time.Now().Add(500 * time.Millisecond)
|
||||
for atomic.LoadInt64(cleanupCount) < 1 && time.Now().Before(deadline) {
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
}
|
||||
if got := atomic.LoadInt64(cleanupCount); got < 1 {
|
||||
t.Errorf("expected ≥1 cleanup (first entry evicted), got %d", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestEICTunnelPool_FailureInvalidates pins the poison-on-fault
|
||||
// behavior — fn returning a tunnel-fatal error marks the entry
|
||||
// unusable so the next acquire builds fresh.
|
||||
func TestEICTunnelPool_FailureInvalidates(t *testing.T) {
|
||||
setupCount, _, _, unblock := withPoolSetupStub(t)
|
||||
poolTTL = 50 * time.Second
|
||||
pool := freshPool(t)
|
||||
ctx := context.Background()
|
||||
|
||||
_, done, err := pool.acquire(ctx, "i-fault")
|
||||
if err != nil {
|
||||
t.Fatalf("acquire 1: %v", err)
|
||||
}
|
||||
done(true) // signal poison
|
||||
|
||||
unblock() // let the next setup through
|
||||
_, done, err = pool.acquire(ctx, "i-fault")
|
||||
if err != nil {
|
||||
t.Fatalf("acquire 2: %v", err)
|
||||
}
|
||||
done(false)
|
||||
|
||||
if got := atomic.LoadInt64(setupCount); got != 2 {
|
||||
t.Errorf("expected 2 setups (poison forced rebuild), got %d", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestEICTunnelPool_ConcurrentAcquireSingleSetup pins that N
|
||||
// concurrent acquires for the same instanceID before any release
|
||||
// only trigger ONE tunnel setup — the rest wait via pendingSetups.
|
||||
//
|
||||
// Without this guard each concurrent acquire would spawn its own
|
||||
// tunnel and the loser-cleanup would still leak refcount. Mutation:
|
||||
// delete the pendingSetups gate → setupCount goes 1 → N → fails.
|
||||
func TestEICTunnelPool_ConcurrentAcquireSingleSetup(t *testing.T) {
|
||||
setupCount, _, _, _ := withPoolSetupStub(t)
|
||||
// Pause setup so all goroutines pile into the pending slot.
|
||||
prev := poolSetupTunnel
|
||||
gate := make(chan struct{})
|
||||
poolSetupTunnel = func(ctx context.Context, instanceID string) (
|
||||
eicSSHSession, func(), error) {
|
||||
<-gate
|
||||
atomic.AddInt64(setupCount, 1)
|
||||
return eicSSHSession{instanceID: instanceID}, func() {}, nil
|
||||
}
|
||||
t.Cleanup(func() { poolSetupTunnel = prev })
|
||||
|
||||
poolTTL = 50 * time.Second
|
||||
pool := freshPool(t)
|
||||
ctx := context.Background()
|
||||
|
||||
const N = 8
|
||||
type result struct {
|
||||
done func(bool)
|
||||
err error
|
||||
}
|
||||
results := make(chan result, N)
|
||||
var startWg sync.WaitGroup
|
||||
startWg.Add(N)
|
||||
for i := 0; i < N; i++ {
|
||||
go func() {
|
||||
startWg.Done()
|
||||
_, done, err := pool.acquire(ctx, "i-concurrent")
|
||||
results <- result{done, err}
|
||||
}()
|
||||
}
|
||||
startWg.Wait()
|
||||
// give all N goroutines time to enter pool.acquire
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
close(gate)
|
||||
|
||||
for i := 0; i < N; i++ {
|
||||
r := <-results
|
||||
if r.err != nil {
|
||||
t.Fatalf("acquire %d: %v", i, r.err)
|
||||
}
|
||||
r.done(false)
|
||||
}
|
||||
|
||||
if got := atomic.LoadInt64(setupCount); got != 1 {
|
||||
t.Errorf("expected 1 setup across %d concurrent acquires, got %d", N, got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestEICTunnelPool_TTLZeroDisablesPooling pins the escape hatch:
|
||||
// poolTTL=0 means every acquire goes straight through to setup +
|
||||
// cleanup, no entry kept. Useful for tests / opt-out.
|
||||
func TestEICTunnelPool_TTLZeroDisablesPooling(t *testing.T) {
|
||||
setupCount, cleanupCount, _, unblock := withPoolSetupStub(t)
|
||||
poolTTL = 0
|
||||
pool := freshPool(t)
|
||||
ctx := context.Background()
|
||||
|
||||
_, done, err := pool.acquire(ctx, "i-ttlzero")
|
||||
if err != nil {
|
||||
t.Fatalf("acquire 1: %v", err)
|
||||
}
|
||||
done(false)
|
||||
|
||||
unblock()
|
||||
_, done, err = pool.acquire(ctx, "i-ttlzero")
|
||||
if err != nil {
|
||||
t.Fatalf("acquire 2: %v", err)
|
||||
}
|
||||
done(false)
|
||||
|
||||
if got := atomic.LoadInt64(setupCount); got != 2 {
|
||||
t.Errorf("expected 2 setups with TTL=0 (pool disabled), got %d", got)
|
||||
}
|
||||
if got := atomic.LoadInt64(cleanupCount); got != 2 {
|
||||
t.Errorf("expected 2 cleanups with TTL=0 (each release closes), got %d", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestEICTunnelPool_LRUEvictionAtCap pins the LRU defence: when the
|
||||
// pool reaches poolMaxEntries, a new acquire for an unseen
|
||||
// instanceID evicts the LRU idle entry instead of growing unbounded.
|
||||
func TestEICTunnelPool_LRUEvictionAtCap(t *testing.T) {
|
||||
setupCount, cleanupCount, _, _ := withPoolSetupStub(t)
|
||||
prev := poolMaxEntries
|
||||
poolMaxEntries = 2
|
||||
t.Cleanup(func() { poolMaxEntries = prev })
|
||||
poolTTL = 50 * time.Second
|
||||
|
||||
// Replace stub with one that doesn't gate so we can fill quickly.
|
||||
poolSetupTunnel = func(ctx context.Context, instanceID string) (
|
||||
eicSSHSession, func(), error) {
|
||||
atomic.AddInt64(setupCount, 1)
|
||||
return eicSSHSession{instanceID: instanceID}, func() {
|
||||
atomic.AddInt64(cleanupCount, 1)
|
||||
}, nil
|
||||
}
|
||||
|
||||
pool := freshPool(t)
|
||||
ctx := context.Background()
|
||||
|
||||
for _, id := range []string{"i-1", "i-2"} {
|
||||
_, done, err := pool.acquire(ctx, id)
|
||||
if err != nil {
|
||||
t.Fatalf("acquire %s: %v", id, err)
|
||||
}
|
||||
done(false)
|
||||
}
|
||||
// Both entries idle, pool at cap.
|
||||
_, done, err := pool.acquire(ctx, "i-3")
|
||||
if err != nil {
|
||||
t.Fatalf("acquire i-3: %v", err)
|
||||
}
|
||||
done(false)
|
||||
|
||||
// Wait for the goroutine'd cleanup of the evicted entry.
|
||||
deadline := time.Now().Add(500 * time.Millisecond)
|
||||
for atomic.LoadInt64(cleanupCount) < 1 && time.Now().Before(deadline) {
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
|
||||
if got := atomic.LoadInt64(setupCount); got != 3 {
|
||||
t.Errorf("expected 3 setups (one per unique instance), got %d", got)
|
||||
}
|
||||
if got := atomic.LoadInt64(cleanupCount); got < 1 {
|
||||
t.Errorf("expected ≥1 cleanup (LRU eviction), got %d", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestEICTunnelPool_PoisonedClassification pins the heuristic that
|
||||
// distinguishes tunnel-fatal errors (poison the entry) from
|
||||
// app-level errors (file not found, validation) that should NOT
|
||||
// invalidate the tunnel.
|
||||
func TestEICTunnelPool_PoisonedClassification(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
err error
|
||||
want bool
|
||||
}{
|
||||
{"nil", nil, false},
|
||||
{"file not found", errors.New("os: file does not exist"), false},
|
||||
{"validation", errors.New("invalid path: must be relative"), false},
|
||||
{"connection refused", errors.New("ssh: connect to host: connection refused"), true},
|
||||
{"connection refused upper", errors.New("Connection Refused"), true},
|
||||
{"broken pipe", errors.New("write tunnel: broken pipe"), true},
|
||||
{"permission denied", errors.New("Permission denied (publickey)"), true},
|
||||
{"auth failed", errors.New("Authentication failed"), true},
|
||||
{"connection reset", errors.New("Connection reset by peer"), true},
|
||||
{"port forward", errors.New("port forwarding failed"), true},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := fnErrIndicatesTunnelFault(tc.err)
|
||||
if got != tc.want {
|
||||
t.Errorf("fnErrIndicatesTunnelFault(%v) = %v, want %v",
|
||||
tc.err, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestEICTunnelPool_RefcountBlocksEviction pins that an entry past
|
||||
// TTL is NOT evicted while a caller still holds it — preventing
|
||||
// use-after-free in the holder.
|
||||
func TestEICTunnelPool_RefcountBlocksEviction(t *testing.T) {
|
||||
setupCount, cleanupCount, _, _ := withPoolSetupStub(t)
|
||||
poolTTL = 30 * time.Millisecond
|
||||
poolJanitorInterval = 5 * time.Millisecond
|
||||
pool := freshPool(t)
|
||||
ctx := context.Background()
|
||||
|
||||
_, done, err := pool.acquire(ctx, "i-hold")
|
||||
if err != nil {
|
||||
t.Fatalf("acquire: %v", err)
|
||||
}
|
||||
|
||||
// Sleep past TTL while holding the session. Janitor sweeps
|
||||
// every 5ms but must skip our entry because refcount=1.
|
||||
time.Sleep(80 * time.Millisecond)
|
||||
|
||||
if got := atomic.LoadInt64(cleanupCount); got != 0 {
|
||||
t.Errorf("expected 0 cleanups while holder is active, got %d", got)
|
||||
}
|
||||
|
||||
done(false)
|
||||
// Now refcount=0 and entry is past TTL; releaser triggers cleanup.
|
||||
deadline := time.Now().Add(200 * time.Millisecond)
|
||||
for atomic.LoadInt64(cleanupCount) < 1 && time.Now().Before(deadline) {
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
}
|
||||
if got := atomic.LoadInt64(cleanupCount); got != 1 {
|
||||
t.Errorf("expected 1 cleanup after release of expired entry, got %d", got)
|
||||
}
|
||||
if got := atomic.LoadInt64(setupCount); got != 1 {
|
||||
t.Errorf("setupCount tracking: got %d, want 1", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestPooledWithEICTunnel_PanicPoisonsEntry pins that a panic
|
||||
// from fn poisons the pool entry on the way out — refcount goes
|
||||
// back to zero (no leak) and the entry is marked unusable so the
|
||||
// next acquire builds fresh. Without the defer-release pattern, a
|
||||
// panic would leave refcount=1 forever and the entry would never
|
||||
// evict.
|
||||
func TestPooledWithEICTunnel_PanicPoisonsEntry(t *testing.T) {
|
||||
setupCount, _, _, _ := withPoolSetupStub(t)
|
||||
poolTTL = 50 * time.Second
|
||||
globalEICTunnelPool = newEICTunnelPool()
|
||||
t.Cleanup(globalEICTunnelPool.stop)
|
||||
|
||||
func() {
|
||||
defer func() {
|
||||
if r := recover(); r == nil {
|
||||
t.Errorf("expected panic to bubble up, got nil")
|
||||
}
|
||||
}()
|
||||
_ = pooledWithEICTunnel(context.Background(), "i-panic",
|
||||
func(s eicSSHSession) error { panic("boom") })
|
||||
}()
|
||||
|
||||
// Replenish the gate so the next setup can run.
|
||||
prev := poolSetupTunnel
|
||||
poolSetupTunnel = func(ctx context.Context, instanceID string) (
|
||||
eicSSHSession, func(), error) {
|
||||
atomic.AddInt64(setupCount, 1)
|
||||
return eicSSHSession{instanceID: instanceID}, func() {}, nil
|
||||
}
|
||||
t.Cleanup(func() { poolSetupTunnel = prev })
|
||||
|
||||
// Next acquire must build fresh — entry was poisoned by panic.
|
||||
if err := pooledWithEICTunnel(context.Background(), "i-panic",
|
||||
func(s eicSSHSession) error { return nil }); err != nil {
|
||||
t.Fatalf("post-panic acquire: %v", err)
|
||||
}
|
||||
if got := atomic.LoadInt64(setupCount); got != 2 {
|
||||
t.Errorf("expected 2 setups (panic poisoned, rebuild), got %d", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestPooledWithEICTunnel_PreservesFnErr pins that errors from the
|
||||
// inner fn pass through to the caller verbatim — pool wrapping
|
||||
// should not swallow or transform error semantics for app code.
|
||||
func TestPooledWithEICTunnel_PreservesFnErr(t *testing.T) {
|
||||
withPoolSetupStub(t)
|
||||
poolTTL = 50 * time.Second
|
||||
|
||||
// Reset the global pool so this test is isolated from any prior
|
||||
// test that may have populated it.
|
||||
globalEICTunnelPool = newEICTunnelPool()
|
||||
|
||||
want := errors.New("file does not exist")
|
||||
got := pooledWithEICTunnel(context.Background(), "i-fn-err",
|
||||
func(s eicSSHSession) error { return want })
|
||||
if !errors.Is(got, want) {
|
||||
t.Errorf("pooledWithEICTunnel returned %v, want %v", got, want)
|
||||
}
|
||||
}
|
||||
@ -23,6 +23,16 @@ import (
|
||||
// workspace-scoped filtering (handler falls back to unfiltered list).
|
||||
type RuntimeLookup func(workspaceID string) (string, error)
|
||||
|
||||
// InstanceIDLookup resolves a workspace's EC2 instance_id by ID. Empty
|
||||
// string means the workspace is not on the SaaS (EC2-per-workspace)
|
||||
// backend — i.e. either local-Docker or pre-provision. The handler uses
|
||||
// this to dispatch plugin install/uninstall to the EIC SSH path
|
||||
// (template_files_eic.go primitive) when a workspace runs on its own EC2
|
||||
// and there's no local Docker container to exec into. A nil lookup keeps
|
||||
// the handler on the local-Docker code path only — same shape as the
|
||||
// pre-fix behaviour.
|
||||
type InstanceIDLookup func(workspaceID string) (string, error)
|
||||
|
||||
// pluginSources is the contract PluginsHandler uses to talk to the
|
||||
// plugin source registry. Extracted as an interface (#1814) so tests can
|
||||
// substitute a stub without standing up the real *plugins.Registry +
|
||||
@ -46,10 +56,11 @@ var _ pluginSources = (*plugins.Registry)(nil)
|
||||
|
||||
// PluginsHandler manages the plugin registry and per-workspace plugin installation.
|
||||
type PluginsHandler struct {
|
||||
pluginsDir string // host path to plugins/ registry
|
||||
docker *client.Client // Docker client for container operations
|
||||
restartFunc func(string) // auto-restart workspace after install/uninstall
|
||||
runtimeLookup RuntimeLookup // workspace_id → runtime (optional)
|
||||
pluginsDir string // host path to plugins/ registry
|
||||
docker *client.Client // Docker client for container operations
|
||||
restartFunc func(string) // auto-restart workspace after install/uninstall
|
||||
runtimeLookup RuntimeLookup // workspace_id → runtime (optional)
|
||||
instanceIDLookup InstanceIDLookup // workspace_id → EC2 instance_id (optional)
|
||||
// sources narrowed from `*plugins.Registry` to the pluginSources
|
||||
// interface (#1814) so tests can substitute a stub. Production
|
||||
// callers still pass *plugins.Registry, which satisfies the
|
||||
@ -90,6 +101,15 @@ func (h *PluginsHandler) WithRuntimeLookup(lookup RuntimeLookup) *PluginsHandler
|
||||
return h
|
||||
}
|
||||
|
||||
// WithInstanceIDLookup installs a workspace → EC2 instance_id resolver.
|
||||
// Wired by the router so production hits a real DB; tests stub it. The
|
||||
// install/uninstall pipeline uses this to dispatch to the EIC SSH path
|
||||
// for SaaS workspaces (no local Docker container to exec into).
|
||||
func (h *PluginsHandler) WithInstanceIDLookup(lookup InstanceIDLookup) *PluginsHandler {
|
||||
h.instanceIDLookup = lookup
|
||||
return h
|
||||
}
|
||||
|
||||
// pluginInfo is the API response for a plugin.
|
||||
type pluginInfo struct {
|
||||
Name string `json:"name"`
|
||||
|
||||
@ -100,6 +100,13 @@ func (h *PluginsHandler) Install(c *gin.Context) {
|
||||
}
|
||||
|
||||
// Uninstall handles DELETE /workspaces/:id/plugins/:name — removes a plugin.
|
||||
//
|
||||
// Dispatch order mirrors Install's deliverToContainer:
|
||||
//
|
||||
// 1. Local Docker container up → exec rm -rf via existing helpers.
|
||||
// 2. SaaS workspace (instance_id set) → ssh sudo rm -rf via EIC.
|
||||
// 3. external runtime → 422 (caller manages its own plugin dir).
|
||||
// 4. Neither → 503.
|
||||
func (h *PluginsHandler) Uninstall(c *gin.Context) {
|
||||
workspaceID := c.Param("id")
|
||||
pluginName := c.Param("name")
|
||||
@ -120,12 +127,24 @@ func (h *PluginsHandler) Uninstall(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
containerName := h.findRunningContainer(ctx, workspaceID)
|
||||
if containerName == "" {
|
||||
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace container not running"})
|
||||
if containerName := h.findRunningContainer(ctx, workspaceID); containerName != "" {
|
||||
h.uninstallViaDocker(ctx, c, workspaceID, pluginName, containerName)
|
||||
return
|
||||
}
|
||||
|
||||
if instanceID, runtime := h.lookupSaaSDispatch(workspaceID); instanceID != "" {
|
||||
h.uninstallViaEIC(ctx, c, workspaceID, pluginName, instanceID, runtime)
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace container not running"})
|
||||
}
|
||||
|
||||
// uninstallViaDocker holds the historical Docker-exec uninstall flow.
|
||||
// Extracted out of Uninstall so the new SaaS dispatch reads cleanly and
|
||||
// the two backend bodies are visibly symmetric (same steps, different
|
||||
// transport).
|
||||
func (h *PluginsHandler) uninstallViaDocker(ctx context.Context, c *gin.Context, workspaceID, pluginName, containerName string) {
|
||||
// Read the plugin's manifest BEFORE deletion to learn which skill dirs
|
||||
// it owns, so we can clean them out of /configs/skills/ and avoid the
|
||||
// auto-restart re-mounting them. Issue #106.
|
||||
@ -177,6 +196,61 @@ func (h *PluginsHandler) Uninstall(c *gin.Context) {
|
||||
})
|
||||
}
|
||||
|
||||
// uninstallViaEIC removes a plugin from a SaaS workspace EC2 over SSH.
|
||||
// Symmetric with uninstallViaDocker:
|
||||
//
|
||||
// - Read manifest (best-effort, missing plugin.yaml = no skills to clean).
|
||||
// - Skip CLAUDE.md awk-strip for now: that file lives at
|
||||
// <runtime-config-prefix>/CLAUDE.md on the host and the same awk script
|
||||
// would work over ssh, but the file is rewritten on workspace restart
|
||||
// by the runtime adapter anyway, so the marker either stays harmless
|
||||
// or gets dropped on the next install/restart cycle. Tracked as
|
||||
// follow-up; not a regression vs the docker path's semantics here.
|
||||
// - rm -rf the plugin dir.
|
||||
// - Trigger restart.
|
||||
//
|
||||
// We intentionally don't try to remove /configs/skills/<skill> entries
|
||||
// over ssh because the same /configs is bind-mounted into the runtime
|
||||
// container; the agent's own start-up adapter rewrites that tree from
|
||||
// the live plugin set, so a stale skill dir for an uninstalled plugin
|
||||
// is cleaned up at restart. The docker path removes them eagerly only
|
||||
// because docker-exec is cheap. We can mirror that later if a real bug
|
||||
// surfaces, but adding two extra ssh round-trips per uninstall today
|
||||
// would be churn for no behavioural win.
|
||||
func (h *PluginsHandler) uninstallViaEIC(ctx context.Context, c *gin.Context, workspaceID, pluginName, instanceID, runtime string) {
|
||||
// Read manifest first (best-effort) — we don't currently use the
|
||||
// skills list on the SaaS path (see comment above), but reading it
|
||||
// keeps the parsing path warm and lets log lines distinguish "we
|
||||
// deleted a real plugin" from "user asked us to delete something
|
||||
// that wasn't there." Errors here are swallowed: missing manifest
|
||||
// must not block uninstall.
|
||||
if data, err := readPluginManifestViaEIC(ctx, instanceID, runtime, pluginName); err == nil && len(data) > 0 {
|
||||
info := parseManifestYAML(pluginName, data)
|
||||
if len(info.Skills) > 0 {
|
||||
log.Printf("Plugin uninstall: %s declared skills=%v (left to runtime restart to clean)", pluginName, info.Skills)
|
||||
}
|
||||
}
|
||||
|
||||
if err := uninstallPluginViaEIC(ctx, instanceID, runtime, pluginName); err != nil {
|
||||
log.Printf("Plugin uninstall: EIC rm failed for %s on %s: %v", pluginName, workspaceID, err)
|
||||
c.JSON(http.StatusBadGateway, gin.H{"error": "failed to remove plugin from workspace EC2"})
|
||||
return
|
||||
}
|
||||
|
||||
if h.restartFunc != nil {
|
||||
go func() {
|
||||
time.Sleep(2 * time.Second)
|
||||
h.restartFunc(workspaceID)
|
||||
}()
|
||||
}
|
||||
|
||||
log.Printf("Plugin uninstall: %s from workspace %s (restarting via SaaS path)", pluginName, workspaceID)
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"status": "uninstalled",
|
||||
"plugin": pluginName,
|
||||
})
|
||||
}
|
||||
|
||||
// Download handles GET /workspaces/:id/plugins/:name/download?source=<scheme://spec>
|
||||
//
|
||||
// Phase 30.3 — stream the named plugin as a gzipped tarball so remote
|
||||
|
||||
249
workspace-server/internal/handlers/plugins_install_eic.go
Normal file
249
workspace-server/internal/handlers/plugins_install_eic.go
Normal file
@ -0,0 +1,249 @@
|
||||
package handlers
|
||||
|
||||
// plugins_install_eic.go — SaaS (EC2-per-workspace) plugin install + uninstall
|
||||
// over the EIC SSH primitive that template_files_eic.go already plumbs. Pairs
|
||||
// with the local-Docker path in plugins_install.go / plugins_install_pipeline.go,
|
||||
// closing the 🔴 docker-only row in docs/architecture/backends.md.
|
||||
//
|
||||
// Architecture note: every operation goes through `withEICTunnel` (ephemeral
|
||||
// keypair → AWS push → tunnel → ssh). This file owns the plugin-shaped
|
||||
// remote commands; the tunnel mechanics live in template_files_eic.go so a
|
||||
// fix to the dance lands in one place.
|
||||
//
|
||||
// Why direct host write (not docker cp via SSH): on the workspace EC2, the
|
||||
// runtime's managed-config dir (/configs for claude-code, /home/ubuntu/.hermes
|
||||
// for hermes — see workspaceFilePathPrefix) is bind-mounted into the
|
||||
// runtime's container by cloud-init. Writing into <prefix>/plugins/<name>/
|
||||
// on the host is exactly what the runtime sees on the next start. No
|
||||
// docker-cp needed, and we avoid coupling to any specific container layout
|
||||
// inside the workspace EC2.
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// eicPluginOpTimeout bounds the whole EIC-tunnel + ssh + tar-pipe dance
|
||||
// for a plugin install or uninstall. Larger than eicFileOpTimeout (30s)
|
||||
// because plugin trees can carry skill markdown, MCP server binaries,
|
||||
// and config files — easily a few MB through ssh + sudo on a fresh
|
||||
// tunnel. 2 min gives headroom on a cold tunnel; the install pipeline's
|
||||
// PLUGIN_INSTALL_FETCH_TIMEOUT (5 min default) still bounds the outer
|
||||
// request.
|
||||
const eicPluginOpTimeout = 2 * time.Minute
|
||||
|
||||
// hostPluginPath returns the absolute directory on the workspace EC2
|
||||
// where /configs/plugins/<name>/ lives for a given runtime. Keeps the
|
||||
// per-runtime indirection in one place (mirrors resolveWorkspaceRootPath
|
||||
// in template_files_eic.go) so future runtimes only edit
|
||||
// workspaceFilePathPrefix.
|
||||
//
|
||||
// The plugin name is shellQuote-wrapped at the call site, not here,
|
||||
// because a couple of callers want the unquoted form for log lines.
|
||||
func hostPluginPath(runtime, pluginName string) string {
|
||||
base := resolveWorkspaceRootPath(runtime, "/configs")
|
||||
return filepath.Join(base, "plugins", pluginName)
|
||||
}
|
||||
|
||||
// buildPluginInstallShell returns the remote command for receiving a tar.gz
|
||||
// stream on stdin and unpacking it into <hostPluginDir>/, owned by the agent
|
||||
// user (uid 1000 — matches the local-Docker path's chown 1000:1000).
|
||||
//
|
||||
// The script is a single `sudo sh -c '...'` so the tar-receive + chown run
|
||||
// under one privileged invocation; ssh-as-ubuntu has passwordless sudo on
|
||||
// the standard tenant AMI.
|
||||
//
|
||||
// - rm -rf clears any prior install of the same plugin (idempotent
|
||||
// reinstall — the user re-clicked Install or version-bumped the source).
|
||||
// - mkdir -p makes the parent dir (host /configs is root-owned + always
|
||||
// present; the per-plugin dir is what we're creating).
|
||||
// - tar -xzf - reads stdin (the gzipped tar). --no-same-owner keeps the
|
||||
// archive's tar-recorded uid/gid out of the picture; the chown -R
|
||||
// after is the canonical owner.
|
||||
// - chown -R 1000:1000 matches the local-Docker handler's exec at
|
||||
// plugins_install_pipeline.go:273 — agent user inside the runtime
|
||||
// container is uid 1000 on every workspace-template image we ship.
|
||||
//
|
||||
// shellQuote on the path is defence-in-depth: the path is composed from
|
||||
// a runtime allowlist (workspaceFilePathPrefix) + validated plugin name,
|
||||
// so traversal is already blocked.
|
||||
func buildPluginInstallShell(hostPluginDir string) string {
|
||||
q := shellQuote(hostPluginDir)
|
||||
return fmt.Sprintf(
|
||||
"sudo -n sh -c 'rm -rf %s && mkdir -p %s && tar -xzf - --no-same-owner -C %s && chown -R 1000:1000 %s'",
|
||||
q, q, q, q,
|
||||
)
|
||||
}
|
||||
|
||||
// buildPluginUninstallShell returns the remote command for `sudo -n rm -rf
|
||||
// <hostPluginDir>`. -rf (vs -f) is intentional here, unlike buildRmShell:
|
||||
// uninstall really does need to remove the plugin's whole subtree.
|
||||
func buildPluginUninstallShell(hostPluginDir string) string {
|
||||
return fmt.Sprintf("sudo -n rm -rf %s", shellQuote(hostPluginDir))
|
||||
}
|
||||
|
||||
// buildPluginManifestReadShell returns the remote command for reading the
|
||||
// plugin's manifest (plugin.yaml). Mirrors buildCatShell — swallows the
|
||||
// missing-file stderr so the missing-manifest case lands as empty stdout
|
||||
// + non-zero exit, which uninstall translates to "no skills to clean".
|
||||
func buildPluginManifestReadShell(hostPluginDir string) string {
|
||||
return fmt.Sprintf("sudo -n cat %s/plugin.yaml 2>/dev/null", shellQuote(hostPluginDir))
|
||||
}
|
||||
|
||||
// installPluginViaEIC pushes a staged plugin directory to a SaaS workspace
|
||||
// EC2 via the EIC SSH tunnel. On success the plugin lives at
|
||||
// <runtime-config-prefix>/plugins/<name>/ on the host, owned by 1000:1000,
|
||||
// ready for the next workspace restart to pick up.
|
||||
//
|
||||
// The caller (deliverToContainer SaaS branch) owns:
|
||||
// - the staged dir (created + cleaned up by resolveAndStage)
|
||||
// - the workspace restart trigger after install
|
||||
//
|
||||
// Errors here are wrapped with the instance + runtime so triage can tell
|
||||
// "tunnel failed" from "tar payload corrupt" without grep-ing the EC2's
|
||||
// auth.log.
|
||||
var installPluginViaEIC = realInstallPluginViaEIC
|
||||
|
||||
func realInstallPluginViaEIC(ctx context.Context, instanceID, runtime, pluginName, stagedDir string) error {
|
||||
if instanceID == "" {
|
||||
return fmt.Errorf("installPluginViaEIC: empty instance_id")
|
||||
}
|
||||
if err := validatePluginName(pluginName); err != nil {
|
||||
return fmt.Errorf("installPluginViaEIC: %w", err)
|
||||
}
|
||||
|
||||
// Build the tar.gz payload up-front so a tar-walk failure is surfaced
|
||||
// before we open the EIC tunnel — saves a 1-2s tunnel setup on every
|
||||
// "broken plugin tree" case.
|
||||
var payload bytes.Buffer
|
||||
gz := gzip.NewWriter(&payload)
|
||||
tw := tar.NewWriter(gz)
|
||||
if err := streamDirAsTar(stagedDir, tw); err != nil {
|
||||
return fmt.Errorf("installPluginViaEIC: tar pack: %w", err)
|
||||
}
|
||||
if err := tw.Close(); err != nil {
|
||||
return fmt.Errorf("installPluginViaEIC: tar close: %w", err)
|
||||
}
|
||||
if err := gz.Close(); err != nil {
|
||||
return fmt.Errorf("installPluginViaEIC: gzip close: %w", err)
|
||||
}
|
||||
|
||||
hostDir := hostPluginPath(runtime, pluginName)
|
||||
cmd := buildPluginInstallShell(hostDir)
|
||||
|
||||
ctx, cancel := context.WithTimeout(ctx, eicPluginOpTimeout)
|
||||
defer cancel()
|
||||
|
||||
return withEICTunnel(ctx, instanceID, func(s eicSSHSession) error {
|
||||
sshCmd := exec.CommandContext(ctx, "ssh", s.sshArgs(cmd)...)
|
||||
sshCmd.Env = os.Environ()
|
||||
sshCmd.Stdin = bytes.NewReader(payload.Bytes())
|
||||
var stderr bytes.Buffer
|
||||
sshCmd.Stderr = &stderr
|
||||
if err := sshCmd.Run(); err != nil {
|
||||
return fmt.Errorf(
|
||||
"ssh install: %w (instance=%s runtime=%s plugin=%s payload=%dB stderr=%s)",
|
||||
err, instanceID, runtime, pluginName, payload.Len(),
|
||||
strings.TrimSpace(stderr.String()),
|
||||
)
|
||||
}
|
||||
log.Printf(
|
||||
"installPluginViaEIC: ws instance=%s runtime=%s plugin=%s payload=%dB → %s",
|
||||
instanceID, runtime, pluginName, payload.Len(), hostDir,
|
||||
)
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
// uninstallPluginViaEIC removes the plugin's directory from the workspace
|
||||
// EC2 via SSH. Symmetric with installPluginViaEIC but no payload — the
|
||||
// remote command is a single `rm -rf`.
|
||||
//
|
||||
// Best-effort by design: the local-Docker path also doesn't fail
|
||||
// uninstall on a missing directory (the pre-existing exec returns 0 when
|
||||
// the dir is absent), so we mirror that here. Real ssh-layer failures
|
||||
// (tunnel down, sudo denied) still propagate.
|
||||
var uninstallPluginViaEIC = realUninstallPluginViaEIC
|
||||
|
||||
func realUninstallPluginViaEIC(ctx context.Context, instanceID, runtime, pluginName string) error {
|
||||
if instanceID == "" {
|
||||
return fmt.Errorf("uninstallPluginViaEIC: empty instance_id")
|
||||
}
|
||||
if err := validatePluginName(pluginName); err != nil {
|
||||
return fmt.Errorf("uninstallPluginViaEIC: %w", err)
|
||||
}
|
||||
|
||||
hostDir := hostPluginPath(runtime, pluginName)
|
||||
cmd := buildPluginUninstallShell(hostDir)
|
||||
|
||||
ctx, cancel := context.WithTimeout(ctx, eicPluginOpTimeout)
|
||||
defer cancel()
|
||||
|
||||
return withEICTunnel(ctx, instanceID, func(s eicSSHSession) error {
|
||||
sshCmd := exec.CommandContext(ctx, "ssh", s.sshArgs(cmd)...)
|
||||
sshCmd.Env = os.Environ()
|
||||
var stderr bytes.Buffer
|
||||
sshCmd.Stderr = &stderr
|
||||
if err := sshCmd.Run(); err != nil {
|
||||
return fmt.Errorf(
|
||||
"ssh rm: %w (instance=%s runtime=%s plugin=%s stderr=%s)",
|
||||
err, instanceID, runtime, pluginName,
|
||||
strings.TrimSpace(stderr.String()),
|
||||
)
|
||||
}
|
||||
log.Printf(
|
||||
"uninstallPluginViaEIC: ws instance=%s runtime=%s plugin=%s → removed %s",
|
||||
instanceID, runtime, pluginName, hostDir,
|
||||
)
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
// readPluginManifestViaEIC reads the plugin's plugin.yaml from the
|
||||
// workspace EC2 so uninstall can learn the skills list to clean up.
|
||||
// Returns ("", nil) when the manifest doesn't exist (best-effort: the
|
||||
// local-Docker path treats a missing manifest as "no skills to remove",
|
||||
// not a failure).
|
||||
var readPluginManifestViaEIC = realReadPluginManifestViaEIC
|
||||
|
||||
func realReadPluginManifestViaEIC(ctx context.Context, instanceID, runtime, pluginName string) ([]byte, error) {
|
||||
if instanceID == "" {
|
||||
return nil, fmt.Errorf("readPluginManifestViaEIC: empty instance_id")
|
||||
}
|
||||
if err := validatePluginName(pluginName); err != nil {
|
||||
return nil, fmt.Errorf("readPluginManifestViaEIC: %w", err)
|
||||
}
|
||||
|
||||
hostDir := hostPluginPath(runtime, pluginName)
|
||||
cmd := buildPluginManifestReadShell(hostDir)
|
||||
|
||||
ctx, cancel := context.WithTimeout(ctx, eicPluginOpTimeout)
|
||||
defer cancel()
|
||||
|
||||
var out []byte
|
||||
runErr := withEICTunnel(ctx, instanceID, func(s eicSSHSession) error {
|
||||
sshCmd := exec.CommandContext(ctx, "ssh", s.sshArgs(cmd)...)
|
||||
sshCmd.Env = os.Environ()
|
||||
var stdout, stderr bytes.Buffer
|
||||
sshCmd.Stdout = &stdout
|
||||
sshCmd.Stderr = &stderr
|
||||
// Don't fail on non-zero exit: missing-manifest case returns 1
|
||||
// from cat with empty stdout, which is the "no skills" signal.
|
||||
_ = sshCmd.Run()
|
||||
out = stdout.Bytes()
|
||||
return nil
|
||||
})
|
||||
if runErr != nil {
|
||||
return nil, runErr
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
505
workspace-server/internal/handlers/plugins_install_eic_test.go
Normal file
505
workspace-server/internal/handlers/plugins_install_eic_test.go
Normal file
@ -0,0 +1,505 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"errors"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
// expectAllowlistAllowAll programs the package-shared withMockDB sqlmock
|
||||
// so the org-allowlist gate (org_plugin_allowlist.go) returns "allow-all"
|
||||
// for the duration of one Install call. The gate fires three queries —
|
||||
// resolveOrgID, allowlist EXISTS, allowlist COUNT — and we satisfy each
|
||||
// with the empty/zero shape that means "no allowlist configured."
|
||||
//
|
||||
// Without this, tests that exercise the full Install flow panic on a
|
||||
// nil DB. The handlers package already ships withMockDB in
|
||||
// tokens_sqlmock_test.go; we just layer the allowlist-specific
|
||||
// expectations on top.
|
||||
func expectAllowlistAllowAll(mock sqlmock.Sqlmock) {
|
||||
mock.MatchExpectationsInOrder(false)
|
||||
mock.ExpectQuery(`SELECT parent_id FROM workspaces WHERE id`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow(nil))
|
||||
mock.ExpectQuery(`SELECT EXISTS`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(false))
|
||||
mock.ExpectQuery(`SELECT COUNT\(\*\) FROM org_plugin_allowlist`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0))
|
||||
}
|
||||
|
||||
// stagePluginRegistry creates a single-plugin registry under dir so the
|
||||
// install handler's local resolver can find it. Returns the path to the
|
||||
// plugin dir for any caller that wants to assert tar contents.
|
||||
//
|
||||
// Centralised so a future tweak to the registry shape (e.g. plugin.yaml
|
||||
// schema bump) only updates one place. Tests use the source spec
|
||||
// `local://<name>` which the local resolver maps to <dir>/<name>/.
|
||||
func stagePluginRegistry(t *testing.T, dir, name string) string {
|
||||
t.Helper()
|
||||
pluginDir := filepath.Join(dir, name)
|
||||
if err := os.Mkdir(pluginDir, 0755); err != nil {
|
||||
t.Fatalf("mkdir plugin dir: %v", err)
|
||||
}
|
||||
manifest := "name: " + name + "\nversion: \"1.0.0\"\ndescription: SaaS dispatch test plugin\n"
|
||||
if err := os.WriteFile(filepath.Join(pluginDir, "plugin.yaml"), []byte(manifest), 0644); err != nil {
|
||||
t.Fatalf("write plugin.yaml: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(pluginDir, "rule.md"), []byte("# rule\n"), 0644); err != nil {
|
||||
t.Fatalf("write rule.md: %v", err)
|
||||
}
|
||||
return pluginDir
|
||||
}
|
||||
|
||||
// stubInstallPluginViaEIC swaps the package-level installPluginViaEIC for
|
||||
// the duration of the test; restored by t.Cleanup. Mirrors the existing
|
||||
// withEICTunnel stub pattern (template_files_eic_dispatch_test.go).
|
||||
func stubInstallPluginViaEIC(t *testing.T, fn func(ctx context.Context, instanceID, runtime, pluginName, stagedDir string) error) {
|
||||
t.Helper()
|
||||
prev := installPluginViaEIC
|
||||
installPluginViaEIC = fn
|
||||
t.Cleanup(func() { installPluginViaEIC = prev })
|
||||
}
|
||||
|
||||
func stubUninstallPluginViaEIC(t *testing.T, fn func(ctx context.Context, instanceID, runtime, pluginName string) error) {
|
||||
t.Helper()
|
||||
prev := uninstallPluginViaEIC
|
||||
uninstallPluginViaEIC = fn
|
||||
t.Cleanup(func() { uninstallPluginViaEIC = prev })
|
||||
}
|
||||
|
||||
func stubReadPluginManifestViaEIC(t *testing.T, fn func(ctx context.Context, instanceID, runtime, pluginName string) ([]byte, error)) {
|
||||
t.Helper()
|
||||
prev := readPluginManifestViaEIC
|
||||
readPluginManifestViaEIC = fn
|
||||
t.Cleanup(func() { readPluginManifestViaEIC = prev })
|
||||
}
|
||||
|
||||
// ---------- pure-function shell shape ----------
|
||||
|
||||
func TestBuildPluginInstallShell_QuotesPath(t *testing.T) {
|
||||
got := buildPluginInstallShell("/configs/plugins/my-plugin")
|
||||
want := "sudo -n sh -c 'rm -rf '/configs/plugins/my-plugin' && mkdir -p '/configs/plugins/my-plugin' && tar -xzf - --no-same-owner -C '/configs/plugins/my-plugin' && chown -R 1000:1000 '/configs/plugins/my-plugin''"
|
||||
if got != want {
|
||||
t.Errorf("buildPluginInstallShell mismatch:\n got %q\nwant %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildPluginUninstallShell_QuotesPath(t *testing.T) {
|
||||
got := buildPluginUninstallShell("/configs/plugins/my-plugin")
|
||||
want := "sudo -n rm -rf '/configs/plugins/my-plugin'"
|
||||
if got != want {
|
||||
t.Errorf("buildPluginUninstallShell mismatch:\n got %q\nwant %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildPluginManifestReadShell_QuotesPath(t *testing.T) {
|
||||
got := buildPluginManifestReadShell("/configs/plugins/my-plugin")
|
||||
want := "sudo -n cat '/configs/plugins/my-plugin'/plugin.yaml 2>/dev/null"
|
||||
if got != want {
|
||||
t.Errorf("buildPluginManifestReadShell mismatch:\n got %q\nwant %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHostPluginPath_PerRuntime(t *testing.T) {
|
||||
cases := []struct {
|
||||
runtime string
|
||||
plugin string
|
||||
want string
|
||||
}{
|
||||
{"claude-code", "browser-automation", "/configs/plugins/browser-automation"},
|
||||
{"hermes", "browser-automation", "/home/ubuntu/.hermes/plugins/browser-automation"},
|
||||
{"langgraph", "browser-automation", "/opt/configs/plugins/browser-automation"},
|
||||
// Unknown / empty runtime falls back to /configs (containerized
|
||||
// user-data layout) so a future runtime added to workspaces table
|
||||
// without a workspaceFilePathPrefix entry doesn't blow up the
|
||||
// install path silently.
|
||||
{"", "browser-automation", "/configs/plugins/browser-automation"},
|
||||
{"some-future-runtime", "x", "/configs/plugins/x"},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.runtime+"/"+c.plugin, func(t *testing.T) {
|
||||
got := hostPluginPath(c.runtime, c.plugin)
|
||||
if got != c.want {
|
||||
t.Errorf("hostPluginPath(%q, %q) = %q, want %q", c.runtime, c.plugin, got, c.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ---------- dispatch: install ----------
|
||||
|
||||
// TestPluginInstall_SaaS_DispatchesToEIC — the most-load-bearing test in
|
||||
// this file. With h.docker == nil and instanceIDLookup returning a real
|
||||
// instance_id, Install MUST push the staged plugin to the EC2 over EIC
|
||||
// (not 503). Asserts the EIC stub is called with the right (instance,
|
||||
// runtime, plugin) tuple AND that the staged dir has the manifest +
|
||||
// rule files we put there — proves the staging side wasn't bypassed.
|
||||
func TestPluginInstall_SaaS_DispatchesToEIC(t *testing.T) {
|
||||
registry := t.TempDir()
|
||||
stagePluginRegistry(t, registry, "browser-automation")
|
||||
|
||||
type capture struct {
|
||||
called bool
|
||||
instanceID string
|
||||
runtime string
|
||||
pluginName string
|
||||
stagedFiles []string
|
||||
}
|
||||
var got capture
|
||||
|
||||
stubInstallPluginViaEIC(t, func(ctx context.Context, instanceID, runtime, pluginName, stagedDir string) error {
|
||||
got.called = true
|
||||
got.instanceID = instanceID
|
||||
got.runtime = runtime
|
||||
got.pluginName = pluginName
|
||||
entries, err := os.ReadDir(stagedDir)
|
||||
if err != nil {
|
||||
t.Fatalf("read staged dir: %v", err)
|
||||
}
|
||||
for _, e := range entries {
|
||||
got.stagedFiles = append(got.stagedFiles, e.Name())
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
expectAllowlistAllowAll(mock)
|
||||
|
||||
h := NewPluginsHandler(registry, nil, nil).
|
||||
WithRuntimeLookup(func(string) (string, error) { return "claude-code", nil }).
|
||||
WithInstanceIDLookup(func(string) (string, error) { return "i-0e0951a3cfd9bbf75", nil })
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "c7244ed9-f623-4cba-8873-020e5c9fe104"}}
|
||||
c.Request = httptest.NewRequest(
|
||||
"POST",
|
||||
"/workspaces/c7244ed9-f623-4cba-8873-020e5c9fe104/plugins",
|
||||
bytes.NewBufferString(`{"source":"local://browser-automation"}`),
|
||||
)
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
h.Install(c)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if !got.called {
|
||||
t.Fatalf("installPluginViaEIC was not called")
|
||||
}
|
||||
if got.instanceID != "i-0e0951a3cfd9bbf75" {
|
||||
t.Errorf("instanceID = %q, want i-0e0951a3cfd9bbf75", got.instanceID)
|
||||
}
|
||||
if got.runtime != "claude-code" {
|
||||
t.Errorf("runtime = %q, want claude-code", got.runtime)
|
||||
}
|
||||
if got.pluginName != "browser-automation" {
|
||||
t.Errorf("pluginName = %q, want browser-automation", got.pluginName)
|
||||
}
|
||||
// Staged dir must carry the resolver's actual fetch — manifest + rule.
|
||||
// Anything missing here means the stage step was bypassed.
|
||||
hasManifest, hasRule := false, false
|
||||
for _, f := range got.stagedFiles {
|
||||
if f == "plugin.yaml" {
|
||||
hasManifest = true
|
||||
}
|
||||
if f == "rule.md" {
|
||||
hasRule = true
|
||||
}
|
||||
}
|
||||
if !hasManifest || !hasRule {
|
||||
t.Errorf("staged dir missing files: %v (want plugin.yaml + rule.md)", got.stagedFiles)
|
||||
}
|
||||
}
|
||||
|
||||
// TestPluginInstall_SaaS_PropagatesEICError — when the EIC push fails
|
||||
// (tunnel down, sudo denied), Install MUST surface 502 rather than swallow
|
||||
// the error and report 200. 502 is the right status for "we tried, the
|
||||
// remote side wasn't there" — distinct from 503 ("nothing wired") and
|
||||
// 500 ("our bug"). The body deliberately doesn't echo the underlying
|
||||
// error string (would leak ssh stderr / instance metadata).
|
||||
func TestPluginInstall_SaaS_PropagatesEICError(t *testing.T) {
|
||||
registry := t.TempDir()
|
||||
stagePluginRegistry(t, registry, "browser-automation")
|
||||
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
expectAllowlistAllowAll(mock)
|
||||
|
||||
stubInstallPluginViaEIC(t, func(ctx context.Context, instanceID, runtime, pluginName, stagedDir string) error {
|
||||
return errors.New("ssh: tunnel exited 255")
|
||||
})
|
||||
|
||||
h := NewPluginsHandler(registry, nil, nil).
|
||||
WithRuntimeLookup(func(string) (string, error) { return "claude-code", nil }).
|
||||
WithInstanceIDLookup(func(string) (string, error) { return "i-aaaa", nil })
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
|
||||
c.Request = httptest.NewRequest(
|
||||
"POST",
|
||||
"/workspaces/ws-1/plugins",
|
||||
bytes.NewBufferString(`{"source":"local://browser-automation"}`),
|
||||
)
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
h.Install(c)
|
||||
|
||||
if w.Code != http.StatusBadGateway {
|
||||
t.Errorf("expected 502 for EIC failure, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if strings.Contains(w.Body.String(), "tunnel exited") {
|
||||
t.Errorf("response body must not echo raw EIC error: %s", w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
// TestPluginInstall_NoBackends_Returns503 — lookup is wired but returns
|
||||
// empty instance_id (e.g. workspace pre-provision, or local-Docker
|
||||
// deploy without a running container). The handler MUST 503, not silently
|
||||
// dispatch to EIC with an empty instance_id.
|
||||
func TestPluginInstall_NoBackends_Returns503(t *testing.T) {
|
||||
registry := t.TempDir()
|
||||
stagePluginRegistry(t, registry, "browser-automation")
|
||||
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
expectAllowlistAllowAll(mock)
|
||||
|
||||
stubInstallPluginViaEIC(t, func(ctx context.Context, instanceID, runtime, pluginName, stagedDir string) error {
|
||||
t.Errorf("EIC must not be called when instance_id is empty")
|
||||
return nil
|
||||
})
|
||||
|
||||
h := NewPluginsHandler(registry, nil, nil).
|
||||
WithRuntimeLookup(func(string) (string, error) { return "claude-code", nil }).
|
||||
WithInstanceIDLookup(func(string) (string, error) { return "", nil }) // empty
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
|
||||
c.Request = httptest.NewRequest(
|
||||
"POST",
|
||||
"/workspaces/ws-1/plugins",
|
||||
bytes.NewBufferString(`{"source":"local://browser-automation"}`),
|
||||
)
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
h.Install(c)
|
||||
|
||||
if w.Code != http.StatusServiceUnavailable {
|
||||
t.Errorf("expected 503, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
// TestPluginInstall_InstanceLookupError_Returns503 — a DB hiccup on the
|
||||
// instance_id lookup must NOT crash or 502; the handler logs and falls
|
||||
// through to 503. Same fail-open shape h.runtimeLookup uses (see
|
||||
// TestPluginInstall_NoRuntimeLookup_FailsOpen). Pinning this prevents a
|
||||
// future "tighten error handling" refactor from quietly converting a DB
|
||||
// blip into a five-minute outage on the install endpoint.
|
||||
func TestPluginInstall_InstanceLookupError_Returns503(t *testing.T) {
|
||||
registry := t.TempDir()
|
||||
stagePluginRegistry(t, registry, "browser-automation")
|
||||
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
expectAllowlistAllowAll(mock)
|
||||
|
||||
h := NewPluginsHandler(registry, nil, nil).
|
||||
WithRuntimeLookup(func(string) (string, error) { return "claude-code", nil }).
|
||||
WithInstanceIDLookup(func(string) (string, error) { return "", errors.New("db: connection refused") })
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
|
||||
c.Request = httptest.NewRequest(
|
||||
"POST",
|
||||
"/workspaces/ws-1/plugins",
|
||||
bytes.NewBufferString(`{"source":"local://browser-automation"}`),
|
||||
)
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
h.Install(c)
|
||||
|
||||
if w.Code != http.StatusServiceUnavailable {
|
||||
t.Errorf("expected 503 on instance-id lookup error, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
// ---------- dispatch: uninstall ----------
|
||||
|
||||
func TestPluginUninstall_SaaS_DispatchesToEIC(t *testing.T) {
|
||||
stubReadPluginManifestViaEIC(t, func(ctx context.Context, instanceID, runtime, pluginName string) ([]byte, error) {
|
||||
return []byte("name: browser-automation\nskills:\n - browse\n"), nil
|
||||
})
|
||||
|
||||
type capture struct {
|
||||
called bool
|
||||
instanceID string
|
||||
runtime string
|
||||
pluginName string
|
||||
}
|
||||
var got capture
|
||||
stubUninstallPluginViaEIC(t, func(ctx context.Context, instanceID, runtime, pluginName string) error {
|
||||
got.called = true
|
||||
got.instanceID = instanceID
|
||||
got.runtime = runtime
|
||||
got.pluginName = pluginName
|
||||
return nil
|
||||
})
|
||||
|
||||
h := NewPluginsHandler(t.TempDir(), nil, nil).
|
||||
WithRuntimeLookup(func(string) (string, error) { return "claude-code", nil }).
|
||||
WithInstanceIDLookup(func(string) (string, error) { return "i-bbbb", nil })
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{
|
||||
{Key: "id", Value: "ws-1"},
|
||||
{Key: "name", Value: "browser-automation"},
|
||||
}
|
||||
c.Request = httptest.NewRequest("DELETE", "/workspaces/ws-1/plugins/browser-automation", nil)
|
||||
|
||||
h.Uninstall(c)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if !got.called {
|
||||
t.Fatalf("uninstallPluginViaEIC was not called")
|
||||
}
|
||||
if got.instanceID != "i-bbbb" || got.runtime != "claude-code" || got.pluginName != "browser-automation" {
|
||||
t.Errorf("dispatch args wrong: %+v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPluginUninstall_SaaS_PropagatesEICError(t *testing.T) {
|
||||
stubReadPluginManifestViaEIC(t, func(ctx context.Context, instanceID, runtime, pluginName string) ([]byte, error) {
|
||||
return nil, nil
|
||||
})
|
||||
stubUninstallPluginViaEIC(t, func(ctx context.Context, instanceID, runtime, pluginName string) error {
|
||||
return errors.New("ssh: connection refused")
|
||||
})
|
||||
|
||||
h := NewPluginsHandler(t.TempDir(), nil, nil).
|
||||
WithRuntimeLookup(func(string) (string, error) { return "claude-code", nil }).
|
||||
WithInstanceIDLookup(func(string) (string, error) { return "i-cccc", nil })
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{
|
||||
{Key: "id", Value: "ws-1"},
|
||||
{Key: "name", Value: "browser-automation"},
|
||||
}
|
||||
c.Request = httptest.NewRequest("DELETE", "/workspaces/ws-1/plugins/browser-automation", nil)
|
||||
|
||||
h.Uninstall(c)
|
||||
|
||||
if w.Code != http.StatusBadGateway {
|
||||
t.Errorf("expected 502, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestPluginUninstall_NoBackends_Returns503(t *testing.T) {
|
||||
stubUninstallPluginViaEIC(t, func(ctx context.Context, instanceID, runtime, pluginName string) error {
|
||||
t.Errorf("EIC uninstall must not be called with empty instance_id")
|
||||
return nil
|
||||
})
|
||||
|
||||
h := NewPluginsHandler(t.TempDir(), nil, nil).
|
||||
WithRuntimeLookup(func(string) (string, error) { return "claude-code", nil }).
|
||||
WithInstanceIDLookup(func(string) (string, error) { return "", nil })
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{
|
||||
{Key: "id", Value: "ws-1"},
|
||||
{Key: "name", Value: "browser-automation"},
|
||||
}
|
||||
c.Request = httptest.NewRequest("DELETE", "/workspaces/ws-1/plugins/browser-automation", nil)
|
||||
|
||||
h.Uninstall(c)
|
||||
|
||||
if w.Code != http.StatusServiceUnavailable {
|
||||
t.Errorf("expected 503, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
// ---------- tarball shape ----------
|
||||
|
||||
// TestRealInstallPluginViaEIC_TarPayloadShape — the production
|
||||
// installPluginViaEIC packs the staged dir as gzipped tar. Stub
|
||||
// withEICTunnel + run the real installPluginViaEIC body, capturing the
|
||||
// ssh stdin via a fake exec.Command — except go's exec is hard to fake
|
||||
// without hijacking $PATH. Instead we exercise the tar packer directly:
|
||||
// streamDirAsTar's behaviour is what we actually depend on, and a
|
||||
// regression in either streamDirAsTar OR the gzip wrapping will be
|
||||
// visible here.
|
||||
func TestRealInstallPluginViaEIC_TarPayloadShape(t *testing.T) {
|
||||
staged := t.TempDir()
|
||||
if err := os.WriteFile(filepath.Join(staged, "plugin.yaml"), []byte("name: x\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Join(staged, "skills", "browse"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(staged, "skills", "browse", "instructions.md"), []byte("step 1\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
gz := gzip.NewWriter(&buf)
|
||||
tw := tar.NewWriter(gz)
|
||||
if err := streamDirAsTar(staged, tw); err != nil {
|
||||
t.Fatalf("streamDirAsTar: %v", err)
|
||||
}
|
||||
if err := tw.Close(); err != nil {
|
||||
t.Fatalf("tw close: %v", err)
|
||||
}
|
||||
if err := gz.Close(); err != nil {
|
||||
t.Fatalf("gz close: %v", err)
|
||||
}
|
||||
|
||||
// Round-trip: the same payload the production flow would pipe into
|
||||
// `tar -xzf -` on the remote should unpack to plugin.yaml +
|
||||
// skills/browse/instructions.md.
|
||||
gr, err := gzip.NewReader(&buf)
|
||||
if err != nil {
|
||||
t.Fatalf("gzip reader: %v", err)
|
||||
}
|
||||
tr := tar.NewReader(gr)
|
||||
seen := map[string]bool{}
|
||||
for {
|
||||
hdr, err := tr.Next()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("tar next: %v", err)
|
||||
}
|
||||
seen[hdr.Name] = true
|
||||
}
|
||||
for _, want := range []string{"plugin.yaml", "skills/browse/instructions.md"} {
|
||||
// Tar entries on Linux normally use forward slashes regardless
|
||||
// of host separator; double-check both forms so a Windows test
|
||||
// runner doesn't go red on a path-sep difference. Production
|
||||
// always runs on Linux (CI + tenant EC2).
|
||||
alt := filepath.FromSlash(want)
|
||||
if !seen[want] && !seen[alt] {
|
||||
t.Errorf("tar payload missing %q (saw %v)", want, seen)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -261,22 +261,74 @@ func (h *PluginsHandler) resolveAndStage(ctx context.Context, req installRequest
|
||||
// deliverToContainer copies the staged plugin dir into the workspace
|
||||
// container, chowns it for the agent user, and triggers a restart.
|
||||
// Returns a typed *httpErr on failure; nil on success.
|
||||
//
|
||||
// Dispatch order:
|
||||
//
|
||||
// 1. Local Docker container is up → tar+CopyToContainer (historical path).
|
||||
// 2. SaaS workspace (instance_id set) → push via EIC SSH to the EC2's
|
||||
// bind-mounted /configs/plugins/<name>/. Closes the 🔴 docker-only
|
||||
// row in docs/architecture/backends.md by routing through the same
|
||||
// primitive Files API uses (template_files_eic.go).
|
||||
// 3. Neither wired → 503. True "no backend" case (dev box without
|
||||
// Docker AND without an instance_id row).
|
||||
//
|
||||
// The SaaS branch is gated on h.instanceIDLookup so unit tests can keep
|
||||
// using NewPluginsHandler without a DB; production wires it in router.go.
|
||||
func (h *PluginsHandler) deliverToContainer(ctx context.Context, workspaceID string, r *stageResult) error {
|
||||
containerName := h.findRunningContainer(ctx, workspaceID)
|
||||
if containerName == "" {
|
||||
return newHTTPErr(http.StatusServiceUnavailable, gin.H{"error": "workspace container not running"})
|
||||
if containerName := h.findRunningContainer(ctx, workspaceID); containerName != "" {
|
||||
if err := h.copyPluginToContainer(ctx, containerName, r.StagedDir, r.PluginName); err != nil {
|
||||
log.Printf("Plugin install: failed to copy %s to %s: %v", r.PluginName, workspaceID, err)
|
||||
return newHTTPErr(http.StatusInternalServerError, gin.H{"error": "failed to copy plugin to container"})
|
||||
}
|
||||
h.execAsRoot(ctx, containerName, []string{
|
||||
"chown", "-R", "1000:1000", "/configs/plugins/" + r.PluginName,
|
||||
})
|
||||
if h.restartFunc != nil {
|
||||
go h.restartFunc(workspaceID)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if err := h.copyPluginToContainer(ctx, containerName, r.StagedDir, r.PluginName); err != nil {
|
||||
log.Printf("Plugin install: failed to copy %s to %s: %v", r.PluginName, workspaceID, err)
|
||||
return newHTTPErr(http.StatusInternalServerError, gin.H{"error": "failed to copy plugin to container"})
|
||||
|
||||
if instanceID, runtime := h.lookupSaaSDispatch(workspaceID); instanceID != "" {
|
||||
if err := installPluginViaEIC(ctx, instanceID, runtime, r.PluginName, r.StagedDir); err != nil {
|
||||
log.Printf("Plugin install: EIC push failed for %s → %s: %v", r.PluginName, workspaceID, err)
|
||||
return newHTTPErr(http.StatusBadGateway, gin.H{
|
||||
"error": "failed to deliver plugin to workspace EC2",
|
||||
})
|
||||
}
|
||||
if h.restartFunc != nil {
|
||||
go h.restartFunc(workspaceID)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
h.execAsRoot(ctx, containerName, []string{
|
||||
"chown", "-R", "1000:1000", "/configs/plugins/" + r.PluginName,
|
||||
})
|
||||
if h.restartFunc != nil {
|
||||
go h.restartFunc(workspaceID)
|
||||
|
||||
return newHTTPErr(http.StatusServiceUnavailable, gin.H{"error": "workspace container not running"})
|
||||
}
|
||||
|
||||
// lookupSaaSDispatch returns (instance_id, runtime) for SaaS dispatch, or
|
||||
// ("", "") when the lookups aren't wired or the workspace isn't on the
|
||||
// EC2 backend. Errors from the lookups are logged-and-swallowed: failing
|
||||
// open here just means the caller falls through to the 503 path it would
|
||||
// have returned without us, never to a wrong action against the wrong
|
||||
// instance.
|
||||
func (h *PluginsHandler) lookupSaaSDispatch(workspaceID string) (instanceID, runtime string) {
|
||||
if h.instanceIDLookup == nil {
|
||||
return "", ""
|
||||
}
|
||||
return nil
|
||||
id, err := h.instanceIDLookup(workspaceID)
|
||||
if err != nil {
|
||||
log.Printf("Plugin install: instance_id lookup failed for %s: %v", workspaceID, err)
|
||||
return "", ""
|
||||
}
|
||||
if id == "" {
|
||||
return "", ""
|
||||
}
|
||||
if h.runtimeLookup != nil {
|
||||
if rt, rterr := h.runtimeLookup(workspaceID); rterr == nil {
|
||||
runtime = rt
|
||||
}
|
||||
}
|
||||
return id, runtime
|
||||
}
|
||||
|
||||
// readPluginSkillsFromContainer reads /configs/plugins/<name>/plugin.yaml
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# VENDORED COPY — DO NOT EDIT THIS FILE BY HAND.
|
||||
#
|
||||
# Source of truth:
|
||||
# github.com/Molecule-AI/molecule-ai-workspace-template-hermes
|
||||
# git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-hermes
|
||||
# scripts/derive-provider.sh
|
||||
#
|
||||
# This snapshot is read by derive_provider_drift_test.go so the AST
|
||||
|
||||
@ -110,10 +110,55 @@ func (s *PostgresMessageStore) List(ctx context.Context, workspaceID string, opt
|
||||
return nil, false, err
|
||||
}
|
||||
|
||||
// Wire order: oldest-first within the page so canvas (and any
|
||||
// future client) can render chronologically without per-pair
|
||||
// reordering. The SQL is `ORDER BY created_at DESC LIMIT N` for
|
||||
// pagination correctness, and activityRowToChatMessages emits
|
||||
// [user, agent] within a row — so a naive client-side flat-reverse
|
||||
// would swap the pair (agent before user at the same timestamp).
|
||||
// Reversing ROW-AWARE here keeps the wire shape display-ready.
|
||||
//
|
||||
// Algorithm: group consecutive same-timestamp messages into row
|
||||
// chunks (1-2 messages each), reverse the chunk order, flatten.
|
||||
// Within-row [user, agent] order is preserved. Single-message
|
||||
// rows (no agent reply yet, or attachments-only) collapse to
|
||||
// 1-element chunks and still reverse correctly.
|
||||
messages = reverseRowChunks(messages)
|
||||
|
||||
reachedEnd := rowCount < opts.Limit
|
||||
return messages, reachedEnd, nil
|
||||
}
|
||||
|
||||
// reverseRowChunks groups msgs by adjacent same-Timestamp runs and
|
||||
// reverses the run order, preserving within-run order. Pairs of
|
||||
// (user, agent) emitted by activityRowToChatMessages share a
|
||||
// timestamp, so this keeps each pair internally ordered while
|
||||
// reversing the row sequence.
|
||||
func reverseRowChunks(msgs []ChatMessage) []ChatMessage {
|
||||
if len(msgs) == 0 {
|
||||
return msgs
|
||||
}
|
||||
var chunks [][]ChatMessage
|
||||
cur := []ChatMessage{msgs[0]}
|
||||
for i := 1; i < len(msgs); i++ {
|
||||
if msgs[i].Timestamp == cur[len(cur)-1].Timestamp {
|
||||
cur = append(cur, msgs[i])
|
||||
} else {
|
||||
chunks = append(chunks, cur)
|
||||
cur = []ChatMessage{msgs[i]}
|
||||
}
|
||||
}
|
||||
chunks = append(chunks, cur)
|
||||
for i, j := 0, len(chunks)-1; i < j; i, j = i+1, j-1 {
|
||||
chunks[i], chunks[j] = chunks[j], chunks[i]
|
||||
}
|
||||
out := make([]ChatMessage, 0, len(msgs))
|
||||
for _, chunk := range chunks {
|
||||
out = append(out, chunk...)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// queryActivityRows is split from List so unit tests can exercise the
|
||||
// parser without spinning a real DB. Internal — alternative impls
|
||||
// shouldn't depend on the SQL shape.
|
||||
|
||||
@ -14,10 +14,13 @@ package messagestore
|
||||
// legacy source the server replaces; divergence == regression.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
)
|
||||
|
||||
const fixedTimestamp = "2026-04-25T18:00:00Z"
|
||||
@ -282,6 +285,145 @@ func TestChatHistory_NoAgentMessageWhenResponseHasNoTextNoFiles(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// =====================================================================
|
||||
// List() integration — sqlmock-backed end-to-end via the real handler
|
||||
// =====================================================================
|
||||
|
||||
// TestList_WireOrderIsOldestFirstAcrossPagedRows pins the integration
|
||||
// invariant: List() returns wire-display-ready messages even though
|
||||
// the underlying SQL is `ORDER BY created_at DESC`. This is the
|
||||
// load-bearing test for PR-C-2 — without the row-aware reversal,
|
||||
// canvas would render every paired bubble in the wrong order on every
|
||||
// chat reload (agent before user within each timestamp).
|
||||
//
|
||||
// Mutation-test cover: removing the `messages = reverseRowChunks(...)`
|
||||
// call in List() must turn this test red. (The lower-level
|
||||
// TestReverseRowChunks_PreservesPairOrderAcrossRows pins the helper
|
||||
// itself; this test pins that List ACTUALLY CALLS the helper.)
|
||||
func TestList_WireOrderIsOldestFirstAcrossPagedRows(t *testing.T) {
|
||||
db, mock, err := sqlmock.New()
|
||||
if err != nil {
|
||||
t.Fatalf("sqlmock.New: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
// Server's SQL is ORDER BY created_at DESC. Build mock rows in
|
||||
// THAT order so the row-aware reversal has work to do.
|
||||
rows := sqlmock.NewRows([]string{"created_at", "status", "request_body", "response_body"}).
|
||||
AddRow(mustParseTime(t, "2026-05-05T00:03:00Z"), "ok",
|
||||
`{"params":{"message":{"parts":[{"kind":"text","text":"u3"}]}}}`,
|
||||
`{"result":"a3"}`).
|
||||
AddRow(mustParseTime(t, "2026-05-05T00:02:00Z"), "ok",
|
||||
`{"params":{"message":{"parts":[{"kind":"text","text":"u2"}]}}}`,
|
||||
`{"result":"a2"}`).
|
||||
AddRow(mustParseTime(t, "2026-05-05T00:01:00Z"), "ok",
|
||||
`{"params":{"message":{"parts":[{"kind":"text","text":"u1"}]}}}`,
|
||||
`{"result":"a1"}`)
|
||||
|
||||
mock.ExpectQuery(`SELECT created_at, status, request_body::text, response_body::text`).
|
||||
WillReturnRows(rows)
|
||||
|
||||
store := NewPostgresMessageStore(db)
|
||||
msgs, reachedEnd, err := store.List(context.Background(), "ws-1", ListOptions{Limit: 10})
|
||||
if err != nil {
|
||||
t.Fatalf("List: %v", err)
|
||||
}
|
||||
|
||||
wantContents := []string{"u1", "a1", "u2", "a2", "u3", "a3"}
|
||||
if len(msgs) != len(wantContents) {
|
||||
t.Fatalf("len(msgs)=%d want %d; got=%v", len(msgs), len(wantContents), msgs)
|
||||
}
|
||||
for i, w := range wantContents {
|
||||
if msgs[i].Content != w {
|
||||
t.Errorf("idx %d: got %q want %q (full slice ordering broken; reverseRowChunks regressed?)", i, msgs[i].Content, w)
|
||||
}
|
||||
}
|
||||
if !reachedEnd {
|
||||
t.Errorf("3 rows < limit 10 should reach end, got reachedEnd=false")
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sqlmock expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// =====================================================================
|
||||
// reverseRowChunks — wire-order helper added in PR-C-2
|
||||
// =====================================================================
|
||||
|
||||
// TestReverseRowChunks_PreservesPairOrderAcrossRows pins the
|
||||
// row-aware reversal that List() applies before returning. Server's
|
||||
// SQL is `ORDER BY created_at DESC`, so messages come out
|
||||
// newest-row-first; activityRowToChatMessages emits [user, agent]
|
||||
// per row with same timestamp. A naive flat reversal of the messages
|
||||
// slice would flip each pair (agent before user). reverseRowChunks
|
||||
// reverses ROWS, preserving pair-internal order. Without this, canvas
|
||||
// would render every paired bubble in the wrong order on every chat
|
||||
// reload — the canvas-side reverse used to do the right thing because
|
||||
// it reversed ROWS BEFORE flattening, but PR-C/D moved the flattening
|
||||
// into the server, so the row-awareness has to live there too.
|
||||
func TestReverseRowChunks_PreservesPairOrderAcrossRows(t *testing.T) {
|
||||
// Build messages newest-row-first as List() collects them. Each
|
||||
// row is a pair sharing a timestamp, with [user, agent] order.
|
||||
in := []ChatMessage{
|
||||
{Role: "user", Content: "user_3", Timestamp: "2026-05-05T00:03:00Z"},
|
||||
{Role: "agent", Content: "agent_3", Timestamp: "2026-05-05T00:03:00Z"},
|
||||
{Role: "user", Content: "user_2", Timestamp: "2026-05-05T00:02:00Z"},
|
||||
{Role: "agent", Content: "agent_2", Timestamp: "2026-05-05T00:02:00Z"},
|
||||
{Role: "user", Content: "user_1", Timestamp: "2026-05-05T00:01:00Z"},
|
||||
{Role: "agent", Content: "agent_1", Timestamp: "2026-05-05T00:01:00Z"},
|
||||
}
|
||||
got := reverseRowChunks(in)
|
||||
|
||||
want := []struct {
|
||||
role, content string
|
||||
}{
|
||||
{"user", "user_1"}, {"agent", "agent_1"},
|
||||
{"user", "user_2"}, {"agent", "agent_2"},
|
||||
{"user", "user_3"}, {"agent", "agent_3"},
|
||||
}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("len(got)=%d len(want)=%d", len(got), len(want))
|
||||
}
|
||||
for i, w := range want {
|
||||
if got[i].Role != w.role || got[i].Content != w.content {
|
||||
t.Errorf("idx %d: got role=%q content=%q want role=%q content=%q",
|
||||
i, got[i].Role, got[i].Content, w.role, w.content)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestReverseRowChunks_HandlesSingleMessageRows pins the case where
|
||||
// a row has only a user OR only an agent message (e.g., agent reply
|
||||
// not yet recorded, attachments-only user upload). Naive reversal
|
||||
// still works for single-message chunks; the test guards against a
|
||||
// future change that special-cases the 2-message-row path.
|
||||
func TestReverseRowChunks_HandlesSingleMessageRows(t *testing.T) {
|
||||
in := []ChatMessage{
|
||||
{Role: "user", Content: "u3", Timestamp: "2026-05-05T00:03:00Z"},
|
||||
{Role: "user", Content: "u2", Timestamp: "2026-05-05T00:02:00Z"}, // single, no agent
|
||||
{Role: "agent", Content: "a2", Timestamp: "2026-05-05T00:02:00Z"},
|
||||
{Role: "user", Content: "u1", Timestamp: "2026-05-05T00:01:00Z"},
|
||||
}
|
||||
got := reverseRowChunks(in)
|
||||
wantContents := []string{"u1", "u2", "a2", "u3"}
|
||||
if len(got) != len(wantContents) {
|
||||
t.Fatalf("len got=%d want=%d", len(got), len(wantContents))
|
||||
}
|
||||
for i, w := range wantContents {
|
||||
if got[i].Content != w {
|
||||
t.Errorf("idx %d: got %q want %q", i, got[i].Content, w)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestReverseRowChunks_EmptyInput returns nil/empty without panic.
|
||||
func TestReverseRowChunks_EmptyInput(t *testing.T) {
|
||||
got := reverseRowChunks(nil)
|
||||
if len(got) != 0 {
|
||||
t.Errorf("nil input should return empty, got %v", got)
|
||||
}
|
||||
}
|
||||
|
||||
// =====================================================================
|
||||
// end-to-end shape — paired user + agent with same timestamp
|
||||
// =====================================================================
|
||||
|
||||
149
workspace-server/internal/registry/cp_orphan_sweeper.go
Normal file
149
workspace-server/internal/registry/cp_orphan_sweeper.go
Normal file
@ -0,0 +1,149 @@
|
||||
package registry
|
||||
|
||||
// cp_orphan_sweeper.go — SaaS-mode counterpart to orphan_sweeper.go.
|
||||
//
|
||||
// The Docker sweeper (StartOrphanSweeper) runs only when prov != nil
|
||||
// (single-tenant Docker mode); SaaS tenants run cpProv != nil and prov
|
||||
// == nil, so they get no sweep coverage from that path. This file fills
|
||||
// the gap for the deprovision split-write race documented in #2989:
|
||||
//
|
||||
// 1. handlers/workspace_crud.go:365 marks workspaces.status = 'removed'.
|
||||
// 2. workspace_crud.go:439 calls StopWorkspaceAuto → cpProv.Stop, which
|
||||
// issues DELETE /cp/workspaces/:id?instance_id=… to controlplane.
|
||||
// 3. If step 2 fails (CP transient 5xx, network blip, AWS hiccup), the
|
||||
// inline path returns a 500 to the canvas — but the DB row is already
|
||||
// at status='removed' with instance_id still populated. There's no
|
||||
// retry, and the EC2 lives forever.
|
||||
//
|
||||
// This sweeper closes that gap by re-issuing cpProv.Stop on every cycle
|
||||
// for any workspace at status='removed' with a non-NULL instance_id.
|
||||
// Stop is idempotent: AWS TerminateInstance on an already-terminated
|
||||
// instance is a no-op (per AWS docs), and CP's Deprovision handler
|
||||
// (controlplane/internal/handlers/workspace_provision.go:289) handles
|
||||
// the already-terminated and already-deleted-DNS cases via best-effort
|
||||
// guards. On Stop success, the sweeper clears instance_id so the next
|
||||
// cycle skips the row.
|
||||
//
|
||||
// Cadence + safety filters mirror the Docker sweeper:
|
||||
// - 60s tick (OrphanSweepInterval)
|
||||
// - 30s per-cycle deadline (orphanSweepDeadline)
|
||||
// - LIMIT 100 per cycle so a sustained CP outage that backs up many
|
||||
// orphans doesn't blow the request timeout; subsequent cycles drain.
|
||||
//
|
||||
// SSOT note: Stop's idempotency (no-op on empty instance_id, AWS
|
||||
// terminate on already-terminated) is the load-bearing invariant. Any
|
||||
// future change that adds non-idempotent side effects to cpProv.Stop
|
||||
// must also gate this sweeper, or it will re-execute those side effects
|
||||
// every 60s for every cleared-but-not-yet-NULL row.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"time"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
)
|
||||
|
||||
// CPOrphanReaper is the dependency the SaaS-mode sweeper takes from
|
||||
// the CP provisioner. *provisioner.CPProvisioner satisfies this
|
||||
// naturally; tests inject fakes.
|
||||
type CPOrphanReaper interface {
|
||||
Stop(ctx context.Context, workspaceID string) error
|
||||
}
|
||||
|
||||
// cpSweepLimit caps the per-cycle row count so a sustained CP outage
|
||||
// can't make a single sweep cycle blow orphanSweepDeadline. With a
|
||||
// 60s cadence and 100-row limit, drain rate is up to 100 orphans/min,
|
||||
// which has never been approached even during the worst leak windows.
|
||||
const cpSweepLimit = 100
|
||||
|
||||
// StartCPOrphanSweeper runs the SaaS-mode reconcile loop until ctx is
|
||||
// cancelled. nil reaper makes the loop a no-op (matches the Docker
|
||||
// sweeper's nil-tolerant pattern).
|
||||
//
|
||||
// Caller is expected to gate on `cpProv != nil` (matching how
|
||||
// StartOrphanSweeper is gated on `prov != nil` at the call site in
|
||||
// cmd/server/main.go) — passing a nil *CPProvisioner here would also
|
||||
// short-circuit but the gate at the wiring site keeps the call shape
|
||||
// symmetric across the two sweepers.
|
||||
func StartCPOrphanSweeper(ctx context.Context, reaper CPOrphanReaper) {
|
||||
if reaper == nil {
|
||||
log.Println("CP orphan sweeper: reaper is nil — sweeper disabled")
|
||||
return
|
||||
}
|
||||
log.Printf("CP orphan sweeper started — reconciling every %s", OrphanSweepInterval)
|
||||
ticker := time.NewTicker(OrphanSweepInterval)
|
||||
defer ticker.Stop()
|
||||
cpSweepOnce(ctx, reaper)
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
log.Println("CP orphan sweeper: shutdown")
|
||||
return
|
||||
case <-ticker.C:
|
||||
cpSweepOnce(ctx, reaper)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// cpSweepOnce executes one reconcile pass. Defensive against db.DB
|
||||
// being nil so a misconfigured boot doesn't panic.
|
||||
func cpSweepOnce(parent context.Context, reaper CPOrphanReaper) {
|
||||
if db.DB == nil {
|
||||
return
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(parent, orphanSweepDeadline)
|
||||
defer cancel()
|
||||
|
||||
rows, err := db.DB.QueryContext(ctx, `
|
||||
SELECT id::text
|
||||
FROM workspaces
|
||||
WHERE status = 'removed'
|
||||
AND instance_id IS NOT NULL
|
||||
AND instance_id != ''
|
||||
ORDER BY updated_at DESC
|
||||
LIMIT $1
|
||||
`, cpSweepLimit)
|
||||
if err != nil {
|
||||
log.Printf("CP orphan sweeper: DB query failed: %v", err)
|
||||
return
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var orphanIDs []string
|
||||
for rows.Next() {
|
||||
var id string
|
||||
if scanErr := rows.Scan(&id); scanErr != nil {
|
||||
log.Printf("CP orphan sweeper: row scan failed: %v", scanErr)
|
||||
continue
|
||||
}
|
||||
orphanIDs = append(orphanIDs, id)
|
||||
}
|
||||
if iterErr := rows.Err(); iterErr != nil {
|
||||
log.Printf("CP orphan sweeper: rows iteration failed: %v", iterErr)
|
||||
return
|
||||
}
|
||||
|
||||
for _, id := range orphanIDs {
|
||||
log.Printf("CP orphan sweeper: terminating leaked EC2 for removed workspace %s", id)
|
||||
if stopErr := reaper.Stop(ctx, id); stopErr != nil {
|
||||
// CP-side error — transient 5xx, network, AWS hiccup. Leave
|
||||
// instance_id populated so the next cycle retries. Loud-fail
|
||||
// only at the log layer; the user-visible 500 was already
|
||||
// returned by the inline path that triggered this orphan.
|
||||
log.Printf("CP orphan sweeper: Stop failed for %s: %v — retry next cycle", id, stopErr)
|
||||
continue
|
||||
}
|
||||
// Stop succeeded — clear instance_id so the next cycle skips this
|
||||
// row. We can't use a tombstone column (no schema change in this
|
||||
// PR); NULL'ing instance_id is the SSOT signal for "no live
|
||||
// EC2 attached." The matching SELECT predicate above stays in
|
||||
// sync with this UPDATE.
|
||||
if _, updErr := db.DB.ExecContext(ctx,
|
||||
`UPDATE workspaces SET instance_id = NULL, updated_at = now() WHERE id = $1`,
|
||||
id,
|
||||
); updErr != nil {
|
||||
log.Printf("CP orphan sweeper: clear instance_id failed for %s: %v — next cycle will re-Stop (idempotent)", id, updErr)
|
||||
}
|
||||
}
|
||||
}
|
||||
266
workspace-server/internal/registry/cp_orphan_sweeper_test.go
Normal file
266
workspace-server/internal/registry/cp_orphan_sweeper_test.go
Normal file
@ -0,0 +1,266 @@
|
||||
package registry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
)
|
||||
|
||||
// fakeCPReaper is a hand-rolled CPOrphanReaper for the SaaS-mode
|
||||
// sweeper tests. Records every Stop call so tests can assert which
|
||||
// workspace IDs were re-issued.
|
||||
type fakeCPReaper struct {
|
||||
mu sync.Mutex
|
||||
stopErr map[string]error
|
||||
stopCalls []string
|
||||
}
|
||||
|
||||
func (f *fakeCPReaper) Stop(_ context.Context, wsID string) error {
|
||||
f.mu.Lock()
|
||||
defer f.mu.Unlock()
|
||||
f.stopCalls = append(f.stopCalls, wsID)
|
||||
return f.stopErr[wsID]
|
||||
}
|
||||
|
||||
// TestCPSweepOnce_StopSucceeds_ClearsInstanceID — happy path. Single
|
||||
// removed-row with non-NULL instance_id; Stop succeeds; instance_id
|
||||
// gets NULL'd so the next cycle won't re-sweep it.
|
||||
func TestCPSweepOnce_StopSucceeds_ClearsInstanceID(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
reaper := &fakeCPReaper{}
|
||||
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT id::text\s+FROM workspaces\s+WHERE status = 'removed'\s+AND instance_id IS NOT NULL\s+AND instance_id != ''\s+ORDER BY updated_at DESC\s+LIMIT \$1`).
|
||||
WithArgs(cpSweepLimit).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-uuid-1"))
|
||||
mock.ExpectExec(`UPDATE workspaces SET instance_id = NULL, updated_at = now\(\) WHERE id = \$1`).
|
||||
WithArgs("ws-uuid-1").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
cpSweepOnce(context.Background(), reaper)
|
||||
|
||||
if len(reaper.stopCalls) != 1 || reaper.stopCalls[0] != "ws-uuid-1" {
|
||||
t.Fatalf("expected Stop(ws-uuid-1), got %v", reaper.stopCalls)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestCPSweepOnce_StopFails_KeepsInstanceID — CP transient failure.
|
||||
// Stop returns an error; instance_id MUST stay populated so the next
|
||||
// cycle retries. UPDATE must NOT fire.
|
||||
func TestCPSweepOnce_StopFails_KeepsInstanceID(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
reaper := &fakeCPReaper{
|
||||
stopErr: map[string]error{"ws-uuid-1": errors.New("CP returned 503")},
|
||||
}
|
||||
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT id::text\s+FROM workspaces`).
|
||||
WithArgs(cpSweepLimit).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-uuid-1"))
|
||||
// No ExpectExec for the UPDATE — sqlmock fails the test if the
|
||||
// UPDATE fires.
|
||||
|
||||
cpSweepOnce(context.Background(), reaper)
|
||||
|
||||
if len(reaper.stopCalls) != 1 || reaper.stopCalls[0] != "ws-uuid-1" {
|
||||
t.Fatalf("expected Stop(ws-uuid-1), got %v", reaper.stopCalls)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations (UPDATE should NOT have fired): %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestCPSweepOnce_NoOrphans — empty result set is the steady state in
|
||||
// healthy operation. No Stop, no UPDATE.
|
||||
func TestCPSweepOnce_NoOrphans(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
reaper := &fakeCPReaper{}
|
||||
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT id::text\s+FROM workspaces`).
|
||||
WithArgs(cpSweepLimit).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}))
|
||||
|
||||
cpSweepOnce(context.Background(), reaper)
|
||||
|
||||
if len(reaper.stopCalls) != 0 {
|
||||
t.Fatalf("expected zero Stop calls, got %v", reaper.stopCalls)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestCPSweepOnce_MultipleOrphans — all rows in the batch get Stop'd
|
||||
// independently; one failure doesn't block others.
|
||||
func TestCPSweepOnce_MultipleOrphans(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
reaper := &fakeCPReaper{
|
||||
stopErr: map[string]error{"ws-uuid-2": errors.New("CP 503 on ws-uuid-2")},
|
||||
}
|
||||
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT id::text\s+FROM workspaces`).
|
||||
WithArgs(cpSweepLimit).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}).
|
||||
AddRow("ws-uuid-1").
|
||||
AddRow("ws-uuid-2").
|
||||
AddRow("ws-uuid-3"))
|
||||
// ws-uuid-1 succeeds → UPDATE fires.
|
||||
mock.ExpectExec(`UPDATE workspaces SET instance_id = NULL`).
|
||||
WithArgs("ws-uuid-1").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
// ws-uuid-2 fails → no UPDATE.
|
||||
// ws-uuid-3 succeeds → UPDATE fires.
|
||||
mock.ExpectExec(`UPDATE workspaces SET instance_id = NULL`).
|
||||
WithArgs("ws-uuid-3").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
cpSweepOnce(context.Background(), reaper)
|
||||
|
||||
if len(reaper.stopCalls) != 3 {
|
||||
t.Fatalf("expected Stop on all 3 ids, got %v", reaper.stopCalls)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestCPSweepOnce_QueryError — DB transient failure. Sweep returns
|
||||
// without panicking. No Stop calls.
|
||||
func TestCPSweepOnce_QueryError(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
reaper := &fakeCPReaper{}
|
||||
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT id::text\s+FROM workspaces`).
|
||||
WithArgs(cpSweepLimit).
|
||||
WillReturnError(errors.New("connection refused"))
|
||||
|
||||
cpSweepOnce(context.Background(), reaper)
|
||||
|
||||
if len(reaper.stopCalls) != 0 {
|
||||
t.Fatalf("expected zero Stop calls on query error, got %v", reaper.stopCalls)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestCPSweepOnce_UpdateError_LogsButContinues — Stop succeeded but
|
||||
// the UPDATE to clear instance_id failed. Subsequent rows in the batch
|
||||
// must still process; comment in cpSweepOnce promises idempotent re-Stop
|
||||
// next cycle.
|
||||
func TestCPSweepOnce_UpdateError_LogsButContinues(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
reaper := &fakeCPReaper{}
|
||||
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT id::text\s+FROM workspaces`).
|
||||
WithArgs(cpSweepLimit).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}).
|
||||
AddRow("ws-uuid-1").
|
||||
AddRow("ws-uuid-2"))
|
||||
mock.ExpectExec(`UPDATE workspaces SET instance_id = NULL`).
|
||||
WithArgs("ws-uuid-1").
|
||||
WillReturnError(errors.New("UPDATE timeout"))
|
||||
mock.ExpectExec(`UPDATE workspaces SET instance_id = NULL`).
|
||||
WithArgs("ws-uuid-2").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
cpSweepOnce(context.Background(), reaper)
|
||||
|
||||
if len(reaper.stopCalls) != 2 {
|
||||
t.Fatalf("expected Stop on both ids despite UPDATE error on first, got %v", reaper.stopCalls)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestCPSweepOnce_NilDB — defensive against db.DB being nil. Must not
|
||||
// panic; must not call Stop.
|
||||
func TestCPSweepOnce_NilDB(t *testing.T) {
|
||||
saved := db.DB
|
||||
db.DB = nil
|
||||
t.Cleanup(func() { db.DB = saved })
|
||||
|
||||
reaper := &fakeCPReaper{}
|
||||
cpSweepOnce(context.Background(), reaper)
|
||||
|
||||
if len(reaper.stopCalls) != 0 {
|
||||
t.Fatalf("expected zero Stop calls when db.DB is nil, got %v", reaper.stopCalls)
|
||||
}
|
||||
}
|
||||
|
||||
// TestStartCPOrphanSweeper_NilReaperDisabled — boot-safety: a SaaS CP
|
||||
// without cpProv configured must not start the loop (immediate return,
|
||||
// no goroutine leak).
|
||||
func TestStartCPOrphanSweeper_NilReaperDisabled(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
StartCPOrphanSweeper(ctx, nil)
|
||||
close(done)
|
||||
}()
|
||||
select {
|
||||
case <-done:
|
||||
// expected — nil reaper short-circuits.
|
||||
case <-time.After(500 * time.Millisecond):
|
||||
t.Fatal("StartCPOrphanSweeper(nil) did not return immediately")
|
||||
}
|
||||
}
|
||||
|
||||
// TestStartCPOrphanSweeper_RunsOnceImmediatelyAndOnTick — cadence
|
||||
// contract: kick off one sweep at boot (so a platform restart starts
|
||||
// healing immediately), then once per OrphanSweepInterval. Verifies
|
||||
// the loop terminates on ctx cancel.
|
||||
func TestStartCPOrphanSweeper_RunsOnceImmediatelyAndOnTick(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
reaper := &fakeCPReaper{}
|
||||
|
||||
// Two sweeps within the test window: one immediate, one on the
|
||||
// first tick. We can't shrink OrphanSweepInterval (it's a const),
|
||||
// so assert "at least one immediate sweep" and let cancel close
|
||||
// the loop.
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT id::text\s+FROM workspaces`).
|
||||
WithArgs(cpSweepLimit).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}))
|
||||
// The ticker may or may not fire in the test window depending on
|
||||
// scheduler; tolerate both shapes by registering a second optional
|
||||
// expectation. sqlmock fails on UNREGISTERED queries, so register
|
||||
// one more then accept either 1 or 2 fires.
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT id::text\s+FROM workspaces`).
|
||||
WithArgs(cpSweepLimit).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}))
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
StartCPOrphanSweeper(ctx, reaper)
|
||||
close(done)
|
||||
}()
|
||||
// 100ms is well past the boot-sweep but well shy of the 60s
|
||||
// interval, so the second query expectation is intentionally
|
||||
// unmet — that's fine, sqlmock distinguishes "expected but not
|
||||
// received" (we don't enforce here) from "unexpected query"
|
||||
// (which would fail).
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
cancel()
|
||||
select {
|
||||
case <-done:
|
||||
// expected
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("StartCPOrphanSweeper did not exit on ctx cancel")
|
||||
}
|
||||
|
||||
// Boot sweep must have happened — without it, an operator restart
|
||||
// after a CP outage would leave a 60s gap before the first heal.
|
||||
// We don't assert mock.ExpectationsWereMet() here because the
|
||||
// second query is intentionally optional.
|
||||
}
|
||||
@ -11,13 +11,13 @@ import (
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/channels"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/messagestore"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/handlers"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/pendinguploads"
|
||||
memwiring "github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/wiring"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/messagestore"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/metrics"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/middleware"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/pendinguploads"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/supervised"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/ws"
|
||||
@ -109,8 +109,8 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi
|
||||
now := time.Now()
|
||||
for name, last := range snap {
|
||||
out[name] = gin.H{
|
||||
"last_tick_at": last,
|
||||
"seconds_ago": int(now.Sub(last).Seconds()),
|
||||
"last_tick_at": last,
|
||||
"seconds_ago": int(now.Sub(last).Seconds()),
|
||||
}
|
||||
}
|
||||
c.JSON(200, gin.H{"subsystems": out})
|
||||
@ -599,8 +599,25 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi
|
||||
).Scan(&runtime)
|
||||
return runtime, err
|
||||
}
|
||||
// Instance-id lookup powers the SaaS dispatch in install/uninstall:
|
||||
// when a workspace is on the EC2-per-workspace backend (instance_id
|
||||
// non-NULL) and there's no local Docker container to exec into, the
|
||||
// pipeline pushes the staged plugin tarball to that EC2 over EIC SSH.
|
||||
// Empty result means the workspace lives on the local-Docker backend
|
||||
// (or hasn't been provisioned yet) and the handler falls back to its
|
||||
// original Docker path. Same pattern templates.go and terminal.go use.
|
||||
instanceIDLookup := func(workspaceID string) (string, error) {
|
||||
var instanceID string
|
||||
err := db.DB.QueryRowContext(
|
||||
context.Background(),
|
||||
`SELECT COALESCE(instance_id, '') FROM workspaces WHERE id = $1`,
|
||||
workspaceID,
|
||||
).Scan(&instanceID)
|
||||
return instanceID, err
|
||||
}
|
||||
plgh := handlers.NewPluginsHandler(pluginsDir, dockerCli, wh.RestartByID).
|
||||
WithRuntimeLookup(runtimeLookup)
|
||||
WithRuntimeLookup(runtimeLookup).
|
||||
WithInstanceIDLookup(instanceIDLookup)
|
||||
r.GET("/plugins", plgh.ListRegistry)
|
||||
r.GET("/plugins/sources", plgh.ListSources)
|
||||
wsAuth.GET("/plugins", plgh.ListInstalled)
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# build-all.sh — Rebuild base image and optionally adapter images.
|
||||
#
|
||||
# NOTE: Adapters have been extracted to standalone template repos:
|
||||
# https://github.com/Molecule-AI/molecule-ai-workspace-template-<runtime>
|
||||
# https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-<runtime>
|
||||
#
|
||||
# This script now only builds the base image from workspace/Dockerfile.
|
||||
# Each adapter repo has its own Dockerfile that installs molecule-ai-workspace-runtime
|
||||
|
||||
Loading…
Reference in New Issue
Block a user