diff --git a/.github/scripts/lint_secret_pattern_drift.py b/.github/scripts/lint_secret_pattern_drift.py index 076d2719..c630094f 100644 --- a/.github/scripts/lint_secret_pattern_drift.py +++ b/.github/scripts/lint_secret_pattern_drift.py @@ -37,7 +37,7 @@ CANONICAL_FILE = Path(".github/workflows/secret-scan.yml") CONSUMERS: list[tuple[str, str]] = [ ( "molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh", - "https://raw.githubusercontent.com/Molecule-AI/molecule-ai-workspace-runtime/main/molecule_runtime/scripts/pre-commit-checks.sh", + "https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-runtime/raw/branch/main/molecule_runtime/scripts/pre-commit-checks.sh", ), ] diff --git a/.github/workflows/canary-verify.yml b/.github/workflows/canary-verify.yml index c26958ae..e19c1619 100644 --- a/.github/workflows/canary-verify.yml +++ b/.github/workflows/canary-verify.yml @@ -108,7 +108,7 @@ jobs: echo echo "One or more canary secrets are unset (\`CANARY_TENANT_URLS\`, \`CANARY_ADMIN_TOKENS\`, \`CANARY_CP_SHARED_SECRET\`)." echo "Phase 2 canary fleet has not been stood up yet —" - echo "see [canary-tenants.md](https://github.com/molecule-ai/molecule-controlplane/blob/main/docs/canary-tenants.md)." + echo "see [canary-tenants.md](https://git.moleculesai.app/molecule-ai/molecule-controlplane/blob/main/docs/canary-tenants.md)." echo echo "**Skipped — promote-to-latest will NOT auto-fire.** Dispatch \`promote-latest.yml\` manually when ready." } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/e2e-api.yml b/.github/workflows/e2e-api.yml index 782cbedc..da7dbcd3 100644 --- a/.github/workflows/e2e-api.yml +++ b/.github/workflows/e2e-api.yml @@ -12,6 +12,59 @@ name: E2E API Smoke Test # spending CI cycles. See the in-job comment on the `e2e-api` job for # why this is one job (not two-jobs-sharing-name) and the 2026-04-29 # PR #2264 incident that drove the consolidation. +# +# Parallel-safety (Class B Hongming-owned CICD red sweep, 2026-05-08) +# ------------------------------------------------------------------- +# Same substrate hazard as PR #98 (handlers-postgres-integration). Our +# Gitea act_runner runs with `container.network: host` (operator host +# `/opt/molecule/runners/config.yaml`), which means: +# +# * Two concurrent runs both try to bind their `-p 15432:5432` / +# `-p 16379:6379` host ports — the second postgres/redis FATALs +# with `Address in use` and `docker run` returns exit 125 with +# `Conflict. The container name "/molecule-ci-postgres" is already +# in use by container ...`. Verified in run a7/2727 on 2026-05-07. +# * The fixed container names `molecule-ci-postgres` / `-redis` (the +# pre-fix shape) collide on name AS WELL AS port. The cleanup-with- +# `docker rm -f` at the start of the second job KILLS the first +# job's still-running postgres/redis. +# +# Fix shape (mirrors PR #98's bridge-net pattern, adapted because +# platform-server is a Go binary on the host, not a containerised +# step): +# +# 1. Unique container names per run: +# pg-e2e-api-${RUN_ID}-${RUN_ATTEMPT} +# redis-e2e-api-${RUN_ID}-${RUN_ATTEMPT} +# `${RUN_ID}-${RUN_ATTEMPT}` is unique even across reruns of the +# same run_id. +# 2. Ephemeral host port per run (`-p 0:5432`), then read the actual +# bound port via `docker port` and export DATABASE_URL/REDIS_URL +# pointing at it. No fixed host-port → no port collision. +# 3. `127.0.0.1` (NOT `localhost`) in URLs — IPv6 first-resolve was +# the original flake fixed in #92 and the script's still IPv6- +# enabled. +# 4. `if: always()` cleanup so containers don't leak when test steps +# fail. +# +# Issue #94 items #2 + #3 (also fixed here): +# * Pre-pull `alpine:latest` so the platform-server's provisioner +# (`internal/handlers/container_files.go`) can stand up its +# ephemeral token-write helper without a daemon.io round-trip. +# * Create `molecule-monorepo-net` bridge network if missing so the +# provisioner's container.HostConfig {NetworkMode: ...} attach +# succeeds. +# Item #1 (timeouts) — evidence on recent runs (77/3191, ae/4270, 0e/ +# 2318) shows Postgres ready in 3s, Redis in 1s, Platform in 1s when +# they DO come up. Timeouts are not the bottleneck; not bumped. +# +# Item explicitly NOT fixed here: failing test `Status back online` +# fails because the platform's langgraph workspace template image +# (ghcr.io/molecule-ai/workspace-template-langgraph:latest) returns +# 403 Forbidden post-2026-05-06 GitHub org suspension. That is a +# template-registry resolution issue (ADR-002 / local-build mode) and +# belongs in a separate change that touches workspace-server, not +# this workflow file. on: push: @@ -78,11 +131,14 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 15 env: - DATABASE_URL: postgres://dev:dev@localhost:15432/molecule?sslmode=disable - REDIS_URL: redis://localhost:16379 + # Unique per-run container names so concurrent runs on the host- + # network act_runner don't collide on name OR port. + # `${RUN_ID}-${RUN_ATTEMPT}` stays unique across reruns of the + # same run_id. PORT is set later (after docker port lookup) since + # we let Docker assign an ephemeral host port. + PG_CONTAINER: pg-e2e-api-${{ github.run_id }}-${{ github.run_attempt }} + REDIS_CONTAINER: redis-e2e-api-${{ github.run_id }}-${{ github.run_attempt }} PORT: "8080" - PG_CONTAINER: molecule-ci-postgres - REDIS_CONTAINER: molecule-ci-redis steps: - name: No-op pass (paths filter excluded this commit) if: needs.detect-changes.outputs.api != 'true' @@ -97,11 +153,53 @@ jobs: go-version: 'stable' cache: true cache-dependency-path: workspace-server/go.sum + - name: Pre-pull alpine + ensure provisioner network (Issue #94 items #2 + #3) + if: needs.detect-changes.outputs.api == 'true' + run: | + # Provisioner uses alpine:latest for ephemeral token-write + # containers (workspace-server/internal/handlers/container_files.go). + # Pre-pull so the first provision in test_api.sh doesn't race + # the daemon's pull cache. Idempotent — `docker pull` is a no-op + # when the image is already present. + docker pull alpine:latest >/dev/null + # Provisioner attaches workspace containers to + # molecule-monorepo-net (workspace-server/internal/provisioner/ + # provisioner.go::DefaultNetwork). The bridge already exists on + # the operator host's docker daemon — `network create` is + # idempotent via `|| true`. + docker network create molecule-monorepo-net >/dev/null 2>&1 || true + echo "alpine:latest pre-pulled; molecule-monorepo-net ensured." - name: Start Postgres (docker) if: needs.detect-changes.outputs.api == 'true' run: | + # Defensive cleanup — only matches THIS run's container name, + # so it cannot kill a sibling run's postgres. (Pre-fix the + # name was static and this rm hit other runs' containers.) docker rm -f "$PG_CONTAINER" 2>/dev/null || true - docker run -d --name "$PG_CONTAINER" -e POSTGRES_USER=dev -e POSTGRES_PASSWORD=dev -e POSTGRES_DB=molecule -p 15432:5432 postgres:16 + # `-p 0:5432` requests an ephemeral host port; we read it back + # below and export DATABASE_URL. + docker run -d --name "$PG_CONTAINER" \ + -e POSTGRES_USER=dev -e POSTGRES_PASSWORD=dev -e POSTGRES_DB=molecule \ + -p 0:5432 postgres:16 >/dev/null + # Resolve the host-side port assignment. `docker port` prints + # `0.0.0.0:NNNN` (and on host-net runners may also print an + # IPv6 line — take the first IPv4 line). + PG_PORT=$(docker port "$PG_CONTAINER" 5432/tcp | awk -F: '/^0\.0\.0\.0:/ {print $2; exit}') + if [ -z "$PG_PORT" ]; then + # Fallback: any first line. Some Docker versions print only + # one line. + PG_PORT=$(docker port "$PG_CONTAINER" 5432/tcp | head -1 | awk -F: '{print $NF}') + fi + if [ -z "$PG_PORT" ]; then + echo "::error::Could not resolve host port for $PG_CONTAINER" + docker port "$PG_CONTAINER" 5432/tcp || true + docker logs "$PG_CONTAINER" || true + exit 1 + fi + # 127.0.0.1 (NOT localhost) — IPv6 first-resolve flake (#92). + echo "PG_PORT=${PG_PORT}" >> "$GITHUB_ENV" + echo "DATABASE_URL=postgres://dev:dev@127.0.0.1:${PG_PORT}/molecule?sslmode=disable" >> "$GITHUB_ENV" + echo "Postgres host port: ${PG_PORT}" for i in $(seq 1 30); do if docker exec "$PG_CONTAINER" pg_isready -U dev >/dev/null 2>&1; then echo "Postgres ready after ${i}s" @@ -116,7 +214,20 @@ jobs: if: needs.detect-changes.outputs.api == 'true' run: | docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true - docker run -d --name "$REDIS_CONTAINER" -p 16379:6379 redis:7 + docker run -d --name "$REDIS_CONTAINER" -p 0:6379 redis:7 >/dev/null + REDIS_PORT=$(docker port "$REDIS_CONTAINER" 6379/tcp | awk -F: '/^0\.0\.0\.0:/ {print $2; exit}') + if [ -z "$REDIS_PORT" ]; then + REDIS_PORT=$(docker port "$REDIS_CONTAINER" 6379/tcp | head -1 | awk -F: '{print $NF}') + fi + if [ -z "$REDIS_PORT" ]; then + echo "::error::Could not resolve host port for $REDIS_CONTAINER" + docker port "$REDIS_CONTAINER" 6379/tcp || true + docker logs "$REDIS_CONTAINER" || true + exit 1 + fi + echo "REDIS_PORT=${REDIS_PORT}" >> "$GITHUB_ENV" + echo "REDIS_URL=redis://127.0.0.1:${REDIS_PORT}" >> "$GITHUB_ENV" + echo "Redis host port: ${REDIS_PORT}" for i in $(seq 1 15); do if docker exec "$REDIS_CONTAINER" redis-cli ping 2>/dev/null | grep -q PONG; then echo "Redis ready after ${i}s" @@ -135,13 +246,15 @@ jobs: if: needs.detect-changes.outputs.api == 'true' working-directory: workspace-server run: | + # DATABASE_URL + REDIS_URL exported by the start-postgres / + # start-redis steps point at this run's per-run host ports. ./platform-server > platform.log 2>&1 & echo $! > platform.pid - name: Wait for /health if: needs.detect-changes.outputs.api == 'true' run: | for i in $(seq 1 30); do - if curl -sf http://localhost:8080/health > /dev/null; then + if curl -sf http://127.0.0.1:8080/health > /dev/null; then echo "Platform up after ${i}s" exit 0 fi @@ -185,6 +298,9 @@ jobs: kill "$(cat workspace-server/platform.pid)" 2>/dev/null || true fi - name: Stop service containers + # always() so containers don't leak when test steps fail. The + # cleanup is best-effort: if the container is already gone + # (e.g. concurrent rerun race), don't fail the job. if: always() && needs.detect-changes.outputs.api == 'true' run: | docker rm -f "$PG_CONTAINER" 2>/dev/null || true diff --git a/.github/workflows/handlers-postgres-integration.yml b/.github/workflows/handlers-postgres-integration.yml index 98927ac9..05216b59 100644 --- a/.github/workflows/handlers-postgres-integration.yml +++ b/.github/workflows/handlers-postgres-integration.yml @@ -14,12 +14,42 @@ name: Handlers Postgres Integration # self-review caught it took 2 minutes to set up and would have caught # the bug at PR-time. # -# This job spins a Postgres service container, applies the migration, -# and runs `go test -tags=integration` against a live DB. Required -# check on staging branch protection — backend handler PRs cannot -# merge without a real-DB regression gate. +# Why this workflow does NOT use `services: postgres:` (Class B fix) +# ------------------------------------------------------------------ +# Our act_runner config has `container.network: host` (operator host +# /opt/molecule/runners/config.yaml), which act_runner applies to BOTH +# the job container AND every service container. With host-net, two +# concurrent runs of this workflow both try to bind 0.0.0.0:5432 — the +# second postgres FATALs with `could not create any TCP/IP sockets: +# Address in use`, and Docker auto-removes it (act_runner sets +# AutoRemove:true on service containers). By the time the migrations +# step runs `psql`, the postgres container is gone, hence +# `Connection refused` then `failed to remove container: No such +# container` at cleanup time. # -# Cost: ~30s job (postgres pull from GH cache + go build + 4 tests). +# Per-job `container.network` override is silently ignored by +# act_runner — `--network and --net in the options will be ignored.` +# appears in the runner log. Documented constraint. +# +# So we sidestep `services:` entirely. The job container still uses +# host-net (inherited from runner config; required for cache server +# discovery on the bridge IP 172.18.0.17:42631). We launch a sibling +# postgres on the existing `molecule-monorepo-net` bridge with a +# UNIQUE name per run — `pg-handlers-${RUN_ID}-${RUN_ATTEMPT}` — and +# read its bridge IP via `docker inspect`. A host-net job container +# can reach a bridge-net container directly via the bridge IP (verified +# manually on operator host 2026-05-08). +# +# Trade-offs vs. the original `services:` shape: +# + No host-port collision; N parallel runs share the bridge cleanly +# + `if: always()` cleanup runs even on test-step failure +# - One more step in the workflow (+~3 lines) +# - Requires `molecule-monorepo-net` to exist on the operator host +# (it does; declared in docker-compose.yml + docker-compose.infra.yml) +# +# Class B Hongming-owned CICD red sweep, 2026-05-08. +# +# Cost: ~30s job (postgres pull from cache + go build + 4 tests). on: push: @@ -59,20 +89,14 @@ jobs: name: Handlers Postgres Integration needs: detect-changes runs-on: ubuntu-latest - services: - postgres: - image: postgres:15-alpine - env: - POSTGRES_PASSWORD: test - POSTGRES_DB: molecule - ports: - - 5432:5432 - # GHA spins this with --health-cmd built in for postgres images. - options: >- - --health-cmd pg_isready - --health-interval 5s - --health-timeout 5s - --health-retries 10 + env: + # Unique name per run so concurrent jobs don't collide on the + # bridge network. ${RUN_ID}-${RUN_ATTEMPT} is unique even across + # workflow_dispatch reruns of the same run_id. + PG_NAME: pg-handlers-${{ github.run_id }}-${{ github.run_attempt }} + # Bridge network already exists on the operator host (declared + # in docker-compose.yml + docker-compose.infra.yml). + PG_NETWORK: molecule-monorepo-net defaults: run: working-directory: workspace-server @@ -89,16 +113,57 @@ jobs: with: go-version: 'stable' + - if: needs.detect-changes.outputs.handlers == 'true' + name: Start sibling Postgres on bridge network + working-directory: . + run: | + # Sanity: the bridge network must exist on the operator host. + # Hard-fail loud if it doesn't — easier to spot than a silent + # auto-create that diverges from the rest of the stack. + if ! docker network inspect "${PG_NETWORK}" >/dev/null 2>&1; then + echo "::error::Bridge network '${PG_NETWORK}' missing on operator host. Re-run docker-compose.infra.yml or check ops handbook." + exit 1 + fi + + # If a stale container with the same name exists (rerun on + # the same run_id), wipe it first. + docker rm -f "${PG_NAME}" >/dev/null 2>&1 || true + + docker run -d \ + --name "${PG_NAME}" \ + --network "${PG_NETWORK}" \ + --health-cmd "pg_isready -U postgres" \ + --health-interval 5s \ + --health-timeout 5s \ + --health-retries 10 \ + -e POSTGRES_PASSWORD=test \ + -e POSTGRES_DB=molecule \ + postgres:15-alpine >/dev/null + + # Read back the bridge IP. Always present immediately after + # `docker run -d` for bridge networks. + PG_HOST=$(docker inspect "${PG_NAME}" \ + --format "{{(index .NetworkSettings.Networks \"${PG_NETWORK}\").IPAddress}}") + if [ -z "${PG_HOST}" ]; then + echo "::error::Could not resolve PG_HOST for ${PG_NAME} on ${PG_NETWORK}" + docker logs "${PG_NAME}" || true + exit 1 + fi + echo "PG_HOST=${PG_HOST}" >> "$GITHUB_ENV" + echo "INTEGRATION_DB_URL=postgres://postgres:test@${PG_HOST}:5432/molecule?sslmode=disable" >> "$GITHUB_ENV" + echo "Started ${PG_NAME} at ${PG_HOST}:5432" + - if: needs.detect-changes.outputs.handlers == 'true' name: Apply migrations to Postgres service env: PGPASSWORD: test run: | - # Wait for postgres to actually accept connections (the - # GHA --health-cmd is best-effort but psql can still race). + # Wait for postgres to actually accept connections. Docker's + # health-cmd handles container-side readiness, but the wire + # to the bridge IP is best-tested with pg_isready directly. for i in {1..15}; do - if pg_isready -h localhost -p 5432 -U postgres -q; then break; fi - echo "waiting for postgres..."; sleep 2 + if pg_isready -h "${PG_HOST}" -p 5432 -U postgres -q; then break; fi + echo "waiting for postgres at ${PG_HOST}:5432..."; sleep 2 done # Apply every .up.sql in lexicographic order with @@ -131,7 +196,7 @@ jobs: # not fine once a cross-table atomicity test came in. set +e for migration in $(ls migrations/*.sql 2>/dev/null | grep -v '\.down\.sql$' | sort); do - if psql -h localhost -U postgres -d molecule -v ON_ERROR_STOP=1 \ + if psql -h "${PG_HOST}" -U postgres -d molecule -v ON_ERROR_STOP=1 \ -f "$migration" >/dev/null 2>&1; then echo "✓ $(basename "$migration")" else @@ -145,7 +210,7 @@ jobs: # fail if any didn't land — that would be a real regression we # want loud. for tbl in delegations workspaces activity_logs pending_uploads; do - if ! psql -h localhost -U postgres -d molecule -tA \ + if ! psql -h "${PG_HOST}" -U postgres -d molecule -tA \ -c "SELECT 1 FROM information_schema.tables WHERE table_name = '$tbl'" \ | grep -q 1; then echo "::error::$tbl table missing after migration replay — handler integration tests would be meaningless" @@ -156,16 +221,32 @@ jobs: - if: needs.detect-changes.outputs.handlers == 'true' name: Run integration tests - env: - INTEGRATION_DB_URL: postgres://postgres:test@localhost:5432/molecule?sslmode=disable run: | + # INTEGRATION_DB_URL is exported by the start-postgres step; + # points at the per-run bridge IP, not 127.0.0.1, so concurrent + # workflow runs don't fight over a host-net 5432 port. go test -tags=integration -timeout 5m -v ./internal/handlers/ -run "^TestIntegration_" - - if: needs.detect-changes.outputs.handlers == 'true' && failure() + - if: failure() && needs.detect-changes.outputs.handlers == 'true' name: Diagnostic dump on failure env: PGPASSWORD: test run: | - echo "::group::delegations table state" - psql -h localhost -U postgres -d molecule -c "SELECT * FROM delegations LIMIT 50;" || true + echo "::group::postgres container status" + docker ps -a --filter "name=${PG_NAME}" --format '{{.Status}} {{.Names}}' || true + docker logs "${PG_NAME}" 2>&1 | tail -50 || true echo "::endgroup::" + echo "::group::delegations table state" + psql -h "${PG_HOST}" -U postgres -d molecule -c "SELECT * FROM delegations LIMIT 50;" || true + echo "::endgroup::" + + - if: always() && needs.detect-changes.outputs.handlers == 'true' + name: Stop sibling Postgres + working-directory: . + run: | + # always() so containers don't leak when migrations or tests + # fail. The cleanup is best-effort: if the container is + # already gone (e.g. concurrent rerun race), don't fail the job. + docker rm -f "${PG_NAME}" >/dev/null 2>&1 || true + echo "Cleaned up ${PG_NAME}" + diff --git a/.github/workflows/pr-guards.yml b/.github/workflows/pr-guards.yml index 151757fe..7dd00c16 100644 --- a/.github/workflows/pr-guards.yml +++ b/.github/workflows/pr-guards.yml @@ -1,14 +1,25 @@ name: pr-guards -# Thin caller that delegates to the molecule-ci reusable guard. Today -# the guard is just "disable auto-merge when a new commit is pushed -# after auto-merge was enabled" — added 2026-04-27 after PR #2174 -# auto-merged with only its first commit because the second commit -# was pushed after the merge queue had locked the PR's SHA. +# PR-time guards. Today the only guard is "disable auto-merge when a +# new commit is pushed after auto-merge was enabled" — added 2026-04-27 +# after PR #2174 auto-merged with only its first commit because the +# second commit was pushed after the merge queue had locked the PR's +# SHA. # -# When more PR-time guards land in molecule-ci, add them here as -# additional jobs that share the same pull_request:synchronize -# trigger. +# Why this is inlined (not delegated to molecule-ci's reusable +# workflow): the reusable workflow uses `gh pr merge --disable-auto`, +# which calls GitHub's GraphQL API. Gitea has no GraphQL endpoint and +# returns HTTP 405 on /api/graphql, so the job failed on every Gitea +# PR push since the 2026-05-06 migration. Gitea also has no `--auto` +# merge primitive that this job could be acting on, so the right +# behaviour on Gitea is "no-op + green status" — not a 405. +# +# Inlining (vs. an `if:` on the `uses:` line) keeps the job ALWAYS +# running, which matters for branch protection: required-check names +# need a job that emits SUCCESS terminal state, not SKIPPED. See +# `feedback_branch_protection_check_name_parity` and `feedback_pr_merge_safety_guards`. +# +# Issue #88 item 1. on: pull_request: @@ -19,4 +30,34 @@ permissions: jobs: disable-auto-merge-on-push: - uses: molecule-ai/molecule-ci/.github/workflows/disable-auto-merge-on-push.yml@main + runs-on: ubuntu-latest + steps: + # Detect Gitea Actions. act_runner sets GITEA_ACTIONS=true in the + # step env on every job. Belt-and-suspenders: also check the repo + # url's host, which is independent of any runner-side env config + # (covers a future Gitea host where the env var is forgotten). + - name: Detect runner host + id: host + run: | + if [[ "${GITEA_ACTIONS:-}" == "true" ]] || [[ "${{ github.server_url }}" == *moleculesai.app* ]] || [[ "${{ github.event.repository.html_url }}" == *moleculesai.app* ]]; then + echo "is_gitea=true" >> "$GITHUB_OUTPUT" + echo "::notice::Gitea Actions detected — auto-merge gating is not applicable here (Gitea has no --auto merge primitive). Job will no-op." + else + echo "is_gitea=false" >> "$GITHUB_OUTPUT" + fi + + - name: Disable auto-merge (GitHub only) + if: steps.host.outputs.is_gitea != 'true' + env: + GH_TOKEN: ${{ github.token }} + PR: ${{ github.event.pull_request.number }} + REPO: ${{ github.repository }} + NEW_SHA: ${{ github.sha }} + run: | + set -eu + gh pr merge "$PR" --disable-auto -R "$REPO" || true + gh pr comment "$PR" -R "$REPO" --body "🔒 Auto-merge disabled — new commit (\`${NEW_SHA:0:7}\`) pushed after auto-merge was enabled. The merge queue locks SHAs at entry, so subsequent pushes can race. Verify the new commit and re-enable with \`gh pr merge --auto\`." + + - name: Gitea no-op + if: steps.host.outputs.is_gitea == 'true' + run: echo "Gitea Actions — auto-merge gating not applicable; no-op (job intentionally green so branch protection's required-check name lands SUCCESS)." diff --git a/.github/workflows/publish-runtime.yml b/.github/workflows/publish-runtime.yml index fa8f64b3..4147c07f 100644 --- a/.github/workflows/publish-runtime.yml +++ b/.github/workflows/publish-runtime.yml @@ -282,42 +282,33 @@ jobs: echo "::error::Refusing to fan out cascade against stale or corrupt PyPI surfaces." exit 1 - - name: Fan out repository_dispatch + - name: Fan out via push to .runtime-version env: - # Fine-grained PAT with `actions:write` on the 8 template repos. - # GITHUB_TOKEN can't fire dispatches across repos — needs an explicit - # token. Stored as a repo secret; rotate per the standard schedule. - DISPATCH_TOKEN: ${{ secrets.TEMPLATE_DISPATCH_TOKEN }} - # Single source of truth: the publish job's output, which handles - # tag/manual-input/auto-bump uniformly. The previous fallback - # (`steps.version.outputs.version` from inside the cascade job) - # was a dead reference — different job, no shared step scope. + # Gitea PAT with write:repository scope on the 8 cascade-active + # template repos. Used here for `git push` (NOT for an API + # dispatch — Gitea 1.22.6 has no repository_dispatch endpoint; + # empirically verified across 6 candidate paths in molecule- + # core#20 issuecomment-913). The push trips each template's + # existing `on: push: branches: [main]` trigger on + # publish-image.yml, which then reads the updated + # .runtime-version via its resolve-version job. + DISPATCH_TOKEN: ${{ secrets.DISPATCH_TOKEN }} RUNTIME_VERSION: ${{ needs.publish.outputs.version }} run: | set +e # don't abort on a single repo failure — collect them all - # Schedule-vs-dispatch behaviour split (hardened 2026-04-28 - # after the sweep-cf-orphans soft-skip incident — same class - # of bug): - # - # The earlier "skipping cascade. templates will pick up the - # new version on their own next rebuild" message was wrong — - # templates only build on this dispatch trigger; without it - # they stay pinned to whatever runtime version they last saw. - # A silent skip here means "PyPI is current, templates are - # not" and the gap is invisible until someone notices a - # template still on the old version weeks later. - # - # - push → exit 1 (red CI surfaces the gap) - # - workflow_dispatch → exit 0 with a warning (operator - # ran this ad-hoc; let them rerun - # after fixing the secret) + + # Soft-skip on workflow_dispatch when the token is missing + # (operator ad-hoc test); hard-fail on push so unattended + # publishes can't silently skip the cascade. Same shape as + # the original v1, intentional split per the schedule-vs- + # dispatch hardening 2026-04-28. if [ -z "$DISPATCH_TOKEN" ]; then if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - echo "::warning::TEMPLATE_DISPATCH_TOKEN secret not set — skipping cascade." + echo "::warning::DISPATCH_TOKEN secret not set — skipping cascade." echo "::warning::set it at Settings → Secrets and Variables → Actions, then rerun. Templates will stay on the prior runtime version until either this token is set or each template is rebuilt manually." exit 0 fi - echo "::error::TEMPLATE_DISPATCH_TOKEN secret missing — cascade cannot fan out." + echo "::error::DISPATCH_TOKEN secret missing — cascade cannot fan out." echo "::error::PyPI was published, but the 8 template repos will NOT pick up the new version until this token is restored and a republish dispatches the cascade." echo "::error::set it at Settings → Secrets and Variables → Actions; then re-trigger publish-runtime via workflow_dispatch." exit 1 @@ -327,37 +318,119 @@ jobs: echo "::error::publish job did not expose a version output — cascade cannot fan out" exit 1 fi - # All 9 active workspace template repos. The PR #2536 pruning - # ("deprecated, no shipping images") was empirically wrong: - # continuous-synth-e2e.yml defaults to langgraph as its primary - # canary (line 44), and every excluded template had successful - # publish-image runs as of 2026-05-03 — none were dormant. - # Symptom of the prune: today's a2a-sdk strict-mode fix - # (#2566 / commit e1628c4) cascaded to 4 templates but never - # reached langgraph, so the synth-E2E correctly canary'd a fix - # that had landed but not deployed. Re-added the 5 templates. - # Long-term: derive this list from manifest.json so cascade - # scope can't drift from E2E scope — tracked in RFC #388 as a - # Phase-1 invariant. + + # All 9 workspace templates declared in manifest.json. The list + # MUST stay aligned with manifest.json's workspace_templates — + # cascade-list-drift-gate.yml enforces this in CI per the + # codex-stuck-on-stale-runtime invariant from PR #2556. + # Long-term goal: derive this list from manifest.json so it + # can't drift even on a manifest edit (RFC #388 Phase-1). + # + # Per-template publish-image.yml presence is checked at + # cascade-time below: codex doesn't ship one today, so the + # cascade soft-skips it with an informational message rather + # than dropping it from this list (which would re-introduce + # the drift the gate exists to catch). + GITEA_URL="${GITEA_URL:-https://git.moleculesai.app}" TEMPLATES="claude-code hermes openclaw codex langgraph crewai autogen deepagents gemini-cli" FAILED="" + SKIPPED="" + + # Configure git identity once. The persona owning DISPATCH_TOKEN + # is the same identity that authored this commit on each + # template; using a generic "publish-runtime cascade" co-author + # trailer in the message keeps the audit trail honest about the + # workflow-driven origin. + git config --global user.name "publish-runtime cascade" + git config --global user.email "publish-runtime@moleculesai.app" + + WORKDIR="$(mktemp -d)" for tpl in $TEMPLATES; do REPO="molecule-ai/molecule-ai-workspace-template-$tpl" - STATUS=$(curl -sS -o /tmp/dispatch.out -w "%{http_code}" \ - -X POST "https://api.github.com/repos/$REPO/dispatches" \ - -H "Authorization: Bearer $DISPATCH_TOKEN" \ - -H "Accept: application/vnd.github+json" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - -d "{\"event_type\":\"runtime-published\",\"client_payload\":{\"runtime_version\":\"$VERSION\"}}") - if [ "$STATUS" = "204" ]; then - echo "✓ dispatched $tpl ($VERSION)" - else - echo "::warning::✗ failed to dispatch $tpl: HTTP $STATUS — $(cat /tmp/dispatch.out)" + CLONE="$WORKDIR/$tpl" + + # Pre-check: skip templates without a publish-image.yml. + # The cascade's job is to trip the template's on-push + # rebuild — if there's no rebuild workflow, pushing a + # .runtime-version commit is just noise on the target + # repo. Use the Gitea contents API (no clone required for + # the probe). 200 = present; 404 = absent. + HTTP=$(curl -sS -o /dev/null -w "%{http_code}" \ + -H "Authorization: token $DISPATCH_TOKEN" \ + "$GITEA_URL/api/v1/repos/$REPO/contents/.github/workflows/publish-image.yml") + if [ "$HTTP" = "404" ]; then + echo "↷ $tpl has no publish-image.yml — soft-skip (informational; manifest still tracks it)" + SKIPPED="$SKIPPED $tpl" + continue + fi + if [ "$HTTP" != "200" ]; then + echo "::warning::$tpl publish-image.yml probe returned HTTP $HTTP — proceeding anyway, push will surface the real failure if any" + fi + + # Use a per-template attempt loop so a transient race (e.g. + # human pushing to the same template at the same instant) + # doesn't lose the cascade. Bounded retries (3) — beyond + # that we surface the failure and let the operator retry. + attempt=0 + success=false + while [ $attempt -lt 3 ]; do + attempt=$((attempt + 1)) + rm -rf "$CLONE" + if ! git clone --depth=1 \ + "https://x-access-token:${DISPATCH_TOKEN}@${GITEA_URL#https://}/$REPO.git" \ + "$CLONE" >/tmp/clone.log 2>&1; then + echo "::warning::clone $tpl attempt $attempt failed: $(tail -n3 /tmp/clone.log)" + sleep 2 + continue + fi + + cd "$CLONE" + echo "$VERSION" > .runtime-version + + # Idempotency guard: if the file already matches, this + # publish is a re-run for a version already cascaded. + # Don't push a no-op commit (would spuriously re-trip the + # template's on-push and rebuild for nothing). + if git diff --quiet -- .runtime-version; then + echo "✓ $tpl already at $VERSION — no commit needed (idempotent)" + success=true + cd - >/dev/null + break + fi + + git add .runtime-version + git commit -m "chore: pin runtime to $VERSION (publish-runtime cascade)" \ + -m "Co-Authored-By: publish-runtime cascade " \ + >/dev/null + + if git push origin HEAD:main >/tmp/push.log 2>&1; then + echo "✓ $tpl pushed $VERSION on attempt $attempt" + success=true + cd - >/dev/null + break + fi + + # Likely a non-fast-forward — pull-rebase and retry. + # Don't force-push: that would silently overwrite a racing + # human/cascade commit. + echo "::warning::push $tpl attempt $attempt failed, pull-rebasing: $(tail -n3 /tmp/push.log)" + git pull --rebase origin main >/tmp/rebase.log 2>&1 || true + cd - >/dev/null + done + + if [ "$success" != "true" ]; then FAILED="$FAILED $tpl" fi done + rm -rf "$WORKDIR" + if [ -n "$FAILED" ]; then - echo "::warning::Cascade incomplete. Failed templates:$FAILED" - # Don't fail the whole job — PyPI publish already succeeded; - # operators can retry the failed templates manually. + echo "::error::Cascade incomplete after 3 retries each. Failed templates:$FAILED" + echo "::error::PyPI publish succeeded; failed templates lag the new version. Re-run this workflow_dispatch with the same version to retry only the laggers (idempotent — already-cascaded templates skip)." + exit 1 + fi + if [ -n "$SKIPPED" ]; then + echo "Cascade complete: pinned $VERSION on cascade-active templates. Soft-skipped (no publish-image.yml):$SKIPPED" + else + echo "Cascade complete: $VERSION pinned across all manifest workspace_templates." fi diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 601533f4..f0d0a9dd 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -22,7 +22,7 @@ development workflow, conventions, and how to get your changes merged. ```bash # Clone the repo -git clone https://github.com/Molecule-AI/molecule-core.git +git clone https://git.moleculesai.app/molecule-ai/molecule-core.git cd molecule-core # Install git hooks @@ -57,7 +57,7 @@ See `CLAUDE.md` for a full list of environment variables and their purposes. This repo is scoped to **code** (canvas, workspace, workspace-server, related infra). Public content (blog posts, marketing copy, OG images, SEO briefs, -DevRel demos) lives in [`Molecule-AI/docs`](https://github.com/Molecule-AI/docs). +DevRel demos) lives in [`Molecule-AI/docs`](https://git.moleculesai.app/molecule-ai/docs). The `Block forbidden paths` CI gate fails any PR that writes to `marketing/` or other removed paths — open against `Molecule-AI/docs` instead. @@ -110,7 +110,7 @@ causing a render loop when any node position changed. 1. **Repo-wide:** "Automatically delete head branches" is on. Once a PR merges, the branch is deleted server-side. Any subsequent `git push` to that branch fails with `remote rejected — no such branch`. -2. **CI:** the `pr-guards` workflow (calling [molecule-ci `disable-auto-merge-on-push`](https://github.com/Molecule-AI/molecule-ci/blob/main/.github/workflows/disable-auto-merge-on-push.yml)) fires on every push to an open PR. If auto-merge was already enabled, it's disabled and a comment is posted. You must explicitly re-enable after verifying the new commit. +2. **CI:** the `pr-guards` workflow (calling [molecule-ci `disable-auto-merge-on-push`](https://git.moleculesai.app/molecule-ai/molecule-ci/src/branch/main/.github/workflows/disable-auto-merge-on-push.yml)) fires on every push to an open PR. If auto-merge was already enabled, it's disabled and a comment is posted. You must explicitly re-enable after verifying the new commit. **Workflow rules that follow from the guards:** - Push **all** commits before running `gh pr merge --auto`. @@ -180,9 +180,9 @@ and run CI manually. Code in this repo lands in molecule-core. Some related runtime artifacts live in their own repos: -- [`Molecule-AI/molecule-ai-workspace-runtime`](https://github.com/Molecule-AI/molecule-ai-workspace-runtime) — Python adapter SDK (`molecule_runtime`) that runs inside containerized Molecule workspaces. Bridges Claude Code SDK / hermes / langgraph / etc. → A2A queue. -- [`Molecule-AI/molecule-sdk-python`](https://github.com/Molecule-AI/molecule-sdk-python) — `A2AServer` + `RemoteAgentClient` for external agents that register over the public `/registry/register` flow. -- [`Molecule-AI/molecule-mcp-claude-channel`](https://github.com/Molecule-AI/molecule-mcp-claude-channel) — Claude Code channel plugin. Bridges A2A traffic into a running Claude Code session via MCP `notifications/claude/channel`. Polling-based (no tunnel required); install with `claude --channels plugin:molecule@Molecule-AI/molecule-mcp-claude-channel`. +- [`Molecule-AI/molecule-ai-workspace-runtime`](https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-runtime) — Python adapter SDK (`molecule_runtime`) that runs inside containerized Molecule workspaces. Bridges Claude Code SDK / hermes / langgraph / etc. → A2A queue. +- [`Molecule-AI/molecule-sdk-python`](https://git.moleculesai.app/molecule-ai/molecule-sdk-python) — `A2AServer` + `RemoteAgentClient` for external agents that register over the public `/registry/register` flow. +- [`Molecule-AI/molecule-mcp-claude-channel`](https://git.moleculesai.app/molecule-ai/molecule-mcp-claude-channel) — Claude Code channel plugin. Bridges A2A traffic into a running Claude Code session via MCP `notifications/claude/channel`. Polling-based (no tunnel required); install with `claude --channels plugin:molecule@Molecule-AI/molecule-mcp-claude-channel`. When extending the **A2A surface** in molecule-core (`workspace-server/internal/handlers/a2a_proxy.go` etc.), consider whether the change has a downstream impact on the runtime SDK or the channel plugin — they're versioned independently but share the wire shape. diff --git a/README.md b/README.md index c054253d..d455d731 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@

- Molecule AI Icon Logo + Molecule AI

@@ -39,8 +39,8 @@ Workspace Runtime

-[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/new/template?template=https://github.com/Molecule-AI/molecule-monorepo) -[![Deploy to Render](https://render.com/images/deploy-to-render-button.svg)](https://render.com/deploy?repo=https://github.com/Molecule-AI/molecule-monorepo) +[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/new/template?template=https://git.moleculesai.app/molecule-ai/molecule-core) +[![Deploy to Render](https://render.com/images/deploy-to-render-button.svg)](https://render.com/deploy?repo=https://git.moleculesai.app/molecule-ai/molecule-core)
@@ -53,8 +53,8 @@ Molecule AI is the most powerful way to govern an AI agent organization in produ It combines the parts that are usually scattered across demos, internal glue code, and framework-specific tooling into one product: - one org-native control plane for teams, roles, hierarchy, and lifecycle -- one runtime layer that lets LangGraph, DeepAgents, Claude Code, CrewAI, AutoGen, and OpenClaw run side by side -- one memory model that keeps recall, sharing, and skill evolution aligned with organizational boundaries +- one runtime layer that lets **eight** agent runtimes — LangGraph, DeepAgents, Claude Code, CrewAI, AutoGen, **Hermes**, **Gemini CLI**, and OpenClaw — run side by side behind one workspace contract +- one memory model that keeps recall, sharing, and skill evolution aligned with organizational boundaries (Memory v2 backed by pgvector for semantic recall) - one operational surface for observing, pausing, restarting, inspecting, and improving live workspaces Most teams can build a workflow, a strong single agent, a coding agent, or a custom multi-agent graph. @@ -75,7 +75,7 @@ You do not wire collaboration paths by hand. Hierarchy defines the default commu ### 3. Runtime choice stops being a dead-end decision -LangGraph, DeepAgents, Claude Code, CrewAI, AutoGen, and OpenClaw can all plug into the same workspace abstraction. Teams can standardize governance without forcing every group onto one runtime. +LangGraph, DeepAgents, Claude Code, CrewAI, AutoGen, Hermes, Gemini CLI, and OpenClaw can all plug into the same workspace abstraction. Teams can standardize governance without forcing every group onto one runtime. ### 4. Memory is treated like infrastructure @@ -117,6 +117,8 @@ Molecule AI is not trying to replace the frameworks below. It is the system that | **Claude Code** | Shipping on `main` | Real coding workflows, CLI-native continuity | Secure workspace abstraction, A2A delegation, org boundaries, shared control plane | | **CrewAI** | Shipping on `main` | Role-based crews | Persistent workspace identity, policy consistency, shared canvas and registry | | **AutoGen** | Shipping on `main` | Assistant/tool orchestration | Standardized deployment, hierarchy-aware collaboration, shared ops plane | +| **Hermes 4** | Shipping on `main` | Hybrid reasoning, native tools, json_schema (NousResearch/hermes-agent) | Option B upstream hook, A2A bridge to OpenAI-compat API, multi-provider provider derivation | +| **Gemini CLI** | Shipping on `main` | Google Gemini CLI continuity | Workspace lifecycle, A2A, hierarchy-aware collaboration, shared ops plane | | **OpenClaw** | Shipping on `main` | CLI-native runtime with its own session model | Workspace lifecycle, templates, activity logs, topology-aware collaboration | | **NemoClaw** | WIP on `feat/nemoclaw-t4-docker` | NVIDIA-oriented runtime path | Planned to join the same abstraction once merged; not yet part of `main` | @@ -182,9 +184,10 @@ The result is not just “an agent that learns.” It is **an organization that ## What Ships In `main` -### Canvas +### Canvas (v4) - Next.js 15 + React Flow + Zustand +- **warm-paper theme system** — light / dark / follow-system, SSR cookie + nonce'd boot script + ThemeProvider; terminal + code surfaces stay dark unconditionally - drag-to-nest team building - empty-state deployment + onboarding wizard - template palette @@ -193,8 +196,9 @@ The result is not just “an agent that learns.” It is **an organization that ### Platform -- Go/Gin control plane -- workspace CRUD and provisioning +- Go 1.25 / Gin control plane (80+ HTTP endpoints + Gorilla WebSocket fanout) +- workspace CRUD and provisioning (pluggable Provisioner — Docker locally, EC2 + SSM in production) +- **A2A response path is a typed discriminated union (RFC #2967)** — frozen dataclasses + total parser; 100% unit + adversarial fuzz coverage - registry and heartbeats - browser-safe A2A proxy - team expansion/collapse @@ -204,10 +208,10 @@ The result is not just “an agent that learns.” It is **an organization that ### Runtime -- unified `workspace/` image -- adapter-driven execution +- unified `workspace/` image; thin AMI in production (us-east-2) +- adapter-driven execution across **8 runtimes** (Claude Code, Hermes, Gemini CLI, LangGraph, DeepAgents, CrewAI, AutoGen, OpenClaw) - Agent Card registration -- awareness-backed memory integration +- awareness-backed memory integration; **Memory v2 backed by pgvector** for semantic recall - plugin-mounted shared rules/skills - hot-reloadable local skills - coordinator-only delegation path @@ -221,6 +225,21 @@ The result is not just “an agent that learns.” It is **an organization that - runtime tiers - direct workspace inspection through terminal and files +### SaaS (via [`molecule-controlplane`](https://git.moleculesai.app/molecule-ai/molecule-controlplane)) + +- multi-tenant on AWS EC2 + Neon (per-tenant Postgres branch) + Cloudflare Tunnels (per-tenant, no public ports) +- WorkOS AuthKit + Stripe Checkout + Customer Portal +- AWS KMS envelope encryption (DB / Redis connection strings); AWS Secrets Manager for tenant bootstrap +- `tenant_resources` audit table + 30-min boot-event-aware reconciler — every CF / AWS lifecycle event recorded, claim vs live state diffed + +### Bring your own Claude Code session (via [`molecule-mcp-claude-channel`](https://git.moleculesai.app/molecule-ai/molecule-mcp-claude-channel)) + +- Claude Code plugin that bridges Molecule A2A traffic into a local Claude Code session via MCP +- subscribe to one or more workspaces; peer messages surface as conversation turns; replies route back through Molecule's A2A +- no tunnel, no public endpoint — the plugin self-registers each watched workspace as `delivery_mode=poll` and long-polls `/activity?since_id=…` +- multi-tenant friendly: one plugin install can watch workspaces across multiple Molecule tenants (`MOLECULE_PLATFORM_URLS` per-workspace) +- install via the standard marketplace flow: `/plugin marketplace add Molecule-AI/molecule-mcp-claude-channel` → `/plugin install molecule-channel@molecule-mcp-claude-channel` + ## Built For Teams That Need More Than A Demo Molecule AI is especially strong when you need to run: @@ -233,24 +252,30 @@ Molecule AI is especially strong when you need to run: ## Architecture ```text -Canvas (Next.js :3000) <--HTTP / WS--> Platform (Go :8080) <---> Postgres + Redis - | | - | +--> Docker provisioner / bundles / templates / secrets +Canvas (Next.js 15, warm-paper :3000) <--HTTP / WS--> Platform (Go 1.25 :8080) <---> Postgres + Redis + | | + | +--> Provisioner: Docker (local) / EC2 + SSM (prod) + | +--> bundles · templates · secrets · KMS | - +-------------------- shows --------------------> workspaces, teams, tasks, traces, events + +------------------------- shows ------------------------> workspaces, teams, tasks, traces, events -Workspace Runtime (Python image with adapters) - - LangGraph / DeepAgents / Claude Code / CrewAI / AutoGen / OpenClaw - - Agent Card + A2A server - - heartbeat + activity + awareness-backed memory +Workspace Runtime (Python ≥3.11, image with adapters) + - 8 adapters: LangGraph / DeepAgents / Claude Code / CrewAI / AutoGen / Hermes / Gemini CLI / OpenClaw + - Agent Card + A2A server (typed-SSOT response path, RFC #2967) + - heartbeat + activity + awareness-backed memory (Memory v2 — pgvector semantic recall) - skills + plugins + hot reload + +SaaS Control Plane (molecule-controlplane, private) + - per-tenant EC2 + Neon (Postgres branch) + Cloudflare Tunnel + - WorkOS · Stripe · KMS · AWS Secrets Manager + - tenant_resources audit + 30-min reconciler ``` ## Quick Start ```bash -git clone https://github.com/Molecule-AI/molecule-monorepo.git -cd molecule-monorepo +git clone https://git.moleculesai.app/molecule-ai/molecule-core.git +cd molecule-core cp .env.example .env # Defaults boot the stack locally out of the box. See .env.example for @@ -303,7 +328,11 @@ Then open `http://localhost:3000`: ## Current Scope -The current `main` branch already includes the core platform, canvas, memory model, six production adapters, skill lifecycle, and operational surfaces. Adjacent runtime work such as **NemoClaw** remains branch-level until merged, and this README keeps that distinction explicit on purpose. +The current `main` branch ships the core platform, Canvas v4 (warm-paper themed), Memory v2 (pgvector semantic recall), the typed-SSOT A2A response path (RFC #2967), **eight production adapters** (Claude Code, Hermes, Gemini CLI, LangGraph, DeepAgents, CrewAI, AutoGen, OpenClaw), skill lifecycle, and operational surfaces. + +The companion private repo [`molecule-controlplane`](https://git.moleculesai.app/molecule-ai/molecule-controlplane) provides the SaaS surface — multi-tenant orchestration on EC2 + Neon + Cloudflare Tunnels, KMS envelope encryption, WorkOS auth, Stripe billing, and a `tenant_resources` audit table with a 30-min reconciler. + +Adjacent runtime work such as **NemoClaw** remains branch-level until merged, and this README keeps that distinction explicit on purpose. ## License diff --git a/README.zh-CN.md b/README.zh-CN.md index 20df5685..d85fe3b8 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -1,7 +1,7 @@

- Molecule AI 图案 Logo + Molecule AI

@@ -38,8 +38,8 @@ Workspace Runtime

-[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/new/template?template=https://github.com/Molecule-AI/molecule-core) -[![Deploy to Render](https://render.com/images/deploy-to-render-button.svg)](https://render.com/deploy?repo=https://github.com/Molecule-AI/molecule-core) +[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/new/template?template=https://git.moleculesai.app/molecule-ai/molecule-core) +[![Deploy to Render](https://render.com/images/deploy-to-render-button.svg)](https://render.com/deploy?repo=https://git.moleculesai.app/molecule-ai/molecule-core)
@@ -52,8 +52,8 @@ Molecule AI 是目前最强的 AI Agent 组织治理方案之一,用来把 age 它把过去分散在 demo、内部胶水代码和各类 framework 私有工具里的关键能力,收敛成一个产品: - 一套组织原生 control plane,管理团队、角色、层级和生命周期 -- 一套 runtime abstraction,让 LangGraph、DeepAgents、Claude Code、CrewAI、AutoGen、OpenClaw 并存运行 -- 一套与组织边界对齐的 memory 模型,把 recall、sharing 和 skill evolution 放进同一体系 +- 一套 runtime abstraction,让 **8 个** agent runtime —— LangGraph、DeepAgents、Claude Code、CrewAI、AutoGen、**Hermes**、**Gemini CLI**、OpenClaw —— 共用一套 workspace 契约 +- 一套与组织边界对齐的 memory 模型,把 recall、sharing 和 skill evolution 放进同一体系(Memory v2 由 pgvector 支撑语义召回) - 一套面向线上 workspace 的运维面,统一完成观测、暂停、重启、检查和持续改进 今天很多团队能做好 workflow、单 agent、coding agent,或者自定义 multi-agent graph 中的一种。 @@ -74,7 +74,7 @@ Molecule AI 填的就是这个空白。 ### 3. Runtime 选择不再是死路 -LangGraph、DeepAgents、Claude Code、CrewAI、AutoGen、OpenClaw 都可以挂到同一个 workspace abstraction 下。团队可以统一治理方式,而不必统一到底层 runtime。 +LangGraph、DeepAgents、Claude Code、CrewAI、AutoGen、Hermes、Gemini CLI、OpenClaw 都可以挂到同一个 workspace abstraction 下。团队可以统一治理方式,而不必统一到底层 runtime。 ### 4. Memory 被当成基础设施来做 @@ -116,6 +116,8 @@ Molecule AI 并不是要替代下面这些 framework,而是把它们纳入更 | **Claude Code** | `main` 已支持 | 真实编码工作流、CLI-native continuity | 安全 workspace 抽象、A2A delegation、组织边界、共享 control plane | | **CrewAI** | `main` 已支持 | 角色型 crew 模式清晰 | 持久 workspace 身份、统一策略、共享 Canvas 和 registry | | **AutoGen** | `main` 已支持 | assistant/tool orchestration | 统一部署、层级协作、共享运维平面 | +| **Hermes 4** | `main` 已支持 | 混合推理、原生工具调用、json_schema 输出(NousResearch/hermes-agent) | Option B 上游 hook、A2A 桥接 OpenAI 兼容 API、多 provider 自动派生 | +| **Gemini CLI** | `main` 已支持 | Google Gemini CLI 持续会话 | workspace 生命周期、A2A、层级感知协作、共享运维平面 | | **OpenClaw** | `main` 已支持 | CLI-native runtime,自有 session 模型 | workspace 生命周期、templates、activity logs、拓扑感知协作 | | **NemoClaw** | `feat/nemoclaw-t4-docker` 分支 WIP | NVIDIA 方向 runtime 路线 | 计划并入同一抽象层,但当前还不是 `main` 已合并能力 | @@ -181,9 +183,10 @@ Molecule AI 并不是要替代下面这些 framework,而是把它们纳入更 ## `main` 分支已经具备什么 -### Canvas +### Canvas(v4) - Next.js 15 + React Flow + Zustand +- **warm-paper 主题系统** —— light / dark / 跟随系统;SSR cookie + nonce'd boot 脚本 + ThemeProvider;终端与代码面板始终保持深色 - drag-to-nest 团队构建 - empty state + onboarding wizard - template palette @@ -192,8 +195,9 @@ Molecule AI 并不是要替代下面这些 framework,而是把它们纳入更 ### Platform -- Go/Gin control plane -- workspace CRUD 和 provisioning +- Go 1.25 / Gin control plane(80+ HTTP 端点 + Gorilla WebSocket fanout) +- workspace CRUD 和 provisioning(可插拔 Provisioner —— 本地 Docker、生产 EC2 + SSM) +- **A2A 响应路径已收敛为类型化的判别联合(RFC #2967)** —— 冻结 dataclass + 全量 parser;100% 单元测试 + 对抗性 fuzz 覆盖 - registry 与 heartbeat - 浏览器安全的 A2A proxy - team expansion/collapse @@ -203,10 +207,10 @@ Molecule AI 并不是要替代下面这些 framework,而是把它们纳入更 ### Runtime -- 统一 `workspace/` 镜像 -- adapter 驱动执行 +- 统一 `workspace/` 镜像;生产环境采用 thin AMI(us-east-2) +- adapter 驱动执行,覆盖 **8 个 runtime**(Claude Code、Hermes、Gemini CLI、LangGraph、DeepAgents、CrewAI、AutoGen、OpenClaw) - Agent Card 注册 -- awareness-backed memory +- awareness-backed memory;**Memory v2 由 pgvector 支撑**语义召回 - plugin 挂载共享 rules/skills - 本地 skills 热加载 - coordinator-only delegation 路径 @@ -220,6 +224,21 @@ Molecule AI 并不是要替代下面这些 framework,而是把它们纳入更 - runtime tiers - 终端与文件层面的 workspace 直接排障 +### SaaS(由 [`molecule-controlplane`](https://git.moleculesai.app/molecule-ai/molecule-controlplane) 提供) + +- 多租户运行在 AWS EC2 + Neon(每租户一个 Postgres branch)+ Cloudflare Tunnels(每租户一条隧道,对外不开任何端口) +- WorkOS AuthKit + Stripe Checkout + Customer Portal +- AWS KMS 信封加密(DB / Redis 连接串);AWS Secrets Manager 负责租户 bootstrap +- `tenant_resources` 审计表 + 30 分钟 boot-event-aware reconciler —— 每个 CF / AWS lifecycle 事件都有记录,每 30 分钟比对 claim 与实际状态 + +### 在 Claude Code 里直接接入(由 [`molecule-mcp-claude-channel`](https://git.moleculesai.app/molecule-ai/molecule-mcp-claude-channel) 提供) + +- 把 Molecule A2A 流量桥接到本地 Claude Code 会话的 MCP 插件 +- 订阅一个或多个 workspace;peer 的消息会以 user-turn 出现,回复会经 Molecule A2A 路由出去 +- 无需公网隧道、无需公开端点 —— 插件启动时自动把每个 watched workspace 注册成 `delivery_mode=poll`,长轮询 `/activity?since_id=…` +- 多租户友好:单次安装即可同时 watch 跨多个 Molecule 租户的 workspace(`MOLECULE_PLATFORM_URLS` 按 workspace 配置) +- 通过标准 marketplace 流程安装:`/plugin marketplace add Molecule-AI/molecule-mcp-claude-channel` → `/plugin install molecule-channel@molecule-mcp-claude-channel` + ## 适合什么团队 Molecule AI 特别适合下面这些场景: @@ -232,23 +251,29 @@ Molecule AI 特别适合下面这些场景: ## 架构总览 ```text -Canvas (Next.js :3000) <--HTTP / WS--> Platform (Go :8080) <---> Postgres + Redis - | | - | +--> Docker provisioner / bundles / templates / secrets +Canvas (Next.js 15, warm-paper :3000) <--HTTP / WS--> Platform (Go 1.25 :8080) <---> Postgres + Redis + | | + | +--> Provisioner: Docker (本地) / EC2 + SSM (生产) + | +--> bundles · templates · secrets · KMS | - +-------------------- 展示 --------------------> workspaces, teams, tasks, traces, events + +------------------------- 展示 ------------------------> workspaces, teams, tasks, traces, events -Workspace Runtime (Python image with adapters) - - LangGraph / DeepAgents / Claude Code / CrewAI / AutoGen / OpenClaw - - Agent Card + A2A server - - heartbeat + activity + awareness-backed memory +Workspace Runtime (Python ≥3.11,含 adapter 集合的镜像) + - 8 个 adapter: LangGraph / DeepAgents / Claude Code / CrewAI / AutoGen / Hermes / Gemini CLI / OpenClaw + - Agent Card + A2A server(typed-SSOT 响应路径,RFC #2967) + - heartbeat + activity + awareness-backed memory(Memory v2 —— pgvector 语义召回) - skills + plugins + hot reload + +SaaS Control Plane (molecule-controlplane,私有) + - 每租户 EC2 + Neon (Postgres branch) + Cloudflare Tunnel + - WorkOS · Stripe · KMS · AWS Secrets Manager + - tenant_resources 审计 + 30 分钟 reconciler ``` ## 快速开始 ```bash -git clone https://github.com/Molecule-AI/molecule-core.git +git clone https://git.moleculesai.app/molecule-ai/molecule-core.git cd molecule-core cp .env.example .env @@ -296,7 +321,11 @@ npm run dev ## 当前范围说明 -当前 `main` 已经包含核心平台、Canvas、memory model、6 个正式 adapter、skill lifecycle 和主要运维面。像 **NemoClaw** 这样的相邻 runtime 路线仍然属于分支级工作,只有合并后才会进入正式支持列表,这里会明确区分。 +当前 `main` 已经包含核心平台、Canvas v4(warm-paper 主题)、Memory v2(pgvector 语义召回)、typed-SSOT A2A 响应路径(RFC #2967)、**8 个正式 adapter**(Claude Code、Hermes、Gemini CLI、LangGraph、DeepAgents、CrewAI、AutoGen、OpenClaw)、skill lifecycle,以及主要运维面。 + +配套的私有仓库 [`molecule-controlplane`](https://git.moleculesai.app/molecule-ai/molecule-controlplane) 提供 SaaS 层 —— 多租户编排(EC2 + Neon + Cloudflare Tunnels)、KMS 信封加密、WorkOS 鉴权、Stripe 计费,以及 `tenant_resources` 审计表加 30 分钟 reconciler。 + +像 **NemoClaw** 这样的相邻 runtime 路线仍然属于分支级工作,只有合并后才会进入正式支持列表,这里会明确区分。 ## License diff --git a/canvas/src/app/pricing/page.tsx b/canvas/src/app/pricing/page.tsx index 3ef6f319..73748770 100644 --- a/canvas/src/app/pricing/page.tsx +++ b/canvas/src/app/pricing/page.tsx @@ -41,7 +41,7 @@ export default function PricingPage() {

We publish the{" "} full source on GitHub diff --git a/canvas/src/components/tabs/ChatTab.tsx b/canvas/src/components/tabs/ChatTab.tsx index f343b63c..21e9f665 100644 --- a/canvas/src/components/tabs/ChatTab.tsx +++ b/canvas/src/components/tabs/ChatTab.tsx @@ -13,7 +13,6 @@ import { AttachmentPreview } from "./chat/AttachmentPreview"; import { extractFilesFromTask } from "./chat/message-parser"; import { AgentCommsPanel } from "./chat/AgentCommsPanel"; import { appendActivityLine } from "./chat/activityLog"; -import { activityRowToMessages, type ActivityRowForHydration } from "./chat/historyHydration"; import { runtimeDisplayName } from "@/lib/runtime-names"; import { ConfirmDialog } from "@/components/ConfirmDialog"; @@ -50,38 +49,12 @@ interface A2AResponse { }; } -/** Detect activity-log rows that the workspace's own runtime fired - * against itself but were misclassified as canvas-source. The proper - * fix is the X-Workspace-ID header from `self_source_headers()` in - * workspace/platform_auth.py, which makes the platform record - * source_id = workspace_id. But three failure modes still leak a - * self-message into "My Chat": - * - * 1. Historical rows already in the DB with source_id=NULL. - * 2. Workspace containers running pre-fix heartbeat.py / main.py - * (the fix only takes effect after an image rebuild + redeploy). - * 3. Future internal triggers added without the helper. - * - * This client-side filter recognises the heartbeat trigger by its - * exact prefix — the heartbeat assembles - * - * "Delegation results are ready. Review them and take appropriate - * action:\n" + summary_lines + report_instruction - * - * in workspace/heartbeat.py. The prefix is template-fixed so a - * string match is reliable. If the heartbeat copy ever changes, - * update this constant in the same commit. - * - * This is a backstop, not the primary defence — the X-Workspace-ID - * header is. Filtering content is fragile to copy edits, so keep - * the list narrow. */ -const INTERNAL_SELF_MESSAGE_PREFIXES = [ - "Delegation results are ready. Review them and take appropriate action", -]; - -function isInternalSelfMessage(text: string): boolean { - return INTERNAL_SELF_MESSAGE_PREFIXES.some((p) => text.startsWith(p)); -} +// Internal-self-message filtering moved server-side in RFC #2945 +// PR-C/D — the platform's /chat-history endpoint applies the +// IsInternalSelfMessage predicate before returning rows, so the +// client no longer needs the local backstop on the history path. +// The proper fix is still X-Workspace-ID header (source_id=workspace_id); +// the platform-side prefix filter handles the residual cases. // extractReplyText pulls the agent's text reply out of an A2A response. // Concatenates ALL text parts (joined with "\n") rather than returning @@ -134,8 +107,19 @@ const INITIAL_HISTORY_LIMIT = 10; const OLDER_HISTORY_BATCH = 20; /** - * Load chat history from the activity_logs database via the platform API. - * Uses source=canvas to only get user-initiated messages (not agent-to-agent). + * Load chat history from the platform's typed /chat-history endpoint. + * + * Server-side rendering of activity_logs rows into ChatMessage shape + * lives in workspace-server/internal/messagestore/postgres_store.go + * (RFC #2945 PR-C/D). The server already applies the canvas-source + * filter, the internal-self-message predicate, the role decision + * (status=error vs agent-error prefix → system), and the v0/v1 + * file-shape extraction. Canvas just renders what it receives. + * + * Wire shape (mirrors ChatMessage exactly, no per-row mapping needed): + * + * GET /workspaces/:id/chat-history?limit=N&before_ts=T + * 200 → {"messages": ChatMessage[], "reached_end": boolean} * * Pagination: * - Pass `limit` to bound the page size (newest-first from server). @@ -143,10 +127,10 @@ const OLDER_HISTORY_BATCH = 20; * timestamp. Combined with limit, this yields the next-older page * when scrolling backward through history. * - * `reachedEnd` is true when the server returned fewer rows than asked - * for — caller uses this to disable further older-batch fetches. - * (Counts row-level returns, not chat-bubble count: each row may - * produce 1-2 bubbles.) + * `reachedEnd` is propagated from the server. The server computes it + * by comparing rowCount vs limit so a partial last page is correctly + * detected even when the row→bubble fan-out is non-1:1 (each row + * produces 1-2 bubbles). */ async function loadMessagesFromDB( workspaceId: string, @@ -154,25 +138,23 @@ async function loadMessagesFromDB( beforeTs?: string, ): Promise<{ messages: ChatMessage[]; error: string | null; reachedEnd: boolean }> { try { - const params = new URLSearchParams({ - type: "a2a_receive", - source: "canvas", - limit: String(limit), - }); + const params = new URLSearchParams({ limit: String(limit) }); if (beforeTs) params.set("before_ts", beforeTs); - const activities = await api.get( - `/workspaces/${workspaceId}/activity?${params.toString()}`, + const resp = await api.get<{ messages: ChatMessage[]; reached_end: boolean }>( + `/workspaces/${workspaceId}/chat-history?${params.toString()}`, ); - const messages: ChatMessage[] = []; - // Activities are newest-first, reverse for chronological order. - // Per-row mapping lives in chat/historyHydration.ts so it can be - // unit-tested without spinning up the full ChatTab component - // (regression cover for the timestamp-collapse bug). - for (const a of [...activities].reverse()) { - messages.push(...activityRowToMessages(a, isInternalSelfMessage)); - } - return { messages, error: null, reachedEnd: activities.length < limit }; + // Server emits oldest-first within the page (RFC #2945 PR-C-2 + // post-fix: server reverses row-aware before returning so the + // wire is display-ready). Canvas appends/prepends without + // reordering — this avoids the pair-flip bug a naive flat + // reverse causes when each row produces a (user, agent) pair + // with the same timestamp. + return { + messages: resp.messages ?? [], + error: null, + reachedEnd: resp.reached_end, + }; } catch (err) { return { messages: [], diff --git a/canvas/src/components/tabs/ConfigTab.tsx b/canvas/src/components/tabs/ConfigTab.tsx index 2250f3f1..ab229632 100644 --- a/canvas/src/components/tabs/ConfigTab.tsx +++ b/canvas/src/components/tabs/ConfigTab.tsx @@ -21,20 +21,39 @@ interface Props { // --- Agent Card Section --- function AgentCardSection({ workspaceId }: { workspaceId: string }) { - const [card, setCard] = useState | null>(null); - const [loading, setLoading] = useState(true); + // Initial card value comes from the canvas store — node.data.agentCard + // is hydrated by the platform stream when the workspace appears in the + // graph, so reading it here avoids a duplicate `GET /workspaces/${id}` + // (the parent ConfigTab.loadConfig already fetches workspace metadata, + // and refetching here adds a serialised RTT to the panel-open path — + // contributed to the ~20s detail-panel load reported in core#11). + // Local state still tracks the edited/saved value so the editor flow + // is unchanged. + const storeCard = useCanvasStore((s) => { + // Defensive against test mocks that omit `nodes` (some test files + // stub the store with a minimal shape). In production `nodes` is + // always an array — empty or not — so the optional chaining only + // matters for the test path. + const node = s.nodes?.find?.((n) => n.id === workspaceId); + return (node?.data.agentCard as + | Record + | null + | undefined) ?? null; + }); + const [card, setCard] = useState | null>(storeCard); const [editing, setEditing] = useState(false); const [draft, setDraft] = useState(""); const [saving, setSaving] = useState(false); const [error, setError] = useState(null); const [success, setSuccess] = useState(false); + // If the store updates while this section is mounted (another tab + // pushed an update via the platform event stream), reflect that — + // unless the user is mid-edit, in which case we don't clobber their + // unsaved draft. useEffect(() => { - api.get>(`/workspaces/${workspaceId}`) - .then((ws) => setCard((ws.agent_card as Record) || null)) - .catch(() => {}) - .finally(() => setLoading(false)); - }, [workspaceId]); + if (!editing) setCard(storeCard); + }, [storeCard, editing]); const handleSave = async () => { setError(null); @@ -53,9 +72,7 @@ function AgentCardSection({ workspaceId }: { workspaceId: string }) { return (

- {loading ? ( -
Loading...
- ) : editing ? ( + {editing ? (