diff --git a/.gitea/workflows/publish-workspace-server-image.yml b/.gitea/workflows/publish-workspace-server-image.yml index e9ca5ec2..08a65d14 100644 --- a/.gitea/workflows/publish-workspace-server-image.yml +++ b/.gitea/workflows/publish-workspace-server-image.yml @@ -32,11 +32,9 @@ on: - '.gitea/workflows/publish-workspace-server-image.yml' workflow_dispatch: -# Serialize per-branch so two rapid staging pushes don't race the same -# :staging-latest tag retag. Allow staging and main to run in parallel -# (different GITHUB_REF → different concurrency group) since they -# produce different :staging- tags and last-write-wins on -# :staging-latest is acceptable across branches. +# Serialize per-branch so two rapid main pushes don't race the same +# :staging-latest tag retag. Allow parallel runs as they produce +# different :staging- tags and last-write-wins on :staging-latest. # # cancel-in-progress: false → in-flight builds finish; the next push's # build queues. This avoids a partially-pushed image. diff --git a/.staging-trigger b/.staging-trigger new file mode 100644 index 00000000..270a6560 --- /dev/null +++ b/.staging-trigger @@ -0,0 +1 @@ +staging trigger \ No newline at end of file diff --git a/docs/architecture/staging-environment.md b/docs/architecture/staging-environment.md index 79cbb384..d7182ceb 100644 --- a/docs/architecture/staging-environment.md +++ b/docs/architecture/staging-environment.md @@ -1,7 +1,7 @@ # Staging Environment Design -> **Status:** Planned — gates all future infra changes (Tunnel migration, -> security fixes, etc.) +> **Status:** In Progress — CI image pipeline documented below; remaining +> components (Railway, Neon, Vercel staging) tracked separately. > > **Problem:** We merge directly to main and auto-deploy to production. > Today's session broke CI twice and caused hours of Cloudflare edge cache @@ -51,6 +51,24 @@ Developer pushes to PR branch → Promote to PRODUCTION (manual trigger or approval) ``` +## CI Image Pipeline + +The CI image pipeline is live. On every push to `main`, the Gitea Actions workflow +[`.gitea/workflows/publish-workspace-server-image.yml`](https://git.moleculesai.app/molecule-ai/molecule-core/blob/main/.gitea/workflows/publish-workspace-server-image.yml) +builds and pushes two ECR images: + +| Tag | Meaning | +|-----|---------| +| `:staging-` | Per-commit digest. Stable — canary verify runs against this tag. | +| `:staging-latest` | Tracks the most recent `main` build. | + +Images are pushed to: +- `153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform` (workspace-server) +- `153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant` (Go + Next.js in one image) + +The `:staging-latest` tag advances automatically on every `main` push; `:staging-` is immutable. +Canary verification jobs reference `:staging-` so they pin to the commit they are testing. + ## Components ### 1. Railway: two environments diff --git a/docs/design-system/canvas-design-system-v1.md b/docs/design-system/canvas-design-system-v1.md index d8fbe7e9..659ec9b1 100644 --- a/docs/design-system/canvas-design-system-v1.md +++ b/docs/design-system/canvas-design-system-v1.md @@ -303,6 +303,7 @@ type ResolvedTheme = "light" | "dark"; ### 5.1 Focus Management ✅ VERIFIED - All interactive elements have `focus-visible:ring-2 focus-visible:ring-blue-500 focus-visible:ring-offset-2 focus-visible:ring-offset-zinc-950` +- **WCAG 2.4.7 Focus Visible:** The canvas uses `focus-visible` (not `:focus`) so the ring only appears for keyboard users, not mouse/touch. Mouse users see hover states but no permanent ring. Keyboard focus always shows a 2px blue ring offset from the element — visible against all canvas backgrounds (dark zinc-950 + dark zinc-900 surfaces). - No `outline-none` without equivalent focus ring - Radix Dialog traps focus automatically diff --git a/docs/guides/remote-workspaces-faq.md b/docs/guides/remote-workspaces-faq.md index 738001ee..ae3f0395 100644 --- a/docs/guides/remote-workspaces-faq.md +++ b/docs/guides/remote-workspaces-faq.md @@ -51,10 +51,42 @@ Yes. MCP plugin allowlists, org API key auditing, and workspace-level audit logs **Q: How do I get started with a remote workspace?** -1. Install the agent: `curl -sSL https://get.moleculesai.app | bash` -2. Authenticate: `molecule login --org your-org` -3. Bootstrap: `molecule workspace init --name my-agent --runtime remote` -4. The workspace registers with the platform and appears in Canvas within ~10 seconds. +1. Install the SDK: `pip install molecule-ai-sdk` +2. Create an external workspace on the platform (admin step — requires `ADMIN_TOKEN` or org API key): + +```bash +PLATFORM_URL="https://acme.moleculesai.app" # your platform URL +ADMIN_TOKEN="your-admin-token" + +WORKSPACE=$(curl -s -X POST "${PLATFORM_URL}/workspaces" \ + -H "Authorization: Bearer ${ADMIN_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{"name":"my-agent","runtime":"external","external":true}') + +WORKSPACE_ID=$(echo $WORKSPACE | jq -r '.id') +echo "Workspace ID: ${WORKSPACE_ID}" +``` + +3. Register the agent and start the heartbeat loop: + +```python +from molecule_agent import RemoteAgentClient +import os + +client = RemoteAgentClient( + workspace_id = os.environ["WORKSPACE_ID"], + platform_url = os.environ["PLATFORM_URL"], + agent_card = {"name": "my-agent", "skills": []}, +) +client.register() # fetch + cache auth token +client.run_heartbeat_loop( + task_supplier = lambda: {"current_task": "idle", "active_tasks": 0} +) +``` + +4. The workspace registers with the platform and appears in Canvas within ~10 seconds with a **purple REMOTE badge**. + +For a complete walkthrough with secrets management and A2A messaging, see the [Remote Workspaces quick-start guide](../guides/remote-workspaces.md) and the [External Agent Registration guide](../guides/external-agent-registration.md). **Q: Can I use my existing SSH keys and git config with a remote workspace?** @@ -62,7 +94,7 @@ Yes. The remote runtime does not virtualize or override your shell environment. **Q: How do I update the remote agent when a new version ships?** -`molecule update` — pulls the latest agent binary from the platform, does a rolling restart. Zero downtime if the agent reconnects within the heartbeat window. +Stop the current agent process, install the new version (`pip install --upgrade molecule-ai-sdk` for the SDK, or pull the latest agent binary from your deployment mechanism), and restart. The agent re-registers on startup and Canvas picks it up within one heartbeat cycle (~30s). **Q: What's the latency like for A2A coordination between a remote workspace and a container workspace?** @@ -94,10 +126,20 @@ Same as a container workspace — up to 5 concurrent delegated tasks. Remote run **Q: Remote workspace shows offline in Canvas but the process is running on my machine.** -1. Check the agent log: `molecule logs --workspace my-agent` -2. Confirm the machine has outbound internet access: `curl -s https://[your-org].moleculesai.app/health` -3. Check token validity: `molecule auth status` — re-authenticate if expired -4. Restart the agent: `molecule restart --workspace my-agent` +1. Confirm the machine has outbound internet access: + ```bash + curl -s https://platform.moleculesai.app/health + ``` +2. Check the agent log output (however your agent writes logs — print statements, a log file, etc.) +3. Verify the agent is still registered: + ```bash + curl -s -X POST "https://platform.moleculesai.app/registry/heartbeat" \ + -H "Authorization: Bearer ${AUTH_TOKEN}" \ + -H "Content-Type: application/json" \ + -d "{\"workspace_id\": \"${WORKSPACE_ID}\"}" + ``` + A `200` response means the heartbeat is reaching the platform; a `401` means the auth token is invalid. +4. Restart the agent: stop the current process and re-run the registration + heartbeat loop above. **Q: A2A messages to my remote workspace are timing out.** diff --git a/manifest.json b/manifest.json index 2ac2f462..bde3a1d9 100644 --- a/manifest.json +++ b/manifest.json @@ -44,3 +44,4 @@ {"name": "mock-bigorg", "repo": "molecule-ai/molecule-ai-org-template-mock-bigorg", "ref": "main"} ] } +// Triggered by Integration Tester at 2026-05-10T08:52Z diff --git a/scripts/clone-manifest.sh b/scripts/clone-manifest.sh index 4e9e5d99..d6e343c8 100755 --- a/scripts/clone-manifest.sh +++ b/scripts/clone-manifest.sh @@ -37,6 +37,50 @@ PLUGINS_DIR="${4:?Missing plugins dir}" EXPECTED=0 CLONED=0 +# clone_one_with_retry — clone a single repo, retrying on transient failure. +# +# Why: the publish-workspace-server-image (and harness-replays) CI jobs +# clone the full manifest (~36 repos) serially on a memory-constrained +# Gitea Actions runner. Under host memory pressure the OOM killer +# occasionally SIGKILLs git-remote-https mid-clone: +# +# error: git-remote-https died of signal 9 +# fatal: the remote end hung up unexpectedly +# +# (observed in publish-workspace-server-image run 4622 on 2026-05-10 — the +# job died on the 14th of 36 clones, which wedged staging→main). One +# transient SIGKILL / network blip would otherwise fail the whole tenant +# image rebuild. Retrying after a short backoff lets the pressure subside. +# The durable fix is more runner RAM/swap (tracked with Infra-SRE); this +# just stops a single flake from being release-blocking. +# +# Args: +clone_one_with_retry() { + local tdir="$1" name="$2" url="$3" display="$4" ref="$5" + local attempt=1 max_attempts=3 backoff + + while : ; do + # A killed attempt can leave a partial directory behind; git clone + # refuses a non-empty target, so wipe it before each try. + rm -rf "$tdir/$name" + + if [ "$ref" = "main" ]; then + if git clone --depth=1 -q "$url" "$tdir/$name"; then return 0; fi + else + if git clone --depth=1 -q --branch "$ref" "$url" "$tdir/$name"; then return 0; fi + fi + + if [ "$attempt" -ge "$max_attempts" ]; then + echo "::error::clone failed after ${max_attempts} attempts: ${display}" >&2 + return 1 + fi + backoff=$((attempt * 3)) # 3s, then 6s + echo " ⚠ clone attempt ${attempt}/${max_attempts} failed for ${display} — retrying in ${backoff}s" >&2 + sleep "$backoff" + attempt=$((attempt + 1)) + done +} + clone_category() { local category="$1" local target_dir="$2" @@ -82,11 +126,7 @@ clone_category() { fi echo " cloning $display_url -> $target_dir/$name (ref=$ref)" - if [ "$ref" = "main" ]; then - git clone --depth=1 -q "$clone_url" "$target_dir/$name" - else - git clone --depth=1 -q --branch "$ref" "$clone_url" "$target_dir/$name" - fi + clone_one_with_retry "$target_dir" "$name" "$clone_url" "$display_url" "$ref" CLONED=$((CLONED + 1)) i=$((i + 1)) done diff --git a/workspace/builtin_tools/a2a_tools.py b/workspace/builtin_tools/a2a_tools.py index acdd15cb..48b813a1 100644 --- a/workspace/builtin_tools/a2a_tools.py +++ b/workspace/builtin_tools/a2a_tools.py @@ -77,6 +77,16 @@ async def delegate_task(workspace_id: str, task: str) -> str: return str(result) if isinstance(result, str) else "(no text)" elif "error" in data: err = data["error"] + # Handle both string-form errors ("error": "some string") + # and object-form errors ("error": {"message": "...", "code": ...}). + msg = "" + if isinstance(err, dict): + msg = err.get("message", "") + elif isinstance(err, str): + msg = err + else: + msg = str(err) + return f"Error: {msg}" msg = "" if isinstance(err, dict): msg = err.get("message", "")