merge: resolve scheduler conflicts with main (#85 panic-recover + supervised heartbeat)

This commit is contained in:
Hongming Wang 2026-04-15 00:12:29 -07:00
commit ba375e8551
10 changed files with 749 additions and 19 deletions

View File

@ -247,6 +247,9 @@ point for "what else is out there."
- **GitHub issue #15** — Provisioner: auto-refresh `CLAUDE_CODE_OAUTH_TOKEN` from `global_secrets` on workspace restart → **DONE** via PR #64 (`SetGlobal` / `DeleteGlobal` now fan out `RestartByID` to every affected workspace).
- **GitHub issue #19 Layer 1** — Platform-generated restart context → **DONE** via PR #65 (synthetic A2A `message/send` with `metadata.kind=restart_context`, `system:restart-context` caller prefix, 30s re-register wait). Layer 2 deferred to issue #66 (see Backlog item 15 above).
### Recently launched (2026-04-15 tick-9)
- **Phase 32 Phase B.2 (image pipeline)** — PR #80 (merged `c3cc8e87`) adds `.github/workflows/publish-platform-image.yml`: on every main-merge touching `platform/**`, builds `platform/Dockerfile` and pushes `ghcr.io/molecule-ai/platform:latest` + `:sha-<commit>` to GHCR. Paired with the private `molecule-controlplane` Fly + Neon provisioner (PR #3 there, merged `2e85d5ad`) that reads `TENANT_IMAGE` env and boots tenant Fly Machines from this image. Tick-8 docs-sync PR #79 (merged `d53a1287`) also landed.
### Recently launched (2026-04-14 tick-8)
- **Phase 32 PR #1**`TenantGuard` middleware (PR #78, merged `57a05686`). Public repo's only SaaS hook: when `MOLECULE_ORG_ID` env is set, non-allowlisted requests require matching `X-Molecule-Org-Id` header or 404. Unset → passthrough (self-hosted unchanged). Allowlist is exact-match: `/health` + `/metrics`. Paired with the private `Molecule-AI/molecule-controlplane` repo scaffolded this tick (Fly Machines provisioner stub, `/cp/orgs` CRUD, subdomain→fly-replay router, migrations 001-003 for `organizations`/`org_instances`/`org_members`). +6 `TestTenantGuard_*` tests. Phase 32 plan: follow-up PRs wire real Fly provisioner, WorkOS AuthKit, Stripe, Cloudflare, signup UX — all in the private repo except the single public middleware.

View File

@ -1,5 +1,6 @@
import type { Metadata } from "next";
import "./globals.css";
import { AuthGate } from "@/components/AuthGate";
export const metadata: Metadata = {
title: "Molecule AI",
@ -13,7 +14,13 @@ export default function RootLayout({
}) {
return (
<html lang="en">
<body className="bg-zinc-950 text-white">{children}</body>
<body className="bg-zinc-950 text-white">
{/* AuthGate is a client component; it checks the session on mount
and bounces anonymous users to the control plane's login page
when running on a tenant subdomain. Non-SaaS hosts (localhost,
vercel preview URL, apex) pass through unchanged. */}
<AuthGate>{children}</AuthGate>
</body>
</html>
);
}

View File

@ -0,0 +1,68 @@
"use client";
/**
* AuthGate wraps the canvas root so every page is gated on a valid session.
* Anonymous users get bounced to app.moleculesai.app/cp/auth/login?return_to=<here>.
*
* In non-SaaS mode (no tenant slug local dev, apex, vercel preview URL),
* the gate is a pass-through: canvas works without auth for local dev.
* This mirrors the control plane's "disabled provider" fallback.
*/
import { useEffect, useState, type ReactNode } from "react";
import { fetchSession, redirectToLogin, type Session } from "@/lib/auth";
import { getTenantSlug } from "@/lib/tenant";
export type AuthGateState =
| { kind: "loading" }
| { kind: "anonymous"; skipRedirect: boolean }
| { kind: "authenticated"; session: Session };
export function AuthGate({ children }: { children: ReactNode }) {
const [state, setState] = useState<AuthGateState>({ kind: "loading" });
useEffect(() => {
// In non-SaaS mode (no tenant slug) we skip the gate entirely —
// local dev, vercel preview URLs, and the app.moleculesai.app apex
// should not force login for API-only interactions.
const slug = getTenantSlug();
if (!slug) {
setState({ kind: "anonymous", skipRedirect: true });
return;
}
let cancelled = false;
fetchSession()
.then((s) => {
if (cancelled) return;
if (s) {
setState({ kind: "authenticated", session: s });
} else {
setState({ kind: "anonymous", skipRedirect: false });
}
})
.catch(() => {
// Network error — fail closed (show signin) so a transient
// outage doesn't leak the canvas UI to an unauth'd user.
if (!cancelled) setState({ kind: "anonymous", skipRedirect: false });
});
return () => {
cancelled = true;
};
}, []);
useEffect(() => {
if (state.kind === "anonymous" && !state.skipRedirect) {
redirectToLogin("sign-in");
}
}, [state]);
if (state.kind === "loading") {
// Minimal placeholder; canvas has its own loading UI downstream.
return null;
}
if (state.kind === "anonymous" && !state.skipRedirect) {
// Redirect already firing from the effect above; render nothing in
// the interim to avoid a flash of unauthenticated content.
return null;
}
return <>{children}</>;
}

View File

@ -0,0 +1,69 @@
/**
* @vitest-environment jsdom
*/
import { describe, it, expect, vi, afterEach } from "vitest";
import { fetchSession, redirectToLogin } from "../auth";
afterEach(() => {
vi.unstubAllGlobals();
vi.restoreAllMocks();
});
describe("fetchSession", () => {
it("returns session on 200", async () => {
vi.stubGlobal("fetch", vi.fn().mockResolvedValue({
ok: true,
status: 200,
json: async () => ({ user_id: "u1", org_id: "o1", email: "a@x.com" }),
}));
const s = await fetchSession();
expect(s).toEqual({ user_id: "u1", org_id: "o1", email: "a@x.com" });
});
it("returns null on 401 without throwing", async () => {
vi.stubGlobal("fetch", vi.fn().mockResolvedValue({ ok: false, status: 401 }));
const s = await fetchSession();
expect(s).toBeNull();
});
it("throws on 500 so transient outages aren't treated as 'anonymous'", async () => {
vi.stubGlobal("fetch", vi.fn().mockResolvedValue({ ok: false, status: 500, statusText: "oops" }));
await expect(fetchSession()).rejects.toThrow("500");
});
it("sends credentials:include for cross-origin cookies", async () => {
const fetchMock = vi.fn().mockResolvedValue({ ok: false, status: 401 });
vi.stubGlobal("fetch", fetchMock);
await fetchSession();
expect(fetchMock).toHaveBeenCalledWith(
expect.stringContaining("/cp/auth/me"),
expect.objectContaining({ credentials: "include" }),
);
});
});
describe("redirectToLogin", () => {
it("sets window.location to cp login URL with return_to", () => {
const href = "https://acme.moleculesai.app/dashboard";
Object.defineProperty(window, "location", {
writable: true,
value: { href },
});
redirectToLogin("sign-in");
// href now holds the redirect target. encodeURIComponent(href) must
// appear in the query.
expect((window.location as unknown as { href: string }).href).toContain("/cp/auth/login");
expect((window.location as unknown as { href: string }).href).toContain(
encodeURIComponent(href),
);
});
it("uses signup path for sign-up screenHint", () => {
Object.defineProperty(window, "location", {
writable: true,
value: { href: "https://acme.moleculesai.app/" },
});
redirectToLogin("sign-up");
expect((window.location as unknown as { href: string }).href).toContain("/cp/auth/signup");
});
});

51
canvas/src/lib/auth.ts Normal file
View File

@ -0,0 +1,51 @@
/**
* Canvas-side session detection. Calls /cp/auth/me on the control plane
* (via same-origin PLATFORM_URL) and returns the session or null.
*
* 401 is the "anonymous" signal and does NOT throw the caller decides
* whether to redirect. Network errors do throw so React error boundaries
* can surface them.
*/
import { PLATFORM_URL } from "./api";
export interface Session {
user_id: string;
org_id: string;
email: string;
}
// Base path prefix for auth endpoints on the control plane.
const AUTH_BASE = "/cp/auth";
/**
* fetchSession probes /cp/auth/me with the session cookie (credentials:
* include mandatory cross-origin). Returns the Session on 200, null on
* 401 (anonymous), throws on anything else so callers don't silently
* treat a 5xx as "not logged in".
*/
export async function fetchSession(): Promise<Session | null> {
const res = await fetch(`${PLATFORM_URL}${AUTH_BASE}/me`, {
credentials: "include",
});
if (res.status === 401) return null;
if (!res.ok) {
throw new Error(`/cp/auth/me: ${res.status} ${res.statusText}`);
}
return res.json();
}
/**
* redirectToLogin bounces the browser to the control plane's login page
* with a `return_to` param so the user lands back on the current URL
* after signup/login completes. Same-origin safety is enforced on the
* CP side (isSafeReturnTo rejects cross-domain / http / protocol-
* relative URLs). Uses window.location.href so the full URL including
* query + hash survives the round trip.
*/
export function redirectToLogin(screenHint: "sign-up" | "sign-in" = "sign-in"): void {
if (typeof window === "undefined") return;
const returnTo = window.location.href;
const path = screenHint === "sign-up" ? "signup" : "login";
const dest = `${PLATFORM_URL}${AUTH_BASE}/${path}?return_to=${encodeURIComponent(returnTo)}`;
window.location.href = dest;
}

View File

@ -0,0 +1,37 @@
# Edit history — 2026-04-15
## tick-9: Phase 32 Phase B.2 image pipeline (PR #80) + tick-8 docs sync (PR #79)
Two merges:
### PR #79`docs: sync documentation with 2026-04-14 tick-8 merge (#78)`
Merge commit `d53a1287`. Tick-8 docs sync for the TenantGuard middleware.
Pure docs; CLAUDE.md test count + PLAN.md tick-8 block + edit-history entry.
### PR #80`feat(ci): publish-platform-image → ghcr.io/molecule-ai/platform (Phase B.2)`
Merge commit `c3cc8e87`. Noteworthy: ci-infra.
Adds `.github/workflows/publish-platform-image.yml`:
- Trigger: push to main touching `platform/**`; also `workflow_dispatch`.
- Builds `platform/Dockerfile` via `docker/build-push-action@v5`.
- Pushes two tags per run: `ghcr.io/molecule-ai/platform:latest` (floating)
and `:sha-<short-commit>` (immutable, pin-friendly).
- GHA cache via `cache-from/cache-to: type=gha` for warm rebuilds.
- Permissions: `contents:read` + `packages:write`; authenticates to GHCR
using the built-in `GITHUB_TOKEN`, no extra secrets.
- OCI labels propagate source URL + commit SHA for provenance.
Purpose: pairs with the private `molecule-controlplane` Fly + Neon
provisioner (PR #3 there, merged `2e85d5ad`) which reads
`TENANT_IMAGE=ghcr.io/molecule-ai/platform:<tag>` from env and spawns
each tenant Fly Machine from this image.
### Deployment state (informational — not in any repo)
- Fly apps (`molecule-cp`, `molecule-tenant`): **pending CEO** (`flyctl apps create`).
- Fly billing card: **pending CEO**.
- First real tenant provision: **blocked** on the two above.
### File deltas (public repo)
- `.github/workflows/publish-platform-image.yml` — new.
- `CLAUDE.md` — tick-9 block for the new CI workflow.
- `PLAN.md` — new "Recently launched (2026-04-15 tick-9)" entry.

View File

@ -51,6 +51,7 @@ defaults:
infra: [DevOps Engineer]
qa: [QA Engineer]
performance: [Backend Engineer]
docs: [Documentation Specialist]
mixed: [Dev Lead]
# workspace_dir: not set by default — each agent gets an isolated Docker volume
@ -127,19 +128,73 @@ workspaces:
4. Read /workspace/repo/docs/product/overview.md to understand the product
5. Use commit_memory to save key product facts for later recall
6. Wait for tasks from PM.
schedules:
- name: Hourly ecosystem watch
cron_expr: "8 * * * *"
prompt: |
Daily survey for new agent-infra / AI-agent projects worth tracking.
1. Pull docs/ecosystem-watch.md to know what's already tracked.
2. Browse the web for last 24h:
- github.com/trending?since=daily&language=python (and typescript, go)
- HN front page, anything about agent frameworks
- Twitter/X mentions of new agent SDKs, MCP servers, frameworks
3. Cross-reference: skip anything already in ecosystem-watch.md.
4. For each genuinely new + relevant project (1-3 max per day):
- Add an entry under "## Entries" using the existing template
(Pitch / Shape / Overlap / Differentiation / Worth borrowing /
Terminology collisions / Signals to react to / Last reviewed + stars)
- Keep each entry ≤200 words.
5. If a finding suggests a concrete improvement to plugins/, workspace-template/,
or org-templates/, file a GH issue (`gh issue create`) with the proposal.
6. Commit additions to a branch named chore/eco-watch-YYYY-MM-DD. PUSH it
(per the repo "always raise PR" policy) and open a PR.
7. Routing: delegate_task to PM with summary
(audit_summary metadata: category=research, severity=info,
issues=[<gh issue numbers>], top_recommendation=<one-liner>).
8. If nothing notable today, skip the commit and PM-message a one-line "clean".
enabled: true
children:
- name: Market Analyst
role: Market sizing, trends, user research
files_dir: market-analyst
plugins: [browser-automation] # UNION with defaults (#71)
plugins: [browser-automation]
- name: Technical Researcher
role: AI frameworks and protocol evaluation
files_dir: technical-researcher
plugins: [browser-automation] # UNION with defaults (#71)
plugins: [browser-automation]
schedules:
- name: Hourly plugin curation
cron_expr: "22 * * * *"
prompt: |
Weekly survey of `plugins/` and `workspace-template/builtin_tools/` for
evolution opportunities. The team should keep gaining capabilities.
1. Inventory:
- ls plugins/ — every plugin and its plugin.yaml description
- ls workspace-template/builtin_tools/*.py — every builtin tool
- cat org-templates/molecule-dev/org.yaml — see how plugins are wired
2. Gap analysis:
- Any builtin_tool not exposed via a plugin?
- Any role with no plugins beyond defaults that *should* have extras?
- Any plugin that's installed everywhere via defaults but is rarely used?
3. External survey (use browser-automation):
- github.com/topics/ai-agents (last week)
- github.com/topics/mcp-server (last week)
- claude.ai/cookbook, openai/swarm releases
- anthropic blog, openai blog, langchain blog (last week)
4. For 1-3 highest-value findings, file a GH issue with concrete proposal:
- "Plugin proposal: <name> — wraps <upstream tool> for <role(s)>"
- body: what it does, which roles benefit, integration sketch (~30 lines),
upstream link, license check.
5. Routing: delegate_task to PM with audit_summary metadata
(category=plugins, issues=[…], top_recommendation=…).
6. If nothing notable this week, PM-message a one-line "clean".
enabled: true
- name: Competitive Intelligence
role: Competitor tracking and feature comparison
files_dir: competitive-intelligence
plugins: [browser-automation] # UNION with defaults (#71)
plugins: [browser-automation]
- name: Dev Lead
role: Engineering planning and team coordination
@ -155,6 +210,51 @@ workspaces:
4. Run: cd /workspace/repo && git log --oneline -5
5. Use commit_memory to save the architecture summary and recent changes
6. Wait for tasks from PM.
schedules:
- name: Hourly template fitness audit
cron_expr: "15 * * * *"
prompt: |
Daily audit of `org-templates/molecule-dev/`. Catches drift, stale prompts,
missing schedules, and gaps that block the team-runs-24/7 goal. Symptom
of prior incident (issue #85): cron scheduler died silently for 10+ hours
and nobody noticed because no one was watching template fitness.
1. CHECK SCHEDULES ARE FIRING:
For every workspace_schedule in the platform DB:
curl -s http://host.docker.internal:8080/workspaces/<id>/schedules
Compare last_run_at to now() vs cron interval. Anything more than 2x
the interval behind = STALE. File issue against platform.
2. CHECK SYSTEM PROMPTS ARE FRESH:
cd /workspace/repo
for f in org-templates/molecule-dev/*/system-prompt.md; do
echo "$(git log -1 --format='%ar' -- "$f") $f"
done
Anything not touched in 30+ days might be stale relative to recent
platform changes. Spot-check vs CLAUDE.md and recent merges.
3. CHECK ROLES HAVE PLUGINS THEY NEED:
yq '.workspaces[] | (.name, .plugins)' org-templates/molecule-dev/org.yaml
(or python+yaml). Roles inherit defaults; flag any role that should
plausibly have role-specific extras (compare role description vs
plugins list).
4. CHECK CRONS COVER THE EVOLUTION LEVERS:
The team must keep evolving plugins, template, channels, watchlist.
Verify schedules exist for: ecosystem-watch (Research Lead),
plugin-curation (Technical Researcher), template-fitness (you,
this cron), channel-expansion (DevOps).
Any missing? File issue.
5. CHECK CHANNELS:
Today only PM has telegram. Should any other role have a channel?
(Security Auditor → email on critical findings; DevOps → Slack on
build breaks; etc.) File issue if a channel gap is meaningful.
6. ROUTING: delegate_task to PM with audit_summary metadata
(category=template, severity=…, issues=[…], top_recommendation=…).
7. If everything is fit and current, PM-message one-line "clean".
enabled: true
children:
- name: Frontend Engineer
role: >-
@ -227,6 +327,37 @@ workspaces:
4. Read /workspace/repo/.github/workflows/ci.yml
5. Use commit_memory to save CI pipeline structure
6. Wait for tasks from Dev Lead.
schedules:
- name: Hourly channel expansion survey
cron_expr: "47 * * * *"
prompt: |
Weekly survey of channel integrations (Telegram, Slack, Discord, email,
webhooks). The team should grow its external comms surface where useful,
not stay locked at "PM-only Telegram".
1. INVENTORY:
yq '.workspaces[] | {name: .name, channels: .channels}' \
org-templates/molecule-dev/org.yaml 2>/dev/null
(or python+yaml). List which roles have which channels.
2. PLATFORM CAPABILITY CHECK:
grep -rE "channel|telegram|slack|discord|webhook" \
platform/internal/handlers/ --include="*.go" -l
What channel types does the platform actually support today?
3. GAP ANALYSIS:
- PM has Telegram → can the user reach OTHER roles directly?
- Security Auditor: would email-on-critical-finding help?
- DevOps Engineer: would Slack-on-CI-break help?
- Any role that produces high-value asynchronous output but the
user has to poll memory to see it?
4. EXTERNAL: are there channel platforms we should consider adding?
(Discord for community, GitHub Discussions for product, etc.)
5. For the top 1-2 gaps, file a GH issue:
- "Channel proposal: <type> for <role>" with rationale, integration
sketch, secret requirements (e.g. SLACK_BOT_TOKEN as global secret).
6. ROUTING: delegate_task to PM with audit_summary metadata
(category=channels, issues=[…], top_recommendation=…).
7. If no gap this week, PM-message a one-line "clean".
enabled: true
- name: Security Auditor
role: >-
Owns security posture across the full stack: Go/Gin handlers
@ -488,3 +619,185 @@ workspaces:
d. Save to memory key 'uiux-audit-latest' as a secondary record only.
enabled: true
- name: Documentation Specialist
role: >-
Owns end-to-end documentation across THREE Molecule AI repos:
(1) the platform monorepo (public, Molecule-AI/molecule-monorepo) —
internal architecture, READMEs, edit-history, public API references;
(2) the docs site (public, Molecule-AI/docs) — Fumadocs + Next.js 15,
deployed to doc.moleculesai.app, customer-facing;
(3) the SaaS controlplane (PRIVATE, Molecule-AI/molecule-controlplane) —
Go service that provisions tenants on Fly Machines, with the strict
rule that private implementation details NEVER leak into the public
docs site. Documents controlplane changes only in its own internal
README and the platform monorepo's docs/saas/ section (which itself
is gated). Public docs only describe the SaaS PRODUCT (signup, billing,
tenant lifecycle, multi-tenant data isolation guarantees) — not the
provisioner's internals.
Watches PRs landing on all three repos and opens corresponding docs
PRs whenever a public API changes, a new template/plugin/channel
lands, a user-facing concept evolves, or an ecosystem-watch entry
needs publishing. Holds the line on terminology consistency — every
concept has exactly one canonical name across all three repos.
Definition of done: every public surface has accurate, current,
example-rich documentation; every merged PR that touches a public
surface has a paired docs PR open within one cron tick; every stub
page on the docs site eventually gets backfilled; controlplane
internal docs stay current; nothing private leaks to public.
tier: 3
model: opus
files_dir: documentation-specialist
canvas: { x: 900, y: 250 }
# Documentation Specialist needs browser-automation to crawl the live
# docs site (visual regressions, broken links, dead anchors) plus
# update-docs skill (already in defaults) for cross-repo docs sync.
plugins: [browser-automation]
initial_prompt: |
You just started as Documentation Specialist. Set up silently — do NOT contact other agents.
⚠️ PRIVACY RULE (read first, never violate):
molecule-controlplane is a PRIVATE repo. Its source code, file paths,
internal endpoints, schema details, infra config, billing/auth
implementation — none of that goes into the public docs site
(Molecule-AI/docs) or the public README in molecule-monorepo. Public
docs may describe the SaaS PRODUCT (signup, billing, tenant isolation
guarantees) but never the provisioner's internals. When in doubt:
don't publish.
1. Clone all three repos:
git clone https://github.com/${GITHUB_REPO}.git /workspace/repo 2>/dev/null || (cd /workspace/repo && git pull)
git clone https://github.com/Molecule-AI/docs.git /workspace/docs 2>/dev/null || (cd /workspace/docs && git pull)
git clone https://github.com/Molecule-AI/molecule-controlplane.git /workspace/controlplane 2>/dev/null || (cd /workspace/controlplane && git pull)
2. Read /workspace/repo/CLAUDE.md — full architecture, what's public-facing
3. Read /configs/system-prompt.md
4. Read /workspace/docs/README.md and /workspace/docs/content/docs/index.mdx
5. Read /workspace/controlplane/README.md and /workspace/controlplane/PLAN.md
— understand what the SaaS provisioner does (private) vs what users see (public)
6. Run: cd /workspace/docs && ls content/docs/*.mdx
— note which pages are stubs ("Coming soon" marker) vs hand-written
7. Run: cd /workspace/repo && git log --oneline -20 -- platform/internal/handlers/ org-templates/ plugins/
— note recent public-surface changes in the platform repo
8. Run: cd /workspace/controlplane && git log --oneline -20
— note recent controlplane changes (these need internal docs only)
9. Use commit_memory to save:
- Stubs that need backfilling (docs site)
- Recent platform PRs that have NO docs PR yet
- Recent controlplane PRs whose internal README needs an update
- Public concepts that lack a canonical naming entry
10. Wait for tasks from PM. Your owned surfaces are:
- https://github.com/Molecule-AI/docs (customer site, Fumadocs) — PUBLIC
- /workspace/repo/docs/ (internal architecture / edit-history) — PUBLIC
- /workspace/repo/README.md and per-package READMEs — PUBLIC
- /workspace/controlplane/README.md, PLAN.md, internal docs — PRIVATE
schedules:
- name: Daily docs sync — backfill stubs and pair recent platform PRs
cron_expr: "0 9 * * *"
prompt: |
Daily documentation maintenance. Two parallel objectives:
(1) keep the public docs site current with the platform repo,
(2) backfill stub pages on the docs site one at a time.
SETUP:
cd /workspace/repo && git pull 2>/dev/null || true
cd /workspace/docs && git pull 2>/dev/null || true
cd /workspace/controlplane && git pull 2>/dev/null || true
1a. PAIR RECENT PLATFORM PRS (last 24h):
cd /workspace/repo
gh pr list --repo Molecule-AI/molecule-monorepo --state merged \
--search "merged:>$(date -u -d '24 hours ago' +%Y-%m-%dT%H:%M:%SZ)" \
--json number,title,files
For each merged PR that touches a public surface
(platform/internal/handlers/, plugins/*, org-templates/*,
docs/architecture.md, README.md, workspace-template/adapters/*):
- Identify which docs page(s) on the public site cover that surface.
- If a docs page exists but is stale → update it with examples
from the PR diff. Open a PR to Molecule-AI/docs with the change.
- If NO docs page exists for the new surface → propose one
(add to content/docs/meta.json + new .mdx file). Open a PR.
- Always close PRs with `Closes platform PR #N` so the link is durable.
1b. PAIR RECENT CONTROLPLANE PRS (last 24h):
cd /workspace/controlplane
gh pr list --repo Molecule-AI/molecule-controlplane --state merged \
--search "merged:>$(date -u -d '24 hours ago' +%Y-%m-%dT%H:%M:%SZ)" \
--json number,title,files
⚠️ PRIVATE REPO. Two cases:
(i) Internal-only change (handler, schema, infra, fly.toml,
billing logic): update README.md + PLAN.md + any
docs/internal/*.md inside molecule-controlplane itself.
Open the PR against Molecule-AI/molecule-controlplane.
NEVER mention these changes in /workspace/docs.
(ii) Customer-facing change (new tier, new region, new SLA,
pricing change, signup flow change): write a sanitized
description for the PUBLIC docs site (e.g. "We now offer
EU-region tenants" — NOT "controlplane reads FLY_REGION
from env and passes it to provisioner.go:142"). Open a
PR against Molecule-AI/docs.
When unsure which category a change falls into: default to
INTERNAL-only and ask PM for explicit approval before publishing.
2. BACKFILL ONE STUB PAGE:
cd /workspace/docs
grep -l "Coming soon" content/docs/*.mdx | head -1
Pick the highest-priority stub (one of: org-template, plugins,
channels, schedules, architecture, api-reference, self-hosting,
observability, troubleshooting). Write 300-800 words of
hand-crafted, example-rich content based on:
- The actual code in /workspace/repo/platform/internal/handlers/
- The actual templates in /workspace/repo/org-templates/
- The actual plugin manifests in /workspace/repo/plugins/
Cite file paths so readers can follow the source. Open a PR.
3. LINK + ANCHOR CHECK:
Use the browser-automation plugin to crawl
https://doc.moleculesai.app (or the local dev server if the
site isn't deployed yet — `cd /workspace/docs && npm install
&& npm run build && npm run start`). Report broken links and
missing anchors back to PM.
4. ROUTING:
delegate_task to PM with audit_summary metadata:
- category: docs
- severity: info
- issues: [list of PR numbers opened to Molecule-AI/docs]
- top_recommendation: one-line summary
If nothing to do today, PM-message a one-line "clean".
5. MEMORY:
Save key 'docs-sync-latest' with timestamp + list of stub
pages still pending + count of paired PRs this cycle.
enabled: true
- name: Weekly terminology + freshness audit
cron_expr: "0 11 * * 1"
prompt: |
Weekly audit of documentation freshness and terminology consistency.
1. STALE PAGE DETECTION:
cd /workspace/docs && for f in content/docs/*.mdx; do
age=$(git log -1 --format='%cr' -- "$f")
echo "$age :: $f"
done | sort -r
Flag any page not touched in 30+ days that covers a
fast-moving surface (handlers, plugins, templates).
2. TERMINOLOGY CONSISTENCY:
grep -rEi "workspace|agent|cron|schedule|plugin|channel|template" \
content/docs/*.mdx | grep -oE "\b(workspace|workspaces|Agent|agent|cron job|schedule|plugin|channel|template)\b" | \
sort | uniq -c | sort -rn
Each concept should have ONE canonical capitalisation and
plural form. Open a PR fixing inconsistencies.
3. LINK ROT:
grep -rE "\\[.*\\]\\(http[^)]+\\)" content/docs/*.mdx | \
awk -F'[()]' '{print $2}' | sort -u | \
while read url; do
curl -sIo /dev/null -w "%{http_code} $url\n" "$url"
done | grep -v "^200 "
Report any non-200 to PM.
4. ROUTING + MEMORY:
Same audit_summary contract as the daily cron.
Save findings to memory key 'docs-weekly-audit'.
enabled: true

View File

@ -7,6 +7,13 @@ import (
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
)
// maxAncestorWalk caps the depth of the parent-chain walk in
// CanCommunicate. Org trees are realistically 3-5 deep
// (PM → Dev Lead → Backend Engineer is depth 3); 32 is a safety
// ceiling so a malformed cycle in the workspaces table can't loop
// forever.
const maxAncestorWalk = 32
type workspaceRef struct {
ID string
ParentID *string
@ -26,8 +33,51 @@ func getWorkspaceRef(id string) (*workspaceRef, error) {
return &ws, nil
}
// CanCommunicate checks if two workspaces can talk to each other
// based on the hierarchy rules: siblings, parent-child, root-level siblings.
// isAncestorOf returns true if `ancestorID` is found anywhere on the
// parent-chain walk starting from `childID`. Walks at most maxAncestorWalk
// steps so a corrupt parent-cycle cannot loop forever. Returns false on any
// DB lookup error (logged) — fail-secure.
func isAncestorOf(ancestorID, childID string) bool {
current := childID
for i := 0; i < maxAncestorWalk; i++ {
ref, err := getWorkspaceRef(current)
if err != nil {
log.Printf("isAncestorOf: walk lookup %s: %v", current, err)
return false
}
if ref.ParentID == nil {
return false
}
if *ref.ParentID == ancestorID {
return true
}
current = *ref.ParentID
}
log.Printf("isAncestorOf: walk exceeded maxAncestorWalk=%d from %s — corrupt parent chain?",
maxAncestorWalk, childID)
return false
}
// CanCommunicate checks if two workspaces can talk to each other based on
// the org hierarchy. The rules:
//
// - self → self
// - siblings (same parent, including both root-level)
// - any ancestor → any descendant (e.g. PM → Backend Engineer)
// - any descendant → any ancestor (e.g. Security Auditor → PM)
//
// The third and fourth rules generalise the previous "direct parent ↔
// child" check. Originally this was strict 1-step parent/child only,
// which broke the audit-routing contract: Security Auditor (under Dev
// Lead, under PM) could not call delegate_task on PM to deliver an
// audit_summary, so it fell back to delegating to Dev Lead — bypassing
// PM's category_routing entirely.
//
// The relaxation preserves the hierarchy intent (no horizontal cross-team
// chatter — Frontend Engineer cannot directly message Backend Engineer
// unless they share a parent, which they do under Dev Lead) while
// unblocking the leadership-chain pattern that is fundamental to how
// audit summaries fan out across the org.
func CanCommunicate(callerID, targetID string) bool {
if callerID == targetID {
return true
@ -54,15 +104,27 @@ func CanCommunicate(callerID, targetID string) bool {
return true
}
// Parent talking to child
// Direct parent → child (fast path; avoids the ancestor walk)
if target.ParentID != nil && caller.ID == *target.ParentID {
return true
}
// Child talking up to parent
// Direct child → parent (fast path)
if caller.ParentID != nil && target.ID == *caller.ParentID {
return true
}
// Distant ancestor → descendant: caller is somewhere up target's chain.
// Triggers extra DB lookups, only reached when the fast paths above didn't match.
if target.ParentID != nil && isAncestorOf(callerID, *target.ParentID) {
return true
}
// Distant descendant → ancestor: target is somewhere up caller's chain.
// (e.g. Security Auditor → PM, where Security Auditor's parent is Dev Lead.)
if caller.ParentID != nil && isAncestorOf(targetID, *caller.ParentID) {
return true
}
return false
}

View File

@ -97,9 +97,13 @@ func TestCanCommunicate_ChildToParent(t *testing.T) {
func TestCanCommunicate_Denied_DifferentParents(t *testing.T) {
mock := setupMockDB(t)
// ws-a (parent: p1) and ws-b (parent: p2) — not siblings
// ws-a (parent: p1) and ws-b (parent: p2) — not siblings, no shared ancestor.
expectLookup(mock, "ws-a", ptr("p1"))
expectLookup(mock, "ws-b", ptr("p2"))
// Walk #1: isAncestorOf(ws-a, p2) → p2 is parentless, false.
expectLookup(mock, "p2", nil)
// Walk #2: isAncestorOf(ws-b, p1) → p1 is parentless, false.
expectLookup(mock, "p1", nil)
if CanCommunicate("ws-a", "ws-b") {
t.Error("workspaces with different parents should NOT communicate")
@ -108,9 +112,15 @@ func TestCanCommunicate_Denied_DifferentParents(t *testing.T) {
func TestCanCommunicate_Denied_CousinToRoot(t *testing.T) {
mock := setupMockDB(t)
// ws-child (parent: ws-mid) and ws-root (no parent, NOT ws-mid)
// ws-child (parent: ws-mid, which has its own root ws-other-root) and
// ws-root (a different parentless workspace).
// The ancestor walk from ws-child should reach ws-other-root but never
// ws-root, so communication is denied.
expectLookup(mock, "ws-child", ptr("ws-mid"))
expectLookup(mock, "ws-root", nil)
// Ancestor walk: starts at *caller.ParentID = ws-mid. Walks ws-mid → ws-other-root → nil.
expectLookup(mock, "ws-mid", ptr("ws-other-root"))
expectLookup(mock, "ws-other-root", nil)
if CanCommunicate("ws-child", "ws-root") {
t.Error("child should NOT communicate with unrelated root workspace")
@ -136,13 +146,75 @@ func TestCanCommunicate_Denied_TargetNotFound(t *testing.T) {
}
}
func TestCanCommunicate_Denied_Grandchild(t *testing.T) {
func TestCanCommunicate_Allowed_GrandparentToGrandchild(t *testing.T) {
mock := setupMockDB(t)
// ws-grandparent and ws-grandchild (parent: ws-mid, NOT ws-grandparent)
expectLookup(mock, "ws-grandparent", nil)
expectLookup(mock, "ws-grandchild", ptr("ws-mid"))
// PM (no parent) → Backend Engineer (parent: Dev Lead, parent: PM).
// Originally rejected ("grandparent should NOT communicate with grandchild
// directly") — that broke audit_summary routing because Security Auditor
// could not delegate up to PM. The hierarchy is now ancestor↔descendant.
expectLookup(mock, "ws-pm", nil)
expectLookup(mock, "ws-be", ptr("ws-dl"))
// Ancestor walk: target.ParentID = ws-dl. isAncestorOf(ws-pm, ws-dl).
// Walks ws-dl → ws-pm → match. (Walk lookup #1: ws-dl.)
expectLookup(mock, "ws-dl", ptr("ws-pm"))
if CanCommunicate("ws-grandparent", "ws-grandchild") {
t.Error("grandparent should NOT communicate with grandchild directly")
if !CanCommunicate("ws-pm", "ws-be") {
t.Error("PM should be able to communicate with Backend Engineer (descendant)")
}
}
func TestCanCommunicate_Allowed_GrandchildToGrandparent(t *testing.T) {
mock := setupMockDB(t)
// Security Auditor (parent: Dev Lead) → PM (parent of Dev Lead).
// This is the Security Auditor → PM audit_summary delivery path.
expectLookup(mock, "ws-sec", ptr("ws-dl"))
expectLookup(mock, "ws-pm", nil)
// Direct parent → child fast path: target.ParentID = nil, skip.
// Direct child → parent: caller.ParentID = ws-dl, target.ID = ws-pm,
// ws-dl != ws-pm, skip.
// Distant ancestor → descendant: target.ParentID = nil, skip.
// Distant descendant → ancestor: caller.ParentID = ws-dl. Walks
// isAncestorOf(ws-pm, ws-dl) → looks up ws-dl → returns ws-pm → match.
expectLookup(mock, "ws-dl", ptr("ws-pm"))
if !CanCommunicate("ws-sec", "ws-pm") {
t.Error("Security Auditor should be able to send audit_summary up to PM")
}
}
func TestCanCommunicate_Allowed_DeepAncestor(t *testing.T) {
mock := setupMockDB(t)
// Four-level chain: ws-leaf (parent: ws-l3, parent: ws-l2, parent: ws-l1).
// ws-leaf → ws-l1 should be allowed.
expectLookup(mock, "ws-leaf", ptr("ws-l3"))
expectLookup(mock, "ws-l1", nil)
// Distant descendant → ancestor walk: starts at ws-l3.
// ws-l3 → ws-l2: not ws-l1, continue.
// ws-l2 → ws-l1: match!
expectLookup(mock, "ws-l3", ptr("ws-l2"))
expectLookup(mock, "ws-l2", ptr("ws-l1"))
if !CanCommunicate("ws-leaf", "ws-l1") {
t.Error("4-level descendant should reach root ancestor")
}
}
func TestCanCommunicate_Denied_UnrelatedAncestors(t *testing.T) {
mock := setupMockDB(t)
// Two separate org subtrees:
// tree A: ws-a-leaf → ws-a-mid → ws-a-root
// tree B: ws-b-leaf → ws-b-mid → ws-b-root
// ws-a-leaf → ws-b-root must be denied even though both have parents
// (no shared ancestor).
expectLookup(mock, "ws-a-leaf", ptr("ws-a-mid"))
expectLookup(mock, "ws-b-root", nil)
// Walk: isAncestorOf(ws-b-root, ws-a-mid).
// ws-a-mid → ws-a-root: not ws-b-root, continue.
// ws-a-root has no parent → false.
expectLookup(mock, "ws-a-mid", ptr("ws-a-root"))
expectLookup(mock, "ws-a-root", nil)
if CanCommunicate("ws-a-leaf", "ws-b-root") {
t.Error("workspaces in different subtrees should NOT communicate via the walk")
}
}

View File

@ -48,22 +48,70 @@ type scheduleRow struct {
type Scheduler struct {
proxy A2AProxy
broadcaster Broadcaster
// lastTickAt records the wall-clock time of the most recent tick
// (whether it fired schedules or not). Read by Healthy() and the
// /admin/scheduler/health endpoint to detect stuck-tick conditions.
// Atomic-ish via the mutex; tick rate is 30s so contention is trivial.
mu sync.RWMutex
lastTickAt time.Time
}
func New(proxy A2AProxy, broadcaster Broadcaster) *Scheduler {
return &Scheduler{proxy: proxy, broadcaster: broadcaster}
}
// LastTickAt returns the wall-clock time of the most recent successful tick.
// Returns the zero Time if Start() has never been called or no tick has
// completed since process start.
func (s *Scheduler) LastTickAt() time.Time {
s.mu.RLock()
defer s.mu.RUnlock()
return s.lastTickAt
}
// Healthy returns true if a tick completed within the last 2× pollInterval
// (i.e. at most 1 missed tick is tolerated). Use from /health and from
// /admin/scheduler/health to surface scheduler liveness.
func (s *Scheduler) Healthy() bool {
last := s.LastTickAt()
if last.IsZero() {
return false
}
return time.Since(last) < 2*pollInterval
}
// Start runs the scheduler poll loop. Blocks until ctx is cancelled.
//
// Defends against panics inside tick() so a single bad row / bad cron
// expression / DB blip can't permanently kill the scheduler. Without
// this recover the goroutine dies and the only signal to the operator
// is "no crons firing" — which we observed as a 12+ hour silent outage
// on 2026-04-14 (issue #85).
func (s *Scheduler) Start(ctx context.Context) {
ticker := time.NewTicker(pollInterval)
defer ticker.Stop()
log.Printf("Scheduler: started (poll interval=%s)", pollInterval)
// Heartbeat before the first tick so /admin/liveness doesn't flag stale
// during the initial 30s interval after startup.
tickWithRecover := func() {
defer func() {
if r := recover(); r != nil {
log.Printf("Scheduler: PANIC in tick — recovered: %v (next tick in %s)", r, pollInterval)
}
}()
s.tick(ctx)
s.mu.Lock()
s.lastTickAt = time.Now()
s.mu.Unlock()
}
// Heartbeat + initial lastTickAt so /admin/liveness and Healthy() both
// pass during the first 30s interval after startup.
supervised.Heartbeat("scheduler")
s.mu.Lock()
s.lastTickAt = time.Now()
s.mu.Unlock()
for {
select {
@ -71,7 +119,7 @@ func (s *Scheduler) Start(ctx context.Context) {
log.Println("Scheduler: stopped")
return
case <-ticker.C:
s.tick(ctx)
tickWithRecover()
supervised.Heartbeat("scheduler")
}
}