merge: resolve scheduler conflicts with main (#85 panic-recover + supervised heartbeat)

2026-04-15 00:12:29 -07:00 · 2026-04-15 00:12:29 -07:00 · ba375e8551
commit ba375e8551
parent 80a6fa6db5 bc29675ab1
10 changed files with 749 additions and 19 deletions
--- a/PLAN.md
+++ b/PLAN.md
@ -247,6 +247,9 @@ point for "what else is out there."
 - **GitHub issue #15** — Provisioner: auto-refresh `CLAUDE_CODE_OAUTH_TOKEN` from `global_secrets` on workspace restart → **DONE** via PR #64 (`SetGlobal` / `DeleteGlobal` now fan out `RestartByID` to every affected workspace).
 - **GitHub issue #19 Layer 1** — Platform-generated restart context → **DONE** via PR #65 (synthetic A2A `message/send` with `metadata.kind=restart_context`, `system:restart-context` caller prefix, 30s re-register wait). Layer 2 deferred to issue #66 (see Backlog item 15 above).

+### Recently launched (2026-04-15 tick-9)
+- **Phase 32 Phase B.2 (image pipeline)** — PR #80 (merged `c3cc8e87`) adds `.github/workflows/publish-platform-image.yml`: on every main-merge touching `platform/**`, builds `platform/Dockerfile` and pushes `ghcr.io/molecule-ai/platform:latest` + `:sha-<commit>` to GHCR. Paired with the private `molecule-controlplane` Fly + Neon provisioner (PR #3 there, merged `2e85d5ad`) that reads `TENANT_IMAGE` env and boots tenant Fly Machines from this image. Tick-8 docs-sync PR #79 (merged `d53a1287`) also landed.
+
 ### Recently launched (2026-04-14 tick-8)
 - **Phase 32 PR #1** — `TenantGuard` middleware (PR #78, merged `57a05686`). Public repo's only SaaS hook: when `MOLECULE_ORG_ID` env is set, non-allowlisted requests require matching `X-Molecule-Org-Id` header or 404. Unset → passthrough (self-hosted unchanged). Allowlist is exact-match: `/health` + `/metrics`. Paired with the private `Molecule-AI/molecule-controlplane` repo scaffolded this tick (Fly Machines provisioner stub, `/cp/orgs` CRUD, subdomain→fly-replay router, migrations 001-003 for `organizations`/`org_instances`/`org_members`). +6 `TestTenantGuard_*` tests. Phase 32 plan: follow-up PRs wire real Fly provisioner, WorkOS AuthKit, Stripe, Cloudflare, signup UX — all in the private repo except the single public middleware.

--- a/canvas/src/app/layout.tsx
+++ b/canvas/src/app/layout.tsx
@ -1,5 +1,6 @@
 import type { Metadata } from "next";
 import "./globals.css";
+import { AuthGate } from "@/components/AuthGate";

 export const metadata: Metadata = {
  title: "Molecule AI",
@ -13,7 +14,13 @@ export default function RootLayout({
 }) {
  return (
    <html lang="en">
-      <body className="bg-zinc-950 text-white">{children}</body>
+      <body className="bg-zinc-950 text-white">
+        {/* AuthGate is a client component; it checks the session on mount
+            and bounces anonymous users to the control plane's login page
+            when running on a tenant subdomain. Non-SaaS hosts (localhost,
+            vercel preview URL, apex) pass through unchanged. */}
+        <AuthGate>{children}</AuthGate>
+      </body>
    </html>
  );
 }
--- a/canvas/src/components/AuthGate.tsx
+++ b/canvas/src/components/AuthGate.tsx
@ -0,0 +1,68 @@
+"use client";
+
+/**
+ * AuthGate wraps the canvas root so every page is gated on a valid session.
+ * Anonymous users get bounced to app.moleculesai.app/cp/auth/login?return_to=<here>.
+ *
+ * In non-SaaS mode (no tenant slug — local dev, apex, vercel preview URL),
+ * the gate is a pass-through: canvas works without auth for local dev.
+ * This mirrors the control plane's "disabled provider" fallback.
+ */
+import { useEffect, useState, type ReactNode } from "react";
+import { fetchSession, redirectToLogin, type Session } from "@/lib/auth";
+import { getTenantSlug } from "@/lib/tenant";
+
+export type AuthGateState =
+  | { kind: "loading" }
+  | { kind: "anonymous"; skipRedirect: boolean }
+  | { kind: "authenticated"; session: Session };
+
+export function AuthGate({ children }: { children: ReactNode }) {
+  const [state, setState] = useState<AuthGateState>({ kind: "loading" });
+
+  useEffect(() => {
+    // In non-SaaS mode (no tenant slug) we skip the gate entirely —
+    // local dev, vercel preview URLs, and the app.moleculesai.app apex
+    // should not force login for API-only interactions.
+    const slug = getTenantSlug();
+    if (!slug) {
+      setState({ kind: "anonymous", skipRedirect: true });
+      return;
+    }
+    let cancelled = false;
+    fetchSession()
+      .then((s) => {
+        if (cancelled) return;
+        if (s) {
+          setState({ kind: "authenticated", session: s });
+        } else {
+          setState({ kind: "anonymous", skipRedirect: false });
+        }
+      })
+      .catch(() => {
+        // Network error — fail closed (show signin) so a transient
+        // outage doesn't leak the canvas UI to an unauth'd user.
+        if (!cancelled) setState({ kind: "anonymous", skipRedirect: false });
+      });
+    return () => {
+      cancelled = true;
+    };
+  }, []);
+
+  useEffect(() => {
+    if (state.kind === "anonymous" && !state.skipRedirect) {
+      redirectToLogin("sign-in");
+    }
+  }, [state]);
+
+  if (state.kind === "loading") {
+    // Minimal placeholder; canvas has its own loading UI downstream.
+    return null;
+  }
+  if (state.kind === "anonymous" && !state.skipRedirect) {
+    // Redirect already firing from the effect above; render nothing in
+    // the interim to avoid a flash of unauthenticated content.
+    return null;
+  }
+  return <>{children}</>;
+}
--- a/canvas/src/lib/tests/auth.test.ts
+++ b/canvas/src/lib/tests/auth.test.ts
@ -0,0 +1,69 @@
+/**
+ * @vitest-environment jsdom
+ */
+import { describe, it, expect, vi, afterEach } from "vitest";
+import { fetchSession, redirectToLogin } from "../auth";
+
+afterEach(() => {
+  vi.unstubAllGlobals();
+  vi.restoreAllMocks();
+});
+
+describe("fetchSession", () => {
+  it("returns session on 200", async () => {
+    vi.stubGlobal("fetch", vi.fn().mockResolvedValue({
+      ok: true,
+      status: 200,
+      json: async () => ({ user_id: "u1", org_id: "o1", email: "a@x.com" }),
+    }));
+    const s = await fetchSession();
+    expect(s).toEqual({ user_id: "u1", org_id: "o1", email: "a@x.com" });
+  });
+
+  it("returns null on 401 without throwing", async () => {
+    vi.stubGlobal("fetch", vi.fn().mockResolvedValue({ ok: false, status: 401 }));
+    const s = await fetchSession();
+    expect(s).toBeNull();
+  });
+
+  it("throws on 500 so transient outages aren't treated as 'anonymous'", async () => {
+    vi.stubGlobal("fetch", vi.fn().mockResolvedValue({ ok: false, status: 500, statusText: "oops" }));
+    await expect(fetchSession()).rejects.toThrow("500");
+  });
+
+  it("sends credentials:include for cross-origin cookies", async () => {
+    const fetchMock = vi.fn().mockResolvedValue({ ok: false, status: 401 });
+    vi.stubGlobal("fetch", fetchMock);
+    await fetchSession();
+    expect(fetchMock).toHaveBeenCalledWith(
+      expect.stringContaining("/cp/auth/me"),
+      expect.objectContaining({ credentials: "include" }),
+    );
+  });
+});
+
+describe("redirectToLogin", () => {
+  it("sets window.location to cp login URL with return_to", () => {
+    const href = "https://acme.moleculesai.app/dashboard";
+    Object.defineProperty(window, "location", {
+      writable: true,
+      value: { href },
+    });
+    redirectToLogin("sign-in");
+    // href now holds the redirect target. encodeURIComponent(href) must
+    // appear in the query.
+    expect((window.location as unknown as { href: string }).href).toContain("/cp/auth/login");
+    expect((window.location as unknown as { href: string }).href).toContain(
+      encodeURIComponent(href),
+    );
+  });
+
+  it("uses signup path for sign-up screenHint", () => {
+    Object.defineProperty(window, "location", {
+      writable: true,
+      value: { href: "https://acme.moleculesai.app/" },
+    });
+    redirectToLogin("sign-up");
+    expect((window.location as unknown as { href: string }).href).toContain("/cp/auth/signup");
+  });
+});
--- a/canvas/src/lib/auth.ts
+++ b/canvas/src/lib/auth.ts
@ -0,0 +1,51 @@
+/**
+ * Canvas-side session detection. Calls /cp/auth/me on the control plane
+ * (via same-origin → PLATFORM_URL) and returns the session or null.
+ *
+ * 401 is the "anonymous" signal and does NOT throw — the caller decides
+ * whether to redirect. Network errors do throw so React error boundaries
+ * can surface them.
+ */
+import { PLATFORM_URL } from "./api";
+
+export interface Session {
+  user_id: string;
+  org_id: string;
+  email: string;
+}
+
+// Base path prefix for auth endpoints on the control plane.
+const AUTH_BASE = "/cp/auth";
+
+/**
+ * fetchSession probes /cp/auth/me with the session cookie (credentials:
+ * include mandatory cross-origin). Returns the Session on 200, null on
+ * 401 (anonymous), throws on anything else so callers don't silently
+ * treat a 5xx as "not logged in".
+ */
+export async function fetchSession(): Promise<Session | null> {
+  const res = await fetch(`${PLATFORM_URL}${AUTH_BASE}/me`, {
+    credentials: "include",
+  });
+  if (res.status === 401) return null;
+  if (!res.ok) {
+    throw new Error(`/cp/auth/me: ${res.status} ${res.statusText}`);
+  }
+  return res.json();
+}
+
+/**
+ * redirectToLogin bounces the browser to the control plane's login page
+ * with a `return_to` param so the user lands back on the current URL
+ * after signup/login completes. Same-origin safety is enforced on the
+ * CP side (isSafeReturnTo rejects cross-domain / http / protocol-
+ * relative URLs). Uses window.location.href so the full URL including
+ * query + hash survives the round trip.
+ */
+export function redirectToLogin(screenHint: "sign-up" | "sign-in" = "sign-in"): void {
+  if (typeof window === "undefined") return;
+  const returnTo = window.location.href;
+  const path = screenHint === "sign-up" ? "signup" : "login";
+  const dest = `${PLATFORM_URL}${AUTH_BASE}/${path}?return_to=${encodeURIComponent(returnTo)}`;
+  window.location.href = dest;
+}
--- a/docs/edit-history/2026-04-15.md
+++ b/docs/edit-history/2026-04-15.md
@ -0,0 +1,37 @@
+# Edit history — 2026-04-15
+
+## tick-9: Phase 32 Phase B.2 image pipeline (PR #80) + tick-8 docs sync (PR #79)
+
+Two merges:
+
+### PR #79 — `docs: sync documentation with 2026-04-14 tick-8 merge (#78)`
+Merge commit `d53a1287`. Tick-8 docs sync for the TenantGuard middleware.
+Pure docs; CLAUDE.md test count + PLAN.md tick-8 block + edit-history entry.
+
+### PR #80 — `feat(ci): publish-platform-image → ghcr.io/molecule-ai/platform (Phase B.2)`
+Merge commit `c3cc8e87`. Noteworthy: ci-infra.
+
+Adds `.github/workflows/publish-platform-image.yml`:
+- Trigger: push to main touching `platform/**`; also `workflow_dispatch`.
+- Builds `platform/Dockerfile` via `docker/build-push-action@v5`.
+- Pushes two tags per run: `ghcr.io/molecule-ai/platform:latest` (floating)
+  and `:sha-<short-commit>` (immutable, pin-friendly).
+- GHA cache via `cache-from/cache-to: type=gha` for warm rebuilds.
+- Permissions: `contents:read` + `packages:write`; authenticates to GHCR
+  using the built-in `GITHUB_TOKEN`, no extra secrets.
+- OCI labels propagate source URL + commit SHA for provenance.
+
+Purpose: pairs with the private `molecule-controlplane` Fly + Neon
+provisioner (PR #3 there, merged `2e85d5ad`) which reads
+`TENANT_IMAGE=ghcr.io/molecule-ai/platform:<tag>` from env and spawns
+each tenant Fly Machine from this image.
+
+### Deployment state (informational — not in any repo)
+- Fly apps (`molecule-cp`, `molecule-tenant`): **pending CEO** (`flyctl apps create`).
+- Fly billing card: **pending CEO**.
+- First real tenant provision: **blocked** on the two above.
+
+### File deltas (public repo)
+- `.github/workflows/publish-platform-image.yml` — new.
+- `CLAUDE.md` — tick-9 block for the new CI workflow.
+- `PLAN.md` — new "Recently launched (2026-04-15 tick-9)" entry.
--- a/org-templates/molecule-dev/org.yaml
+++ b/org-templates/molecule-dev/org.yaml
@ -51,6 +51,7 @@ defaults:
    infra: [DevOps Engineer]
    qa: [QA Engineer]
    performance: [Backend Engineer]
+    docs: [Documentation Specialist]
    mixed: [Dev Lead]

  # workspace_dir: not set by default — each agent gets an isolated Docker volume
@ -127,19 +128,73 @@ workspaces:
          4. Read /workspace/repo/docs/product/overview.md to understand the product
          5. Use commit_memory to save key product facts for later recall
          6. Wait for tasks from PM.
+        schedules:
+          - name: Hourly ecosystem watch
+            cron_expr: "8 * * * *"
+            prompt: |
+              Daily survey for new agent-infra / AI-agent projects worth tracking.
+
+              1. Pull docs/ecosystem-watch.md to know what's already tracked.
+              2. Browse the web for last 24h:
+                 - github.com/trending?since=daily&language=python (and typescript, go)
+                 - HN front page, anything about agent frameworks
+                 - Twitter/X mentions of new agent SDKs, MCP servers, frameworks
+              3. Cross-reference: skip anything already in ecosystem-watch.md.
+              4. For each genuinely new + relevant project (1-3 max per day):
+                 - Add an entry under "## Entries" using the existing template
+                   (Pitch / Shape / Overlap / Differentiation / Worth borrowing /
+                    Terminology collisions / Signals to react to / Last reviewed + stars)
+                 - Keep each entry ≤200 words.
+              5. If a finding suggests a concrete improvement to plugins/, workspace-template/,
+                 or org-templates/, file a GH issue (`gh issue create`) with the proposal.
+              6. Commit additions to a branch named chore/eco-watch-YYYY-MM-DD. PUSH it
+                 (per the repo "always raise PR" policy) and open a PR.
+              7. Routing: delegate_task to PM with summary
+                 (audit_summary metadata: category=research, severity=info,
+                  issues=[<gh issue numbers>], top_recommendation=<one-liner>).
+              8. If nothing notable today, skip the commit and PM-message a one-line "clean".
+            enabled: true
        children:
          - name: Market Analyst
            role: Market sizing, trends, user research
            files_dir: market-analyst
-            plugins: [browser-automation]  # UNION with defaults (#71)
+            plugins: [browser-automation]
          - name: Technical Researcher
            role: AI frameworks and protocol evaluation
            files_dir: technical-researcher
-            plugins: [browser-automation]  # UNION with defaults (#71)
+            plugins: [browser-automation]
+            schedules:
+              - name: Hourly plugin curation
+                cron_expr: "22 * * * *"
+                prompt: |
+                  Weekly survey of `plugins/` and `workspace-template/builtin_tools/` for
+                  evolution opportunities. The team should keep gaining capabilities.
+
+                  1. Inventory:
+                     - ls plugins/ — every plugin and its plugin.yaml description
+                     - ls workspace-template/builtin_tools/*.py — every builtin tool
+                     - cat org-templates/molecule-dev/org.yaml — see how plugins are wired
+                  2. Gap analysis:
+                     - Any builtin_tool not exposed via a plugin?
+                     - Any role with no plugins beyond defaults that *should* have extras?
+                     - Any plugin that's installed everywhere via defaults but is rarely used?
+                  3. External survey (use browser-automation):
+                     - github.com/topics/ai-agents (last week)
+                     - github.com/topics/mcp-server (last week)
+                     - claude.ai/cookbook, openai/swarm releases
+                     - anthropic blog, openai blog, langchain blog (last week)
+                  4. For 1-3 highest-value findings, file a GH issue with concrete proposal:
+                     - "Plugin proposal: <name> — wraps <upstream tool> for <role(s)>"
+                     - body: what it does, which roles benefit, integration sketch (~30 lines),
+                       upstream link, license check.
+                  5. Routing: delegate_task to PM with audit_summary metadata
+                     (category=plugins, issues=[…], top_recommendation=…).
+                  6. If nothing notable this week, PM-message a one-line "clean".
+                enabled: true
          - name: Competitive Intelligence
            role: Competitor tracking and feature comparison
            files_dir: competitive-intelligence
-            plugins: [browser-automation]  # UNION with defaults (#71)
+            plugins: [browser-automation]

      - name: Dev Lead
        role: Engineering planning and team coordination
@ -155,6 +210,51 @@ workspaces:
          4. Run: cd /workspace/repo && git log --oneline -5
          5. Use commit_memory to save the architecture summary and recent changes
          6. Wait for tasks from PM.
+        schedules:
+          - name: Hourly template fitness audit
+            cron_expr: "15 * * * *"
+            prompt: |
+              Daily audit of `org-templates/molecule-dev/`. Catches drift, stale prompts,
+              missing schedules, and gaps that block the team-runs-24/7 goal. Symptom
+              of prior incident (issue #85): cron scheduler died silently for 10+ hours
+              and nobody noticed because no one was watching template fitness.
+
+              1. CHECK SCHEDULES ARE FIRING:
+                 For every workspace_schedule in the platform DB:
+                 curl -s http://host.docker.internal:8080/workspaces/<id>/schedules
+                 Compare last_run_at to now() vs cron interval. Anything more than 2x
+                 the interval behind = STALE. File issue against platform.
+
+              2. CHECK SYSTEM PROMPTS ARE FRESH:
+                 cd /workspace/repo
+                 for f in org-templates/molecule-dev/*/system-prompt.md; do
+                   echo "$(git log -1 --format='%ar' -- "$f") $f"
+                 done
+                 Anything not touched in 30+ days might be stale relative to recent
+                 platform changes. Spot-check vs CLAUDE.md and recent merges.
+
+              3. CHECK ROLES HAVE PLUGINS THEY NEED:
+                 yq '.workspaces[] | (.name, .plugins)' org-templates/molecule-dev/org.yaml
+                 (or python+yaml). Roles inherit defaults; flag any role that should
+                 plausibly have role-specific extras (compare role description vs
+                 plugins list).
+
+              4. CHECK CRONS COVER THE EVOLUTION LEVERS:
+                 The team must keep evolving plugins, template, channels, watchlist.
+                 Verify schedules exist for: ecosystem-watch (Research Lead),
+                 plugin-curation (Technical Researcher), template-fitness (you,
+                 this cron), channel-expansion (DevOps).
+                 Any missing? File issue.
+
+              5. CHECK CHANNELS:
+                 Today only PM has telegram. Should any other role have a channel?
+                 (Security Auditor → email on critical findings; DevOps → Slack on
+                  build breaks; etc.) File issue if a channel gap is meaningful.
+
+              6. ROUTING: delegate_task to PM with audit_summary metadata
+                 (category=template, severity=…, issues=[…], top_recommendation=…).
+              7. If everything is fit and current, PM-message one-line "clean".
+            enabled: true
        children:
          - name: Frontend Engineer
            role: >-
@ -227,6 +327,37 @@ workspaces:
              4. Read /workspace/repo/.github/workflows/ci.yml
              5. Use commit_memory to save CI pipeline structure
              6. Wait for tasks from Dev Lead.
+            schedules:
+              - name: Hourly channel expansion survey
+                cron_expr: "47 * * * *"
+                prompt: |
+                  Weekly survey of channel integrations (Telegram, Slack, Discord, email,
+                  webhooks). The team should grow its external comms surface where useful,
+                  not stay locked at "PM-only Telegram".
+
+                  1. INVENTORY:
+                     yq '.workspaces[] | {name: .name, channels: .channels}' \
+                       org-templates/molecule-dev/org.yaml 2>/dev/null
+                     (or python+yaml). List which roles have which channels.
+                  2. PLATFORM CAPABILITY CHECK:
+                     grep -rE "channel|telegram|slack|discord|webhook" \
+                       platform/internal/handlers/ --include="*.go" -l
+                     What channel types does the platform actually support today?
+                  3. GAP ANALYSIS:
+                     - PM has Telegram → can the user reach OTHER roles directly?
+                     - Security Auditor: would email-on-critical-finding help?
+                     - DevOps Engineer: would Slack-on-CI-break help?
+                     - Any role that produces high-value asynchronous output but the
+                       user has to poll memory to see it?
+                  4. EXTERNAL: are there channel platforms we should consider adding?
+                     (Discord for community, GitHub Discussions for product, etc.)
+                  5. For the top 1-2 gaps, file a GH issue:
+                     - "Channel proposal: <type> for <role>" with rationale, integration
+                       sketch, secret requirements (e.g. SLACK_BOT_TOKEN as global secret).
+                  6. ROUTING: delegate_task to PM with audit_summary metadata
+                     (category=channels, issues=[…], top_recommendation=…).
+                  7. If no gap this week, PM-message a one-line "clean".
+                enabled: true
          - name: Security Auditor
            role: >-
              Owns security posture across the full stack: Go/Gin handlers
@ -488,3 +619,185 @@ workspaces:

                  d. Save to memory key 'uiux-audit-latest' as a secondary record only.
                enabled: true
+
+      - name: Documentation Specialist
+        role: >-
+          Owns end-to-end documentation across THREE Molecule AI repos:
+          (1) the platform monorepo (public, Molecule-AI/molecule-monorepo) —
+          internal architecture, READMEs, edit-history, public API references;
+          (2) the docs site (public, Molecule-AI/docs) — Fumadocs + Next.js 15,
+          deployed to doc.moleculesai.app, customer-facing;
+          (3) the SaaS controlplane (PRIVATE, Molecule-AI/molecule-controlplane) —
+          Go service that provisions tenants on Fly Machines, with the strict
+          rule that private implementation details NEVER leak into the public
+          docs site. Documents controlplane changes only in its own internal
+          README and the platform monorepo's docs/saas/ section (which itself
+          is gated). Public docs only describe the SaaS PRODUCT (signup, billing,
+          tenant lifecycle, multi-tenant data isolation guarantees) — not the
+          provisioner's internals.
+          Watches PRs landing on all three repos and opens corresponding docs
+          PRs whenever a public API changes, a new template/plugin/channel
+          lands, a user-facing concept evolves, or an ecosystem-watch entry
+          needs publishing. Holds the line on terminology consistency — every
+          concept has exactly one canonical name across all three repos.
+          Definition of done: every public surface has accurate, current,
+          example-rich documentation; every merged PR that touches a public
+          surface has a paired docs PR open within one cron tick; every stub
+          page on the docs site eventually gets backfilled; controlplane
+          internal docs stay current; nothing private leaks to public.
+        tier: 3
+        model: opus
+        files_dir: documentation-specialist
+        canvas: { x: 900, y: 250 }
+        # Documentation Specialist needs browser-automation to crawl the live
+        # docs site (visual regressions, broken links, dead anchors) plus
+        # update-docs skill (already in defaults) for cross-repo docs sync.
+        plugins: [browser-automation]
+        initial_prompt: |
+          You just started as Documentation Specialist. Set up silently — do NOT contact other agents.
+
+          ⚠️  PRIVACY RULE (read first, never violate):
+          molecule-controlplane is a PRIVATE repo. Its source code, file paths,
+          internal endpoints, schema details, infra config, billing/auth
+          implementation — none of that goes into the public docs site
+          (Molecule-AI/docs) or the public README in molecule-monorepo. Public
+          docs may describe the SaaS PRODUCT (signup, billing, tenant isolation
+          guarantees) but never the provisioner's internals. When in doubt:
+          don't publish.
+
+          1. Clone all three repos:
+             git clone https://github.com/${GITHUB_REPO}.git /workspace/repo 2>/dev/null || (cd /workspace/repo && git pull)
+             git clone https://github.com/Molecule-AI/docs.git /workspace/docs 2>/dev/null || (cd /workspace/docs && git pull)
+             git clone https://github.com/Molecule-AI/molecule-controlplane.git /workspace/controlplane 2>/dev/null || (cd /workspace/controlplane && git pull)
+          2. Read /workspace/repo/CLAUDE.md — full architecture, what's public-facing
+          3. Read /configs/system-prompt.md
+          4. Read /workspace/docs/README.md and /workspace/docs/content/docs/index.mdx
+          5. Read /workspace/controlplane/README.md and /workspace/controlplane/PLAN.md
+             — understand what the SaaS provisioner does (private) vs what users see (public)
+          6. Run: cd /workspace/docs && ls content/docs/*.mdx
+             — note which pages are stubs ("Coming soon" marker) vs hand-written
+          7. Run: cd /workspace/repo && git log --oneline -20 -- platform/internal/handlers/ org-templates/ plugins/
+             — note recent public-surface changes in the platform repo
+          8. Run: cd /workspace/controlplane && git log --oneline -20
+             — note recent controlplane changes (these need internal docs only)
+          9. Use commit_memory to save:
+             - Stubs that need backfilling (docs site)
+             - Recent platform PRs that have NO docs PR yet
+             - Recent controlplane PRs whose internal README needs an update
+             - Public concepts that lack a canonical naming entry
+          10. Wait for tasks from PM. Your owned surfaces are:
+             - https://github.com/Molecule-AI/docs (customer site, Fumadocs) — PUBLIC
+             - /workspace/repo/docs/ (internal architecture / edit-history) — PUBLIC
+             - /workspace/repo/README.md and per-package READMEs — PUBLIC
+             - /workspace/controlplane/README.md, PLAN.md, internal docs — PRIVATE
+        schedules:
+          - name: Daily docs sync — backfill stubs and pair recent platform PRs
+            cron_expr: "0 9 * * *"
+            prompt: |
+              Daily documentation maintenance. Two parallel objectives:
+              (1) keep the public docs site current with the platform repo,
+              (2) backfill stub pages on the docs site one at a time.
+
+              SETUP:
+                cd /workspace/repo && git pull 2>/dev/null || true
+                cd /workspace/docs && git pull 2>/dev/null || true
+                cd /workspace/controlplane && git pull 2>/dev/null || true
+
+              1a. PAIR RECENT PLATFORM PRS (last 24h):
+                 cd /workspace/repo
+                 gh pr list --repo Molecule-AI/molecule-monorepo --state merged \
+                   --search "merged:>$(date -u -d '24 hours ago' +%Y-%m-%dT%H:%M:%SZ)" \
+                   --json number,title,files
+                 For each merged PR that touches a public surface
+                 (platform/internal/handlers/, plugins/*, org-templates/*,
+                 docs/architecture.md, README.md, workspace-template/adapters/*):
+                 - Identify which docs page(s) on the public site cover that surface.
+                 - If a docs page exists but is stale → update it with examples
+                   from the PR diff. Open a PR to Molecule-AI/docs with the change.
+                 - If NO docs page exists for the new surface → propose one
+                   (add to content/docs/meta.json + new .mdx file). Open a PR.
+                 - Always close PRs with `Closes platform PR #N` so the link is durable.
+
+              1b. PAIR RECENT CONTROLPLANE PRS (last 24h):
+                 cd /workspace/controlplane
+                 gh pr list --repo Molecule-AI/molecule-controlplane --state merged \
+                   --search "merged:>$(date -u -d '24 hours ago' +%Y-%m-%dT%H:%M:%SZ)" \
+                   --json number,title,files
+                 ⚠️  PRIVATE REPO. Two cases:
+                 (i) Internal-only change (handler, schema, infra, fly.toml,
+                     billing logic): update README.md + PLAN.md + any
+                     docs/internal/*.md inside molecule-controlplane itself.
+                     Open the PR against Molecule-AI/molecule-controlplane.
+                     NEVER mention these changes in /workspace/docs.
+                 (ii) Customer-facing change (new tier, new region, new SLA,
+                     pricing change, signup flow change): write a sanitized
+                     description for the PUBLIC docs site (e.g. "We now offer
+                     EU-region tenants" — NOT "controlplane reads FLY_REGION
+                     from env and passes it to provisioner.go:142"). Open a
+                     PR against Molecule-AI/docs.
+                 When unsure which category a change falls into: default to
+                 INTERNAL-only and ask PM for explicit approval before publishing.
+
+              2. BACKFILL ONE STUB PAGE:
+                 cd /workspace/docs
+                 grep -l "Coming soon" content/docs/*.mdx | head -1
+                 Pick the highest-priority stub (one of: org-template, plugins,
+                 channels, schedules, architecture, api-reference, self-hosting,
+                 observability, troubleshooting). Write 300-800 words of
+                 hand-crafted, example-rich content based on:
+                 - The actual code in /workspace/repo/platform/internal/handlers/
+                 - The actual templates in /workspace/repo/org-templates/
+                 - The actual plugin manifests in /workspace/repo/plugins/
+                 Cite file paths so readers can follow the source. Open a PR.
+
+              3. LINK + ANCHOR CHECK:
+                 Use the browser-automation plugin to crawl
+                 https://doc.moleculesai.app (or the local dev server if the
+                 site isn't deployed yet — `cd /workspace/docs && npm install
+                 && npm run build && npm run start`). Report broken links and
+                 missing anchors back to PM.
+
+              4. ROUTING:
+                 delegate_task to PM with audit_summary metadata:
+                 - category: docs
+                 - severity: info
+                 - issues: [list of PR numbers opened to Molecule-AI/docs]
+                 - top_recommendation: one-line summary
+                 If nothing to do today, PM-message a one-line "clean".
+
+              5. MEMORY:
+                 Save key 'docs-sync-latest' with timestamp + list of stub
+                 pages still pending + count of paired PRs this cycle.
+            enabled: true
+          - name: Weekly terminology + freshness audit
+            cron_expr: "0 11 * * 1"
+            prompt: |
+              Weekly audit of documentation freshness and terminology consistency.
+
+              1. STALE PAGE DETECTION:
+                 cd /workspace/docs && for f in content/docs/*.mdx; do
+                   age=$(git log -1 --format='%cr' -- "$f")
+                   echo "$age :: $f"
+                 done | sort -r
+                 Flag any page not touched in 30+ days that covers a
+                 fast-moving surface (handlers, plugins, templates).
+
+              2. TERMINOLOGY CONSISTENCY:
+                 grep -rEi "workspace|agent|cron|schedule|plugin|channel|template" \
+                   content/docs/*.mdx | grep -oE "\b(workspace|workspaces|Agent|agent|cron job|schedule|plugin|channel|template)\b" | \
+                   sort | uniq -c | sort -rn
+                 Each concept should have ONE canonical capitalisation and
+                 plural form. Open a PR fixing inconsistencies.
+
+              3. LINK ROT:
+                 grep -rE "\\[.*\\]\\(http[^)]+\\)" content/docs/*.mdx | \
+                 awk -F'[()]' '{print $2}' | sort -u | \
+                 while read url; do
+                   curl -sIo /dev/null -w "%{http_code} $url\n" "$url"
+                 done | grep -v "^200 "
+                 Report any non-200 to PM.
+
+              4. ROUTING + MEMORY:
+                 Same audit_summary contract as the daily cron.
+                 Save findings to memory key 'docs-weekly-audit'.
+            enabled: true
--- a/platform/internal/registry/access.go
+++ b/platform/internal/registry/access.go
@ -7,6 +7,13 @@ import (
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
 )

+// maxAncestorWalk caps the depth of the parent-chain walk in
+// CanCommunicate. Org trees are realistically 3-5 deep
+// (PM → Dev Lead → Backend Engineer is depth 3); 32 is a safety
+// ceiling so a malformed cycle in the workspaces table can't loop
+// forever.
+const maxAncestorWalk = 32
+
 type workspaceRef struct {
 	ID       string
 	ParentID *string
@ -26,8 +33,51 @@ func getWorkspaceRef(id string) (*workspaceRef, error) {
 	return &ws, nil
 }

-// CanCommunicate checks if two workspaces can talk to each other
-// based on the hierarchy rules: siblings, parent-child, root-level siblings.
+// isAncestorOf returns true if `ancestorID` is found anywhere on the
+// parent-chain walk starting from `childID`. Walks at most maxAncestorWalk
+// steps so a corrupt parent-cycle cannot loop forever. Returns false on any
+// DB lookup error (logged) — fail-secure.
+func isAncestorOf(ancestorID, childID string) bool {
+	current := childID
+	for i := 0; i < maxAncestorWalk; i++ {
+		ref, err := getWorkspaceRef(current)
+		if err != nil {
+			log.Printf("isAncestorOf: walk lookup %s: %v", current, err)
+			return false
+		}
+		if ref.ParentID == nil {
+			return false
+		}
+		if *ref.ParentID == ancestorID {
+			return true
+		}
+		current = *ref.ParentID
+	}
+	log.Printf("isAncestorOf: walk exceeded maxAncestorWalk=%d from %s — corrupt parent chain?",
+		maxAncestorWalk, childID)
+	return false
+}
+
+// CanCommunicate checks if two workspaces can talk to each other based on
+// the org hierarchy. The rules:
+//
+//   - self → self
+//   - siblings (same parent, including both root-level)
+//   - any ancestor → any descendant (e.g. PM → Backend Engineer)
+//   - any descendant → any ancestor (e.g. Security Auditor → PM)
+//
+// The third and fourth rules generalise the previous "direct parent ↔
+// child" check. Originally this was strict 1-step parent/child only,
+// which broke the audit-routing contract: Security Auditor (under Dev
+// Lead, under PM) could not call delegate_task on PM to deliver an
+// audit_summary, so it fell back to delegating to Dev Lead — bypassing
+// PM's category_routing entirely.
+//
+// The relaxation preserves the hierarchy intent (no horizontal cross-team
+// chatter — Frontend Engineer cannot directly message Backend Engineer
+// unless they share a parent, which they do under Dev Lead) while
+// unblocking the leadership-chain pattern that is fundamental to how
+// audit summaries fan out across the org.
 func CanCommunicate(callerID, targetID string) bool {
 	if callerID == targetID {
 		return true
@ -54,15 +104,27 @@ func CanCommunicate(callerID, targetID string) bool {
 		return true
 	}

-	// Parent talking to child
+	// Direct parent → child (fast path; avoids the ancestor walk)
 	if target.ParentID != nil && caller.ID == *target.ParentID {
 		return true
 	}

-	// Child talking up to parent
+	// Direct child → parent (fast path)
 	if caller.ParentID != nil && target.ID == *caller.ParentID {
 		return true
 	}

+	// Distant ancestor → descendant: caller is somewhere up target's chain.
+	// Triggers extra DB lookups, only reached when the fast paths above didn't match.
+	if target.ParentID != nil && isAncestorOf(callerID, *target.ParentID) {
+		return true
+	}
+
+	// Distant descendant → ancestor: target is somewhere up caller's chain.
+	// (e.g. Security Auditor → PM, where Security Auditor's parent is Dev Lead.)
+	if caller.ParentID != nil && isAncestorOf(targetID, *caller.ParentID) {
+		return true
+	}
+
 	return false
 }
--- a/platform/internal/registry/access_test.go
+++ b/platform/internal/registry/access_test.go
@ -97,9 +97,13 @@ func TestCanCommunicate_ChildToParent(t *testing.T) {

 func TestCanCommunicate_Denied_DifferentParents(t *testing.T) {
 	mock := setupMockDB(t)
-	// ws-a (parent: p1) and ws-b (parent: p2) — not siblings
+	// ws-a (parent: p1) and ws-b (parent: p2) — not siblings, no shared ancestor.
 	expectLookup(mock, "ws-a", ptr("p1"))
 	expectLookup(mock, "ws-b", ptr("p2"))
+	// Walk #1: isAncestorOf(ws-a, p2) → p2 is parentless, false.
+	expectLookup(mock, "p2", nil)
+	// Walk #2: isAncestorOf(ws-b, p1) → p1 is parentless, false.
+	expectLookup(mock, "p1", nil)

 	if CanCommunicate("ws-a", "ws-b") {
 		t.Error("workspaces with different parents should NOT communicate")
@ -108,9 +112,15 @@ func TestCanCommunicate_Denied_DifferentParents(t *testing.T) {

 func TestCanCommunicate_Denied_CousinToRoot(t *testing.T) {
 	mock := setupMockDB(t)
-	// ws-child (parent: ws-mid) and ws-root (no parent, NOT ws-mid)
+	// ws-child (parent: ws-mid, which has its own root ws-other-root) and
+	// ws-root (a different parentless workspace).
+	// The ancestor walk from ws-child should reach ws-other-root but never
+	// ws-root, so communication is denied.
 	expectLookup(mock, "ws-child", ptr("ws-mid"))
 	expectLookup(mock, "ws-root", nil)
+	// Ancestor walk: starts at *caller.ParentID = ws-mid. Walks ws-mid → ws-other-root → nil.
+	expectLookup(mock, "ws-mid", ptr("ws-other-root"))
+	expectLookup(mock, "ws-other-root", nil)

 	if CanCommunicate("ws-child", "ws-root") {
 		t.Error("child should NOT communicate with unrelated root workspace")
@ -136,13 +146,75 @@ func TestCanCommunicate_Denied_TargetNotFound(t *testing.T) {
 	}
 }

-func TestCanCommunicate_Denied_Grandchild(t *testing.T) {
+func TestCanCommunicate_Allowed_GrandparentToGrandchild(t *testing.T) {
 	mock := setupMockDB(t)
-	// ws-grandparent and ws-grandchild (parent: ws-mid, NOT ws-grandparent)
-	expectLookup(mock, "ws-grandparent", nil)
-	expectLookup(mock, "ws-grandchild", ptr("ws-mid"))
+	// PM (no parent) → Backend Engineer (parent: Dev Lead, parent: PM).
+	// Originally rejected ("grandparent should NOT communicate with grandchild
+	// directly") — that broke audit_summary routing because Security Auditor
+	// could not delegate up to PM. The hierarchy is now ancestor↔descendant.
+	expectLookup(mock, "ws-pm", nil)
+	expectLookup(mock, "ws-be", ptr("ws-dl"))
+	// Ancestor walk: target.ParentID = ws-dl. isAncestorOf(ws-pm, ws-dl).
+	// Walks ws-dl → ws-pm → match. (Walk lookup #1: ws-dl.)
+	expectLookup(mock, "ws-dl", ptr("ws-pm"))

-	if CanCommunicate("ws-grandparent", "ws-grandchild") {
-		t.Error("grandparent should NOT communicate with grandchild directly")
+	if !CanCommunicate("ws-pm", "ws-be") {
+		t.Error("PM should be able to communicate with Backend Engineer (descendant)")
+	}
+}
+
+func TestCanCommunicate_Allowed_GrandchildToGrandparent(t *testing.T) {
+	mock := setupMockDB(t)
+	// Security Auditor (parent: Dev Lead) → PM (parent of Dev Lead).
+	// This is the Security Auditor → PM audit_summary delivery path.
+	expectLookup(mock, "ws-sec", ptr("ws-dl"))
+	expectLookup(mock, "ws-pm", nil)
+	// Direct parent → child fast path: target.ParentID = nil, skip.
+	// Direct child → parent: caller.ParentID = ws-dl, target.ID = ws-pm,
+	//   ws-dl != ws-pm, skip.
+	// Distant ancestor → descendant: target.ParentID = nil, skip.
+	// Distant descendant → ancestor: caller.ParentID = ws-dl. Walks
+	//   isAncestorOf(ws-pm, ws-dl) → looks up ws-dl → returns ws-pm → match.
+	expectLookup(mock, "ws-dl", ptr("ws-pm"))
+
+	if !CanCommunicate("ws-sec", "ws-pm") {
+		t.Error("Security Auditor should be able to send audit_summary up to PM")
+	}
+}
+
+func TestCanCommunicate_Allowed_DeepAncestor(t *testing.T) {
+	mock := setupMockDB(t)
+	// Four-level chain: ws-leaf (parent: ws-l3, parent: ws-l2, parent: ws-l1).
+	// ws-leaf → ws-l1 should be allowed.
+	expectLookup(mock, "ws-leaf", ptr("ws-l3"))
+	expectLookup(mock, "ws-l1", nil)
+	// Distant descendant → ancestor walk: starts at ws-l3.
+	//   ws-l3 → ws-l2: not ws-l1, continue.
+	//   ws-l2 → ws-l1: match!
+	expectLookup(mock, "ws-l3", ptr("ws-l2"))
+	expectLookup(mock, "ws-l2", ptr("ws-l1"))
+
+	if !CanCommunicate("ws-leaf", "ws-l1") {
+		t.Error("4-level descendant should reach root ancestor")
+	}
+}
+
+func TestCanCommunicate_Denied_UnrelatedAncestors(t *testing.T) {
+	mock := setupMockDB(t)
+	// Two separate org subtrees:
+	//   tree A: ws-a-leaf → ws-a-mid → ws-a-root
+	//   tree B: ws-b-leaf → ws-b-mid → ws-b-root
+	// ws-a-leaf → ws-b-root must be denied even though both have parents
+	// (no shared ancestor).
+	expectLookup(mock, "ws-a-leaf", ptr("ws-a-mid"))
+	expectLookup(mock, "ws-b-root", nil)
+	// Walk: isAncestorOf(ws-b-root, ws-a-mid).
+	//   ws-a-mid → ws-a-root: not ws-b-root, continue.
+	//   ws-a-root has no parent → false.
+	expectLookup(mock, "ws-a-mid", ptr("ws-a-root"))
+	expectLookup(mock, "ws-a-root", nil)
+
+	if CanCommunicate("ws-a-leaf", "ws-b-root") {
+		t.Error("workspaces in different subtrees should NOT communicate via the walk")
 	}
 }
--- a/platform/internal/scheduler/scheduler.go
+++ b/platform/internal/scheduler/scheduler.go
@ -48,22 +48,70 @@ type scheduleRow struct {
 type Scheduler struct {
 	proxy       A2AProxy
 	broadcaster Broadcaster
+
+	// lastTickAt records the wall-clock time of the most recent tick
+	// (whether it fired schedules or not). Read by Healthy() and the
+	// /admin/scheduler/health endpoint to detect stuck-tick conditions.
+	// Atomic-ish via the mutex; tick rate is 30s so contention is trivial.
+	mu         sync.RWMutex
+	lastTickAt time.Time
 }

 func New(proxy A2AProxy, broadcaster Broadcaster) *Scheduler {
 	return &Scheduler{proxy: proxy, broadcaster: broadcaster}
 }

+// LastTickAt returns the wall-clock time of the most recent successful tick.
+// Returns the zero Time if Start() has never been called or no tick has
+// completed since process start.
+func (s *Scheduler) LastTickAt() time.Time {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	return s.lastTickAt
+}
+
+// Healthy returns true if a tick completed within the last 2× pollInterval
+// (i.e. at most 1 missed tick is tolerated). Use from /health and from
+// /admin/scheduler/health to surface scheduler liveness.
+func (s *Scheduler) Healthy() bool {
+	last := s.LastTickAt()
+	if last.IsZero() {
+		return false
+	}
+	return time.Since(last) < 2*pollInterval
+}
+
 // Start runs the scheduler poll loop. Blocks until ctx is cancelled.
+//
+// Defends against panics inside tick() so a single bad row / bad cron
+// expression / DB blip can't permanently kill the scheduler. Without
+// this recover the goroutine dies and the only signal to the operator
+// is "no crons firing" — which we observed as a 12+ hour silent outage
+// on 2026-04-14 (issue #85).
 func (s *Scheduler) Start(ctx context.Context) {
 	ticker := time.NewTicker(pollInterval)
 	defer ticker.Stop()

 	log.Printf("Scheduler: started (poll interval=%s)", pollInterval)

-	// Heartbeat before the first tick so /admin/liveness doesn't flag stale
-	// during the initial 30s interval after startup.
+	tickWithRecover := func() {
+		defer func() {
+			if r := recover(); r != nil {
+				log.Printf("Scheduler: PANIC in tick — recovered: %v (next tick in %s)", r, pollInterval)
+			}
+		}()
+		s.tick(ctx)
+		s.mu.Lock()
+		s.lastTickAt = time.Now()
+		s.mu.Unlock()
+	}
+
+	// Heartbeat + initial lastTickAt so /admin/liveness and Healthy() both
+	// pass during the first 30s interval after startup.
 	supervised.Heartbeat("scheduler")
+	s.mu.Lock()
+	s.lastTickAt = time.Now()
+	s.mu.Unlock()

 	for {
 		select {
@ -71,7 +119,7 @@ func (s *Scheduler) Start(ctx context.Context) {
 			log.Println("Scheduler: stopped")
 			return
 		case <-ticker.C:
-			s.tick(ctx)
+			tickWithRecover()
 			supervised.Heartbeat("scheduler")
 		}
 	}