merge: resolve scheduler conflicts with main (#85 panic-recover + supervised heartbeat)
This commit is contained in:
commit
ba375e8551
3
PLAN.md
3
PLAN.md
@ -247,6 +247,9 @@ point for "what else is out there."
|
||||
- **GitHub issue #15** — Provisioner: auto-refresh `CLAUDE_CODE_OAUTH_TOKEN` from `global_secrets` on workspace restart → **DONE** via PR #64 (`SetGlobal` / `DeleteGlobal` now fan out `RestartByID` to every affected workspace).
|
||||
- **GitHub issue #19 Layer 1** — Platform-generated restart context → **DONE** via PR #65 (synthetic A2A `message/send` with `metadata.kind=restart_context`, `system:restart-context` caller prefix, 30s re-register wait). Layer 2 deferred to issue #66 (see Backlog item 15 above).
|
||||
|
||||
### Recently launched (2026-04-15 tick-9)
|
||||
- **Phase 32 Phase B.2 (image pipeline)** — PR #80 (merged `c3cc8e87`) adds `.github/workflows/publish-platform-image.yml`: on every main-merge touching `platform/**`, builds `platform/Dockerfile` and pushes `ghcr.io/molecule-ai/platform:latest` + `:sha-<commit>` to GHCR. Paired with the private `molecule-controlplane` Fly + Neon provisioner (PR #3 there, merged `2e85d5ad`) that reads `TENANT_IMAGE` env and boots tenant Fly Machines from this image. Tick-8 docs-sync PR #79 (merged `d53a1287`) also landed.
|
||||
|
||||
### Recently launched (2026-04-14 tick-8)
|
||||
- **Phase 32 PR #1** — `TenantGuard` middleware (PR #78, merged `57a05686`). Public repo's only SaaS hook: when `MOLECULE_ORG_ID` env is set, non-allowlisted requests require matching `X-Molecule-Org-Id` header or 404. Unset → passthrough (self-hosted unchanged). Allowlist is exact-match: `/health` + `/metrics`. Paired with the private `Molecule-AI/molecule-controlplane` repo scaffolded this tick (Fly Machines provisioner stub, `/cp/orgs` CRUD, subdomain→fly-replay router, migrations 001-003 for `organizations`/`org_instances`/`org_members`). +6 `TestTenantGuard_*` tests. Phase 32 plan: follow-up PRs wire real Fly provisioner, WorkOS AuthKit, Stripe, Cloudflare, signup UX — all in the private repo except the single public middleware.
|
||||
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import type { Metadata } from "next";
|
||||
import "./globals.css";
|
||||
import { AuthGate } from "@/components/AuthGate";
|
||||
|
||||
export const metadata: Metadata = {
|
||||
title: "Molecule AI",
|
||||
@ -13,7 +14,13 @@ export default function RootLayout({
|
||||
}) {
|
||||
return (
|
||||
<html lang="en">
|
||||
<body className="bg-zinc-950 text-white">{children}</body>
|
||||
<body className="bg-zinc-950 text-white">
|
||||
{/* AuthGate is a client component; it checks the session on mount
|
||||
and bounces anonymous users to the control plane's login page
|
||||
when running on a tenant subdomain. Non-SaaS hosts (localhost,
|
||||
vercel preview URL, apex) pass through unchanged. */}
|
||||
<AuthGate>{children}</AuthGate>
|
||||
</body>
|
||||
</html>
|
||||
);
|
||||
}
|
||||
|
||||
68
canvas/src/components/AuthGate.tsx
Normal file
68
canvas/src/components/AuthGate.tsx
Normal file
@ -0,0 +1,68 @@
|
||||
"use client";
|
||||
|
||||
/**
|
||||
* AuthGate wraps the canvas root so every page is gated on a valid session.
|
||||
* Anonymous users get bounced to app.moleculesai.app/cp/auth/login?return_to=<here>.
|
||||
*
|
||||
* In non-SaaS mode (no tenant slug — local dev, apex, vercel preview URL),
|
||||
* the gate is a pass-through: canvas works without auth for local dev.
|
||||
* This mirrors the control plane's "disabled provider" fallback.
|
||||
*/
|
||||
import { useEffect, useState, type ReactNode } from "react";
|
||||
import { fetchSession, redirectToLogin, type Session } from "@/lib/auth";
|
||||
import { getTenantSlug } from "@/lib/tenant";
|
||||
|
||||
export type AuthGateState =
|
||||
| { kind: "loading" }
|
||||
| { kind: "anonymous"; skipRedirect: boolean }
|
||||
| { kind: "authenticated"; session: Session };
|
||||
|
||||
export function AuthGate({ children }: { children: ReactNode }) {
|
||||
const [state, setState] = useState<AuthGateState>({ kind: "loading" });
|
||||
|
||||
useEffect(() => {
|
||||
// In non-SaaS mode (no tenant slug) we skip the gate entirely —
|
||||
// local dev, vercel preview URLs, and the app.moleculesai.app apex
|
||||
// should not force login for API-only interactions.
|
||||
const slug = getTenantSlug();
|
||||
if (!slug) {
|
||||
setState({ kind: "anonymous", skipRedirect: true });
|
||||
return;
|
||||
}
|
||||
let cancelled = false;
|
||||
fetchSession()
|
||||
.then((s) => {
|
||||
if (cancelled) return;
|
||||
if (s) {
|
||||
setState({ kind: "authenticated", session: s });
|
||||
} else {
|
||||
setState({ kind: "anonymous", skipRedirect: false });
|
||||
}
|
||||
})
|
||||
.catch(() => {
|
||||
// Network error — fail closed (show signin) so a transient
|
||||
// outage doesn't leak the canvas UI to an unauth'd user.
|
||||
if (!cancelled) setState({ kind: "anonymous", skipRedirect: false });
|
||||
});
|
||||
return () => {
|
||||
cancelled = true;
|
||||
};
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
if (state.kind === "anonymous" && !state.skipRedirect) {
|
||||
redirectToLogin("sign-in");
|
||||
}
|
||||
}, [state]);
|
||||
|
||||
if (state.kind === "loading") {
|
||||
// Minimal placeholder; canvas has its own loading UI downstream.
|
||||
return null;
|
||||
}
|
||||
if (state.kind === "anonymous" && !state.skipRedirect) {
|
||||
// Redirect already firing from the effect above; render nothing in
|
||||
// the interim to avoid a flash of unauthenticated content.
|
||||
return null;
|
||||
}
|
||||
return <>{children}</>;
|
||||
}
|
||||
69
canvas/src/lib/__tests__/auth.test.ts
Normal file
69
canvas/src/lib/__tests__/auth.test.ts
Normal file
@ -0,0 +1,69 @@
|
||||
/**
|
||||
* @vitest-environment jsdom
|
||||
*/
|
||||
import { describe, it, expect, vi, afterEach } from "vitest";
|
||||
import { fetchSession, redirectToLogin } from "../auth";
|
||||
|
||||
afterEach(() => {
|
||||
vi.unstubAllGlobals();
|
||||
vi.restoreAllMocks();
|
||||
});
|
||||
|
||||
describe("fetchSession", () => {
|
||||
it("returns session on 200", async () => {
|
||||
vi.stubGlobal("fetch", vi.fn().mockResolvedValue({
|
||||
ok: true,
|
||||
status: 200,
|
||||
json: async () => ({ user_id: "u1", org_id: "o1", email: "a@x.com" }),
|
||||
}));
|
||||
const s = await fetchSession();
|
||||
expect(s).toEqual({ user_id: "u1", org_id: "o1", email: "a@x.com" });
|
||||
});
|
||||
|
||||
it("returns null on 401 without throwing", async () => {
|
||||
vi.stubGlobal("fetch", vi.fn().mockResolvedValue({ ok: false, status: 401 }));
|
||||
const s = await fetchSession();
|
||||
expect(s).toBeNull();
|
||||
});
|
||||
|
||||
it("throws on 500 so transient outages aren't treated as 'anonymous'", async () => {
|
||||
vi.stubGlobal("fetch", vi.fn().mockResolvedValue({ ok: false, status: 500, statusText: "oops" }));
|
||||
await expect(fetchSession()).rejects.toThrow("500");
|
||||
});
|
||||
|
||||
it("sends credentials:include for cross-origin cookies", async () => {
|
||||
const fetchMock = vi.fn().mockResolvedValue({ ok: false, status: 401 });
|
||||
vi.stubGlobal("fetch", fetchMock);
|
||||
await fetchSession();
|
||||
expect(fetchMock).toHaveBeenCalledWith(
|
||||
expect.stringContaining("/cp/auth/me"),
|
||||
expect.objectContaining({ credentials: "include" }),
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe("redirectToLogin", () => {
|
||||
it("sets window.location to cp login URL with return_to", () => {
|
||||
const href = "https://acme.moleculesai.app/dashboard";
|
||||
Object.defineProperty(window, "location", {
|
||||
writable: true,
|
||||
value: { href },
|
||||
});
|
||||
redirectToLogin("sign-in");
|
||||
// href now holds the redirect target. encodeURIComponent(href) must
|
||||
// appear in the query.
|
||||
expect((window.location as unknown as { href: string }).href).toContain("/cp/auth/login");
|
||||
expect((window.location as unknown as { href: string }).href).toContain(
|
||||
encodeURIComponent(href),
|
||||
);
|
||||
});
|
||||
|
||||
it("uses signup path for sign-up screenHint", () => {
|
||||
Object.defineProperty(window, "location", {
|
||||
writable: true,
|
||||
value: { href: "https://acme.moleculesai.app/" },
|
||||
});
|
||||
redirectToLogin("sign-up");
|
||||
expect((window.location as unknown as { href: string }).href).toContain("/cp/auth/signup");
|
||||
});
|
||||
});
|
||||
51
canvas/src/lib/auth.ts
Normal file
51
canvas/src/lib/auth.ts
Normal file
@ -0,0 +1,51 @@
|
||||
/**
|
||||
* Canvas-side session detection. Calls /cp/auth/me on the control plane
|
||||
* (via same-origin → PLATFORM_URL) and returns the session or null.
|
||||
*
|
||||
* 401 is the "anonymous" signal and does NOT throw — the caller decides
|
||||
* whether to redirect. Network errors do throw so React error boundaries
|
||||
* can surface them.
|
||||
*/
|
||||
import { PLATFORM_URL } from "./api";
|
||||
|
||||
export interface Session {
|
||||
user_id: string;
|
||||
org_id: string;
|
||||
email: string;
|
||||
}
|
||||
|
||||
// Base path prefix for auth endpoints on the control plane.
|
||||
const AUTH_BASE = "/cp/auth";
|
||||
|
||||
/**
|
||||
* fetchSession probes /cp/auth/me with the session cookie (credentials:
|
||||
* include mandatory cross-origin). Returns the Session on 200, null on
|
||||
* 401 (anonymous), throws on anything else so callers don't silently
|
||||
* treat a 5xx as "not logged in".
|
||||
*/
|
||||
export async function fetchSession(): Promise<Session | null> {
|
||||
const res = await fetch(`${PLATFORM_URL}${AUTH_BASE}/me`, {
|
||||
credentials: "include",
|
||||
});
|
||||
if (res.status === 401) return null;
|
||||
if (!res.ok) {
|
||||
throw new Error(`/cp/auth/me: ${res.status} ${res.statusText}`);
|
||||
}
|
||||
return res.json();
|
||||
}
|
||||
|
||||
/**
|
||||
* redirectToLogin bounces the browser to the control plane's login page
|
||||
* with a `return_to` param so the user lands back on the current URL
|
||||
* after signup/login completes. Same-origin safety is enforced on the
|
||||
* CP side (isSafeReturnTo rejects cross-domain / http / protocol-
|
||||
* relative URLs). Uses window.location.href so the full URL including
|
||||
* query + hash survives the round trip.
|
||||
*/
|
||||
export function redirectToLogin(screenHint: "sign-up" | "sign-in" = "sign-in"): void {
|
||||
if (typeof window === "undefined") return;
|
||||
const returnTo = window.location.href;
|
||||
const path = screenHint === "sign-up" ? "signup" : "login";
|
||||
const dest = `${PLATFORM_URL}${AUTH_BASE}/${path}?return_to=${encodeURIComponent(returnTo)}`;
|
||||
window.location.href = dest;
|
||||
}
|
||||
37
docs/edit-history/2026-04-15.md
Normal file
37
docs/edit-history/2026-04-15.md
Normal file
@ -0,0 +1,37 @@
|
||||
# Edit history — 2026-04-15
|
||||
|
||||
## tick-9: Phase 32 Phase B.2 image pipeline (PR #80) + tick-8 docs sync (PR #79)
|
||||
|
||||
Two merges:
|
||||
|
||||
### PR #79 — `docs: sync documentation with 2026-04-14 tick-8 merge (#78)`
|
||||
Merge commit `d53a1287`. Tick-8 docs sync for the TenantGuard middleware.
|
||||
Pure docs; CLAUDE.md test count + PLAN.md tick-8 block + edit-history entry.
|
||||
|
||||
### PR #80 — `feat(ci): publish-platform-image → ghcr.io/molecule-ai/platform (Phase B.2)`
|
||||
Merge commit `c3cc8e87`. Noteworthy: ci-infra.
|
||||
|
||||
Adds `.github/workflows/publish-platform-image.yml`:
|
||||
- Trigger: push to main touching `platform/**`; also `workflow_dispatch`.
|
||||
- Builds `platform/Dockerfile` via `docker/build-push-action@v5`.
|
||||
- Pushes two tags per run: `ghcr.io/molecule-ai/platform:latest` (floating)
|
||||
and `:sha-<short-commit>` (immutable, pin-friendly).
|
||||
- GHA cache via `cache-from/cache-to: type=gha` for warm rebuilds.
|
||||
- Permissions: `contents:read` + `packages:write`; authenticates to GHCR
|
||||
using the built-in `GITHUB_TOKEN`, no extra secrets.
|
||||
- OCI labels propagate source URL + commit SHA for provenance.
|
||||
|
||||
Purpose: pairs with the private `molecule-controlplane` Fly + Neon
|
||||
provisioner (PR #3 there, merged `2e85d5ad`) which reads
|
||||
`TENANT_IMAGE=ghcr.io/molecule-ai/platform:<tag>` from env and spawns
|
||||
each tenant Fly Machine from this image.
|
||||
|
||||
### Deployment state (informational — not in any repo)
|
||||
- Fly apps (`molecule-cp`, `molecule-tenant`): **pending CEO** (`flyctl apps create`).
|
||||
- Fly billing card: **pending CEO**.
|
||||
- First real tenant provision: **blocked** on the two above.
|
||||
|
||||
### File deltas (public repo)
|
||||
- `.github/workflows/publish-platform-image.yml` — new.
|
||||
- `CLAUDE.md` — tick-9 block for the new CI workflow.
|
||||
- `PLAN.md` — new "Recently launched (2026-04-15 tick-9)" entry.
|
||||
@ -51,6 +51,7 @@ defaults:
|
||||
infra: [DevOps Engineer]
|
||||
qa: [QA Engineer]
|
||||
performance: [Backend Engineer]
|
||||
docs: [Documentation Specialist]
|
||||
mixed: [Dev Lead]
|
||||
|
||||
# workspace_dir: not set by default — each agent gets an isolated Docker volume
|
||||
@ -127,19 +128,73 @@ workspaces:
|
||||
4. Read /workspace/repo/docs/product/overview.md to understand the product
|
||||
5. Use commit_memory to save key product facts for later recall
|
||||
6. Wait for tasks from PM.
|
||||
schedules:
|
||||
- name: Hourly ecosystem watch
|
||||
cron_expr: "8 * * * *"
|
||||
prompt: |
|
||||
Daily survey for new agent-infra / AI-agent projects worth tracking.
|
||||
|
||||
1. Pull docs/ecosystem-watch.md to know what's already tracked.
|
||||
2. Browse the web for last 24h:
|
||||
- github.com/trending?since=daily&language=python (and typescript, go)
|
||||
- HN front page, anything about agent frameworks
|
||||
- Twitter/X mentions of new agent SDKs, MCP servers, frameworks
|
||||
3. Cross-reference: skip anything already in ecosystem-watch.md.
|
||||
4. For each genuinely new + relevant project (1-3 max per day):
|
||||
- Add an entry under "## Entries" using the existing template
|
||||
(Pitch / Shape / Overlap / Differentiation / Worth borrowing /
|
||||
Terminology collisions / Signals to react to / Last reviewed + stars)
|
||||
- Keep each entry ≤200 words.
|
||||
5. If a finding suggests a concrete improvement to plugins/, workspace-template/,
|
||||
or org-templates/, file a GH issue (`gh issue create`) with the proposal.
|
||||
6. Commit additions to a branch named chore/eco-watch-YYYY-MM-DD. PUSH it
|
||||
(per the repo "always raise PR" policy) and open a PR.
|
||||
7. Routing: delegate_task to PM with summary
|
||||
(audit_summary metadata: category=research, severity=info,
|
||||
issues=[<gh issue numbers>], top_recommendation=<one-liner>).
|
||||
8. If nothing notable today, skip the commit and PM-message a one-line "clean".
|
||||
enabled: true
|
||||
children:
|
||||
- name: Market Analyst
|
||||
role: Market sizing, trends, user research
|
||||
files_dir: market-analyst
|
||||
plugins: [browser-automation] # UNION with defaults (#71)
|
||||
plugins: [browser-automation]
|
||||
- name: Technical Researcher
|
||||
role: AI frameworks and protocol evaluation
|
||||
files_dir: technical-researcher
|
||||
plugins: [browser-automation] # UNION with defaults (#71)
|
||||
plugins: [browser-automation]
|
||||
schedules:
|
||||
- name: Hourly plugin curation
|
||||
cron_expr: "22 * * * *"
|
||||
prompt: |
|
||||
Weekly survey of `plugins/` and `workspace-template/builtin_tools/` for
|
||||
evolution opportunities. The team should keep gaining capabilities.
|
||||
|
||||
1. Inventory:
|
||||
- ls plugins/ — every plugin and its plugin.yaml description
|
||||
- ls workspace-template/builtin_tools/*.py — every builtin tool
|
||||
- cat org-templates/molecule-dev/org.yaml — see how plugins are wired
|
||||
2. Gap analysis:
|
||||
- Any builtin_tool not exposed via a plugin?
|
||||
- Any role with no plugins beyond defaults that *should* have extras?
|
||||
- Any plugin that's installed everywhere via defaults but is rarely used?
|
||||
3. External survey (use browser-automation):
|
||||
- github.com/topics/ai-agents (last week)
|
||||
- github.com/topics/mcp-server (last week)
|
||||
- claude.ai/cookbook, openai/swarm releases
|
||||
- anthropic blog, openai blog, langchain blog (last week)
|
||||
4. For 1-3 highest-value findings, file a GH issue with concrete proposal:
|
||||
- "Plugin proposal: <name> — wraps <upstream tool> for <role(s)>"
|
||||
- body: what it does, which roles benefit, integration sketch (~30 lines),
|
||||
upstream link, license check.
|
||||
5. Routing: delegate_task to PM with audit_summary metadata
|
||||
(category=plugins, issues=[…], top_recommendation=…).
|
||||
6. If nothing notable this week, PM-message a one-line "clean".
|
||||
enabled: true
|
||||
- name: Competitive Intelligence
|
||||
role: Competitor tracking and feature comparison
|
||||
files_dir: competitive-intelligence
|
||||
plugins: [browser-automation] # UNION with defaults (#71)
|
||||
plugins: [browser-automation]
|
||||
|
||||
- name: Dev Lead
|
||||
role: Engineering planning and team coordination
|
||||
@ -155,6 +210,51 @@ workspaces:
|
||||
4. Run: cd /workspace/repo && git log --oneline -5
|
||||
5. Use commit_memory to save the architecture summary and recent changes
|
||||
6. Wait for tasks from PM.
|
||||
schedules:
|
||||
- name: Hourly template fitness audit
|
||||
cron_expr: "15 * * * *"
|
||||
prompt: |
|
||||
Daily audit of `org-templates/molecule-dev/`. Catches drift, stale prompts,
|
||||
missing schedules, and gaps that block the team-runs-24/7 goal. Symptom
|
||||
of prior incident (issue #85): cron scheduler died silently for 10+ hours
|
||||
and nobody noticed because no one was watching template fitness.
|
||||
|
||||
1. CHECK SCHEDULES ARE FIRING:
|
||||
For every workspace_schedule in the platform DB:
|
||||
curl -s http://host.docker.internal:8080/workspaces/<id>/schedules
|
||||
Compare last_run_at to now() vs cron interval. Anything more than 2x
|
||||
the interval behind = STALE. File issue against platform.
|
||||
|
||||
2. CHECK SYSTEM PROMPTS ARE FRESH:
|
||||
cd /workspace/repo
|
||||
for f in org-templates/molecule-dev/*/system-prompt.md; do
|
||||
echo "$(git log -1 --format='%ar' -- "$f") $f"
|
||||
done
|
||||
Anything not touched in 30+ days might be stale relative to recent
|
||||
platform changes. Spot-check vs CLAUDE.md and recent merges.
|
||||
|
||||
3. CHECK ROLES HAVE PLUGINS THEY NEED:
|
||||
yq '.workspaces[] | (.name, .plugins)' org-templates/molecule-dev/org.yaml
|
||||
(or python+yaml). Roles inherit defaults; flag any role that should
|
||||
plausibly have role-specific extras (compare role description vs
|
||||
plugins list).
|
||||
|
||||
4. CHECK CRONS COVER THE EVOLUTION LEVERS:
|
||||
The team must keep evolving plugins, template, channels, watchlist.
|
||||
Verify schedules exist for: ecosystem-watch (Research Lead),
|
||||
plugin-curation (Technical Researcher), template-fitness (you,
|
||||
this cron), channel-expansion (DevOps).
|
||||
Any missing? File issue.
|
||||
|
||||
5. CHECK CHANNELS:
|
||||
Today only PM has telegram. Should any other role have a channel?
|
||||
(Security Auditor → email on critical findings; DevOps → Slack on
|
||||
build breaks; etc.) File issue if a channel gap is meaningful.
|
||||
|
||||
6. ROUTING: delegate_task to PM with audit_summary metadata
|
||||
(category=template, severity=…, issues=[…], top_recommendation=…).
|
||||
7. If everything is fit and current, PM-message one-line "clean".
|
||||
enabled: true
|
||||
children:
|
||||
- name: Frontend Engineer
|
||||
role: >-
|
||||
@ -227,6 +327,37 @@ workspaces:
|
||||
4. Read /workspace/repo/.github/workflows/ci.yml
|
||||
5. Use commit_memory to save CI pipeline structure
|
||||
6. Wait for tasks from Dev Lead.
|
||||
schedules:
|
||||
- name: Hourly channel expansion survey
|
||||
cron_expr: "47 * * * *"
|
||||
prompt: |
|
||||
Weekly survey of channel integrations (Telegram, Slack, Discord, email,
|
||||
webhooks). The team should grow its external comms surface where useful,
|
||||
not stay locked at "PM-only Telegram".
|
||||
|
||||
1. INVENTORY:
|
||||
yq '.workspaces[] | {name: .name, channels: .channels}' \
|
||||
org-templates/molecule-dev/org.yaml 2>/dev/null
|
||||
(or python+yaml). List which roles have which channels.
|
||||
2. PLATFORM CAPABILITY CHECK:
|
||||
grep -rE "channel|telegram|slack|discord|webhook" \
|
||||
platform/internal/handlers/ --include="*.go" -l
|
||||
What channel types does the platform actually support today?
|
||||
3. GAP ANALYSIS:
|
||||
- PM has Telegram → can the user reach OTHER roles directly?
|
||||
- Security Auditor: would email-on-critical-finding help?
|
||||
- DevOps Engineer: would Slack-on-CI-break help?
|
||||
- Any role that produces high-value asynchronous output but the
|
||||
user has to poll memory to see it?
|
||||
4. EXTERNAL: are there channel platforms we should consider adding?
|
||||
(Discord for community, GitHub Discussions for product, etc.)
|
||||
5. For the top 1-2 gaps, file a GH issue:
|
||||
- "Channel proposal: <type> for <role>" with rationale, integration
|
||||
sketch, secret requirements (e.g. SLACK_BOT_TOKEN as global secret).
|
||||
6. ROUTING: delegate_task to PM with audit_summary metadata
|
||||
(category=channels, issues=[…], top_recommendation=…).
|
||||
7. If no gap this week, PM-message a one-line "clean".
|
||||
enabled: true
|
||||
- name: Security Auditor
|
||||
role: >-
|
||||
Owns security posture across the full stack: Go/Gin handlers
|
||||
@ -488,3 +619,185 @@ workspaces:
|
||||
|
||||
d. Save to memory key 'uiux-audit-latest' as a secondary record only.
|
||||
enabled: true
|
||||
|
||||
- name: Documentation Specialist
|
||||
role: >-
|
||||
Owns end-to-end documentation across THREE Molecule AI repos:
|
||||
(1) the platform monorepo (public, Molecule-AI/molecule-monorepo) —
|
||||
internal architecture, READMEs, edit-history, public API references;
|
||||
(2) the docs site (public, Molecule-AI/docs) — Fumadocs + Next.js 15,
|
||||
deployed to doc.moleculesai.app, customer-facing;
|
||||
(3) the SaaS controlplane (PRIVATE, Molecule-AI/molecule-controlplane) —
|
||||
Go service that provisions tenants on Fly Machines, with the strict
|
||||
rule that private implementation details NEVER leak into the public
|
||||
docs site. Documents controlplane changes only in its own internal
|
||||
README and the platform monorepo's docs/saas/ section (which itself
|
||||
is gated). Public docs only describe the SaaS PRODUCT (signup, billing,
|
||||
tenant lifecycle, multi-tenant data isolation guarantees) — not the
|
||||
provisioner's internals.
|
||||
Watches PRs landing on all three repos and opens corresponding docs
|
||||
PRs whenever a public API changes, a new template/plugin/channel
|
||||
lands, a user-facing concept evolves, or an ecosystem-watch entry
|
||||
needs publishing. Holds the line on terminology consistency — every
|
||||
concept has exactly one canonical name across all three repos.
|
||||
Definition of done: every public surface has accurate, current,
|
||||
example-rich documentation; every merged PR that touches a public
|
||||
surface has a paired docs PR open within one cron tick; every stub
|
||||
page on the docs site eventually gets backfilled; controlplane
|
||||
internal docs stay current; nothing private leaks to public.
|
||||
tier: 3
|
||||
model: opus
|
||||
files_dir: documentation-specialist
|
||||
canvas: { x: 900, y: 250 }
|
||||
# Documentation Specialist needs browser-automation to crawl the live
|
||||
# docs site (visual regressions, broken links, dead anchors) plus
|
||||
# update-docs skill (already in defaults) for cross-repo docs sync.
|
||||
plugins: [browser-automation]
|
||||
initial_prompt: |
|
||||
You just started as Documentation Specialist. Set up silently — do NOT contact other agents.
|
||||
|
||||
⚠️ PRIVACY RULE (read first, never violate):
|
||||
molecule-controlplane is a PRIVATE repo. Its source code, file paths,
|
||||
internal endpoints, schema details, infra config, billing/auth
|
||||
implementation — none of that goes into the public docs site
|
||||
(Molecule-AI/docs) or the public README in molecule-monorepo. Public
|
||||
docs may describe the SaaS PRODUCT (signup, billing, tenant isolation
|
||||
guarantees) but never the provisioner's internals. When in doubt:
|
||||
don't publish.
|
||||
|
||||
1. Clone all three repos:
|
||||
git clone https://github.com/${GITHUB_REPO}.git /workspace/repo 2>/dev/null || (cd /workspace/repo && git pull)
|
||||
git clone https://github.com/Molecule-AI/docs.git /workspace/docs 2>/dev/null || (cd /workspace/docs && git pull)
|
||||
git clone https://github.com/Molecule-AI/molecule-controlplane.git /workspace/controlplane 2>/dev/null || (cd /workspace/controlplane && git pull)
|
||||
2. Read /workspace/repo/CLAUDE.md — full architecture, what's public-facing
|
||||
3. Read /configs/system-prompt.md
|
||||
4. Read /workspace/docs/README.md and /workspace/docs/content/docs/index.mdx
|
||||
5. Read /workspace/controlplane/README.md and /workspace/controlplane/PLAN.md
|
||||
— understand what the SaaS provisioner does (private) vs what users see (public)
|
||||
6. Run: cd /workspace/docs && ls content/docs/*.mdx
|
||||
— note which pages are stubs ("Coming soon" marker) vs hand-written
|
||||
7. Run: cd /workspace/repo && git log --oneline -20 -- platform/internal/handlers/ org-templates/ plugins/
|
||||
— note recent public-surface changes in the platform repo
|
||||
8. Run: cd /workspace/controlplane && git log --oneline -20
|
||||
— note recent controlplane changes (these need internal docs only)
|
||||
9. Use commit_memory to save:
|
||||
- Stubs that need backfilling (docs site)
|
||||
- Recent platform PRs that have NO docs PR yet
|
||||
- Recent controlplane PRs whose internal README needs an update
|
||||
- Public concepts that lack a canonical naming entry
|
||||
10. Wait for tasks from PM. Your owned surfaces are:
|
||||
- https://github.com/Molecule-AI/docs (customer site, Fumadocs) — PUBLIC
|
||||
- /workspace/repo/docs/ (internal architecture / edit-history) — PUBLIC
|
||||
- /workspace/repo/README.md and per-package READMEs — PUBLIC
|
||||
- /workspace/controlplane/README.md, PLAN.md, internal docs — PRIVATE
|
||||
schedules:
|
||||
- name: Daily docs sync — backfill stubs and pair recent platform PRs
|
||||
cron_expr: "0 9 * * *"
|
||||
prompt: |
|
||||
Daily documentation maintenance. Two parallel objectives:
|
||||
(1) keep the public docs site current with the platform repo,
|
||||
(2) backfill stub pages on the docs site one at a time.
|
||||
|
||||
SETUP:
|
||||
cd /workspace/repo && git pull 2>/dev/null || true
|
||||
cd /workspace/docs && git pull 2>/dev/null || true
|
||||
cd /workspace/controlplane && git pull 2>/dev/null || true
|
||||
|
||||
1a. PAIR RECENT PLATFORM PRS (last 24h):
|
||||
cd /workspace/repo
|
||||
gh pr list --repo Molecule-AI/molecule-monorepo --state merged \
|
||||
--search "merged:>$(date -u -d '24 hours ago' +%Y-%m-%dT%H:%M:%SZ)" \
|
||||
--json number,title,files
|
||||
For each merged PR that touches a public surface
|
||||
(platform/internal/handlers/, plugins/*, org-templates/*,
|
||||
docs/architecture.md, README.md, workspace-template/adapters/*):
|
||||
- Identify which docs page(s) on the public site cover that surface.
|
||||
- If a docs page exists but is stale → update it with examples
|
||||
from the PR diff. Open a PR to Molecule-AI/docs with the change.
|
||||
- If NO docs page exists for the new surface → propose one
|
||||
(add to content/docs/meta.json + new .mdx file). Open a PR.
|
||||
- Always close PRs with `Closes platform PR #N` so the link is durable.
|
||||
|
||||
1b. PAIR RECENT CONTROLPLANE PRS (last 24h):
|
||||
cd /workspace/controlplane
|
||||
gh pr list --repo Molecule-AI/molecule-controlplane --state merged \
|
||||
--search "merged:>$(date -u -d '24 hours ago' +%Y-%m-%dT%H:%M:%SZ)" \
|
||||
--json number,title,files
|
||||
⚠️ PRIVATE REPO. Two cases:
|
||||
(i) Internal-only change (handler, schema, infra, fly.toml,
|
||||
billing logic): update README.md + PLAN.md + any
|
||||
docs/internal/*.md inside molecule-controlplane itself.
|
||||
Open the PR against Molecule-AI/molecule-controlplane.
|
||||
NEVER mention these changes in /workspace/docs.
|
||||
(ii) Customer-facing change (new tier, new region, new SLA,
|
||||
pricing change, signup flow change): write a sanitized
|
||||
description for the PUBLIC docs site (e.g. "We now offer
|
||||
EU-region tenants" — NOT "controlplane reads FLY_REGION
|
||||
from env and passes it to provisioner.go:142"). Open a
|
||||
PR against Molecule-AI/docs.
|
||||
When unsure which category a change falls into: default to
|
||||
INTERNAL-only and ask PM for explicit approval before publishing.
|
||||
|
||||
2. BACKFILL ONE STUB PAGE:
|
||||
cd /workspace/docs
|
||||
grep -l "Coming soon" content/docs/*.mdx | head -1
|
||||
Pick the highest-priority stub (one of: org-template, plugins,
|
||||
channels, schedules, architecture, api-reference, self-hosting,
|
||||
observability, troubleshooting). Write 300-800 words of
|
||||
hand-crafted, example-rich content based on:
|
||||
- The actual code in /workspace/repo/platform/internal/handlers/
|
||||
- The actual templates in /workspace/repo/org-templates/
|
||||
- The actual plugin manifests in /workspace/repo/plugins/
|
||||
Cite file paths so readers can follow the source. Open a PR.
|
||||
|
||||
3. LINK + ANCHOR CHECK:
|
||||
Use the browser-automation plugin to crawl
|
||||
https://doc.moleculesai.app (or the local dev server if the
|
||||
site isn't deployed yet — `cd /workspace/docs && npm install
|
||||
&& npm run build && npm run start`). Report broken links and
|
||||
missing anchors back to PM.
|
||||
|
||||
4. ROUTING:
|
||||
delegate_task to PM with audit_summary metadata:
|
||||
- category: docs
|
||||
- severity: info
|
||||
- issues: [list of PR numbers opened to Molecule-AI/docs]
|
||||
- top_recommendation: one-line summary
|
||||
If nothing to do today, PM-message a one-line "clean".
|
||||
|
||||
5. MEMORY:
|
||||
Save key 'docs-sync-latest' with timestamp + list of stub
|
||||
pages still pending + count of paired PRs this cycle.
|
||||
enabled: true
|
||||
- name: Weekly terminology + freshness audit
|
||||
cron_expr: "0 11 * * 1"
|
||||
prompt: |
|
||||
Weekly audit of documentation freshness and terminology consistency.
|
||||
|
||||
1. STALE PAGE DETECTION:
|
||||
cd /workspace/docs && for f in content/docs/*.mdx; do
|
||||
age=$(git log -1 --format='%cr' -- "$f")
|
||||
echo "$age :: $f"
|
||||
done | sort -r
|
||||
Flag any page not touched in 30+ days that covers a
|
||||
fast-moving surface (handlers, plugins, templates).
|
||||
|
||||
2. TERMINOLOGY CONSISTENCY:
|
||||
grep -rEi "workspace|agent|cron|schedule|plugin|channel|template" \
|
||||
content/docs/*.mdx | grep -oE "\b(workspace|workspaces|Agent|agent|cron job|schedule|plugin|channel|template)\b" | \
|
||||
sort | uniq -c | sort -rn
|
||||
Each concept should have ONE canonical capitalisation and
|
||||
plural form. Open a PR fixing inconsistencies.
|
||||
|
||||
3. LINK ROT:
|
||||
grep -rE "\\[.*\\]\\(http[^)]+\\)" content/docs/*.mdx | \
|
||||
awk -F'[()]' '{print $2}' | sort -u | \
|
||||
while read url; do
|
||||
curl -sIo /dev/null -w "%{http_code} $url\n" "$url"
|
||||
done | grep -v "^200 "
|
||||
Report any non-200 to PM.
|
||||
|
||||
4. ROUTING + MEMORY:
|
||||
Same audit_summary contract as the daily cron.
|
||||
Save findings to memory key 'docs-weekly-audit'.
|
||||
enabled: true
|
||||
|
||||
@ -7,6 +7,13 @@ import (
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
)
|
||||
|
||||
// maxAncestorWalk caps the depth of the parent-chain walk in
|
||||
// CanCommunicate. Org trees are realistically 3-5 deep
|
||||
// (PM → Dev Lead → Backend Engineer is depth 3); 32 is a safety
|
||||
// ceiling so a malformed cycle in the workspaces table can't loop
|
||||
// forever.
|
||||
const maxAncestorWalk = 32
|
||||
|
||||
type workspaceRef struct {
|
||||
ID string
|
||||
ParentID *string
|
||||
@ -26,8 +33,51 @@ func getWorkspaceRef(id string) (*workspaceRef, error) {
|
||||
return &ws, nil
|
||||
}
|
||||
|
||||
// CanCommunicate checks if two workspaces can talk to each other
|
||||
// based on the hierarchy rules: siblings, parent-child, root-level siblings.
|
||||
// isAncestorOf returns true if `ancestorID` is found anywhere on the
|
||||
// parent-chain walk starting from `childID`. Walks at most maxAncestorWalk
|
||||
// steps so a corrupt parent-cycle cannot loop forever. Returns false on any
|
||||
// DB lookup error (logged) — fail-secure.
|
||||
func isAncestorOf(ancestorID, childID string) bool {
|
||||
current := childID
|
||||
for i := 0; i < maxAncestorWalk; i++ {
|
||||
ref, err := getWorkspaceRef(current)
|
||||
if err != nil {
|
||||
log.Printf("isAncestorOf: walk lookup %s: %v", current, err)
|
||||
return false
|
||||
}
|
||||
if ref.ParentID == nil {
|
||||
return false
|
||||
}
|
||||
if *ref.ParentID == ancestorID {
|
||||
return true
|
||||
}
|
||||
current = *ref.ParentID
|
||||
}
|
||||
log.Printf("isAncestorOf: walk exceeded maxAncestorWalk=%d from %s — corrupt parent chain?",
|
||||
maxAncestorWalk, childID)
|
||||
return false
|
||||
}
|
||||
|
||||
// CanCommunicate checks if two workspaces can talk to each other based on
|
||||
// the org hierarchy. The rules:
|
||||
//
|
||||
// - self → self
|
||||
// - siblings (same parent, including both root-level)
|
||||
// - any ancestor → any descendant (e.g. PM → Backend Engineer)
|
||||
// - any descendant → any ancestor (e.g. Security Auditor → PM)
|
||||
//
|
||||
// The third and fourth rules generalise the previous "direct parent ↔
|
||||
// child" check. Originally this was strict 1-step parent/child only,
|
||||
// which broke the audit-routing contract: Security Auditor (under Dev
|
||||
// Lead, under PM) could not call delegate_task on PM to deliver an
|
||||
// audit_summary, so it fell back to delegating to Dev Lead — bypassing
|
||||
// PM's category_routing entirely.
|
||||
//
|
||||
// The relaxation preserves the hierarchy intent (no horizontal cross-team
|
||||
// chatter — Frontend Engineer cannot directly message Backend Engineer
|
||||
// unless they share a parent, which they do under Dev Lead) while
|
||||
// unblocking the leadership-chain pattern that is fundamental to how
|
||||
// audit summaries fan out across the org.
|
||||
func CanCommunicate(callerID, targetID string) bool {
|
||||
if callerID == targetID {
|
||||
return true
|
||||
@ -54,15 +104,27 @@ func CanCommunicate(callerID, targetID string) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
// Parent talking to child
|
||||
// Direct parent → child (fast path; avoids the ancestor walk)
|
||||
if target.ParentID != nil && caller.ID == *target.ParentID {
|
||||
return true
|
||||
}
|
||||
|
||||
// Child talking up to parent
|
||||
// Direct child → parent (fast path)
|
||||
if caller.ParentID != nil && target.ID == *caller.ParentID {
|
||||
return true
|
||||
}
|
||||
|
||||
// Distant ancestor → descendant: caller is somewhere up target's chain.
|
||||
// Triggers extra DB lookups, only reached when the fast paths above didn't match.
|
||||
if target.ParentID != nil && isAncestorOf(callerID, *target.ParentID) {
|
||||
return true
|
||||
}
|
||||
|
||||
// Distant descendant → ancestor: target is somewhere up caller's chain.
|
||||
// (e.g. Security Auditor → PM, where Security Auditor's parent is Dev Lead.)
|
||||
if caller.ParentID != nil && isAncestorOf(targetID, *caller.ParentID) {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
@ -97,9 +97,13 @@ func TestCanCommunicate_ChildToParent(t *testing.T) {
|
||||
|
||||
func TestCanCommunicate_Denied_DifferentParents(t *testing.T) {
|
||||
mock := setupMockDB(t)
|
||||
// ws-a (parent: p1) and ws-b (parent: p2) — not siblings
|
||||
// ws-a (parent: p1) and ws-b (parent: p2) — not siblings, no shared ancestor.
|
||||
expectLookup(mock, "ws-a", ptr("p1"))
|
||||
expectLookup(mock, "ws-b", ptr("p2"))
|
||||
// Walk #1: isAncestorOf(ws-a, p2) → p2 is parentless, false.
|
||||
expectLookup(mock, "p2", nil)
|
||||
// Walk #2: isAncestorOf(ws-b, p1) → p1 is parentless, false.
|
||||
expectLookup(mock, "p1", nil)
|
||||
|
||||
if CanCommunicate("ws-a", "ws-b") {
|
||||
t.Error("workspaces with different parents should NOT communicate")
|
||||
@ -108,9 +112,15 @@ func TestCanCommunicate_Denied_DifferentParents(t *testing.T) {
|
||||
|
||||
func TestCanCommunicate_Denied_CousinToRoot(t *testing.T) {
|
||||
mock := setupMockDB(t)
|
||||
// ws-child (parent: ws-mid) and ws-root (no parent, NOT ws-mid)
|
||||
// ws-child (parent: ws-mid, which has its own root ws-other-root) and
|
||||
// ws-root (a different parentless workspace).
|
||||
// The ancestor walk from ws-child should reach ws-other-root but never
|
||||
// ws-root, so communication is denied.
|
||||
expectLookup(mock, "ws-child", ptr("ws-mid"))
|
||||
expectLookup(mock, "ws-root", nil)
|
||||
// Ancestor walk: starts at *caller.ParentID = ws-mid. Walks ws-mid → ws-other-root → nil.
|
||||
expectLookup(mock, "ws-mid", ptr("ws-other-root"))
|
||||
expectLookup(mock, "ws-other-root", nil)
|
||||
|
||||
if CanCommunicate("ws-child", "ws-root") {
|
||||
t.Error("child should NOT communicate with unrelated root workspace")
|
||||
@ -136,13 +146,75 @@ func TestCanCommunicate_Denied_TargetNotFound(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestCanCommunicate_Denied_Grandchild(t *testing.T) {
|
||||
func TestCanCommunicate_Allowed_GrandparentToGrandchild(t *testing.T) {
|
||||
mock := setupMockDB(t)
|
||||
// ws-grandparent and ws-grandchild (parent: ws-mid, NOT ws-grandparent)
|
||||
expectLookup(mock, "ws-grandparent", nil)
|
||||
expectLookup(mock, "ws-grandchild", ptr("ws-mid"))
|
||||
// PM (no parent) → Backend Engineer (parent: Dev Lead, parent: PM).
|
||||
// Originally rejected ("grandparent should NOT communicate with grandchild
|
||||
// directly") — that broke audit_summary routing because Security Auditor
|
||||
// could not delegate up to PM. The hierarchy is now ancestor↔descendant.
|
||||
expectLookup(mock, "ws-pm", nil)
|
||||
expectLookup(mock, "ws-be", ptr("ws-dl"))
|
||||
// Ancestor walk: target.ParentID = ws-dl. isAncestorOf(ws-pm, ws-dl).
|
||||
// Walks ws-dl → ws-pm → match. (Walk lookup #1: ws-dl.)
|
||||
expectLookup(mock, "ws-dl", ptr("ws-pm"))
|
||||
|
||||
if CanCommunicate("ws-grandparent", "ws-grandchild") {
|
||||
t.Error("grandparent should NOT communicate with grandchild directly")
|
||||
if !CanCommunicate("ws-pm", "ws-be") {
|
||||
t.Error("PM should be able to communicate with Backend Engineer (descendant)")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCanCommunicate_Allowed_GrandchildToGrandparent(t *testing.T) {
|
||||
mock := setupMockDB(t)
|
||||
// Security Auditor (parent: Dev Lead) → PM (parent of Dev Lead).
|
||||
// This is the Security Auditor → PM audit_summary delivery path.
|
||||
expectLookup(mock, "ws-sec", ptr("ws-dl"))
|
||||
expectLookup(mock, "ws-pm", nil)
|
||||
// Direct parent → child fast path: target.ParentID = nil, skip.
|
||||
// Direct child → parent: caller.ParentID = ws-dl, target.ID = ws-pm,
|
||||
// ws-dl != ws-pm, skip.
|
||||
// Distant ancestor → descendant: target.ParentID = nil, skip.
|
||||
// Distant descendant → ancestor: caller.ParentID = ws-dl. Walks
|
||||
// isAncestorOf(ws-pm, ws-dl) → looks up ws-dl → returns ws-pm → match.
|
||||
expectLookup(mock, "ws-dl", ptr("ws-pm"))
|
||||
|
||||
if !CanCommunicate("ws-sec", "ws-pm") {
|
||||
t.Error("Security Auditor should be able to send audit_summary up to PM")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCanCommunicate_Allowed_DeepAncestor(t *testing.T) {
|
||||
mock := setupMockDB(t)
|
||||
// Four-level chain: ws-leaf (parent: ws-l3, parent: ws-l2, parent: ws-l1).
|
||||
// ws-leaf → ws-l1 should be allowed.
|
||||
expectLookup(mock, "ws-leaf", ptr("ws-l3"))
|
||||
expectLookup(mock, "ws-l1", nil)
|
||||
// Distant descendant → ancestor walk: starts at ws-l3.
|
||||
// ws-l3 → ws-l2: not ws-l1, continue.
|
||||
// ws-l2 → ws-l1: match!
|
||||
expectLookup(mock, "ws-l3", ptr("ws-l2"))
|
||||
expectLookup(mock, "ws-l2", ptr("ws-l1"))
|
||||
|
||||
if !CanCommunicate("ws-leaf", "ws-l1") {
|
||||
t.Error("4-level descendant should reach root ancestor")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCanCommunicate_Denied_UnrelatedAncestors(t *testing.T) {
|
||||
mock := setupMockDB(t)
|
||||
// Two separate org subtrees:
|
||||
// tree A: ws-a-leaf → ws-a-mid → ws-a-root
|
||||
// tree B: ws-b-leaf → ws-b-mid → ws-b-root
|
||||
// ws-a-leaf → ws-b-root must be denied even though both have parents
|
||||
// (no shared ancestor).
|
||||
expectLookup(mock, "ws-a-leaf", ptr("ws-a-mid"))
|
||||
expectLookup(mock, "ws-b-root", nil)
|
||||
// Walk: isAncestorOf(ws-b-root, ws-a-mid).
|
||||
// ws-a-mid → ws-a-root: not ws-b-root, continue.
|
||||
// ws-a-root has no parent → false.
|
||||
expectLookup(mock, "ws-a-mid", ptr("ws-a-root"))
|
||||
expectLookup(mock, "ws-a-root", nil)
|
||||
|
||||
if CanCommunicate("ws-a-leaf", "ws-b-root") {
|
||||
t.Error("workspaces in different subtrees should NOT communicate via the walk")
|
||||
}
|
||||
}
|
||||
|
||||
@ -48,22 +48,70 @@ type scheduleRow struct {
|
||||
type Scheduler struct {
|
||||
proxy A2AProxy
|
||||
broadcaster Broadcaster
|
||||
|
||||
// lastTickAt records the wall-clock time of the most recent tick
|
||||
// (whether it fired schedules or not). Read by Healthy() and the
|
||||
// /admin/scheduler/health endpoint to detect stuck-tick conditions.
|
||||
// Atomic-ish via the mutex; tick rate is 30s so contention is trivial.
|
||||
mu sync.RWMutex
|
||||
lastTickAt time.Time
|
||||
}
|
||||
|
||||
func New(proxy A2AProxy, broadcaster Broadcaster) *Scheduler {
|
||||
return &Scheduler{proxy: proxy, broadcaster: broadcaster}
|
||||
}
|
||||
|
||||
// LastTickAt returns the wall-clock time of the most recent successful tick.
|
||||
// Returns the zero Time if Start() has never been called or no tick has
|
||||
// completed since process start.
|
||||
func (s *Scheduler) LastTickAt() time.Time {
|
||||
s.mu.RLock()
|
||||
defer s.mu.RUnlock()
|
||||
return s.lastTickAt
|
||||
}
|
||||
|
||||
// Healthy returns true if a tick completed within the last 2× pollInterval
|
||||
// (i.e. at most 1 missed tick is tolerated). Use from /health and from
|
||||
// /admin/scheduler/health to surface scheduler liveness.
|
||||
func (s *Scheduler) Healthy() bool {
|
||||
last := s.LastTickAt()
|
||||
if last.IsZero() {
|
||||
return false
|
||||
}
|
||||
return time.Since(last) < 2*pollInterval
|
||||
}
|
||||
|
||||
// Start runs the scheduler poll loop. Blocks until ctx is cancelled.
|
||||
//
|
||||
// Defends against panics inside tick() so a single bad row / bad cron
|
||||
// expression / DB blip can't permanently kill the scheduler. Without
|
||||
// this recover the goroutine dies and the only signal to the operator
|
||||
// is "no crons firing" — which we observed as a 12+ hour silent outage
|
||||
// on 2026-04-14 (issue #85).
|
||||
func (s *Scheduler) Start(ctx context.Context) {
|
||||
ticker := time.NewTicker(pollInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
log.Printf("Scheduler: started (poll interval=%s)", pollInterval)
|
||||
|
||||
// Heartbeat before the first tick so /admin/liveness doesn't flag stale
|
||||
// during the initial 30s interval after startup.
|
||||
tickWithRecover := func() {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
log.Printf("Scheduler: PANIC in tick — recovered: %v (next tick in %s)", r, pollInterval)
|
||||
}
|
||||
}()
|
||||
s.tick(ctx)
|
||||
s.mu.Lock()
|
||||
s.lastTickAt = time.Now()
|
||||
s.mu.Unlock()
|
||||
}
|
||||
|
||||
// Heartbeat + initial lastTickAt so /admin/liveness and Healthy() both
|
||||
// pass during the first 30s interval after startup.
|
||||
supervised.Heartbeat("scheduler")
|
||||
s.mu.Lock()
|
||||
s.lastTickAt = time.Now()
|
||||
s.mu.Unlock()
|
||||
|
||||
for {
|
||||
select {
|
||||
@ -71,7 +119,7 @@ func (s *Scheduler) Start(ctx context.Context) {
|
||||
log.Println("Scheduler: stopped")
|
||||
return
|
||||
case <-ticker.C:
|
||||
s.tick(ctx)
|
||||
tickWithRecover()
|
||||
supervised.Heartbeat("scheduler")
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user