Renames: - platform/ → workspace-server/ (Go module path stays as "platform" for external dep compat — will update after plugin module republish) - workspace-template/ → workspace/ Removed (moved to separate repos or deleted): - PLAN.md — internal roadmap (move to private project board) - HANDOFF.md, AGENTS.md — one-time internal session docs - .claude/ — gitignored entirely (local agent config) - infra/cloudflare-worker/ → Molecule-AI/molecule-tenant-proxy - org-templates/molecule-dev/ → standalone template repo - .mcp-eval/ → molecule-mcp-server repo - test-results/ — ephemeral, gitignored Security scrubbing: - Cloudflare account/zone/KV IDs → placeholders - Real EC2 IPs → <EC2_IP> in all docs - CF token prefix, Neon project ID, Fly app names → redacted - Langfuse dev credentials → parameterized - Personal runner username/machine name → generic Community files: - CONTRIBUTING.md — build, test, branch conventions - CODE_OF_CONDUCT.md — Contributor Covenant 2.1 All Dockerfiles, CI workflows, docker-compose, railway.toml, render.yaml, README, CLAUDE.md updated for new directory names. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
241 lines
8.0 KiB
Go
241 lines
8.0 KiB
Go
// Package handlers — restart_context.go implements Layer 1 of issue #19:
|
|
// after a workspace is restarted and comes back online, the platform
|
|
// generates a state snapshot (timestamp, previous session end, env-var
|
|
// keys now available) and delivers it as a synthetic A2A message/send
|
|
// so the agent sees what changed across the restart boundary.
|
|
//
|
|
// Layer 2 (user-defined restart_prompt via config.yaml / org.yaml) is
|
|
// out of scope for this file — tracked as a separate follow-up issue.
|
|
package handlers
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log"
|
|
"sort"
|
|
"time"
|
|
|
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
|
"github.com/google/uuid"
|
|
)
|
|
|
|
// restartContextOnlineTimeout bounds how long we wait for a workspace
|
|
// to re-register after restart before dropping the context message.
|
|
// The Restart HTTP handler has already returned 200 by the time this
|
|
// waiter runs, so a timeout here is purely a best-effort skip.
|
|
const restartContextOnlineTimeout = 30 * time.Second
|
|
|
|
// restartContextOnlinePollInterval is the poll cadence while waiting
|
|
// for WORKSPACE_ONLINE. 500ms keeps the typical-case latency low
|
|
// without hammering Postgres.
|
|
const restartContextOnlinePollInterval = 500 * time.Millisecond
|
|
|
|
// restartContextData captures the platform-computed snapshot that will
|
|
// be rendered into a human-readable message. Keeping it as a struct
|
|
// (rather than building the string inline) makes the builder
|
|
// unit-testable without stubbing time/DB calls.
|
|
type restartContextData struct {
|
|
RestartAt time.Time
|
|
PrevSessionAt time.Time // zero value = no prior session recorded
|
|
EnvKeys []string // sorted list of env-var keys (no values)
|
|
}
|
|
|
|
// buildRestartContextMessage renders the restart context into the
|
|
// exact format proposed in issue #19. Fields that have no data (e.g.
|
|
// first-ever session) are rendered with a neutral placeholder so the
|
|
// agent always sees a consistent shape.
|
|
func buildRestartContextMessage(d restartContextData) string {
|
|
msg := "=== WORKSPACE RESTART CONTEXT ===\n"
|
|
msg += fmt.Sprintf("Restart at: %s\n", d.RestartAt.UTC().Format(time.RFC3339))
|
|
|
|
if d.PrevSessionAt.IsZero() {
|
|
msg += "Previous session ended: (no prior session on record)\n"
|
|
} else {
|
|
delta := d.RestartAt.Sub(d.PrevSessionAt)
|
|
msg += fmt.Sprintf("Previous session ended: %s (%s ago)\n",
|
|
d.PrevSessionAt.UTC().Format(time.RFC3339),
|
|
humanDuration(delta))
|
|
}
|
|
|
|
if len(d.EnvKeys) == 0 {
|
|
msg += "Env vars now available: (none)\n"
|
|
} else {
|
|
msg += fmt.Sprintf("Env vars now available: %s\n", joinStrings(d.EnvKeys, ", "))
|
|
}
|
|
|
|
msg += "=== END RESTART CONTEXT ===\n"
|
|
return msg
|
|
}
|
|
|
|
// humanDuration formats a duration for display in the restart context.
|
|
// Keeps the output terse ("2h14m", "38s") without pulling in a
|
|
// humanize library. Negative/zero deltas render as "0s".
|
|
func humanDuration(d time.Duration) string {
|
|
if d <= 0 {
|
|
return "0s"
|
|
}
|
|
d = d.Round(time.Second)
|
|
h := int(d / time.Hour)
|
|
m := int((d % time.Hour) / time.Minute)
|
|
s := int((d % time.Minute) / time.Second)
|
|
switch {
|
|
case h > 0:
|
|
return fmt.Sprintf("%dh%dm", h, m)
|
|
case m > 0:
|
|
return fmt.Sprintf("%dm%ds", m, s)
|
|
default:
|
|
return fmt.Sprintf("%ds", s)
|
|
}
|
|
}
|
|
|
|
// joinStrings is strings.Join — inlined to avoid an import cycle
|
|
// concern in a file that already carries a handful of stdlib deps.
|
|
func joinStrings(parts []string, sep string) string {
|
|
switch len(parts) {
|
|
case 0:
|
|
return ""
|
|
case 1:
|
|
return parts[0]
|
|
}
|
|
n := len(sep) * (len(parts) - 1)
|
|
for i := 0; i < len(parts); i++ {
|
|
n += len(parts[i])
|
|
}
|
|
b := make([]byte, 0, n)
|
|
b = append(b, parts[0]...)
|
|
for _, p := range parts[1:] {
|
|
b = append(b, sep...)
|
|
b = append(b, p...)
|
|
}
|
|
return string(b)
|
|
}
|
|
|
|
// loadRestartContextData gathers the snapshot inputs from the DB.
|
|
// Called *before* the restart mutates workspace state so the "previous
|
|
// session ended" timestamp reflects the pre-restart heartbeat, not the
|
|
// newly-provisioning row.
|
|
func loadRestartContextData(ctx context.Context, workspaceID string) restartContextData {
|
|
d := restartContextData{RestartAt: time.Now()}
|
|
|
|
var lastHB sql.NullTime
|
|
if err := db.DB.QueryRowContext(ctx,
|
|
`SELECT last_heartbeat_at FROM workspaces WHERE id = $1`, workspaceID,
|
|
).Scan(&lastHB); err == nil && lastHB.Valid {
|
|
d.PrevSessionAt = lastHB.Time
|
|
}
|
|
|
|
// Env-var keys: union of global secrets + workspace-specific
|
|
// secrets. Values are NEVER included — only keys — so the agent
|
|
// can reason about "did my missing credential arrive?" without
|
|
// the platform ever echoing secret material back into the
|
|
// message bus.
|
|
keySet := map[string]struct{}{}
|
|
if rows, err := db.DB.QueryContext(ctx, `SELECT key FROM global_secrets`); err == nil {
|
|
for rows.Next() {
|
|
var k string
|
|
if rows.Scan(&k) == nil {
|
|
keySet[k] = struct{}{}
|
|
}
|
|
}
|
|
rows.Close()
|
|
}
|
|
if rows, err := db.DB.QueryContext(ctx,
|
|
`SELECT key FROM workspace_secrets WHERE workspace_id = $1`, workspaceID,
|
|
); err == nil {
|
|
for rows.Next() {
|
|
var k string
|
|
if rows.Scan(&k) == nil {
|
|
keySet[k] = struct{}{}
|
|
}
|
|
}
|
|
rows.Close()
|
|
}
|
|
for k := range keySet {
|
|
d.EnvKeys = append(d.EnvKeys, k)
|
|
}
|
|
sort.Strings(d.EnvKeys)
|
|
return d
|
|
}
|
|
|
|
// waitForWorkspaceOnline polls the workspaces table until the target
|
|
// workspace's status flips to 'online' or the deadline expires.
|
|
// Returns true on success; callers log+drop on false.
|
|
func waitForWorkspaceOnline(ctx context.Context, workspaceID string, timeout time.Duration) bool {
|
|
deadline := time.Now().Add(timeout)
|
|
for time.Now().Before(deadline) {
|
|
var status string
|
|
if err := db.DB.QueryRowContext(ctx,
|
|
`SELECT status FROM workspaces WHERE id = $1`, workspaceID,
|
|
).Scan(&status); err == nil && status == "online" {
|
|
return true
|
|
}
|
|
select {
|
|
case <-ctx.Done():
|
|
return false
|
|
case <-time.After(restartContextOnlinePollInterval):
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// buildRestartA2APayload wraps the rendered context string in the
|
|
// JSON-RPC 2.0 / A2A message/send shape that the proxy already knows
|
|
// how to normalize. Returns the marshalled body ready for ProxyA2ARequest.
|
|
func buildRestartA2APayload(text string) ([]byte, error) {
|
|
payload := map[string]any{
|
|
"jsonrpc": "2.0",
|
|
"id": uuid.New().String(),
|
|
"method": "message/send",
|
|
"params": map[string]any{
|
|
"message": map[string]any{
|
|
"messageId": uuid.New().String(),
|
|
"role": "user",
|
|
"parts": []any{map[string]any{"kind": "text", "text": text}},
|
|
"metadata": map[string]any{
|
|
"source": "platform",
|
|
"kind": "restart_context",
|
|
"layer": 1,
|
|
"restart_context": true,
|
|
},
|
|
},
|
|
},
|
|
}
|
|
return json.Marshal(payload)
|
|
}
|
|
|
|
// sendRestartContext is called by the Restart handler in a background
|
|
// goroutine. It waits for the workspace to come online, then delivers
|
|
// the snapshot via the existing A2A proxy. Failures are logged and
|
|
// dropped — the restart itself is already considered successful at
|
|
// this point.
|
|
func (h *WorkspaceHandler) sendRestartContext(workspaceID string, data restartContextData) {
|
|
// Detach from any request context — this runs after the HTTP
|
|
// response is flushed.
|
|
ctx, cancel := context.WithTimeout(context.Background(), restartContextOnlineTimeout+30*time.Second)
|
|
defer cancel()
|
|
|
|
if !waitForWorkspaceOnline(ctx, workspaceID, restartContextOnlineTimeout) {
|
|
log.Printf("restart-context: workspace %s did not come online within %s — dropping context message", workspaceID, restartContextOnlineTimeout)
|
|
return
|
|
}
|
|
|
|
text := buildRestartContextMessage(data)
|
|
body, err := buildRestartA2APayload(text)
|
|
if err != nil {
|
|
log.Printf("restart-context: failed to marshal payload for %s: %v", workspaceID, err)
|
|
return
|
|
}
|
|
|
|
// "system:restart-context" prefix flags this as a trusted
|
|
// non-workspace caller — bypasses CanCommunicate and the
|
|
// caller-token check in a2a_proxy.go.
|
|
status, _, proxyErr := h.ProxyA2ARequest(ctx, workspaceID, body, "system:restart-context", false)
|
|
if proxyErr != nil {
|
|
log.Printf("restart-context: ProxyA2ARequest failed for %s (status=%d): %v", workspaceID, status, proxyErr)
|
|
return
|
|
}
|
|
log.Printf("restart-context: delivered to %s (status=%d, keys=%d)", workspaceID, status, len(data.EnvKeys))
|
|
}
|