## Symptom (cycle 6+ of #488) Workspaces appear `online` (heartbeats fine) but every cron tick fails silently with `No conversation found with session ID: <uuid>` → `ProcessError: exit code 1` → idle loop logs HTTP 200, no actual work happens. Backend Engineer received 5 idle pulses without claiming a single one of the 6 open Hermes issues (#496-500) because the bug prevents `gh issue list` from ever firing. ## Root cause (verified live in ws-20cb8ff8-3e4 today) claude-code stores sessions at `/root/.claude/projects/<cwd-with-/-as-->/<id>.jsonl`. When a workspace container is recreated, `self._session_id` from a prior instance references a file that no longer exists. Passing it as `resume=<id>` to ClaudeAgentOptions crashes the CLI on the very first call. The existing #75 fix only fires AFTER the first ProcessError lands, and per-cycle executor re-instantiation can reload the stale id from elsewhere — restart-with-reset_claude_session was the only working mitigation, hand-fired every cycle. ## Fix New `_resolve_resume()` in ClaudeSDKExecutor: probes a handful of well-known session-file locations (`/root/.claude/projects/*/<id>.jsonl`, `/root/.claude/sessions/<id>.jsonl`, plus the agent-uid variants) via `glob.glob`. If no file matches the in-memory `_session_id`, drops the id (sets to None) AND returns None so `ClaudeAgentOptions.resume` is unset — CLI starts a fresh session. Logged at INFO with `#488` in the message so operators correlate. `_build_options()` now calls `_resolve_resume()` instead of reading `self._session_id` directly. Cheap path when no session set: zero glob calls. Hot path (session set + file exists): one glob call, short-circuits on first match. ## Drive-by fix: stale `from X import` in 4 modules Same regression class as #1 (the runtime release that closed it): - `claude_sdk_executor.py:43`: `from executor_helpers import …` - `cli_executor.py:39-40`: `from config import …`, `from executor_helpers import …` - `main.py:28-30`: `from config import …`, `from heartbeat import …`, `from preflight import …` - `preflight.py:7`: `from config import …` All rewritten to absolute `from molecule_runtime.<module> import …` so they resolve outside of workspace containers (e.g. test environments where `/app` isn't on sys.path). The grep guard in `tests/test_imports.py` already covered `adapters` — extending to all top-level imports would catch this class going forward; not in this PR to keep scope tight. ## Tests 6 new in `tests/test_session_resume_gate.py`: - baseline (no session) → no glob, returns None - file exists → keep id, returns id, single glob (early-exit) - file missing → drop id (clears `_session_id`), returns None - late-pattern match → walks all patterns until hit - log includes session id (operator triage) - log references #488 (debugger discoverability) All 16 tests (10 existing + 6 new) pass. ## Release plan - Bump version 0.1.1 → 0.1.2 (in this commit) - After merge, push v0.1.2 tag → publish.yml auto-publishes to PyPI - Then rebuild workspace template images locally so workspaces pick up the fix (templates pin `>=0.1.0`, will resolve to 0.1.2 on next build) - Then mass-restart workspaces with reset_claude_session=true once to clear any DB-side stale state, and the permanent fix kicks in Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
144 lines
4.8 KiB
Python
144 lines
4.8 KiB
Python
"""Startup preflight checks for workspace runtime configs."""
|
|
|
|
import os
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
|
|
from molecule_runtime.config import WorkspaceConfig
|
|
|
|
SUPPORTED_RUNTIMES = {
|
|
"langgraph",
|
|
"claude-code",
|
|
"codex",
|
|
"ollama",
|
|
"custom",
|
|
"crewai",
|
|
"autogen",
|
|
"deepagents",
|
|
"openclaw",
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class PreflightIssue:
|
|
severity: str
|
|
title: str
|
|
detail: str
|
|
fix: str = ""
|
|
|
|
|
|
@dataclass
|
|
class PreflightReport:
|
|
warnings: list[PreflightIssue] = field(default_factory=list)
|
|
failures: list[PreflightIssue] = field(default_factory=list)
|
|
|
|
@property
|
|
def ok(self) -> bool:
|
|
return not self.failures
|
|
|
|
|
|
def run_preflight(config: WorkspaceConfig, config_path: str) -> PreflightReport:
|
|
"""Check the workspace config for obvious startup blockers."""
|
|
report = PreflightReport()
|
|
config_dir = Path(config_path)
|
|
|
|
if config.runtime not in SUPPORTED_RUNTIMES:
|
|
report.failures.append(
|
|
PreflightIssue(
|
|
severity="fail",
|
|
title="Runtime",
|
|
detail=f"Unsupported runtime '{config.runtime}'",
|
|
fix="Choose one of the supported runtimes or install the matching adapter.",
|
|
)
|
|
)
|
|
|
|
if not 1 <= int(config.a2a.port) <= 65535:
|
|
report.failures.append(
|
|
PreflightIssue(
|
|
severity="fail",
|
|
title="A2A port",
|
|
detail=f"Invalid A2A port: {config.a2a.port}",
|
|
fix="Set a2a.port to a value between 1 and 65535.",
|
|
)
|
|
)
|
|
|
|
# Check required environment variables (e.g. CLAUDE_CODE_OAUTH_TOKEN, OPENAI_API_KEY).
|
|
# These are declared per-runtime in config.yaml and injected via the secrets API.
|
|
required_env = getattr(config.runtime_config, "required_env", []) or []
|
|
for env_var in required_env:
|
|
if not os.environ.get(env_var):
|
|
report.failures.append(
|
|
PreflightIssue(
|
|
severity="fail",
|
|
title="Required env",
|
|
detail=f"Missing required environment variable: {env_var}",
|
|
fix=f"Set {env_var} via the secrets API (global or workspace-level).",
|
|
)
|
|
)
|
|
|
|
# Backward compat: if legacy auth_token_file is set, warn but don't block
|
|
# if the token is available via required_env or auth_token_env.
|
|
token_file = getattr(config.runtime_config, "auth_token_file", "")
|
|
if token_file:
|
|
token_path = config_dir / token_file
|
|
if not token_path.exists():
|
|
token_env = getattr(config.runtime_config, "auth_token_env", "")
|
|
env_has_token = bool(token_env and os.environ.get(token_env))
|
|
# Also check if any required_env is set (covers the new path)
|
|
if not env_has_token and required_env:
|
|
env_has_token = all(os.environ.get(e) for e in required_env)
|
|
|
|
if not env_has_token:
|
|
report.failures.append(
|
|
PreflightIssue(
|
|
severity="fail",
|
|
title="Auth token",
|
|
detail=f"Missing auth token file: {token_file}",
|
|
fix="Remove auth_token_file and use required_env + secrets API instead.",
|
|
)
|
|
)
|
|
|
|
prompt_files = config.prompt_files or ["system-prompt.md"]
|
|
for prompt_file in prompt_files:
|
|
prompt_path = config_dir / prompt_file
|
|
if not prompt_path.exists():
|
|
report.warnings.append(
|
|
PreflightIssue(
|
|
severity="warn",
|
|
title="Prompt file",
|
|
detail=f"Missing prompt file: {prompt_file}",
|
|
fix="Add the file or remove it from prompt_files.",
|
|
)
|
|
)
|
|
|
|
skills_dir = config_dir / "skills"
|
|
for skill_name in config.skills:
|
|
skill_path = skills_dir / skill_name / "SKILL.md"
|
|
if not skill_path.exists():
|
|
report.warnings.append(
|
|
PreflightIssue(
|
|
severity="warn",
|
|
title="Skill",
|
|
detail=f"Missing skill package: {skill_name}",
|
|
fix="Restore the skill folder or remove it from config.yaml.",
|
|
)
|
|
)
|
|
|
|
return report
|
|
|
|
|
|
def render_preflight_report(report: PreflightReport) -> None:
|
|
"""Print a concise startup report."""
|
|
if not report.warnings and not report.failures:
|
|
return
|
|
|
|
print("Preflight checks:")
|
|
for issue in report.failures:
|
|
print(f"[FAIL] {issue.title}: {issue.detail}")
|
|
if issue.fix:
|
|
print(f" Fix: {issue.fix}")
|
|
for issue in report.warnings:
|
|
print(f"[WARN] {issue.title}: {issue.detail}")
|
|
if issue.fix:
|
|
print(f" Fix: {issue.fix}")
|