diff --git a/workspace/config.py b/workspace/config.py index 3b205f1b..4e199c57 100644 --- a/workspace/config.py +++ b/workspace/config.py @@ -166,6 +166,43 @@ class SecurityScanConfig: operators who require a CVE gate know the gate is absent. Closes #268.""" +@dataclass +class ObservabilityConfig: + """Observability settings — heartbeat cadence and log verbosity. + + Hermes-style block: groups platform-runtime knobs that operators + typically tune together (cadence, verbosity) into one declarative + section instead of scattering them across env vars and hard-coded + constants. Adopting this shape unblocks per-workspace tuning without + a code change and pre-positions the schema for tracing/event-log + settings that will land in follow-up PRs (#119 PR-2 / PR-3). + + Today only ``heartbeat_interval_seconds`` and ``log_level`` have live + consumers; both fields are accepted but not yet wired to their final + sites in this PR (schema-only). Wiring lands in PR-3 of the series. + + Example config.yaml snippet:: + + observability: + heartbeat_interval_seconds: 60 + log_level: DEBUG + """ + + heartbeat_interval_seconds: int = 30 + """Seconds between heartbeats sent to the platform. Default 30 matches + ``workspace/heartbeat.py``'s long-standing constant. Lower values + reduce platform-side detection latency for crashed workspaces; higher + values reduce platform write load. Bounds: clamped to [5, 300] at + parse time — outside that range the workspace either floods the + platform or looks dead before the next beat.""" + + log_level: str = "INFO" + """Python ``logging`` level for the workspace runtime. Accepts the + standard names (DEBUG, INFO, WARNING, ERROR, CRITICAL). Today the + runtime reads ``LOG_LEVEL`` env; PR-3 of the #119 stack switches to + this field with env still honored as an override for ops debugging.""" + + @dataclass class ComplianceConfig: """OWASP Top 10 for Agentic Applications compliance settings. @@ -264,6 +301,7 @@ class WorkspaceConfig: governance: GovernanceConfig = field(default_factory=GovernanceConfig) security_scan: SecurityScanConfig = field(default_factory=SecurityScanConfig) compliance: ComplianceConfig = field(default_factory=ComplianceConfig) + observability: ObservabilityConfig = field(default_factory=ObservabilityConfig) sub_workspaces: list[dict] = field(default_factory=list) effort: str = "" """Claude output effort level for the agentic loop: low | medium | high | xhigh | max. @@ -289,6 +327,22 @@ def _derive_provider_from_model(model: str) -> str: return "" +def _clamp_heartbeat(value: object) -> int: + """Coerce raw YAML/env input into the [5, 300]-second heartbeat band. + + Outside that band the workspace either floods the platform with + sub-second beats or looks dead long before the next one — both + real failure modes seen on incidents, neither benign. Coerce here + so adapters and ``heartbeat.py`` can read the value without + re-validating. + """ + try: + n = int(value) + except (TypeError, ValueError): + return 30 + return max(5, min(300, n)) + + def load_config(config_path: Optional[str] = None) -> WorkspaceConfig: """Load config from WORKSPACE_CONFIG_PATH or the given path.""" if config_path is None: @@ -336,6 +390,7 @@ def load_config(config_path: Optional[str] = None) -> WorkspaceConfig: _ss_raw = raw.get("security_scan", {}) security_scan_raw = _ss_raw if isinstance(_ss_raw, dict) else {"mode": str(_ss_raw)} compliance_raw = raw.get("compliance", {}) + observability_raw = raw.get("observability", {}) # Resolve initial_prompt: inline string or file reference initial_prompt = raw.get("initial_prompt", "") @@ -445,6 +500,12 @@ def load_config(config_path: Optional[str] = None) -> WorkspaceConfig: max_tool_calls_per_task=int(compliance_raw.get("max_tool_calls_per_task", 50)), max_task_duration_seconds=int(compliance_raw.get("max_task_duration_seconds", 300)), ), + observability=ObservabilityConfig( + heartbeat_interval_seconds=_clamp_heartbeat( + observability_raw.get("heartbeat_interval_seconds", 30) + ), + log_level=str(observability_raw.get("log_level", "INFO")).upper(), + ), sub_workspaces=raw.get("sub_workspaces", []), effort=str(raw.get("effort", "")), task_budget=int(raw.get("task_budget", 0)), diff --git a/workspace/tests/test_config.py b/workspace/tests/test_config.py index bc09d638..5c790b04 100644 --- a/workspace/tests/test_config.py +++ b/workspace/tests/test_config.py @@ -9,6 +9,7 @@ from config import ( A2AConfig, ComplianceConfig, DelegationConfig, + ObservabilityConfig, SandboxConfig, WorkspaceConfig, load_config, @@ -523,3 +524,119 @@ def test_compliance_default_via_load_config(tmp_path, yaml_payload, expected_mod # prompt_injection was never overridden in any payload — must stay at # the dataclass default regardless of the mode value. assert cfg.compliance.prompt_injection == "detect" + + +# ===== Observability block (#119 PR-1) ===== +# +# Hermes-style declarative block grouping cadence + verbosity knobs into one +# place. Schema-only in this PR — wiring into heartbeat.py / main.py lands in +# PR-3. These tests pin the schema so the wiring PR can rely on the parsed +# values matching the documented contract (defaults, clamping bounds, +# log-level normalization). + + +def test_observability_dataclass_default(): + """ObservabilityConfig() — no args — yields the documented defaults.""" + cfg = ObservabilityConfig() + assert cfg.heartbeat_interval_seconds == 30 + assert cfg.log_level == "INFO" + + +def test_observability_default_when_yaml_omits_block(tmp_path): + """No ``observability:`` key in YAML → dataclass defaults.""" + config_yaml = tmp_path / "config.yaml" + config_yaml.write_text(yaml.dump({})) + + cfg = load_config(str(tmp_path)) + assert cfg.observability.heartbeat_interval_seconds == 30 + assert cfg.observability.log_level == "INFO" + + +def test_observability_explicit_yaml_override(tmp_path): + """Explicit YAML values flow through load_config to ObservabilityConfig.""" + config_yaml = tmp_path / "config.yaml" + config_yaml.write_text( + yaml.dump( + { + "observability": { + "heartbeat_interval_seconds": 60, + "log_level": "DEBUG", + } + } + ) + ) + + cfg = load_config(str(tmp_path)) + assert cfg.observability.heartbeat_interval_seconds == 60 + assert cfg.observability.log_level == "DEBUG" + + +def test_observability_partial_override_keeps_other_defaults(tmp_path): + """Setting only heartbeat preserves the log_level default — and vice versa.""" + config_yaml = tmp_path / "config.yaml" + config_yaml.write_text( + yaml.dump({"observability": {"heartbeat_interval_seconds": 45}}) + ) + + cfg = load_config(str(tmp_path)) + assert cfg.observability.heartbeat_interval_seconds == 45 + assert cfg.observability.log_level == "INFO" + + +@pytest.mark.parametrize( + "raw, expected", + [ + # In-band values pass through unchanged. + (5, 5), + (30, 30), + (300, 300), + # Below floor → clamped up to 5s. Sub-5s heartbeats flooded the + # platform during incident IR-2026-03-11 (workspace stuck in a + # tight loop emitting beats faster than the platform could ack). + (1, 5), + (0, 5), + (-7, 5), + # Above ceiling → clamped down to 300s. >5min beats let crashed + # workspaces look healthy long enough to mask the failure. + (301, 300), + (3600, 300), + # Non-integer YAML values fall back to the documented default + # rather than crashing the workspace at boot. + ("not-a-number", 30), + (None, 30), + ], + ids=[ + "floor_in_band", + "default_in_band", + "ceiling_in_band", + "below_floor_one", + "below_floor_zero", + "below_floor_negative", + "above_ceiling_just", + "above_ceiling_far", + "garbage_string", + "null", + ], +) +def test_observability_heartbeat_clamp(tmp_path, raw, expected): + """heartbeat_interval_seconds is clamped to the [5, 300] band at parse.""" + config_yaml = tmp_path / "config.yaml" + config_yaml.write_text( + yaml.dump({"observability": {"heartbeat_interval_seconds": raw}}) + ) + + cfg = load_config(str(tmp_path)) + assert cfg.observability.heartbeat_interval_seconds == expected + + +def test_observability_log_level_uppercased(tmp_path): + """Lowercase or mixed-case log levels normalize to the canonical form + Python's ``logging`` module expects, so operators can write either + ``debug`` or ``DEBUG`` in YAML without surprise.""" + config_yaml = tmp_path / "config.yaml" + config_yaml.write_text( + yaml.dump({"observability": {"log_level": "debug"}}) + ) + + cfg = load_config(str(tmp_path)) + assert cfg.observability.log_level == "DEBUG"