Merge pull request #2450 from Molecule-AI/feat/observability-config-schema
feat(config): observability block schema (#119 PR-1 of 4)
This commit is contained in:
commit
c06c4c0f56
@ -166,6 +166,43 @@ class SecurityScanConfig:
|
||||
operators who require a CVE gate know the gate is absent. Closes #268."""
|
||||
|
||||
|
||||
@dataclass
|
||||
class ObservabilityConfig:
|
||||
"""Observability settings — heartbeat cadence and log verbosity.
|
||||
|
||||
Hermes-style block: groups platform-runtime knobs that operators
|
||||
typically tune together (cadence, verbosity) into one declarative
|
||||
section instead of scattering them across env vars and hard-coded
|
||||
constants. Adopting this shape unblocks per-workspace tuning without
|
||||
a code change and pre-positions the schema for tracing/event-log
|
||||
settings that will land in follow-up PRs (#119 PR-2 / PR-3).
|
||||
|
||||
Today only ``heartbeat_interval_seconds`` and ``log_level`` have live
|
||||
consumers; both fields are accepted but not yet wired to their final
|
||||
sites in this PR (schema-only). Wiring lands in PR-3 of the series.
|
||||
|
||||
Example config.yaml snippet::
|
||||
|
||||
observability:
|
||||
heartbeat_interval_seconds: 60
|
||||
log_level: DEBUG
|
||||
"""
|
||||
|
||||
heartbeat_interval_seconds: int = 30
|
||||
"""Seconds between heartbeats sent to the platform. Default 30 matches
|
||||
``workspace/heartbeat.py``'s long-standing constant. Lower values
|
||||
reduce platform-side detection latency for crashed workspaces; higher
|
||||
values reduce platform write load. Bounds: clamped to [5, 300] at
|
||||
parse time — outside that range the workspace either floods the
|
||||
platform or looks dead before the next beat."""
|
||||
|
||||
log_level: str = "INFO"
|
||||
"""Python ``logging`` level for the workspace runtime. Accepts the
|
||||
standard names (DEBUG, INFO, WARNING, ERROR, CRITICAL). Today the
|
||||
runtime reads ``LOG_LEVEL`` env; PR-3 of the #119 stack switches to
|
||||
this field with env still honored as an override for ops debugging."""
|
||||
|
||||
|
||||
@dataclass
|
||||
class ComplianceConfig:
|
||||
"""OWASP Top 10 for Agentic Applications compliance settings.
|
||||
@ -264,6 +301,7 @@ class WorkspaceConfig:
|
||||
governance: GovernanceConfig = field(default_factory=GovernanceConfig)
|
||||
security_scan: SecurityScanConfig = field(default_factory=SecurityScanConfig)
|
||||
compliance: ComplianceConfig = field(default_factory=ComplianceConfig)
|
||||
observability: ObservabilityConfig = field(default_factory=ObservabilityConfig)
|
||||
sub_workspaces: list[dict] = field(default_factory=list)
|
||||
effort: str = ""
|
||||
"""Claude output effort level for the agentic loop: low | medium | high | xhigh | max.
|
||||
@ -289,6 +327,22 @@ def _derive_provider_from_model(model: str) -> str:
|
||||
return ""
|
||||
|
||||
|
||||
def _clamp_heartbeat(value: object) -> int:
|
||||
"""Coerce raw YAML/env input into the [5, 300]-second heartbeat band.
|
||||
|
||||
Outside that band the workspace either floods the platform with
|
||||
sub-second beats or looks dead long before the next one — both
|
||||
real failure modes seen on incidents, neither benign. Coerce here
|
||||
so adapters and ``heartbeat.py`` can read the value without
|
||||
re-validating.
|
||||
"""
|
||||
try:
|
||||
n = int(value)
|
||||
except (TypeError, ValueError):
|
||||
return 30
|
||||
return max(5, min(300, n))
|
||||
|
||||
|
||||
def load_config(config_path: Optional[str] = None) -> WorkspaceConfig:
|
||||
"""Load config from WORKSPACE_CONFIG_PATH or the given path."""
|
||||
if config_path is None:
|
||||
@ -336,6 +390,7 @@ def load_config(config_path: Optional[str] = None) -> WorkspaceConfig:
|
||||
_ss_raw = raw.get("security_scan", {})
|
||||
security_scan_raw = _ss_raw if isinstance(_ss_raw, dict) else {"mode": str(_ss_raw)}
|
||||
compliance_raw = raw.get("compliance", {})
|
||||
observability_raw = raw.get("observability", {})
|
||||
|
||||
# Resolve initial_prompt: inline string or file reference
|
||||
initial_prompt = raw.get("initial_prompt", "")
|
||||
@ -445,6 +500,12 @@ def load_config(config_path: Optional[str] = None) -> WorkspaceConfig:
|
||||
max_tool_calls_per_task=int(compliance_raw.get("max_tool_calls_per_task", 50)),
|
||||
max_task_duration_seconds=int(compliance_raw.get("max_task_duration_seconds", 300)),
|
||||
),
|
||||
observability=ObservabilityConfig(
|
||||
heartbeat_interval_seconds=_clamp_heartbeat(
|
||||
observability_raw.get("heartbeat_interval_seconds", 30)
|
||||
),
|
||||
log_level=str(observability_raw.get("log_level", "INFO")).upper(),
|
||||
),
|
||||
sub_workspaces=raw.get("sub_workspaces", []),
|
||||
effort=str(raw.get("effort", "")),
|
||||
task_budget=int(raw.get("task_budget", 0)),
|
||||
|
||||
@ -9,6 +9,7 @@ from config import (
|
||||
A2AConfig,
|
||||
ComplianceConfig,
|
||||
DelegationConfig,
|
||||
ObservabilityConfig,
|
||||
SandboxConfig,
|
||||
WorkspaceConfig,
|
||||
load_config,
|
||||
@ -523,3 +524,119 @@ def test_compliance_default_via_load_config(tmp_path, yaml_payload, expected_mod
|
||||
# prompt_injection was never overridden in any payload — must stay at
|
||||
# the dataclass default regardless of the mode value.
|
||||
assert cfg.compliance.prompt_injection == "detect"
|
||||
|
||||
|
||||
# ===== Observability block (#119 PR-1) =====
|
||||
#
|
||||
# Hermes-style declarative block grouping cadence + verbosity knobs into one
|
||||
# place. Schema-only in this PR — wiring into heartbeat.py / main.py lands in
|
||||
# PR-3. These tests pin the schema so the wiring PR can rely on the parsed
|
||||
# values matching the documented contract (defaults, clamping bounds,
|
||||
# log-level normalization).
|
||||
|
||||
|
||||
def test_observability_dataclass_default():
|
||||
"""ObservabilityConfig() — no args — yields the documented defaults."""
|
||||
cfg = ObservabilityConfig()
|
||||
assert cfg.heartbeat_interval_seconds == 30
|
||||
assert cfg.log_level == "INFO"
|
||||
|
||||
|
||||
def test_observability_default_when_yaml_omits_block(tmp_path):
|
||||
"""No ``observability:`` key in YAML → dataclass defaults."""
|
||||
config_yaml = tmp_path / "config.yaml"
|
||||
config_yaml.write_text(yaml.dump({}))
|
||||
|
||||
cfg = load_config(str(tmp_path))
|
||||
assert cfg.observability.heartbeat_interval_seconds == 30
|
||||
assert cfg.observability.log_level == "INFO"
|
||||
|
||||
|
||||
def test_observability_explicit_yaml_override(tmp_path):
|
||||
"""Explicit YAML values flow through load_config to ObservabilityConfig."""
|
||||
config_yaml = tmp_path / "config.yaml"
|
||||
config_yaml.write_text(
|
||||
yaml.dump(
|
||||
{
|
||||
"observability": {
|
||||
"heartbeat_interval_seconds": 60,
|
||||
"log_level": "DEBUG",
|
||||
}
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
cfg = load_config(str(tmp_path))
|
||||
assert cfg.observability.heartbeat_interval_seconds == 60
|
||||
assert cfg.observability.log_level == "DEBUG"
|
||||
|
||||
|
||||
def test_observability_partial_override_keeps_other_defaults(tmp_path):
|
||||
"""Setting only heartbeat preserves the log_level default — and vice versa."""
|
||||
config_yaml = tmp_path / "config.yaml"
|
||||
config_yaml.write_text(
|
||||
yaml.dump({"observability": {"heartbeat_interval_seconds": 45}})
|
||||
)
|
||||
|
||||
cfg = load_config(str(tmp_path))
|
||||
assert cfg.observability.heartbeat_interval_seconds == 45
|
||||
assert cfg.observability.log_level == "INFO"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"raw, expected",
|
||||
[
|
||||
# In-band values pass through unchanged.
|
||||
(5, 5),
|
||||
(30, 30),
|
||||
(300, 300),
|
||||
# Below floor → clamped up to 5s. Sub-5s heartbeats flooded the
|
||||
# platform during incident IR-2026-03-11 (workspace stuck in a
|
||||
# tight loop emitting beats faster than the platform could ack).
|
||||
(1, 5),
|
||||
(0, 5),
|
||||
(-7, 5),
|
||||
# Above ceiling → clamped down to 300s. >5min beats let crashed
|
||||
# workspaces look healthy long enough to mask the failure.
|
||||
(301, 300),
|
||||
(3600, 300),
|
||||
# Non-integer YAML values fall back to the documented default
|
||||
# rather than crashing the workspace at boot.
|
||||
("not-a-number", 30),
|
||||
(None, 30),
|
||||
],
|
||||
ids=[
|
||||
"floor_in_band",
|
||||
"default_in_band",
|
||||
"ceiling_in_band",
|
||||
"below_floor_one",
|
||||
"below_floor_zero",
|
||||
"below_floor_negative",
|
||||
"above_ceiling_just",
|
||||
"above_ceiling_far",
|
||||
"garbage_string",
|
||||
"null",
|
||||
],
|
||||
)
|
||||
def test_observability_heartbeat_clamp(tmp_path, raw, expected):
|
||||
"""heartbeat_interval_seconds is clamped to the [5, 300] band at parse."""
|
||||
config_yaml = tmp_path / "config.yaml"
|
||||
config_yaml.write_text(
|
||||
yaml.dump({"observability": {"heartbeat_interval_seconds": raw}})
|
||||
)
|
||||
|
||||
cfg = load_config(str(tmp_path))
|
||||
assert cfg.observability.heartbeat_interval_seconds == expected
|
||||
|
||||
|
||||
def test_observability_log_level_uppercased(tmp_path):
|
||||
"""Lowercase or mixed-case log levels normalize to the canonical form
|
||||
Python's ``logging`` module expects, so operators can write either
|
||||
``debug`` or ``DEBUG`` in YAML without surprise."""
|
||||
config_yaml = tmp_path / "config.yaml"
|
||||
config_yaml.write_text(
|
||||
yaml.dump({"observability": {"log_level": "debug"}})
|
||||
)
|
||||
|
||||
cfg = load_config(str(tmp_path))
|
||||
assert cfg.observability.log_level == "DEBUG"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user