From 6488ba09e7d08db499224e72d31775b0d987e377 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 4 May 2026 12:20:34 -0700 Subject: [PATCH] fix(preflight): downgrade required_env + auth_token failures to warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Preflight was hard-failing the workspace boot when required env vars or legacy auth_token_files were missing, raising SystemExit(1) before main.py's PR #2756 try/except could mount the not-configured handler. Result: codex/openclaw workspaces launched without OPENAI_API_KEY were INVISIBLE — `/.well-known/agent-card.json` never returned 200, the bench timed out at 600s, canvas had no actionable signal. PR #2756 fixed half the puzzle (decouple agent-card from adapter.setup() failure); this fixes the other half (decouple from preflight failure). Caught by bench-provision-time run 25335853189 on 2026-05-04: codex and openclaw both timed_out at 609s while claude-code (whose default model needs no env) hit 86.7s on the same AMI. Hermes hit 147s because hermes config doesn't declare top-level required_env. After this change: - Missing required_env: WARN (operator sees it in boot logs); workspace proceeds to adapter.setup() which raises with the same env-name detail; PR #2756's try/except mounts the not-configured handler; /.well-known/agent-card.json serves 200; JSON-RPC POST / returns -32603 "agent not configured" with the env-name in `error.data`. - Missing auth_token_file (legacy path): same treatment. - Other preflight failures (runtime adapter not installable, invalid A2A port) STAY as fails — those are structural, the workspace truly can't run. Updated 4 existing tests that asserted `report.ok is False` on required_env / auth_token misses to assert `report.ok is True` and check `report.warnings` instead. All 31 preflight tests pass; full suite 1664 pass + 1 unrelated flake on staging. Co-Authored-By: Claude Opus 4.7 (1M context) --- workspace/preflight.py | 34 ++++++++++++---- workspace/tests/test_preflight.py | 65 +++++++++++++++++++++---------- 2 files changed, 71 insertions(+), 28 deletions(-) diff --git a/workspace/preflight.py b/workspace/preflight.py index e1929f3e..0f048b4b 100644 --- a/workspace/preflight.py +++ b/workspace/preflight.py @@ -204,17 +204,31 @@ def run_preflight(config: WorkspaceConfig, config_path: str) -> PreflightReport: ) ) continue - report.failures.append( + # Missing required env is a CONFIGURATION issue, not a STRUCTURAL one. + # The workspace can still bind /.well-known/agent-card.json — adapter.setup() + # raises on the missing key, main.py's PR #2756 try/except mounts the + # not-configured JSON-RPC handler, canvas surfaces a clear "agent not + # configured: " error to the user. Hard-failing preflight here + # would crash before the not-configured path even loads, leaving the + # workspace invisible (the failure mode that bit codex/openclaw bench + # 25335853189 on 2026-05-04 even after PR #2756). Warn loudly so logs + # remain actionable, but let the boot continue. + report.warnings.append( PreflightIssue( - severity="fail", + severity="warn", title="Required env", detail=f"Missing required environment variable: {env_var}", - fix=f"Set {env_var} via the secrets API (global or workspace-level).", + fix=( + f"Set {env_var} via the secrets API (global or workspace-level). " + "Workspace will boot in not-configured state until this is set; " + "JSON-RPC will return -32603 'agent not configured' on every request." + ), ) ) - # Backward compat: if legacy auth_token_file is set, warn but don't block - # if the token is available via required_env or auth_token_env. + # Backward compat: if legacy auth_token_file is set, warn — same reasoning + # as the required_env block above. The downstream auth check fires inside + # adapter.setup(), which is wrapped by main.py's try/except. token_file = getattr(config.runtime_config, "auth_token_file", "") if token_file: token_path = config_dir / token_file @@ -226,12 +240,16 @@ def run_preflight(config: WorkspaceConfig, config_path: str) -> PreflightReport: env_has_token = all(os.environ.get(e) for e in required_env) if not env_has_token: - report.failures.append( + report.warnings.append( PreflightIssue( - severity="fail", + severity="warn", title="Auth token", detail=f"Missing auth token file: {token_file}", - fix="Remove auth_token_file and use required_env + secrets API instead.", + fix=( + "Remove auth_token_file and use required_env + secrets API " + "instead. Workspace will boot in not-configured state until " + "the token is provided." + ), ) ) diff --git a/workspace/tests/test_preflight.py b/workspace/tests/test_preflight.py index 063dcb8f..d53daf71 100644 --- a/workspace/tests/test_preflight.py +++ b/workspace/tests/test_preflight.py @@ -225,8 +225,14 @@ def test_required_env_present_passes(tmp_path, monkeypatch): assert not any(issue.title == "Required env" for issue in report.failures) -def test_required_env_missing_fails(tmp_path, monkeypatch): - """When a required_env var is missing, preflight fails.""" +def test_required_env_missing_warns_does_not_fail(tmp_path, monkeypatch): + """When a required_env var is missing, preflight WARNS but does not + fail the boot. Pairs with PR #2756 (molecule-core): the workspace + binds /.well-known/agent-card.json regardless of credentials and + routes JSON-RPC to a -32603 'agent not configured' handler. Hard + failing here would crash before the not-configured path even loads, + leaving the workspace invisible — that's the failure mode that bit + codex/openclaw bench 25335853189 on 2026-05-04 even after PR #2756.""" monkeypatch.delenv("CLAUDE_CODE_OAUTH_TOKEN", raising=False) config = make_config( @@ -236,10 +242,13 @@ def test_required_env_missing_fails(tmp_path, monkeypatch): report = run_preflight(config, str(tmp_path)) - assert report.ok is False + assert report.ok is True assert any( issue.title == "Required env" and "CLAUDE_CODE_OAUTH_TOKEN" in issue.detail - for issue in report.failures + for issue in report.warnings + ) + assert not any( + issue.title == "Required env" for issue in report.failures ) @@ -257,8 +266,11 @@ def test_required_env_multiple_all_present_passes(tmp_path, monkeypatch): assert report.ok is True -def test_required_env_multiple_one_missing_fails(tmp_path, monkeypatch): - """If any required_env var is missing, preflight fails with that var named.""" +def test_required_env_multiple_one_missing_warns(tmp_path, monkeypatch): + """If any required_env var is missing, preflight warns with that var + named (and does NOT fail). The eventual setup() failure is what + actually surfaces to the user via the -32603 handler — preflight is + just a logging signal for operators inspecting boot logs.""" monkeypatch.setenv("API_KEY_A", "key-a") monkeypatch.delenv("API_KEY_B", raising=False) @@ -268,10 +280,10 @@ def test_required_env_multiple_one_missing_fails(tmp_path, monkeypatch): report = run_preflight(config, str(tmp_path)) - assert report.ok is False + assert report.ok is True assert any( issue.title == "Required env" and "API_KEY_B" in issue.detail - for issue in report.failures + for issue in report.warnings ) @@ -317,8 +329,10 @@ def test_required_env_skipped_in_smoke_mode(tmp_path, monkeypatch): ) -def test_required_env_smoke_mode_off_still_fails(tmp_path, monkeypatch): - """Sanity: smoke bypass is OFF when MOLECULE_SMOKE_MODE is unset.""" +def test_required_env_smoke_mode_off_still_warns(tmp_path, monkeypatch): + """Sanity: smoke bypass is OFF when MOLECULE_SMOKE_MODE is unset, but + the warning still fires (and preflight no longer hard-fails — see + test_required_env_missing_warns_does_not_fail for the rationale).""" monkeypatch.delenv("HERMES_API_KEY", raising=False) monkeypatch.delenv("MOLECULE_SMOKE_MODE", raising=False) @@ -328,10 +342,13 @@ def test_required_env_smoke_mode_off_still_fails(tmp_path, monkeypatch): report = run_preflight(config, str(tmp_path)) - assert report.ok is False + assert report.ok is True assert any( issue.title == "Required env" and "HERMES_API_KEY" in issue.detail - for issue in report.failures + for issue in report.warnings + ) + assert not any( + issue.title == "Required env" for issue in report.failures ) @@ -383,10 +400,12 @@ def test_top_level_required_env_used_when_no_models_declared(tmp_path, monkeypat report = run_preflight(config, str(tmp_path)) - assert report.ok is False + # Missing required_env is now a warning (workspace boots in + # not-configured state); see test_required_env_missing_warns_does_not_fail. + assert report.ok is True assert any( issue.title == "Required env" and "CLAUDE_CODE_OAUTH_TOKEN" in issue.detail - for issue in report.failures + for issue in report.warnings ) @@ -411,10 +430,10 @@ def test_top_level_used_when_picked_model_not_in_models_list(tmp_path, monkeypat report = run_preflight(config, str(tmp_path)) - assert report.ok is False + assert report.ok is True assert any( issue.title == "Required env" and "CLAUDE_CODE_OAUTH_TOKEN" in issue.detail - for issue in report.failures + for issue in report.warnings ) @@ -526,8 +545,13 @@ def test_per_model_required_env_null_treated_as_empty_no_auth(tmp_path, monkeypa # ---------- Legacy auth_token_file backward compat ---------- -def test_legacy_auth_token_file_missing_no_env_fails(tmp_path, monkeypatch): - """Legacy: missing auth_token_file with no env var should fail.""" +def test_legacy_auth_token_file_missing_no_env_warns(tmp_path, monkeypatch): + """Legacy: missing auth_token_file with no env var emits a warning, + not a hard failure. Same reasoning as + test_required_env_missing_warns_does_not_fail — adapter.setup() is + the authoritative auth check, preflight just surfaces the issue + early in the boot log. The workspace still binds /agent-card and + routes to the not-configured -32603 handler.""" monkeypatch.delenv("CLAUDE_CODE_OAUTH_TOKEN", raising=False) config = make_config( @@ -536,8 +560,9 @@ def test_legacy_auth_token_file_missing_no_env_fails(tmp_path, monkeypatch): report = run_preflight(config, str(tmp_path)) - assert report.ok is False - assert any(issue.title == "Auth token" for issue in report.failures) + assert report.ok is True + assert any(issue.title == "Auth token" for issue in report.warnings) + assert not any(issue.title == "Auth token" for issue in report.failures) def test_legacy_auth_token_file_missing_but_auth_token_env_passes(tmp_path, monkeypatch):