From e7b9b7df718571b5e60a5118a33b8a7d62279b47 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 19 Apr 2026 00:32:42 -0700 Subject: [PATCH] feat(workspace): snapshot secret scrubber (closes #823) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sub-issue of #799, security condition C4. Standalone module in workspace/lib/snapshot_scrub.py with three public functions: - scrub_content(str) → str: regex-based redaction of secret patterns - is_sandbox_content(str) → bool: detect run_code tool output markers - scrub_snapshot(dict) → dict: walk memories, scrub each, drop sandbox entries Patterns covered: sk-ant-/sk-proj-, ghp_/ghs_/github_pat_, AKIA, cfut_, mol_pk_, ctx7_, Bearer, env-var assignments, base64 blobs ≥33 chars. 21 unit tests, 100% coverage on new code. Co-Authored-By: Claude Opus 4.6 (1M context) --- workspace/lib/__init__.py | 0 workspace/lib/snapshot_scrub.py | 125 +++++++++++++++++ workspace/tests/test_snapshot_scrub.py | 181 +++++++++++++++++++++++++ 3 files changed, 306 insertions(+) create mode 100644 workspace/lib/__init__.py create mode 100644 workspace/lib/snapshot_scrub.py create mode 100644 workspace/tests/test_snapshot_scrub.py diff --git a/workspace/lib/__init__.py b/workspace/lib/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/workspace/lib/snapshot_scrub.py b/workspace/lib/snapshot_scrub.py new file mode 100644 index 00000000..9dc7994e --- /dev/null +++ b/workspace/lib/snapshot_scrub.py @@ -0,0 +1,125 @@ +"""Snapshot scrubbing — strip secrets and internal details from hibernation snapshots. + +Issue #823 (sub of #799). Before the workspace runtime serializes a memory +snapshot for hibernation, every memory entry's content must pass through +this scrubber so an attacker who obtains a snapshot blob cannot recover: + +- API keys (sk-ant-, sk-proj-, ghp_, etc.) +- Auth tokens (Bearer headers, OAuth tokens) +- Env-var assignments (ANTHROPIC_API_KEY=..., OPENAI_API_KEY=...) +- Arbitrary subprocess output from the sandbox tool (can be anything) + +The scrubber is a pure function so it can be unit-tested independently. +""" +from __future__ import annotations + +import re +from typing import Any + + +# Compiled once at import time — most-specific patterns first so that +# env-var assignments are caught before the generic sk-* or base64 sweeps +# swallow only part of the match. +_SECRET_PATTERNS: list[tuple[re.Pattern[str], str]] = [ + # Env-var assignments: ANTHROPIC_API_KEY=sk-ant-... GITHUB_TOKEN=ghp_... + (re.compile(r"(?i)\b[A-Z][A-Z0-9_]*_API_KEY\s*=\s*\S+"), "API_KEY"), + (re.compile(r"(?i)\b[A-Z][A-Z0-9_]*_TOKEN\s*=\s*\S+"), "TOKEN"), + (re.compile(r"(?i)\b[A-Z][A-Z0-9_]*_SECRET\s*=\s*\S+"), "SECRET"), + # HTTP Bearer header values. + (re.compile(r"Bearer\s+\S+"), "BEARER_TOKEN"), + # OpenAI / Anthropic sk-... / sk-ant-... / sk-proj-... key format. + (re.compile(r"sk-[A-Za-z0-9\-_]{16,}"), "SK_TOKEN"), + # GitHub personal access tokens and installation tokens. + (re.compile(r"ghp_[A-Za-z0-9]{20,}"), "GITHUB_PAT"), + (re.compile(r"ghs_[A-Za-z0-9]{20,}"), "GITHUB_SERVER_TOKEN"), + (re.compile(r"github_pat_[A-Za-z0-9_]{60,}"), "GITHUB_PAT_V2"), + # AWS access key IDs. + (re.compile(r"\bAKIA[A-Z0-9]{16}\b"), "AWS_ACCESS_KEY"), + # Cloudflare API tokens. + (re.compile(r"\bcfut_[A-Za-z0-9]{32,}"), "CF_TOKEN"), + # Molecule partner API keys (Phase 34). + (re.compile(r"\bmol_pk_[A-Za-z0-9]{20,}"), "MOL_PK"), + # context7 tokens. + (re.compile(r"\bctx7_[A-Za-z0-9]+"), "CTX7_TOKEN"), + # High-entropy base64 blobs 33+ chars. Catches long opaque tokens that + # don't match any structured pattern above. + (re.compile(r"[A-Za-z0-9+/]{33,}={0,2}"), "BASE64_BLOB"), +] + + +# Substring markers that identify content from the run_code sandbox tool. +# Any memory entry tagged with this source is excluded wholesale from the +# snapshot — the arbitrary subprocess output cannot be safely scrubbed by +# pattern alone (attacker could print `echo "innocent"` but have hidden +# secrets in stderr or file handles). +_SANDBOX_TOOL_MARKERS = ( + "source=sandbox", + "tool=run_code", + "[sandbox_output]", +) + + +def scrub_content(content: str) -> str: + """Return `content` with secret patterns replaced by [REDACTED:LABEL] markers. + + Idempotent — running scrub_content on already-scrubbed output is a no-op + because [REDACTED:...] doesn't match any of the patterns above. + """ + if not content: + return content + out = content + for pattern, label in _SECRET_PATTERNS: + out = pattern.sub(f"[REDACTED:{label}]", out) + return out + + +def is_sandbox_content(content: str) -> bool: + """Return True if `content` originates from the run_code sandbox tool. + + Sandbox output can contain arbitrary subprocess stdout/stderr that may + include secrets the scrubber wouldn't recognize (e.g. printed via a + custom format). Entries matching this check should be excluded from + the snapshot entirely rather than scrubbed. + """ + if not content: + return False + lower = content.lower() + return any(marker in lower for marker in _SANDBOX_TOOL_MARKERS) + + +def scrub_memory_entry(entry: dict[str, Any]) -> dict[str, Any] | None: + """Scrub a single memory entry for snapshot inclusion. + + Returns a new dict with secrets redacted, or None if the entry must be + excluded entirely (sandbox-sourced content). + + The input dict is treated as read-only — callers should use the returned + value and not mutate the original. + """ + content = entry.get("content", "") + if is_sandbox_content(content): + return None + scrubbed = dict(entry) + scrubbed["content"] = scrub_content(content) + return scrubbed + + +def scrub_snapshot(snapshot: dict[str, Any]) -> dict[str, Any]: + """Scrub a full snapshot payload before serialization. + + Walks the `memories` list, scrubs each entry's content, and drops + sandbox-sourced entries. Other snapshot fields (workspace metadata, + config, etc.) pass through unchanged — they are not expected to contain + user-supplied secret-bearing content. + + Returns a new dict; the input is not mutated. + """ + out = dict(snapshot) + memories = snapshot.get("memories") or [] + scrubbed_list = [] + for entry in memories: + cleaned = scrub_memory_entry(entry) + if cleaned is not None: + scrubbed_list.append(cleaned) + out["memories"] = scrubbed_list + return out diff --git a/workspace/tests/test_snapshot_scrub.py b/workspace/tests/test_snapshot_scrub.py new file mode 100644 index 00000000..800b8b04 --- /dev/null +++ b/workspace/tests/test_snapshot_scrub.py @@ -0,0 +1,181 @@ +"""Tests for workspace.lib.snapshot_scrub — issue #823.""" +from __future__ import annotations + +import pytest + +from lib.snapshot_scrub import ( + is_sandbox_content, + scrub_content, + scrub_memory_entry, + scrub_snapshot, +) + + +# ---------- scrub_content ---------- + +def test_scrub_empty_returns_empty(): + assert scrub_content("") == "" + assert scrub_content("no secrets here") == "no secrets here" + + +def test_scrub_anthropic_key(): + got = scrub_content("key: sk-ant-api03-aaaaaaaaaaaaaaaaaaaaaa") + assert "sk-ant-api03" not in got + assert "[REDACTED:SK_TOKEN]" in got + + +def test_scrub_openai_project_key(): + got = scrub_content("OPENAI_API_KEY=sk-proj-ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890") + # Env-var pattern fires first and consumes the whole assignment. + assert "sk-proj-" not in got + assert "[REDACTED:API_KEY]" in got + + +def test_scrub_github_pat(): + got = scrub_content("token: ghp_ABCDEFGHIJKLMNOPQRSTUV1234567890") + assert "ghp_" not in got + assert "[REDACTED:GITHUB_PAT]" in got + + +def test_scrub_bearer_header(): + got = scrub_content("Authorization: Bearer abc123.def456.ghi789") + assert "Bearer abc" not in got + assert "[REDACTED:BEARER_TOKEN]" in got + + +def test_scrub_aws_access_key(): + got = scrub_content("AKIAIOSFODNN7EXAMPLE is embedded") + assert "AKIAIOSFODNN7EXAMPLE" not in got + assert "[REDACTED:AWS_ACCESS_KEY]" in got + + +def test_scrub_cloudflare_token(): + got = scrub_content("CF_TOKEN=cfut_abcdefghijklmnopqrstuvwxyz1234567890") + assert "cfut_abc" not in got + # Env-var pattern wins because it's more specific. + assert "[REDACTED:TOKEN]" in got + + +def test_scrub_molecule_partner_key(): + got = scrub_content("mol_pk_abcdefghijklmnopqrstuvwxyz") + assert "mol_pk_abc" not in got + assert "[REDACTED:MOL_PK]" in got + + +def test_scrub_idempotent(): + # Running scrub twice produces the same output — [REDACTED:...] doesn't + # itself match any pattern. + first = scrub_content("sk-ant-api03-aaaaaaaaaaaaaaaaaaaaaa") + second = scrub_content(first) + assert first == second + + +def test_scrub_preserves_surrounding_text(): + got = scrub_content("prefix sk-ant-api03-abcdefghijklmnopqrst suffix") + assert "prefix " in got + assert " suffix" in got + assert "sk-ant-" not in got + + +# ---------- is_sandbox_content ---------- + +def test_is_sandbox_content_detects_source_tag(): + assert is_sandbox_content("Some output, source=sandbox logged") + assert is_sandbox_content("tool=run_code fired at 2026-01-01") + + +def test_is_sandbox_content_detects_output_marker(): + assert is_sandbox_content("[sandbox_output] ls -la\ntotal 0") + + +def test_is_sandbox_content_ignores_normal_memory(): + assert not is_sandbox_content("Remember to check the deploy on Monday") + assert not is_sandbox_content("") + + +# ---------- scrub_memory_entry ---------- + +def test_scrub_memory_entry_redacts_content(): + entry = {"id": "mem-1", "content": "ANTHROPIC_API_KEY=sk-ant-api03-xxxxxxxxxxxxxxxxxxxx", "scope": "LOCAL"} + got = scrub_memory_entry(entry) + assert got is not None + assert "sk-ant-" not in got["content"] + assert got["id"] == "mem-1" + assert got["scope"] == "LOCAL" + + +def test_scrub_memory_entry_drops_sandbox(): + entry = {"id": "mem-sandbox", "content": "source=sandbox cmd output"} + got = scrub_memory_entry(entry) + assert got is None + + +def test_scrub_memory_entry_preserves_original(): + entry = {"id": "mem-1", "content": "sk-ant-api03-xxxxxxxxxxxxxxxxxxxx"} + _ = scrub_memory_entry(entry) + # Original dict unchanged + assert entry["content"] == "sk-ant-api03-xxxxxxxxxxxxxxxxxxxx" + + +# ---------- scrub_snapshot ---------- + +def test_scrub_snapshot_filters_and_redacts(): + snapshot = { + "workspace_id": "ws-1", + "memories": [ + {"id": "m1", "content": "Task completed successfully"}, + {"id": "m2", "content": "ANTHROPIC_API_KEY=sk-ant-api03-xxxxxxxxxxxxxxxxxxxx"}, + {"id": "m3", "content": "tool=run_code output: rm -rf /tmp"}, + ], + } + got = scrub_snapshot(snapshot) + assert got["workspace_id"] == "ws-1" + assert len(got["memories"]) == 2 # m3 dropped + ids = [m["id"] for m in got["memories"]] + assert "m1" in ids + assert "m2" in ids + assert "m3" not in ids + # m2 content redacted + m2 = next(m for m in got["memories"] if m["id"] == "m2") + assert "sk-ant-" not in m2["content"] + + +def test_scrub_snapshot_empty_memories(): + snapshot = {"workspace_id": "ws-1", "memories": []} + got = scrub_snapshot(snapshot) + assert got["memories"] == [] + + +def test_scrub_snapshot_missing_memories_key(): + snapshot = {"workspace_id": "ws-1"} + got = scrub_snapshot(snapshot) + assert got["memories"] == [] + + +def test_scrub_snapshot_does_not_mutate_input(): + snapshot = { + "workspace_id": "ws-1", + "memories": [ + {"id": "m1", "content": "sk-ant-api03-xxxxxxxxxxxxxxxxxxxx"}, + ], + } + original_content = snapshot["memories"][0]["content"] + _ = scrub_snapshot(snapshot) + # Input memory content unchanged + assert snapshot["memories"][0]["content"] == original_content + + +# ---------- regression: real-world combined patterns ---------- + +def test_scrub_combined_secrets_in_one_memory(): + """A memory that accumulated multiple secrets during a single session.""" + content = ( + "Called Anthropic with sk-ant-api03-abcdefghijklmnop " + "and GitHub with ghp_ABCDEFGHIJKLMNOPQRST1234567890 " + "and got Authorization: Bearer xyz.jwt.token" + ) + got = scrub_content(content) + assert "sk-ant-" not in got + assert "ghp_" not in got + assert "Bearer xyz" not in got + assert got.count("[REDACTED:") == 3