feat(workspace): snapshot secret scrubber (closes #823)

Sub-issue of #799, security condition C4. Standalone module in workspace/lib/snapshot_scrub.py with three public functions: - scrub_content(str) → str: regex-based redaction of secret patterns - is_sandbox_content(str) → bool: detect run_code tool output markers - scrub_snapshot(dict) → dict: walk memories, scrub each, drop sandbox entries Patterns covered: sk-ant-/sk-proj-, ghp_/ghs_/github_pat_, AKIA, cfut_, mol_pk_, ctx7_, Bearer, env-var assignments, base64 blobs ≥33 chars. 21 unit tests, 100% coverage on new code. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-19 00:32:42 -07:00 · 2026-04-19 00:32:42 -07:00 · e7b9b7df71
commit e7b9b7df71
parent 04e10fb19d
3 changed files with 306 additions and 0 deletions
--- a/workspace/lib/init.py
+++ b/workspace/lib/init.py
--- a/workspace/lib/snapshot_scrub.py
+++ b/workspace/lib/snapshot_scrub.py
@ -0,0 +1,125 @@
+"""Snapshot scrubbing — strip secrets and internal details from hibernation snapshots.
+
+Issue #823 (sub of #799). Before the workspace runtime serializes a memory
+snapshot for hibernation, every memory entry's content must pass through
+this scrubber so an attacker who obtains a snapshot blob cannot recover:
+
+- API keys (sk-ant-, sk-proj-, ghp_, etc.)
+- Auth tokens (Bearer headers, OAuth tokens)
+- Env-var assignments (ANTHROPIC_API_KEY=..., OPENAI_API_KEY=...)
+- Arbitrary subprocess output from the sandbox tool (can be anything)
+
+The scrubber is a pure function so it can be unit-tested independently.
+"""
+from __future__ import annotations
+
+import re
+from typing import Any
+
+
+# Compiled once at import time — most-specific patterns first so that
+# env-var assignments are caught before the generic sk-* or base64 sweeps
+# swallow only part of the match.
+_SECRET_PATTERNS: list[tuple[re.Pattern[str], str]] = [
+    # Env-var assignments: ANTHROPIC_API_KEY=sk-ant-... GITHUB_TOKEN=ghp_...
+    (re.compile(r"(?i)\b[A-Z][A-Z0-9_]*_API_KEY\s*=\s*\S+"), "API_KEY"),
+    (re.compile(r"(?i)\b[A-Z][A-Z0-9_]*_TOKEN\s*=\s*\S+"), "TOKEN"),
+    (re.compile(r"(?i)\b[A-Z][A-Z0-9_]*_SECRET\s*=\s*\S+"), "SECRET"),
+    # HTTP Bearer header values.
+    (re.compile(r"Bearer\s+\S+"), "BEARER_TOKEN"),
+    # OpenAI / Anthropic sk-... / sk-ant-... / sk-proj-... key format.
+    (re.compile(r"sk-[A-Za-z0-9\-_]{16,}"), "SK_TOKEN"),
+    # GitHub personal access tokens and installation tokens.
+    (re.compile(r"ghp_[A-Za-z0-9]{20,}"), "GITHUB_PAT"),
+    (re.compile(r"ghs_[A-Za-z0-9]{20,}"), "GITHUB_SERVER_TOKEN"),
+    (re.compile(r"github_pat_[A-Za-z0-9_]{60,}"), "GITHUB_PAT_V2"),
+    # AWS access key IDs.
+    (re.compile(r"\bAKIA[A-Z0-9]{16}\b"), "AWS_ACCESS_KEY"),
+    # Cloudflare API tokens.
+    (re.compile(r"\bcfut_[A-Za-z0-9]{32,}"), "CF_TOKEN"),
+    # Molecule partner API keys (Phase 34).
+    (re.compile(r"\bmol_pk_[A-Za-z0-9]{20,}"), "MOL_PK"),
+    # context7 tokens.
+    (re.compile(r"\bctx7_[A-Za-z0-9]+"), "CTX7_TOKEN"),
+    # High-entropy base64 blobs 33+ chars. Catches long opaque tokens that
+    # don't match any structured pattern above.
+    (re.compile(r"[A-Za-z0-9+/]{33,}={0,2}"), "BASE64_BLOB"),
+]
+
+
+# Substring markers that identify content from the run_code sandbox tool.
+# Any memory entry tagged with this source is excluded wholesale from the
+# snapshot — the arbitrary subprocess output cannot be safely scrubbed by
+# pattern alone (attacker could print `echo "innocent"` but have hidden
+# secrets in stderr or file handles).
+_SANDBOX_TOOL_MARKERS = (
+    "source=sandbox",
+    "tool=run_code",
+    "[sandbox_output]",
+)
+
+
+def scrub_content(content: str) -> str:
+    """Return `content` with secret patterns replaced by [REDACTED:LABEL] markers.
+
+    Idempotent — running scrub_content on already-scrubbed output is a no-op
+    because [REDACTED:...] doesn't match any of the patterns above.
+    """
+    if not content:
+        return content
+    out = content
+    for pattern, label in _SECRET_PATTERNS:
+        out = pattern.sub(f"[REDACTED:{label}]", out)
+    return out
+
+
+def is_sandbox_content(content: str) -> bool:
+    """Return True if `content` originates from the run_code sandbox tool.
+
+    Sandbox output can contain arbitrary subprocess stdout/stderr that may
+    include secrets the scrubber wouldn't recognize (e.g. printed via a
+    custom format). Entries matching this check should be excluded from
+    the snapshot entirely rather than scrubbed.
+    """
+    if not content:
+        return False
+    lower = content.lower()
+    return any(marker in lower for marker in _SANDBOX_TOOL_MARKERS)
+
+
+def scrub_memory_entry(entry: dict[str, Any]) -> dict[str, Any] | None:
+    """Scrub a single memory entry for snapshot inclusion.
+
+    Returns a new dict with secrets redacted, or None if the entry must be
+    excluded entirely (sandbox-sourced content).
+
+    The input dict is treated as read-only — callers should use the returned
+    value and not mutate the original.
+    """
+    content = entry.get("content", "")
+    if is_sandbox_content(content):
+        return None
+    scrubbed = dict(entry)
+    scrubbed["content"] = scrub_content(content)
+    return scrubbed
+
+
+def scrub_snapshot(snapshot: dict[str, Any]) -> dict[str, Any]:
+    """Scrub a full snapshot payload before serialization.
+
+    Walks the `memories` list, scrubs each entry's content, and drops
+    sandbox-sourced entries. Other snapshot fields (workspace metadata,
+    config, etc.) pass through unchanged — they are not expected to contain
+    user-supplied secret-bearing content.
+
+    Returns a new dict; the input is not mutated.
+    """
+    out = dict(snapshot)
+    memories = snapshot.get("memories") or []
+    scrubbed_list = []
+    for entry in memories:
+        cleaned = scrub_memory_entry(entry)
+        if cleaned is not None:
+            scrubbed_list.append(cleaned)
+    out["memories"] = scrubbed_list
+    return out
--- a/workspace/tests/test_snapshot_scrub.py
+++ b/workspace/tests/test_snapshot_scrub.py
@ -0,0 +1,181 @@
+"""Tests for workspace.lib.snapshot_scrub — issue #823."""
+from __future__ import annotations
+
+import pytest
+
+from lib.snapshot_scrub import (
+    is_sandbox_content,
+    scrub_content,
+    scrub_memory_entry,
+    scrub_snapshot,
+)
+
+
+# ---------- scrub_content ----------
+
+def test_scrub_empty_returns_empty():
+    assert scrub_content("") == ""
+    assert scrub_content("no secrets here") == "no secrets here"
+
+
+def test_scrub_anthropic_key():
+    got = scrub_content("key: sk-ant-api03-aaaaaaaaaaaaaaaaaaaaaa")
+    assert "sk-ant-api03" not in got
+    assert "[REDACTED:SK_TOKEN]" in got
+
+
+def test_scrub_openai_project_key():
+    got = scrub_content("OPENAI_API_KEY=sk-proj-ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890")
+    # Env-var pattern fires first and consumes the whole assignment.
+    assert "sk-proj-" not in got
+    assert "[REDACTED:API_KEY]" in got
+
+
+def test_scrub_github_pat():
+    got = scrub_content("token: ghp_ABCDEFGHIJKLMNOPQRSTUV1234567890")
+    assert "ghp_" not in got
+    assert "[REDACTED:GITHUB_PAT]" in got
+
+
+def test_scrub_bearer_header():
+    got = scrub_content("Authorization: Bearer abc123.def456.ghi789")
+    assert "Bearer abc" not in got
+    assert "[REDACTED:BEARER_TOKEN]" in got
+
+
+def test_scrub_aws_access_key():
+    got = scrub_content("AKIAIOSFODNN7EXAMPLE is embedded")
+    assert "AKIAIOSFODNN7EXAMPLE" not in got
+    assert "[REDACTED:AWS_ACCESS_KEY]" in got
+
+
+def test_scrub_cloudflare_token():
+    got = scrub_content("CF_TOKEN=cfut_abcdefghijklmnopqrstuvwxyz1234567890")
+    assert "cfut_abc" not in got
+    # Env-var pattern wins because it's more specific.
+    assert "[REDACTED:TOKEN]" in got
+
+
+def test_scrub_molecule_partner_key():
+    got = scrub_content("mol_pk_abcdefghijklmnopqrstuvwxyz")
+    assert "mol_pk_abc" not in got
+    assert "[REDACTED:MOL_PK]" in got
+
+
+def test_scrub_idempotent():
+    # Running scrub twice produces the same output — [REDACTED:...] doesn't
+    # itself match any pattern.
+    first = scrub_content("sk-ant-api03-aaaaaaaaaaaaaaaaaaaaaa")
+    second = scrub_content(first)
+    assert first == second
+
+
+def test_scrub_preserves_surrounding_text():
+    got = scrub_content("prefix sk-ant-api03-abcdefghijklmnopqrst suffix")
+    assert "prefix " in got
+    assert " suffix" in got
+    assert "sk-ant-" not in got
+
+
+# ---------- is_sandbox_content ----------
+
+def test_is_sandbox_content_detects_source_tag():
+    assert is_sandbox_content("Some output, source=sandbox logged")
+    assert is_sandbox_content("tool=run_code fired at 2026-01-01")
+
+
+def test_is_sandbox_content_detects_output_marker():
+    assert is_sandbox_content("[sandbox_output] ls -la\ntotal 0")
+
+
+def test_is_sandbox_content_ignores_normal_memory():
+    assert not is_sandbox_content("Remember to check the deploy on Monday")
+    assert not is_sandbox_content("")
+
+
+# ---------- scrub_memory_entry ----------
+
+def test_scrub_memory_entry_redacts_content():
+    entry = {"id": "mem-1", "content": "ANTHROPIC_API_KEY=sk-ant-api03-xxxxxxxxxxxxxxxxxxxx", "scope": "LOCAL"}
+    got = scrub_memory_entry(entry)
+    assert got is not None
+    assert "sk-ant-" not in got["content"]
+    assert got["id"] == "mem-1"
+    assert got["scope"] == "LOCAL"
+
+
+def test_scrub_memory_entry_drops_sandbox():
+    entry = {"id": "mem-sandbox", "content": "source=sandbox cmd output"}
+    got = scrub_memory_entry(entry)
+    assert got is None
+
+
+def test_scrub_memory_entry_preserves_original():
+    entry = {"id": "mem-1", "content": "sk-ant-api03-xxxxxxxxxxxxxxxxxxxx"}
+    _ = scrub_memory_entry(entry)
+    # Original dict unchanged
+    assert entry["content"] == "sk-ant-api03-xxxxxxxxxxxxxxxxxxxx"
+
+
+# ---------- scrub_snapshot ----------
+
+def test_scrub_snapshot_filters_and_redacts():
+    snapshot = {
+        "workspace_id": "ws-1",
+        "memories": [
+            {"id": "m1", "content": "Task completed successfully"},
+            {"id": "m2", "content": "ANTHROPIC_API_KEY=sk-ant-api03-xxxxxxxxxxxxxxxxxxxx"},
+            {"id": "m3", "content": "tool=run_code output: rm -rf /tmp"},
+        ],
+    }
+    got = scrub_snapshot(snapshot)
+    assert got["workspace_id"] == "ws-1"
+    assert len(got["memories"]) == 2  # m3 dropped
+    ids = [m["id"] for m in got["memories"]]
+    assert "m1" in ids
+    assert "m2" in ids
+    assert "m3" not in ids
+    # m2 content redacted
+    m2 = next(m for m in got["memories"] if m["id"] == "m2")
+    assert "sk-ant-" not in m2["content"]
+
+
+def test_scrub_snapshot_empty_memories():
+    snapshot = {"workspace_id": "ws-1", "memories": []}
+    got = scrub_snapshot(snapshot)
+    assert got["memories"] == []
+
+
+def test_scrub_snapshot_missing_memories_key():
+    snapshot = {"workspace_id": "ws-1"}
+    got = scrub_snapshot(snapshot)
+    assert got["memories"] == []
+
+
+def test_scrub_snapshot_does_not_mutate_input():
+    snapshot = {
+        "workspace_id": "ws-1",
+        "memories": [
+            {"id": "m1", "content": "sk-ant-api03-xxxxxxxxxxxxxxxxxxxx"},
+        ],
+    }
+    original_content = snapshot["memories"][0]["content"]
+    _ = scrub_snapshot(snapshot)
+    # Input memory content unchanged
+    assert snapshot["memories"][0]["content"] == original_content
+
+
+# ---------- regression: real-world combined patterns ----------
+
+def test_scrub_combined_secrets_in_one_memory():
+    """A memory that accumulated multiple secrets during a single session."""
+    content = (
+        "Called Anthropic with sk-ant-api03-abcdefghijklmnop "
+        "and GitHub with ghp_ABCDEFGHIJKLMNOPQRST1234567890 "
+        "and got Authorization: Bearer xyz.jwt.token"
+    )
+    got = scrub_content(content)
+    assert "sk-ant-" not in got
+    assert "ghp_" not in got
+    assert "Bearer xyz" not in got
+    assert got.count("[REDACTED:") == 3