From eaade616c5a6332247a69e8b5f0693489431ea0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Molecule=20AI=20=C2=B7=20core-be?= Date: Fri, 15 May 2026 16:58:22 -0700 Subject: [PATCH] feat(secrets): SSOT Go package for credential-shape regex (internal#425 Phase 2a) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 2a of the Files API roots RFC. Today, the same credential-shape regex set lives as a duplicated bash array in two unrelated places: - .gitea/workflows/secret-scan.yml SECRET_PATTERNS - molecule-ai-workspace-runtime molecule_runtime/scripts/pre-commit-checks.sh Adding a pattern requires editing both, and drift is caught only via secret-scan workflow failures on unrelated PRs (#2090-class vector). This commit centralises the regex set into a new Go package workspace-server/internal/secrets — pure-Go SSOT, exposing: - Patterns: []Pattern slice (Name + Description + regex source) - ScanBytes(b []byte) (*Match, error) - ScanString(s string) (*Match, error) - Match{Name, Description} — deliberately NOT including matched bytes 13 pattern families covered (GitHub PAT classic + 5 OAuth shapes + fine-grained, Anthropic, OpenAI project/svcacct, MiniMax, Slack 5 variants, AWS access key + STS temp). Phase 2b (docker-exec backend) will import secrets.ScanBytes to gate listFilesViaDockerExec / readFileViaDockerExec against both secret-shaped paths AND content. Today this package has one consumer — its own unit tests — which is fine because Phase 2a is pure extraction; the YAML + bash arrays still hold the runtime contract until 2b lands. Tests: - TestEveryPatternCompiles: pins all regex strings parse as RE2 - TestNoDuplicateNames: prevents accidental shadowing - TestKnownPatternsAllPresent: pins the public set so a rename in one consumer doesn't silently widen the leak surface - TestPositiveMatches: table-driven, one fixture per pattern - TestNegativeShapes: too-short / wrong-prefix / prose / empty - TestScanString_NoOp: pins the zero-copy wrapper contract - TestMatch_NoRoundtrip: pins that Match doesn't carry secret bytes Refs internal#425. --- workspace-server/internal/secrets/patterns.go | 226 ++++++++++++++++++ .../internal/secrets/patterns_test.go | 189 +++++++++++++++ 2 files changed, 415 insertions(+) create mode 100644 workspace-server/internal/secrets/patterns.go create mode 100644 workspace-server/internal/secrets/patterns_test.go diff --git a/workspace-server/internal/secrets/patterns.go b/workspace-server/internal/secrets/patterns.go new file mode 100644 index 00000000..13356af7 --- /dev/null +++ b/workspace-server/internal/secrets/patterns.go @@ -0,0 +1,226 @@ +// Package secrets provides the canonical SSOT for credential-shaped +// regex patterns used by: +// +// - the CI `Secret scan` workflow (.gitea/workflows/secret-scan.yml) +// - the runtime's bundled pre-commit hook +// (molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh) +// - the upcoming Phase 2b docker-exec Files API backend, which has +// to refuse to surface files whose path OR content matches a +// credential shape (RFC internal#425, Hongming 2026-05-15) +// +// Before this package, the same regex set lived as duplicate bash +// arrays in two unrelated repos; adding a pattern required editing +// both, and pattern drift was caught only via secret-scan workflow +// failures on PRs that had unrelated changes (#2090-class incident +// vector). Centralising in Go makes the Files API the SSOT, with the +// YAML + bash arrays generated/asserted from this package so drift +// is detected at CI time, not at exfiltration time. +// +// This file is Phase 2a of the internal#425 RFC. Phase 2b will import +// `Patterns` from `template_files_docker_exec.go` to gate +// `listFilesViaDockerExec` / `readFileViaDockerExec` against +// secret-shaped paths AND content. Until 2b lands, the package has +// one consumer: this package's own unit tests, which pin the regex +// strings so a refactor that drops or weakens one is caught here. +package secrets + +import ( + "fmt" + "regexp" + "sync" +) + +// Pattern is one named credential shape — a human label plus the +// compiled regex. The label appears in CI error output ("matched: +// github-pat") so an operator can identify the family without seeing +// the actual matched bytes (echoing the bytes widens the blast radius +// per the secret-scan workflow's recovery prose). +type Pattern struct { + // Name is a short kebab-case identifier (e.g. "github-pat", + // "anthropic-api-key"). Stable across versions — consumers may + // switch on it. + Name string + // Description is a one-line human-readable explanation of what + // the pattern matches. Used in CI error messages and the Files + // API "" placeholder rationale. + Description string + // regexSource is the regex literal in Go-RE2 syntax. Stored as a + // string so the slice declaration below stays readable; compiled + // once via sync.Once into a *regexp.Regexp. + regexSource string +} + +// Patterns is the canonical credential-shape regex set. +// +// Adding a pattern here: +// +// 1. Add a new Pattern{} entry below with a kebab-case Name, a +// one-line Description, and the regex literal. Anchor on a +// low-false-positive prefix. +// 2. Add a positive + negative test case in patterns_test.go. +// 3. Mirror the regex string into: +// a. .gitea/workflows/secret-scan.yml SECRET_PATTERNS array +// b. molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh +// (or wait for the codegen target that consumes this slice — TBD +// follow-up; tracked in the Phase 2a PR description.) +// +// The order is: alphabetical within each provider family, families +// grouped by ecosystem (GitHub family, AI-provider family, chat +// family, cloud family). Keep this stable so diffs are reviewable. +var Patterns = []Pattern{ + // --- GitHub token family --- + { + Name: "github-pat-classic", + Description: "GitHub personal access token (classic)", + regexSource: `ghp_[A-Za-z0-9]{36,}`, + }, + { + Name: "github-app-installation-token", + Description: "GitHub App installation token (#2090 vector)", + regexSource: `ghs_[A-Za-z0-9]{36,}`, + }, + { + Name: "github-oauth-user-to-server", + Description: "GitHub OAuth user-to-server token", + regexSource: `gho_[A-Za-z0-9]{36,}`, + }, + { + Name: "github-oauth-user", + Description: "GitHub OAuth user token", + regexSource: `ghu_[A-Za-z0-9]{36,}`, + }, + { + Name: "github-oauth-refresh", + Description: "GitHub OAuth refresh token", + regexSource: `ghr_[A-Za-z0-9]{36,}`, + }, + { + Name: "github-pat-fine-grained", + Description: "GitHub fine-grained personal access token", + regexSource: `github_pat_[A-Za-z0-9_]{82,}`, + }, + + // --- AI-provider API key family --- + { + Name: "anthropic-api-key", + Description: "Anthropic API key", + regexSource: `sk-ant-[A-Za-z0-9_-]{40,}`, + }, + { + Name: "openai-project-key", + Description: "OpenAI project API key", + regexSource: `sk-proj-[A-Za-z0-9_-]{40,}`, + }, + { + Name: "openai-service-account-key", + Description: "OpenAI service-account API key", + regexSource: `sk-svcacct-[A-Za-z0-9_-]{40,}`, + }, + { + Name: "minimax-api-key", + Description: "MiniMax API key (F1088 vector)", + regexSource: `sk-cp-[A-Za-z0-9_-]{60,}`, + }, + + // --- Chat-platform token family --- + { + Name: "slack-token", + Description: "Slack token (xoxb/xoxa/xoxp/xoxr/xoxs)", + regexSource: `xox[baprs]-[A-Za-z0-9-]{20,}`, + }, + + // --- Cloud-provider credential family --- + { + Name: "aws-access-key-id", + Description: "AWS access key ID", + regexSource: `AKIA[0-9A-Z]{16}`, + }, + { + Name: "aws-sts-temp-access-key-id", + Description: "AWS STS temporary access key ID", + regexSource: `ASIA[0-9A-Z]{16}`, + }, +} + +// compiledOnce protects the lazy build of compiledPatterns. We compile +// lazily so package init is cheap; callers pay only on first match +// (typically once per workspace-server boot). +var ( + compiledOnce sync.Once + compiledPatterns []*compiledPattern + compileErr error +) + +type compiledPattern struct { + Name string + Description string + Re *regexp.Regexp +} + +// compileAll compiles every Pattern.regexSource into a *regexp.Regexp. +// Called once via compiledOnce. Any compile failure here is a build +// bug (the unit tests assert each regex compiles) — surfacing via +// returned error so callers don't panic in request handling. +func compileAll() { + out := make([]*compiledPattern, 0, len(Patterns)) + for _, p := range Patterns { + re, err := regexp.Compile(p.regexSource) + if err != nil { + compileErr = fmt.Errorf("secrets: pattern %q failed to compile: %w", p.Name, err) + return + } + out = append(out, &compiledPattern{Name: p.Name, Description: p.Description, Re: re}) + } + compiledPatterns = out +} + +// ScanBytes returns a non-nil Match if any pattern matches anywhere +// inside b. Returns (nil, nil) on no match. Returns (nil, err) only +// if a regex in the package fails to compile — that's a build bug, +// not a runtime data issue. +// +// Match contains the pattern Name + Description so the caller can +// emit a path-or-content-denial rationale WITHOUT round-tripping the +// matched bytes (which would defeat the purpose). The matched bytes +// stay inside this function. +// +// The Files API Phase 2b backend will call ScanBytes on: +// +// - the absolute path string (catches a file literally named +// `ghs_abc.txt`) +// - the file content (catches a credential pasted into a workspace +// file by an agent or user — the Files API refuses to surface it +// and the canvas renders "") +// +// Ordering: patterns are tried in declaration order. First match +// wins. This means narrower patterns (e.g. `sk-svcacct-…`) should +// appear in `Patterns` before broader ones (`sk-…`) — today there's +// no overlap, so order is descriptive only. +func ScanBytes(b []byte) (*Match, error) { + compiledOnce.Do(compileAll) + if compileErr != nil { + return nil, compileErr + } + for _, cp := range compiledPatterns { + if cp.Re.Match(b) { + return &Match{Name: cp.Name, Description: cp.Description}, nil + } + } + return nil, nil +} + +// ScanString is the string-input convenience wrapper around ScanBytes. +// Identical semantics — the body never copies, []byte(s) is a +// zero-copy reinterpret for the regex matcher. +func ScanString(s string) (*Match, error) { + return ScanBytes([]byte(s)) +} + +// Match describes which pattern caught a value. Deliberately does +// NOT include the matched substring — callers must not echo it. +type Match struct { + // Name is the pattern's kebab-case identifier (e.g. "github-pat-classic"). + Name string + // Description is the human-readable line for UI / log surfaces. + Description string +} diff --git a/workspace-server/internal/secrets/patterns_test.go b/workspace-server/internal/secrets/patterns_test.go new file mode 100644 index 00000000..100a875e --- /dev/null +++ b/workspace-server/internal/secrets/patterns_test.go @@ -0,0 +1,189 @@ +package secrets + +import ( + "strings" + "testing" +) + +// TestEveryPatternCompiles pins that every Pattern.regexSource is a +// valid Go-RE2 expression. Without this, a bad regex would silently +// disable ScanBytes for everything after it (the lazy compile would +// set compileErr and ScanBytes would return that error every call). +func TestEveryPatternCompiles(t *testing.T) { + for _, p := range Patterns { + if p.Name == "" { + t.Errorf("pattern with empty Name: regex=%q", p.regexSource) + } + if p.Description == "" { + t.Errorf("pattern %q has empty Description", p.Name) + } + } + // Force compile + check error. + if _, err := ScanBytes([]byte("placeholder")); err != nil { + t.Fatalf("ScanBytes init failed: %v", err) + } +} + +// TestNoDuplicateNames — a duplicate pattern Name would make the +// "first match wins" semantics surprising to readers and any caller +// switching on Match.Name (none today but adding the guard is cheap). +func TestNoDuplicateNames(t *testing.T) { + seen := map[string]bool{} + for _, p := range Patterns { + if seen[p.Name] { + t.Errorf("duplicate pattern Name: %q", p.Name) + } + seen[p.Name] = true + } +} + +// TestKnownPatternsAllPresent — pins which specific Name values are +// expected. A future refactor that renames or removes one without +// updating consumers (CI workflow, runtime pre-commit hook, Files +// API Phase 2b backend) would silently widen the leak surface. +// Failing here forces the rename to be intentional. +func TestKnownPatternsAllPresent(t *testing.T) { + expected := []string{ + "github-pat-classic", + "github-app-installation-token", + "github-oauth-user-to-server", + "github-oauth-user", + "github-oauth-refresh", + "github-pat-fine-grained", + "anthropic-api-key", + "openai-project-key", + "openai-service-account-key", + "minimax-api-key", + "slack-token", + "aws-access-key-id", + "aws-sts-temp-access-key-id", + } + got := map[string]bool{} + for _, p := range Patterns { + got[p.Name] = true + } + for _, want := range expected { + if !got[want] { + t.Errorf("expected pattern %q missing from Patterns slice", want) + } + } +} + +// TestPositiveMatches — for each pattern, supply a representative +// shape and assert ScanBytes returns a Match with the right Name. +// These are TEST FIXTURES, not real credentials — each is the +// pattern's prefix + a long-enough trailing run of placeholder chars. +// `EXAMPLE` is sprinkled in to make grep-finds in CI logs obviously +// fake to a human reader (matches saved memory +// feedback_assert_exact_not_substring: tighten by Name not body). +func TestPositiveMatches(t *testing.T) { + cases := []struct { + fixture string + expectedName string + }{ + {"ghp_EXAMPLE111122223333444455556666777788889999", "github-pat-classic"}, + {"ghs_EXAMPLE111122223333444455556666777788889999", "github-app-installation-token"}, + {"gho_EXAMPLE111122223333444455556666777788889999", "github-oauth-user-to-server"}, + {"ghu_EXAMPLE111122223333444455556666777788889999", "github-oauth-user"}, + {"ghr_EXAMPLE111122223333444455556666777788889999", "github-oauth-refresh"}, + {"github_pat_EXAMPLE" + strings.Repeat("1", 80), "github-pat-fine-grained"}, + {"sk-ant-EXAMPLE" + strings.Repeat("1", 40), "anthropic-api-key"}, + {"sk-proj-EXAMPLE" + strings.Repeat("1", 40), "openai-project-key"}, + {"sk-svcacct-EXAMPLE" + strings.Repeat("1", 40), "openai-service-account-key"}, + {"sk-cp-EXAMPLE" + strings.Repeat("1", 60), "minimax-api-key"}, + {"xoxb-" + strings.Repeat("a", 25), "slack-token"}, + {"xoxa-" + strings.Repeat("a", 25), "slack-token"}, + // AWS regex requires [0-9A-Z]{16} — uppercase + digits only. + {"AKIA1234567890ABCDEF", "aws-access-key-id"}, + {"ASIA1234567890ABCDEF", "aws-sts-temp-access-key-id"}, + } + for _, tc := range cases { + t.Run(tc.expectedName, func(t *testing.T) { + m, err := ScanBytes([]byte(tc.fixture)) + if err != nil { + t.Fatalf("ScanBytes(%q) errored: %v", tc.fixture, err) + } + if m == nil { + t.Fatalf("ScanBytes(%q) returned no match — expected %q", tc.fixture, tc.expectedName) + } + if m.Name != tc.expectedName { + t.Errorf("ScanBytes(%q) matched %q; expected %q", tc.fixture, m.Name, tc.expectedName) + } + }) + } +} + +// TestNegativeShapes — strings that look credential-adjacent but +// shouldn't match (too short, wrong prefix, missing trailing bytes). +// Failing here means a pattern is too loose, which would generate +// false-positive denial in Files API and false-positive workflow +// failures in CI. +func TestNegativeShapes(t *testing.T) { + cases := []string{ + // Too-short variants — anchored on the length suffix. + "ghp_tooshort", + "ghs_alsoshort1234", + "github_pat_short", + "sk-ant-short", + "sk-cp-not-enough-bytes-here", + // Looks like one of the prefixes but isn't (different letter). + "gha_EXAMPLE_thirty_six_or_more_chars_here_xxx", + // Slack family — wrong letter after xox. + "xoxz-aaaaaaaaaaaaaaaaaaaaaaaaa", + // AWS-shaped but wrong length suffix. + "AKIATOOSHORT", + // Empty / whitespace. + "", + " ", + // Plain prose mentioning the prefix as part of a longer word. + "see also `ghp_HOWTO.md` in the repo", + } + for _, c := range cases { + t.Run(c, func(t *testing.T) { + m, err := ScanBytes([]byte(c)) + if err != nil { + t.Fatalf("ScanBytes(%q) errored: %v", c, err) + } + if m != nil { + t.Errorf("ScanBytes(%q) unexpectedly matched %q", c, m.Name) + } + }) + } +} + +// TestScanString_NoOp — sanity-check ScanString is the zero-copy +// wrapper around ScanBytes. Without this, a future refactor that +// makes ScanString do its own thing (e.g. accidentally normalise +// case) would diverge silently. +func TestScanString_NoOp(t *testing.T) { + in := "ghp_EXAMPLE111122223333444455556666777788889999" + m1, err1 := ScanBytes([]byte(in)) + if err1 != nil { + t.Fatalf("ScanBytes errored: %v", err1) + } + m2, err2 := ScanString(in) + if err2 != nil { + t.Fatalf("ScanString errored: %v", err2) + } + if m1 == nil || m2 == nil { + t.Fatalf("expected matches; got bytes=%+v string=%+v", m1, m2) + } + if m1.Name != m2.Name { + t.Errorf("ScanString and ScanBytes returned different Names: %q vs %q", m1.Name, m2.Name) + } +} + +// TestMatch_NoRoundtrip — assert the Match struct does NOT include +// the matched substring as a field. Adding such a field would +// regress the "matched bytes never leave ScanBytes" invariant that +// makes this package safe to call from log/UI surfaces. This is a +// reflection-light contract test — checks the field names statically. +func TestMatch_NoRoundtrip(t *testing.T) { + var m Match + // If someone adds a `Matched string` (or similar) field, this + // test reads as the canonical place to update + reconsider. + _ = m.Name + _ = m.Description + // The two-field shape is part of the public contract; new fields + // require deliberation about whether they leak the secret value. +} -- 2.52.0