Merge pull request 'feat(secrets): SSOT Go package for credential-shape regex (internal#425 Phase 2a)' (#1255) from feat/secrets-patterns-ssot-internal-425-phase-2a into staging
Some checks failed
Block internal-flavored paths / Block forbidden paths (push) Waiting to run
CI / Detect changes (push) Waiting to run
CI / Platform (Go) (push) Waiting to run
CI / Canvas (Next.js) (push) Waiting to run
CI / Shellcheck (E2E scripts) (push) Blocked by required conditions
CI / Canvas Deploy Reminder (push) Blocked by required conditions
CI / Python Lint & Test (push) Blocked by required conditions
CI / all-required (push) Blocked by required conditions
Handlers Postgres Integration / detect-changes (push) Waiting to run
Handlers Postgres Integration / Handlers Postgres Integration (push) Blocked by required conditions
E2E Chat / detect-changes (push) Successful in 1m31s
E2E API Smoke Test / detect-changes (push) Successful in 1m48s
Harness Replays / detect-changes (push) Successful in 33s
Secret scan / Scan diff for credential-shaped strings (push) Failing after 39s
Runtime PR-Built Compatibility / detect-changes (push) Successful in 2m10s
E2E API Smoke Test / E2E API Smoke Test (push) Has been cancelled
Harness Replays / Harness Replays (push) Successful in 21s
E2E Chat / E2E Chat (push) Has been cancelled
Runtime PR-Built Compatibility / PR-built wheel + import smoke (push) Has been cancelled

This commit is contained in:
devops-engineer 2026-05-16 00:43:51 +00:00
commit 5ace10fd14
2 changed files with 415 additions and 0 deletions

View File

@ -0,0 +1,226 @@
// Package secrets provides the canonical SSOT for credential-shaped
// regex patterns used by:
//
// - the CI `Secret scan` workflow (.gitea/workflows/secret-scan.yml)
// - the runtime's bundled pre-commit hook
// (molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh)
// - the upcoming Phase 2b docker-exec Files API backend, which has
// to refuse to surface files whose path OR content matches a
// credential shape (RFC internal#425, Hongming 2026-05-15)
//
// Before this package, the same regex set lived as duplicate bash
// arrays in two unrelated repos; adding a pattern required editing
// both, and pattern drift was caught only via secret-scan workflow
// failures on PRs that had unrelated changes (#2090-class incident
// vector). Centralising in Go makes the Files API the SSOT, with the
// YAML + bash arrays generated/asserted from this package so drift
// is detected at CI time, not at exfiltration time.
//
// This file is Phase 2a of the internal#425 RFC. Phase 2b will import
// `Patterns` from `template_files_docker_exec.go` to gate
// `listFilesViaDockerExec` / `readFileViaDockerExec` against
// secret-shaped paths AND content. Until 2b lands, the package has
// one consumer: this package's own unit tests, which pin the regex
// strings so a refactor that drops or weakens one is caught here.
package secrets
import (
"fmt"
"regexp"
"sync"
)
// Pattern is one named credential shape — a human label plus the
// compiled regex. The label appears in CI error output ("matched:
// github-pat") so an operator can identify the family without seeing
// the actual matched bytes (echoing the bytes widens the blast radius
// per the secret-scan workflow's recovery prose).
type Pattern struct {
// Name is a short kebab-case identifier (e.g. "github-pat",
// "anthropic-api-key"). Stable across versions — consumers may
// switch on it.
Name string
// Description is a one-line human-readable explanation of what
// the pattern matches. Used in CI error messages and the Files
// API "<denied: secret-shape>" placeholder rationale.
Description string
// regexSource is the regex literal in Go-RE2 syntax. Stored as a
// string so the slice declaration below stays readable; compiled
// once via sync.Once into a *regexp.Regexp.
regexSource string
}
// Patterns is the canonical credential-shape regex set.
//
// Adding a pattern here:
//
// 1. Add a new Pattern{} entry below with a kebab-case Name, a
// one-line Description, and the regex literal. Anchor on a
// low-false-positive prefix.
// 2. Add a positive + negative test case in patterns_test.go.
// 3. Mirror the regex string into:
// a. .gitea/workflows/secret-scan.yml SECRET_PATTERNS array
// b. molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh
// (or wait for the codegen target that consumes this slice — TBD
// follow-up; tracked in the Phase 2a PR description.)
//
// The order is: alphabetical within each provider family, families
// grouped by ecosystem (GitHub family, AI-provider family, chat
// family, cloud family). Keep this stable so diffs are reviewable.
var Patterns = []Pattern{
// --- GitHub token family ---
{
Name: "github-pat-classic",
Description: "GitHub personal access token (classic)",
regexSource: `ghp_[A-Za-z0-9]{36,}`,
},
{
Name: "github-app-installation-token",
Description: "GitHub App installation token (#2090 vector)",
regexSource: `ghs_[A-Za-z0-9]{36,}`,
},
{
Name: "github-oauth-user-to-server",
Description: "GitHub OAuth user-to-server token",
regexSource: `gho_[A-Za-z0-9]{36,}`,
},
{
Name: "github-oauth-user",
Description: "GitHub OAuth user token",
regexSource: `ghu_[A-Za-z0-9]{36,}`,
},
{
Name: "github-oauth-refresh",
Description: "GitHub OAuth refresh token",
regexSource: `ghr_[A-Za-z0-9]{36,}`,
},
{
Name: "github-pat-fine-grained",
Description: "GitHub fine-grained personal access token",
regexSource: `github_pat_[A-Za-z0-9_]{82,}`,
},
// --- AI-provider API key family ---
{
Name: "anthropic-api-key",
Description: "Anthropic API key",
regexSource: `sk-ant-[A-Za-z0-9_-]{40,}`,
},
{
Name: "openai-project-key",
Description: "OpenAI project API key",
regexSource: `sk-proj-[A-Za-z0-9_-]{40,}`,
},
{
Name: "openai-service-account-key",
Description: "OpenAI service-account API key",
regexSource: `sk-svcacct-[A-Za-z0-9_-]{40,}`,
},
{
Name: "minimax-api-key",
Description: "MiniMax API key (F1088 vector)",
regexSource: `sk-cp-[A-Za-z0-9_-]{60,}`,
},
// --- Chat-platform token family ---
{
Name: "slack-token",
Description: "Slack token (xoxb/xoxa/xoxp/xoxr/xoxs)",
regexSource: `xox[baprs]-[A-Za-z0-9-]{20,}`,
},
// --- Cloud-provider credential family ---
{
Name: "aws-access-key-id",
Description: "AWS access key ID",
regexSource: `AKIA[0-9A-Z]{16}`,
},
{
Name: "aws-sts-temp-access-key-id",
Description: "AWS STS temporary access key ID",
regexSource: `ASIA[0-9A-Z]{16}`,
},
}
// compiledOnce protects the lazy build of compiledPatterns. We compile
// lazily so package init is cheap; callers pay only on first match
// (typically once per workspace-server boot).
var (
compiledOnce sync.Once
compiledPatterns []*compiledPattern
compileErr error
)
type compiledPattern struct {
Name string
Description string
Re *regexp.Regexp
}
// compileAll compiles every Pattern.regexSource into a *regexp.Regexp.
// Called once via compiledOnce. Any compile failure here is a build
// bug (the unit tests assert each regex compiles) — surfacing via
// returned error so callers don't panic in request handling.
func compileAll() {
out := make([]*compiledPattern, 0, len(Patterns))
for _, p := range Patterns {
re, err := regexp.Compile(p.regexSource)
if err != nil {
compileErr = fmt.Errorf("secrets: pattern %q failed to compile: %w", p.Name, err)
return
}
out = append(out, &compiledPattern{Name: p.Name, Description: p.Description, Re: re})
}
compiledPatterns = out
}
// ScanBytes returns a non-nil Match if any pattern matches anywhere
// inside b. Returns (nil, nil) on no match. Returns (nil, err) only
// if a regex in the package fails to compile — that's a build bug,
// not a runtime data issue.
//
// Match contains the pattern Name + Description so the caller can
// emit a path-or-content-denial rationale WITHOUT round-tripping the
// matched bytes (which would defeat the purpose). The matched bytes
// stay inside this function.
//
// The Files API Phase 2b backend will call ScanBytes on:
//
// - the absolute path string (catches a file literally named
// `ghs_abc.txt`)
// - the file content (catches a credential pasted into a workspace
// file by an agent or user — the Files API refuses to surface it
// and the canvas renders "<denied: secret-shape>")
//
// Ordering: patterns are tried in declaration order. First match
// wins. This means narrower patterns (e.g. `sk-svcacct-…`) should
// appear in `Patterns` before broader ones (`sk-…`) — today there's
// no overlap, so order is descriptive only.
func ScanBytes(b []byte) (*Match, error) {
compiledOnce.Do(compileAll)
if compileErr != nil {
return nil, compileErr
}
for _, cp := range compiledPatterns {
if cp.Re.Match(b) {
return &Match{Name: cp.Name, Description: cp.Description}, nil
}
}
return nil, nil
}
// ScanString is the string-input convenience wrapper around ScanBytes.
// Identical semantics — the body never copies, []byte(s) is a
// zero-copy reinterpret for the regex matcher.
func ScanString(s string) (*Match, error) {
return ScanBytes([]byte(s))
}
// Match describes which pattern caught a value. Deliberately does
// NOT include the matched substring — callers must not echo it.
type Match struct {
// Name is the pattern's kebab-case identifier (e.g. "github-pat-classic").
Name string
// Description is the human-readable line for UI / log surfaces.
Description string
}

View File

@ -0,0 +1,189 @@
package secrets
import (
"strings"
"testing"
)
// TestEveryPatternCompiles pins that every Pattern.regexSource is a
// valid Go-RE2 expression. Without this, a bad regex would silently
// disable ScanBytes for everything after it (the lazy compile would
// set compileErr and ScanBytes would return that error every call).
func TestEveryPatternCompiles(t *testing.T) {
for _, p := range Patterns {
if p.Name == "" {
t.Errorf("pattern with empty Name: regex=%q", p.regexSource)
}
if p.Description == "" {
t.Errorf("pattern %q has empty Description", p.Name)
}
}
// Force compile + check error.
if _, err := ScanBytes([]byte("placeholder")); err != nil {
t.Fatalf("ScanBytes init failed: %v", err)
}
}
// TestNoDuplicateNames — a duplicate pattern Name would make the
// "first match wins" semantics surprising to readers and any caller
// switching on Match.Name (none today but adding the guard is cheap).
func TestNoDuplicateNames(t *testing.T) {
seen := map[string]bool{}
for _, p := range Patterns {
if seen[p.Name] {
t.Errorf("duplicate pattern Name: %q", p.Name)
}
seen[p.Name] = true
}
}
// TestKnownPatternsAllPresent — pins which specific Name values are
// expected. A future refactor that renames or removes one without
// updating consumers (CI workflow, runtime pre-commit hook, Files
// API Phase 2b backend) would silently widen the leak surface.
// Failing here forces the rename to be intentional.
func TestKnownPatternsAllPresent(t *testing.T) {
expected := []string{
"github-pat-classic",
"github-app-installation-token",
"github-oauth-user-to-server",
"github-oauth-user",
"github-oauth-refresh",
"github-pat-fine-grained",
"anthropic-api-key",
"openai-project-key",
"openai-service-account-key",
"minimax-api-key",
"slack-token",
"aws-access-key-id",
"aws-sts-temp-access-key-id",
}
got := map[string]bool{}
for _, p := range Patterns {
got[p.Name] = true
}
for _, want := range expected {
if !got[want] {
t.Errorf("expected pattern %q missing from Patterns slice", want)
}
}
}
// TestPositiveMatches — for each pattern, supply a representative
// shape and assert ScanBytes returns a Match with the right Name.
// These are TEST FIXTURES, not real credentials — each is the
// pattern's prefix + a long-enough trailing run of placeholder chars.
// `EXAMPLE` is sprinkled in to make grep-finds in CI logs obviously
// fake to a human reader (matches saved memory
// feedback_assert_exact_not_substring: tighten by Name not body).
func TestPositiveMatches(t *testing.T) {
cases := []struct {
fixture string
expectedName string
}{
{"ghp_EXAMPLE111122223333444455556666777788889999", "github-pat-classic"},
{"ghs_EXAMPLE111122223333444455556666777788889999", "github-app-installation-token"},
{"gho_EXAMPLE111122223333444455556666777788889999", "github-oauth-user-to-server"},
{"ghu_EXAMPLE111122223333444455556666777788889999", "github-oauth-user"},
{"ghr_EXAMPLE111122223333444455556666777788889999", "github-oauth-refresh"},
{"github_pat_EXAMPLE" + strings.Repeat("1", 80), "github-pat-fine-grained"},
{"sk-ant-EXAMPLE" + strings.Repeat("1", 40), "anthropic-api-key"},
{"sk-proj-EXAMPLE" + strings.Repeat("1", 40), "openai-project-key"},
{"sk-svcacct-EXAMPLE" + strings.Repeat("1", 40), "openai-service-account-key"},
{"sk-cp-EXAMPLE" + strings.Repeat("1", 60), "minimax-api-key"},
{"xoxb-" + strings.Repeat("a", 25), "slack-token"},
{"xoxa-" + strings.Repeat("a", 25), "slack-token"},
// AWS regex requires [0-9A-Z]{16} — uppercase + digits only.
{"AKIA1234567890ABCDEF", "aws-access-key-id"},
{"ASIA1234567890ABCDEF", "aws-sts-temp-access-key-id"},
}
for _, tc := range cases {
t.Run(tc.expectedName, func(t *testing.T) {
m, err := ScanBytes([]byte(tc.fixture))
if err != nil {
t.Fatalf("ScanBytes(%q) errored: %v", tc.fixture, err)
}
if m == nil {
t.Fatalf("ScanBytes(%q) returned no match — expected %q", tc.fixture, tc.expectedName)
}
if m.Name != tc.expectedName {
t.Errorf("ScanBytes(%q) matched %q; expected %q", tc.fixture, m.Name, tc.expectedName)
}
})
}
}
// TestNegativeShapes — strings that look credential-adjacent but
// shouldn't match (too short, wrong prefix, missing trailing bytes).
// Failing here means a pattern is too loose, which would generate
// false-positive denial in Files API and false-positive workflow
// failures in CI.
func TestNegativeShapes(t *testing.T) {
cases := []string{
// Too-short variants — anchored on the length suffix.
"ghp_tooshort",
"ghs_alsoshort1234",
"github_pat_short",
"sk-ant-short",
"sk-cp-not-enough-bytes-here",
// Looks like one of the prefixes but isn't (different letter).
"gha_EXAMPLE_thirty_six_or_more_chars_here_xxx",
// Slack family — wrong letter after xox.
"xoxz-aaaaaaaaaaaaaaaaaaaaaaaaa",
// AWS-shaped but wrong length suffix.
"AKIATOOSHORT",
// Empty / whitespace.
"",
" ",
// Plain prose mentioning the prefix as part of a longer word.
"see also `ghp_HOWTO.md` in the repo",
}
for _, c := range cases {
t.Run(c, func(t *testing.T) {
m, err := ScanBytes([]byte(c))
if err != nil {
t.Fatalf("ScanBytes(%q) errored: %v", c, err)
}
if m != nil {
t.Errorf("ScanBytes(%q) unexpectedly matched %q", c, m.Name)
}
})
}
}
// TestScanString_NoOp — sanity-check ScanString is the zero-copy
// wrapper around ScanBytes. Without this, a future refactor that
// makes ScanString do its own thing (e.g. accidentally normalise
// case) would diverge silently.
func TestScanString_NoOp(t *testing.T) {
in := "ghp_EXAMPLE111122223333444455556666777788889999"
m1, err1 := ScanBytes([]byte(in))
if err1 != nil {
t.Fatalf("ScanBytes errored: %v", err1)
}
m2, err2 := ScanString(in)
if err2 != nil {
t.Fatalf("ScanString errored: %v", err2)
}
if m1 == nil || m2 == nil {
t.Fatalf("expected matches; got bytes=%+v string=%+v", m1, m2)
}
if m1.Name != m2.Name {
t.Errorf("ScanString and ScanBytes returned different Names: %q vs %q", m1.Name, m2.Name)
}
}
// TestMatch_NoRoundtrip — assert the Match struct does NOT include
// the matched substring as a field. Adding such a field would
// regress the "matched bytes never leave ScanBytes" invariant that
// makes this package safe to call from log/UI surfaces. This is a
// reflection-light contract test — checks the field names statically.
func TestMatch_NoRoundtrip(t *testing.T) {
var m Match
// If someone adds a `Matched string` (or similar) field, this
// test reads as the canonical place to update + reconsider.
_ = m.Name
_ = m.Description
// The two-field shape is part of the public contract; new fields
// require deliberation about whether they leak the secret value.
}