From 061716f3c24d7a979683211e57422a5f49debded Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Sun, 7 Jun 2026 19:17:24 +0000 Subject: [PATCH 1/2] security(scripts): use GIT_ASKPASS instead of embedding token in git clone URL (runtime#86) Replace URL-embedded-token pattern in check_consumer_runtime_drift.py and check_platform_comm_contract.py with a temporary GIT_ASKPASS script. The token no longer appears in: - subprocess argv (visible via ps, /proc/*/cmdline) - git remote URL (visible via git remote -v) - git diagnostics or logs Both scripts now clone via plain HTTPS URL + GIT_ASKPASS env var. The askpass script echoes x-access-token for username prompts and the real token for password prompts, then is immediately unlinked. Add regression test test_clone_consumers_never_puts_token_in_argv. 12/12 tests pass. Fixes runtime#86. Co-Authored-By: Claude Opus 4.8 --- scripts/check_consumer_runtime_drift.py | 41 ++++++++++++++------- scripts/check_platform_comm_contract.py | 42 +++++++++++++++------- tests/test_consumer_runtime_drift_guard.py | 25 +++++++++++++ 3 files changed, 84 insertions(+), 24 deletions(-) diff --git a/scripts/check_consumer_runtime_drift.py b/scripts/check_consumer_runtime_drift.py index cb5068d..681c2a0 100644 --- a/scripts/check_consumer_runtime_drift.py +++ b/scripts/check_consumer_runtime_drift.py @@ -112,6 +112,32 @@ def find_runtime_drift(repo_name: str, repo_path: Path, runtime_root: Path | Non return findings +def _git_clone_with_token(dest: Path, url: str, token: str) -> subprocess.CompletedProcess[str]: + """Clone using GIT_ASKPASS so the token never appears in argv or remote URL.""" + import shlex + + with tempfile.NamedTemporaryFile(mode="w", suffix=".sh", delete=False) as f: + f.write("#!/bin/sh\n") + f.write('case "$1" in\n') + f.write(' *Username*) echo "x-access-token" ;;\n') + f.write(f' *Password*) echo {shlex.quote(token)} ;;\n') + f.write("esac\n") + askpass = f.name + os.chmod(askpass, 0o700) + try: + return subprocess.run( + ["git", "clone", "--depth", "1", url, str(dest)], + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=30, + env={**os.environ, "GIT_ASKPASS": askpass}, + ) + finally: + os.unlink(askpass) + + def clone_consumers( workdir: Path, repos: tuple[str, ...], @@ -126,28 +152,19 @@ def clone_consumers( parsed_url = urlsplit(gitea_url) if parsed_url.scheme not in {"http", "https"} or not parsed_url.netloc: raise RuntimeError(f"invalid Gitea URL: {gitea_url}") - safe_token = quote(token, safe="") - base_url = f"{parsed_url.scheme}://x-access-token:{safe_token}@{parsed_url.netloc}" + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" for repo in repos: dest = workdir / repo clone_url = f"{base_url}/molecule-ai/{repo}.git" - last_exc: Exception | None = None for attempt in range(1, 4): - result = subprocess.run( - ["git", "clone", "--depth", "1", clone_url, str(dest)], - check=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - timeout=30, - ) + result = _git_clone_with_token(dest, clone_url, token) if result.returncode == 0: paths[repo] = dest break if attempt < 3: time.sleep(2 ** (attempt - 1)) continue - stderr = result.stderr.replace(token, "").replace(safe_token, "") + stderr = result.stderr.replace(token, "") raise RuntimeError(f"failed to clone {repo} after 3 attempts: {stderr.strip()}") return paths diff --git a/scripts/check_platform_comm_contract.py b/scripts/check_platform_comm_contract.py index 8ee8f9f..7f67e8d 100755 --- a/scripts/check_platform_comm_contract.py +++ b/scripts/check_platform_comm_contract.py @@ -278,6 +278,32 @@ def find_platform_comm_drift(repo_name: str, repo_path: Path) -> list[ContractFi return [] +def _git_clone_with_token(dest: Path, url: str, token: str) -> subprocess.CompletedProcess[str]: + """Clone using GIT_ASKPASS so the token never appears in argv or remote URL.""" + import shlex + + with tempfile.NamedTemporaryFile(mode="w", suffix=".sh", delete=False) as f: + f.write("#!/bin/sh\n") + f.write('case "$1" in\n') + f.write(' *Username*) echo "x-access-token" ;;\n') + f.write(f' *Password*) echo {shlex.quote(token)} ;;\n') + f.write("esac\n") + askpass = f.name + os.chmod(askpass, 0o700) + try: + return subprocess.run( + ["git", "clone", "--depth", "1", url, str(dest)], + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=30, + env={**os.environ, "GIT_ASKPASS": askpass}, + ) + finally: + os.unlink(askpass) + + def clone_repos(workdir: Path, repos: tuple[str, ...], *, gitea_url: str, token: str) -> dict[str, Path]: if not token: raise RuntimeError("GITEA_TOKEN is required when --root is not provided") @@ -286,28 +312,20 @@ def clone_repos(workdir: Path, repos: tuple[str, ...], *, gitea_url: str, token: if parsed_url.scheme not in {"http", "https"} or not parsed_url.netloc: raise RuntimeError(f"invalid Gitea URL: {gitea_url}") - safe_token = quote(token, safe="") - base_url = f"{parsed_url.scheme}://x-access-token:{safe_token}@{parsed_url.netloc}" + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" paths: dict[str, Path] = {} for repo in repos: dest = workdir / repo - last_exc: Exception | None = None + clone_url = f"{base_url}/molecule-ai/{repo}.git" for attempt in range(1, 4): - result = subprocess.run( - ["git", "clone", "--depth", "1", f"{base_url}/molecule-ai/{repo}.git", str(dest)], - check=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - timeout=30, - ) + result = _git_clone_with_token(dest, clone_url, token) if result.returncode == 0: paths[repo] = dest break if attempt < 3: time.sleep(2 ** (attempt - 1)) continue - stderr = result.stderr.replace(token, "").replace(safe_token, "") + stderr = result.stderr.replace(token, "") raise RuntimeError(f"failed to clone {repo} after 3 attempts: {stderr.strip()}") return paths diff --git a/tests/test_consumer_runtime_drift_guard.py b/tests/test_consumer_runtime_drift_guard.py index cfe660f..66c283c 100644 --- a/tests/test_consumer_runtime_drift_guard.py +++ b/tests/test_consumer_runtime_drift_guard.py @@ -104,3 +104,28 @@ def test_clone_consumers_retries_on_transient_failure(monkeypatch: pytest.Monkey import check_consumer_runtime_drift as guard guard.clone_consumers(workdir, ("molecule-core",), gitea_url="https://git.moleculesai.app", token="fake-token") assert call_count == 3, f"expected 3 attempts, got {call_count}" + + +def test_clone_consumers_never_puts_token_in_argv(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + """GIT_ASKPASS path: token must not appear in git clone argv or remote URL (runtime#86).""" + import subprocess + + captured: list[tuple[tuple[object, ...], dict[str, object]]] = [] + + def capture_run(*args: object, **kwargs: object) -> object: + captured.append((args, kwargs)) + return type("Result", (), {"returncode": 0, "stdout": "", "stderr": ""})() + + monkeypatch.setattr(subprocess, "run", capture_run) + workdir = tmp_path / "wd" + workdir.mkdir() + import check_consumer_runtime_drift as guard + guard.clone_consumers(workdir, ("molecule-core",), gitea_url="https://git.moleculesai.app", token="s3cr3t-t0k3n") + + assert len(captured) == 1 + cmd = captured[0][0][0] + env = captured[0][1].get("env") or {} + cmd_str = " ".join(str(c) for c in cmd) + assert "s3cr3t-t0k3n" not in cmd_str, "token leaked into subprocess argv" + assert "x-access-token" not in cmd_str, "username leaked into subprocess argv" + assert env.get("GIT_ASKPASS") is not None, "GIT_ASKPASS not set in clone env" -- 2.52.0 From 21d9ce04d08469d61696fced009948ecedbd25b3 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer B (MiniMax)" Date: Tue, 23 Jun 2026 07:44:29 +0000 Subject: [PATCH 2/2] test(runtime#86): gate token-in-URL regressions in workflows + scripts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the remaining acceptance-criteria gap from runtime#86: a regression test that fails the build if the x-access-token-in-clone-URL pattern is re-introduced in .gitea/workflows/*.yml or scripts/*.{py,sh}. Why this is the last piece -------------------------- Kimi's prior commit (061716f) on this branch converted the Python audit scripts to use GIT_ASKPASS so the token never lands in subprocess argv or the git remote URL. The publish-runtime.yml cascade had ALREADY been replaced (commits 28cbf9b, 7154e15) with a Gitea contents+pulls API approach — no git clone, no on-disk remote URL with a token. So the practical leak surface is closed; the missing piece is the regression guard so the next PR that re-introduces the pattern goes red in CI rather than red in production. What the gate scans ------------------- * .gitea/workflows/*.yml — the CI surface * scripts/*.py — the audit-script surface (clone helpers used by CI) * scripts/*.sh — shell helpers (none shipped; covered for symmetry) What the gate matches --------------------- x-access-token: followed by a token-looking value: 1. bash ${VAR} 2. GitHub Actions ${{ ... }} 3. Python f-string {var} 4. a quoted token literal (>=8 chars) What the gate does NOT match ---------------------------- * The GIT_ASKPASS helper's 'echo "x-access-token"' username response (the token rides the password prompt, not the URL — the documented safe shape). * Comment lines (the prior fix's commit message + this test file reference the pattern as documentation; the comment-line skip is applied at the scan layer). Tests ----- * test_scan_targets_exist — defence-in-depth, the gate scans something (vacuous-pass guard). * test_no_workflow_or_script_embeds_token_in_clone_url — the main check. * test_askpass_username_response_is_the_only_allowed_occurrence — pins the documented exception so a refactor of the askpass helper doesn't silently break the gate. * test_leak_regex_does_not_match_comment_lines — regression guard for the gate itself (verifies the comment-skip is wired up). Positive control (manual): injecting - run: git clone "https://x-access-token:${SECRET}@github.com/x/y" into .gitea/workflows/publish-runtime.yml and re-running the test produces a clear, file:line-cited failure: Forbidden token-in-URL pattern detected (runtime#86 regression). ... Offenders: .gitea/workflows/publish-runtime.yml:161: - run: git clone ... Existing tests still pass: tests/test_llm_auth.py — 35/35 tests/test_consumer_runtime_drift_guard.py — 6/6 tests/test_platform_comm_contract_guard.py — 6/6 tests/test_workflow_no_token_in_url.py — 4/4 Total 51/51. Closes: runtime#86 Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/test_workflow_no_token_in_url.py | 235 +++++++++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 tests/test_workflow_no_token_in_url.py diff --git a/tests/test_workflow_no_token_in_url.py b/tests/test_workflow_no_token_in_url.py new file mode 100644 index 0000000..fade8a7 --- /dev/null +++ b/tests/test_workflow_no_token_in_url.py @@ -0,0 +1,235 @@ +"""Regression gate (runtime#86): forbid token-in-URL patterns in workflow + scripts. + +The 2026-05-28 family of bugs included ``x-access-token:${DISPATCH_TOKEN}`` +embedded in a ``git clone`` URL — the token ended up in subprocess argv +(visible via ``ps``/``/proc/*/cmdline``), the git remote URL (visible via +``git remote -v``), and git diagnostics/logs. + +The fix is GIT_ASKPASS / Gitea-API rather than URL-embedded tokens. This +gate makes the regression observable: a CI red on the next PR that +re-introduces the pattern. + +Scope +----- +* Walks ``.gitea/workflows/*.yml`` — the CI surface. +* Walks ``scripts/*.py`` — the audit-script surface (clone helpers used by + workflows). +* Walks ``scripts/*.sh`` — shell helpers (none shipped today, but covered + for symmetry). + +What the gate matches +--------------------- +A *leak* is ``x-access-token:`` followed by something that *looks like* a +token or a variable that evaluates to a token — bash ``${...}``, +GitHub Actions ``${{ ... }}``, Python f-string ``{...}``, or a quoted +literal token. + +What the gate does NOT match +---------------------------- +* ``echo "x-access-token"`` — the GIT_ASKPASS script's *username* response. + This is the EXPECTED, safe occurrence (the token is fed via the + *password* prompt, never the URL). +* Comment text describing the pattern (e.g. ``# fix: x-access-token:${...}``). + These use a leading ``#`` so are skipped. + +Failure shape +------------- +A red gate prints every offending (file, line-no, line) tuple so the +offending PR author can fix or justify each in review. +""" +from __future__ import annotations + +import re +from pathlib import Path + +import pytest + + +REPO_ROOT = Path(__file__).resolve().parents[1] + +# Paths the gate scans. Resolved relative to REPO_ROOT and the existence +# of each is asserted at collection time so a misconfigured env surfaces +# a clear failure rather than a vacuous pass. +SCAN_TARGETS = ( + REPO_ROOT / ".gitea" / "workflows", + REPO_ROOT / "scripts", +) + +# Leak pattern: x-access-token: followed by a token-looking value. +# The non-capturing group below covers the four "looks-like-a-token" +# shapes we care about: +# 1. bash ${VAR} +# 2. GitHub Actions ${{ ... }} +# 3. Python f-string {var} (with optional !r/!s and format spec) +# 4. A bare quoted token literal, e.g. "ghs_xxx..." or "abc123..." +# Comment-line filtering is applied at the file-scan layer +# (see _COMMENT_LINE_RE) rather than in the regex — a negative-lookbehind +# for "#" only catches the char immediately before, which is whitespace +# (not "#") on a real comment line, so the regex alone cannot tell. +_LEAK_RE = re.compile( + r""" + x-access-token # the forbidden prefix + \s*:\s* # the colon separator + (?: + \$\{[^}]+\} # bash ${VAR} + | \$\{\{[^}]+\}\} # GitHub Actions ${{ expr }} + | \{[a-zA-Z_][a-zA-Z0-9_]*[!:>.<}]? # Python f-string {var} + | ["'][A-Za-z0-9_\-]{8,}["'] # quoted token literal (>=8 chars) + ) + """, + re.VERBOSE, +) + +# Comment-line detector: lines whose first non-whitespace is '#'. +_COMMENT_LINE_RE = re.compile(r"^\s*#") + + +def _iter_candidate_files() -> list[Path]: + """Enumerate files the gate scans. Excludes the test file itself.""" + files: list[Path] = [] + for target in SCAN_TARGETS: + assert target.exists(), f"scan target missing: {target}" + if target.is_file(): + files.append(target) + continue + for path in sorted(target.rglob("*")): + if not path.is_file(): + continue + if path.suffix in {".yml", ".yaml", ".py", ".sh"}: + files.append(path) + # Exclude the test file itself (its docstring + tests reference the + # pattern; gating the gate would loop). + return [p for p in files if p != Path(__file__)] + + +def _scan_file(path: Path) -> list[tuple[Path, int, str]]: + """Return list of (path, line-no, line) for every leak match in path.""" + try: + text = path.read_text(encoding="utf-8") + except UnicodeDecodeError: + return [] + return _scan_file_for_text(text, source_name=str(path)) + + +def _scan_file_for_text( + text: str, source_name: str = "" +) -> list[tuple[Path, int, str]]: + """Return list of (path, line-no, line) for every leak match in text. + + ``source_name`` is just a label for the returned tuples; the line + number is 1-based. Split out from ``_scan_file`` so the + ``test_leak_regex_does_not_match_comment_lines`` regression guard can + drive the same scan pipeline against inline text. + """ + findings: list[tuple[Path, int, str]] = [] + label = Path(source_name) + for lineno, line in enumerate(text.splitlines(), start=1): + if _COMMENT_LINE_RE.match(line): + continue + if _LEAK_RE.search(line): + findings.append((label, lineno, line)) + return findings + + +def test_scan_targets_exist() -> None: + """The gate's scan targets are present (so a vacuous pass is impossible).""" + for target in SCAN_TARGETS: + assert target.exists(), ( + f"scan target {target} must exist; otherwise the gate is a " + f"vacuous pass — fix the gate, not the test" + ) + + +def test_no_workflow_or_script_embeds_token_in_clone_url() -> None: + """No ``.gitea/workflows/*.yml`` or script may embed a token in a URL. + + Runtime#86: the original bug was ``x-access-token:${DISPATCH_TOKEN}`` in + a ``git clone`` URL inside the publish-runtime workflow. The fix moved + to GIT_ASKPASS / Gitea-API. This regression gate makes the fix durable. + """ + files = _iter_candidate_files() + # Defence-in-depth: the gate must actually be scanning something. + assert files, "no candidate files found — gate is vacuous" + + all_findings: list[tuple[Path, int, str]] = [] + for path in files: + all_findings.extend(_scan_file(path)) + + if all_findings: + # Pretty-print all offenders so the failing PR author gets a clear + # diff-shaped report (rather than a single AssertionError blob). + report = "\n".join( + f" {p.relative_to(REPO_ROOT)}:{ln}: {line.strip()}" + for p, ln, line in all_findings + ) + pytest.fail( + "Forbidden token-in-URL pattern detected (runtime#86 regression). " + "Use GIT_ASKPASS or the Gitea contents+pulls API instead — never " + "embed a token in a URL passed to git / curl. Offenders:\n" + f"{report}" + ) + + +def test_askpass_username_response_is_the_only_allowed_occurrence() -> None: + """The GIT_ASKPASS helper legitimately writes ``x-access-token`` as the + *username* response (the token rides the *password* prompt). This test + pins that pattern so any future re-introduction of the URL-embedding + pattern is *visible* in a diff but does not falsely trip the gate. + + It does NOT relax the gate — it just guards the documented exception + shape so a refactor of the askpass helper can keep working. + """ + expected_substring = 'echo "x-access-token"' + matches: list[Path] = [] + for path in _iter_candidate_files(): + if path.suffix != ".py": + continue + try: + text = path.read_text(encoding="utf-8") + except UnicodeDecodeError: + continue + if expected_substring in text: + matches.append(path) + + assert matches, ( + "GIT_ASKPASS helpers no longer write the expected `x-access-token` " + "username response. If the askpass mechanism was intentionally " + "replaced, update this test to match the new shape — do not silently " + "let the gate pass on no occurrences (vacuous protection)." + ) + # The expected set: both consumer-drift + platform-comm-contract scripts. + expected_paths = { + REPO_ROOT / "scripts" / "check_consumer_runtime_drift.py", + REPO_ROOT / "scripts" / "check_platform_comm_contract.py", + } + assert set(matches) == expected_paths, ( + f"GIT_ASKPASS helper moved/added/removed: got {sorted(p.name for p in matches)}, " + f"expected {sorted(p.name for p in expected_paths)}" + ) + + +def test_leak_regex_does_not_match_comment_lines() -> None: + """The gate's scan pipeline must skip comment lines so documentation + referencing the pattern (commit messages, this file, etc.) does not + false-positive. + + Regression guard for the gate itself: if someone refactors + ``_scan_file`` and drops the ``_COMMENT_LINE_RE`` short-circuit, every + doc line referencing the pattern would trip the gate. + """ + sample = ( + "# The prior bug was: x-access-token:${DISPATCH_TOKEN}@github.com/...\n" + 'x-access-token:"sk_live_actual_leak"\n' + ) + # Drive the full scan pipeline (per-line comment skip + regex) rather + # than the regex in isolation, so the test exercises the *real* guard + # contract — not just the regex. + findings = _scan_file_for_text(sample, source_name="") + assert len(findings) == 1, ( + f"expected exactly 1 leak finding (the non-comment line), got " + f"{findings!r}" + ) + _path, _lineno, line = findings[0] + assert "sk_live_actual_leak" in line, ( + f"the leak match should be the actual token line, got {line!r}" + ) -- 2.52.0