diff --git a/.gitea/scripts/lint-curl-status-capture.py b/.gitea/scripts/lint-curl-status-capture.py new file mode 100644 index 00000000..73cbbab5 --- /dev/null +++ b/.gitea/scripts/lint-curl-status-capture.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +"""Lint workflow bash for curl status-code capture pollution. + +The bad shape is: + + HTTP_CODE=$(curl ... -w '%{http_code}' ... || echo "000") + +`curl -w` writes the HTTP code to stdout before returning non-zero, so +fallback output inside the same command substitution appends another code. +""" +from __future__ import annotations + +import argparse +import glob +import re +import sys +from pathlib import Path +from typing import NamedTuple + + +SELF = ".gitea/workflows/lint-curl-status-capture.yml" + + +class Finding(NamedTuple): + path: str + snippet: str + + +BAD_STATUS_CAPTURE = re.compile( + r""" + \$\(\s* + curl\b + [^)]* + -w\s*['"]%\{http_code\}['"] + [^)]* + \|\|\s* + (?: + echo\s+['"]?000['"]? + | + printf\s+['"]000['"] + ) + \s*\) + """, + re.DOTALL | re.VERBOSE, +) + + +def _logical_shell(content: str) -> str: + """Collapse bash line continuations so one curl command is one string.""" + return re.sub(r"\\\s*\n\s*", " ", content) + + +def scan_content(path: str, content: str) -> list[Finding]: + flat = _logical_shell(content) + return [ + Finding(path=path, snippet=re.sub(r"\s+", " ", match.group(0)).strip()[:160]) + for match in BAD_STATUS_CAPTURE.finditer(flat) + ] + + +def scan_paths(paths: list[str]) -> list[Finding]: + findings: list[Finding] = [] + for path in paths: + if path == SELF: + continue + content = Path(path).read_text(encoding="utf-8") + findings.extend(scan_content(path, content)) + return findings + + +def default_paths() -> list[str]: + return sorted(glob.glob(".gitea/workflows/*.yml")) + + +def print_report(findings: list[Finding]) -> None: + if not findings: + print("OK No curl-status-capture pollution patterns detected") + return + + print(f"::error::Found {len(findings)} curl-status-capture pollution site(s):") + for finding in findings: + print( + f"::error file={finding.path}::Curl status-capture pollution: " + "'|| echo/printf 000' inside a $(curl ... -w '%{http_code}' ...) " + "subshell. On non-2xx or connection failure, curl's -w writes a " + "status, then exits non-zero, then the fallback appends another " + "status. Fix: route -w into a tempfile so the exit code cannot " + "pollute stdout." + ) + print(f" matched: {finding.snippet}...") + + print() + print("Fix template:") + print(" set +e") + print(" curl ... -w '%{http_code}' >code.txt 2>/dev/null") + print(" set -e") + print(' HTTP_CODE=$(cat code.txt 2>/dev/null)') + print(' [ -z "$HTTP_CODE" ] && HTTP_CODE="000"') + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser() + parser.add_argument("paths", nargs="*", help="workflow files to scan") + args = parser.parse_args(argv) + + paths = args.paths or default_paths() + findings = scan_paths(paths) + print_report(findings) + return 1 if findings else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/.gitea/workflows/lint-curl-status-capture.yml b/.gitea/workflows/lint-curl-status-capture.yml index 99f3f4c0..4f8b0550 100644 --- a/.gitea/workflows/lint-curl-status-capture.yml +++ b/.gitea/workflows/lint-curl-status-capture.yml @@ -30,10 +30,16 @@ name: Lint curl status-code capture on: pull_request: - paths: ['.gitea/workflows/**'] + paths: + - '.gitea/workflows/**' + - '.gitea/scripts/lint-curl-status-capture.py' + - 'tests/test_lint_curl_status_capture.py' push: branches: [main, staging] - paths: ['.gitea/workflows/**'] + paths: + - '.gitea/workflows/**' + - '.gitea/scripts/lint-curl-status-capture.py' + - 'tests/test_lint_curl_status_capture.py' env: GITHUB_SERVER_URL: https://git.moleculesai.app @@ -50,55 +56,4 @@ jobs: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Find curl ... -w '%{http_code}' ... || echo "000" subshells run: | - set -uo pipefail - # Multi-line aware: look for `$(curl ... -w '%{http_code}' ... || echo "000")` - # subshell where the entire command-substitution wraps a curl that - # ends with `|| echo "000"`. Must distinguish from the SAFE shape - # `$(cat tempfile 2>/dev/null || echo "000")` — `cat` with a missing - # tempfile produces empty stdout, no pollution. - python3 <<'PY' - import os, re, sys, glob - - BAD_FILES = [] - - # Match the buggy substitution across newlines: $(curl ... -w '%{http_code}' ... || echo "000") - # The `\\n` is the bash line-continuation that lets curl flags span lines. - # We collapse continuation lines first, then look for the single-line bad pattern. - PATTERN = re.compile( - r'\$\(\s*curl\b[^)]*-w\s*[\'"]%\{http_code\}[\'"][^)]*\|\|\s*echo\s+"000"\s*\)', - re.DOTALL, - ) - - # Self-skip: this lint workflow contains the literal anti-pattern in - # its own docstring — that's intentional, not a bug. - SELF = ".gitea/workflows/lint-curl-status-capture.yml" - - for f in sorted(glob.glob(".gitea/workflows/*.yml")): - if f == SELF: - continue - with open(f) as fh: - content = fh.read() - # Collapse bash line-continuations (\\\n + leading whitespace) - # into a single logical line so the regex can see the full - # curl invocation as one chunk. - flat = re.sub(r'\\\s*\n\s*', ' ', content) - for m in PATTERN.finditer(flat): - BAD_FILES.append((f, m.group(0)[:120])) - - if not BAD_FILES: - print("OK No curl-status-capture pollution patterns detected") - sys.exit(0) - - print(f"::error::Found {len(BAD_FILES)} curl-status-capture pollution site(s):") - for f, snippet in BAD_FILES: - print(f"::error file={f}::Curl status-capture pollution: '|| echo \"000\"' inside a $(curl ... -w '%{{http_code}}' ...) subshell. On non-2xx or connection failure, curl's -w writes a status, then exits non-zero, then the || echo appends another '000' — producing 'HTTP 000000' or '409000' that fails comparisons silently. Fix: route -w into a tempfile so the exit code can't pollute stdout. See memory feedback_curl_status_capture_pollution.md.") - print(f" matched: {snippet}...") - print() - print("Fix template:") - print(' set +e') - print(' curl ... -w \'%{http_code}\' >code.txt 2>/dev/null') - print(' set -e') - print(' HTTP_CODE=$(cat code.txt 2>/dev/null)') - print(' [ -z "$HTTP_CODE" ] && HTTP_CODE="000"') - sys.exit(1) - PY + python3 .gitea/scripts/lint-curl-status-capture.py diff --git a/tests/test_lint_curl_status_capture.py b/tests/test_lint_curl_status_capture.py new file mode 100644 index 00000000..e3a9a3f8 --- /dev/null +++ b/tests/test_lint_curl_status_capture.py @@ -0,0 +1,88 @@ +"""Tests for `.gitea/scripts/lint-curl-status-capture.py`. + +Run: + python3 -m pytest tests/test_lint_curl_status_capture.py -v +""" +from __future__ import annotations + +import importlib.util +from pathlib import Path + + +SCRIPT_PATH = ( + Path(__file__).resolve().parent.parent + / ".gitea" + / "scripts" + / "lint-curl-status-capture.py" +) + + +def _load_module(): + spec = importlib.util.spec_from_file_location("lint_curl_status_capture", SCRIPT_PATH) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def test_finds_quoted_echo_fallback_pollution(): + lint = _load_module() + content = """ + HTTP_CODE=$(curl -sS -o /tmp/body -w "%{http_code}" https://example.test || echo "000") + """ + + findings = lint.scan_content("workflow.yml", content) + + assert len(findings) == 1 + assert "echo" in findings[0].snippet + + +def test_finds_unquoted_echo_fallback_pollution(): + lint = _load_module() + content = """ + HTTP_CODE=$(curl -sS -o /tmp/body -w '%{http_code}' https://example.test || echo 000) + """ + + findings = lint.scan_content("workflow.yml", content) + + assert len(findings) == 1 + assert "echo" in findings[0].snippet + + +def test_finds_printf_fallback_pollution(): + lint = _load_module() + content = """ + HTTP_CODE=$(curl -sS -o /tmp/body -w '%{http_code}' https://example.test || printf '000') + """ + + findings = lint.scan_content("workflow.yml", content) + + assert len(findings) == 1 + assert "printf" in findings[0].snippet + + +def test_ignores_tempfile_fallback_after_curl(): + lint = _load_module() + content = """ + set +e + curl -sS -o /tmp/body -w '%{http_code}' https://example.test >/tmp/code + rc=$? + set -e + HTTP_CODE=$(cat /tmp/code 2>/dev/null || echo "000") + [ -z "$HTTP_CODE" ] && HTTP_CODE="000" + """ + + assert lint.scan_content("workflow.yml", content) == [] + + +def test_collapses_bash_line_continuations(): + lint = _load_module() + content = """ + HTTP_CODE=$(curl -sS -o /tmp/body \\ + -w "%{http_code}" \\ + https://example.test \\ + || echo "000") + """ + + findings = lint.scan_content("workflow.yml", content) + + assert len(findings) == 1