test curl status capture workflow lint #764

Merged
hongming merged 1 commits from chore/curl-status-lint-script into main 2026-05-13 04:29:44 +00:00
3 changed files with 210 additions and 54 deletions

View File

@ -0,0 +1,113 @@
#!/usr/bin/env python3
"""Lint workflow bash for curl status-code capture pollution.
The bad shape is:
HTTP_CODE=$(curl ... -w '%{http_code}' ... || echo "000")
`curl -w` writes the HTTP code to stdout before returning non-zero, so
fallback output inside the same command substitution appends another code.
"""
from __future__ import annotations
import argparse
import glob
import re
import sys
from pathlib import Path
from typing import NamedTuple
SELF = ".gitea/workflows/lint-curl-status-capture.yml"
class Finding(NamedTuple):
path: str
snippet: str
BAD_STATUS_CAPTURE = re.compile(
r"""
\$\(\s*
curl\b
[^)]*
-w\s*['"]%\{http_code\}['"]
[^)]*
\|\|\s*
(?:
echo\s+['"]?000['"]?
|
printf\s+['"]000['"]
)
\s*\)
""",
re.DOTALL | re.VERBOSE,
)
def _logical_shell(content: str) -> str:
"""Collapse bash line continuations so one curl command is one string."""
return re.sub(r"\\\s*\n\s*", " ", content)
def scan_content(path: str, content: str) -> list[Finding]:
flat = _logical_shell(content)
return [
Finding(path=path, snippet=re.sub(r"\s+", " ", match.group(0)).strip()[:160])
for match in BAD_STATUS_CAPTURE.finditer(flat)
]
def scan_paths(paths: list[str]) -> list[Finding]:
findings: list[Finding] = []
for path in paths:
if path == SELF:
continue
content = Path(path).read_text(encoding="utf-8")
findings.extend(scan_content(path, content))
return findings
def default_paths() -> list[str]:
return sorted(glob.glob(".gitea/workflows/*.yml"))
def print_report(findings: list[Finding]) -> None:
if not findings:
print("OK No curl-status-capture pollution patterns detected")
return
print(f"::error::Found {len(findings)} curl-status-capture pollution site(s):")
for finding in findings:
print(
f"::error file={finding.path}::Curl status-capture pollution: "
"'|| echo/printf 000' inside a $(curl ... -w '%{http_code}' ...) "
"subshell. On non-2xx or connection failure, curl's -w writes a "
"status, then exits non-zero, then the fallback appends another "
"status. Fix: route -w into a tempfile so the exit code cannot "
"pollute stdout."
)
print(f" matched: {finding.snippet}...")
print()
print("Fix template:")
print(" set +e")
print(" curl ... -w '%{http_code}' >code.txt 2>/dev/null")
print(" set -e")
print(' HTTP_CODE=$(cat code.txt 2>/dev/null)')
print(' [ -z "$HTTP_CODE" ] && HTTP_CODE="000"')
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser()
parser.add_argument("paths", nargs="*", help="workflow files to scan")
args = parser.parse_args(argv)
paths = args.paths or default_paths()
findings = scan_paths(paths)
print_report(findings)
return 1 if findings else 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@ -30,10 +30,16 @@ name: Lint curl status-code capture
on:
pull_request:
paths: ['.gitea/workflows/**']
paths:
- '.gitea/workflows/**'
- '.gitea/scripts/lint-curl-status-capture.py'
- 'tests/test_lint_curl_status_capture.py'
push:
branches: [main, staging]
paths: ['.gitea/workflows/**']
paths:
- '.gitea/workflows/**'
- '.gitea/scripts/lint-curl-status-capture.py'
- 'tests/test_lint_curl_status_capture.py'
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
@ -50,55 +56,4 @@ jobs:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Find curl ... -w '%{http_code}' ... || echo "000" subshells
run: |
set -uo pipefail
# Multi-line aware: look for `$(curl ... -w '%{http_code}' ... || echo "000")`
# subshell where the entire command-substitution wraps a curl that
# ends with `|| echo "000"`. Must distinguish from the SAFE shape
# `$(cat tempfile 2>/dev/null || echo "000")` — `cat` with a missing
# tempfile produces empty stdout, no pollution.
python3 <<'PY'
import os, re, sys, glob
BAD_FILES = []
# Match the buggy substitution across newlines: $(curl ... -w '%{http_code}' ... || echo "000")
# The `\\n` is the bash line-continuation that lets curl flags span lines.
# We collapse continuation lines first, then look for the single-line bad pattern.
PATTERN = re.compile(
r'\$\(\s*curl\b[^)]*-w\s*[\'"]%\{http_code\}[\'"][^)]*\|\|\s*echo\s+"000"\s*\)',
re.DOTALL,
)
# Self-skip: this lint workflow contains the literal anti-pattern in
# its own docstring — that's intentional, not a bug.
SELF = ".gitea/workflows/lint-curl-status-capture.yml"
for f in sorted(glob.glob(".gitea/workflows/*.yml")):
if f == SELF:
continue
with open(f) as fh:
content = fh.read()
# Collapse bash line-continuations (\\\n + leading whitespace)
# into a single logical line so the regex can see the full
# curl invocation as one chunk.
flat = re.sub(r'\\\s*\n\s*', ' ', content)
for m in PATTERN.finditer(flat):
BAD_FILES.append((f, m.group(0)[:120]))
if not BAD_FILES:
print("OK No curl-status-capture pollution patterns detected")
sys.exit(0)
print(f"::error::Found {len(BAD_FILES)} curl-status-capture pollution site(s):")
for f, snippet in BAD_FILES:
print(f"::error file={f}::Curl status-capture pollution: '|| echo \"000\"' inside a $(curl ... -w '%{{http_code}}' ...) subshell. On non-2xx or connection failure, curl's -w writes a status, then exits non-zero, then the || echo appends another '000' — producing 'HTTP 000000' or '409000' that fails comparisons silently. Fix: route -w into a tempfile so the exit code can't pollute stdout. See memory feedback_curl_status_capture_pollution.md.")
print(f" matched: {snippet}...")
print()
print("Fix template:")
print(' set +e')
print(' curl ... -w \'%{http_code}\' >code.txt 2>/dev/null')
print(' set -e')
print(' HTTP_CODE=$(cat code.txt 2>/dev/null)')
print(' [ -z "$HTTP_CODE" ] && HTTP_CODE="000"')
sys.exit(1)
PY
python3 .gitea/scripts/lint-curl-status-capture.py

View File

@ -0,0 +1,88 @@
"""Tests for `.gitea/scripts/lint-curl-status-capture.py`.
Run:
python3 -m pytest tests/test_lint_curl_status_capture.py -v
"""
from __future__ import annotations
import importlib.util
from pathlib import Path
SCRIPT_PATH = (
Path(__file__).resolve().parent.parent
/ ".gitea"
/ "scripts"
/ "lint-curl-status-capture.py"
)
def _load_module():
spec = importlib.util.spec_from_file_location("lint_curl_status_capture", SCRIPT_PATH)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
def test_finds_quoted_echo_fallback_pollution():
lint = _load_module()
content = """
HTTP_CODE=$(curl -sS -o /tmp/body -w "%{http_code}" https://example.test || echo "000")
"""
findings = lint.scan_content("workflow.yml", content)
assert len(findings) == 1
assert "echo" in findings[0].snippet
def test_finds_unquoted_echo_fallback_pollution():
lint = _load_module()
content = """
HTTP_CODE=$(curl -sS -o /tmp/body -w '%{http_code}' https://example.test || echo 000)
"""
findings = lint.scan_content("workflow.yml", content)
assert len(findings) == 1
assert "echo" in findings[0].snippet
def test_finds_printf_fallback_pollution():
lint = _load_module()
content = """
HTTP_CODE=$(curl -sS -o /tmp/body -w '%{http_code}' https://example.test || printf '000')
"""
findings = lint.scan_content("workflow.yml", content)
assert len(findings) == 1
assert "printf" in findings[0].snippet
def test_ignores_tempfile_fallback_after_curl():
lint = _load_module()
content = """
set +e
curl -sS -o /tmp/body -w '%{http_code}' https://example.test >/tmp/code
rc=$?
set -e
HTTP_CODE=$(cat /tmp/code 2>/dev/null || echo "000")
[ -z "$HTTP_CODE" ] && HTTP_CODE="000"
"""
assert lint.scan_content("workflow.yml", content) == []
def test_collapses_bash_line_continuations():
lint = _load_module()
content = """
HTTP_CODE=$(curl -sS -o /tmp/body \\
-w "%{http_code}" \\
https://example.test \\
|| echo "000")
"""
findings = lint.scan_content("workflow.yml", content)
assert len(findings) == 1