Merge pull request 'fix(sop-checklist): stream comment pagination + RLIMIT_AS guardrail + na-fallback (task #369)' (#1610) from fix/sop-checklist-stream-pagination-oom into main
Block internal-flavored paths / Block forbidden paths (push) Waiting to run
CI / Detect changes (push) Waiting to run
CI / Platform (Go) (push) Waiting to run
CI / Canvas (Next.js) (push) Waiting to run
CI / Shellcheck (E2E scripts) (push) Waiting to run
CI / Canvas Deploy Reminder (push) Blocked by required conditions
CI / Python Lint & Test (push) Waiting to run
CI / all-required (push) Waiting to run
E2E API Smoke Test / detect-changes (push) Waiting to run
E2E API Smoke Test / E2E API Smoke Test (push) Blocked by required conditions
E2E Chat / detect-changes (push) Waiting to run
E2E Chat / E2E Chat (push) Blocked by required conditions
E2E Staging Canvas (Playwright) / detect-changes (push) Waiting to run
E2E Staging Canvas (Playwright) / Canvas tabs E2E (push) Blocked by required conditions
Handlers Postgres Integration / detect-changes (push) Waiting to run
Handlers Postgres Integration / Handlers Postgres Integration (push) Blocked by required conditions
Lint forbidden tenant-env keys / Scan workspace_secrets writers for forbidden env keys (push) Waiting to run
Lint no tenant GITEA or GITHUB token write / Scan for repo-host token write into tenant workspace surface (push) Waiting to run
Lint shellcheck (arm64 pilot) / shellcheck-arm64 (pilot) (push) Waiting to run
publish-workspace-server-image / Production auto-deploy (push) Blocked by required conditions
Runtime PR-Built Compatibility / detect-changes (push) Waiting to run
Runtime PR-Built Compatibility / PR-built wheel + import smoke (push) Blocked by required conditions
Secret scan / Scan diff for credential-shaped strings (push) Waiting to run
publish-workspace-server-image / build-and-push (push) Has been cancelled
Ops Scripts Tests / Ops scripts (unittest) (push) Successful in 1m6s

This commit was merged in pull request #1610.
This commit is contained in:
2026-05-20 13:03:49 +00:00
2 changed files with 338 additions and 12 deletions
+149 -12
View File
@@ -64,11 +64,41 @@ import argparse
import json
import os
import re
import resource
import sys
import urllib.error
import urllib.parse
import urllib.request
from typing import Any, Callable
from typing import Any, Callable, Iterator
# ---------------------------------------------------------------------------
# Address-space guardrail (RFC#369 / task #369 follow-up to mc#1242-class OOM).
#
# `get_issue_comments` paginates the full comment history of a PR. On
# bot-relay-heavy PRs (e.g. mc#291, mc#1242) this can balloon past the
# runner's cgroup memory limit and 137 the job. Cap virtual-address-space
# at 2 GiB so the script OOMs as a `MemoryError` (catchable / surfaceable)
# rather than a SIGKILL we can't post a status for.
#
# 2 GiB is generous — a 5000-comment PR with 1 KiB minimal-dicts (see
# get_issue_comments below) fits in ~10 MiB, leaving plenty of headroom
# for the Python runtime + urllib + json buffers.
#
# Skipped under pytest / dry-run where RLIMIT_AS would interfere with
# test runner memory needs (set SOP_CHECKLIST_NO_RLIMIT=1 to opt out).
if not os.environ.get("SOP_CHECKLIST_NO_RLIMIT"):
try:
resource.setrlimit(resource.RLIMIT_AS, (2 * 1024**3, 2 * 1024**3))
except (ValueError, OSError):
# macOS sometimes refuses RLIMIT_AS; not fatal — the Linux runner
# is the only place this matters for the OOM-prevention goal.
pass
# Per-comment body cap (task #369). The directive parser walks the body
# line-by-line looking for ^/sop-ack ^/sop-revoke ^/sop-n/a markers — only
# the first few KiB matter for that. Cap each comment body so a single
# pasted-log comment can't push us past the cgroup limit.
_MAX_BODY_BYTES = int(os.environ.get("SOP_CHECKLIST_MAX_BODY_BYTES") or 8 * 1024)
# ---------------------------------------------------------------------------
@@ -460,16 +490,35 @@ class GiteaClient:
raise RuntimeError(f"GET pulls/{pr} → HTTP {code}: {data!r}")
return data
def get_issue_comments(
self, owner: str, repo: str, issue: int
) -> list[dict[str, Any]]:
# Paginate. Gitea default page size 50.
out: list[dict[str, Any]] = []
def iter_issue_comments(
self, owner: str, repo: str, issue: int, page_size: int = 50
) -> Iterator[dict[str, Any]]:
"""Stream comments page-by-page, yielding ONE minimal-dict per comment.
Each yielded comment carries ONLY the fields the gate actually reads
— `{"user": {"login": str}, "body": str}` — and DROPS the much
larger Gitea-API extras (html_url, pull_request_url, issue_url,
assets, created_at, updated_at, id, original_author_*).
Memory motivation (task #369 / mc#1242-class OOM): full Gitea
comment dicts are ~2 KiB median + ~3 KiB p95. On PRs with several
thousand bot-relay comments the eager `list[full_dict]` shape used
previously pushed runner anon-rss past the cgroup limit. The
minimal-dict shape is ~10-20x smaller (typically ~50-100B Python
overhead + the body string).
The two downstream consumers (`compute_ack_state`,
`compute_na_state`) each iterate the comment list exactly once and
read only `body` + `user.login`, so dropping every other field is
safe. They still receive `list[dict[str, Any]]`-shaped objects so
the test fixtures (which already used the minimal shape) keep
working with no fixture changes.
"""
page = 1
while True:
code, data = self._req(
"GET",
f"/repos/{owner}/{repo}/issues/{issue}/comments?limit=50&page={page}",
f"/repos/{owner}/{repo}/issues/{issue}/comments?limit={page_size}&page={page}",
)
if code != 200:
raise RuntimeError(
@@ -477,10 +526,41 @@ class GiteaClient:
)
if not data:
break
out.extend(data)
if len(data) < 50:
for c in data:
# Minimal projection — drop ALL fields the gate doesn't read.
user_login = ((c.get("user") or {}).get("login") or "") if isinstance(c, dict) else ""
body = (c.get("body") if isinstance(c, dict) else "") or ""
# Body-size guardrail: huge comments (e.g. pasted CI logs) can
# individually be MiBs. The directive parser only needs the
# first ~8 KiB to find /sop-ack /sop-revoke /sop-n/a markers
# — anything past that is filler. Truncate at 8 KiB so a
# single oversized comment can't OOM the runner.
if len(body) > _MAX_BODY_BYTES:
body = body[:_MAX_BODY_BYTES]
yield {"user": {"login": user_login}, "body": body}
if len(data) < page_size:
break
page += 1
def get_issue_comments(
self,
owner: str,
repo: str,
issue: int,
max_comments: int | None = None,
) -> list[dict[str, Any]]:
"""Paginate + collect minimal comment dicts. See `iter_issue_comments`
for the per-comment shape and the OOM-prevention rationale.
`max_comments` (optional, default unbounded): hard cap. When the cap
is hit we stop fetching further pages and the caller surfaces a
soft 'skipping due to volume' status (see main()).
"""
out: list[dict[str, Any]] = []
for c in self.iter_issue_comments(owner, repo, issue):
out.append(c)
if max_comments is not None and len(out) >= max_comments:
break
return out
def resolve_team_id(self, org: str, team_name: str) -> int | None:
@@ -832,6 +912,17 @@ def main(argv: list[str] | None = None) -> int:
"thing BP sees is the POSTed status. Useful for local debugging."
),
)
p.add_argument(
"--max-comments",
type=int,
default=int(os.environ.get("SOP_CHECKLIST_MAX_COMMENTS") or 5000),
help=(
"Hard cap on comments fetched from the PR. Above this we post "
"a SOFT-pending status with a 'skipping due to volume' note "
"instead of OOM'ing the runner (task #369). Override with the "
"SOP_CHECKLIST_MAX_COMMENTS env var. Set 0 to disable the cap."
),
)
args = p.parse_args(argv)
token = os.environ.get("GITEA_TOKEN", "")
@@ -865,7 +956,18 @@ def main(argv: list[str] | None = None) -> int:
print("::error::PR payload missing user.login or head.sha", file=sys.stderr)
return 1
comments = client.get_issue_comments(args.owner, args.repo, args.pr)
max_comments_cap = args.max_comments if args.max_comments and args.max_comments > 0 else None
comments = client.get_issue_comments(
args.owner, args.repo, args.pr, max_comments=max_comments_cap
)
# Volume short-circuit: PRs with thousands of bot-relay comments
# (the mc#1242-class OOM source) get a soft 'volume-skipped' status
# so the gate doesn't churn the runner; reviewers can re-trigger by
# editing the PR or filing a fresh PR with the housekeeping comments
# split off. Cap-hit means we couldn't see the WHOLE history, so we
# can't fairly post failure — pending is the safe default.
volume_skipped = bool(max_comments_cap and len(comments) >= max_comments_cap)
# High-risk classification (RFC#450 Option C, governance fix for
# internal#442). Computed ONCE per PR — used by both the probe
@@ -879,8 +981,34 @@ def main(argv: list[str] | None = None) -> int:
team_member_cache: dict[tuple[str, int], bool | None] = {}
def probe(slug: str, users: list[str]) -> list[str]:
item = items_by_slug[slug]
team_names: list[str] = resolve_required_teams(item, high_risk)
# `slug` may be either an items-key (compute_ack_state caller) OR
# an n/a-gate key (compute_na_state caller). Previously this hard
# KeyError'd on the n/a-gate path when slug was e.g. "security-review"
# — that's a config gate, not an item — so the gate would crash
# instead of falling back to the gate's own required_teams. Fix
# task #369 follow-up to issue #355.
if slug in items_by_slug:
item = items_by_slug[slug]
team_names: list[str] = resolve_required_teams(item, high_risk)
elif slug in na_gates:
# n/a-gate configs carry `required_teams` directly (see
# sop-checklist-config.yaml: n/a_gates.<gate>.required_teams).
gate_cfg = na_gates[slug] or {}
team_names = list(gate_cfg.get("required_teams") or [])
if not team_names:
print(
f"::warning::n/a-gate '{slug}' has no required_teams; "
"fail-closed (no users will be approved)",
file=sys.stderr,
)
else:
# Unknown slug — fail closed, log so we can find config drift.
print(
f"::warning::probe() called with slug '{slug}' which is "
f"neither an items entry nor an n/a-gate; fail-closed",
file=sys.stderr,
)
return []
# Resolve names → ids. NOTE: orgs/{org}/teams/search may not be
# available — fall back to the list endpoint.
team_ids: list[int] = []
@@ -938,6 +1066,15 @@ def main(argv: list[str] | None = None) -> int:
# were not required (vs a tier:medium+ PR that truly passed all acks).
state = "success"
description = f"[info tier:low] {description}"
if volume_skipped:
# Above the comment-cap — we may have a partial view. Soft-pend
# so neither BP nor the author gets stuck; surface the cap so
# reviewers know what's up. No-block at the gate level.
state = "pending"
description = (
f"[volume-skipped] comment-cap={max_comments_cap} hit; please file "
f"a fresh PR with bot-relay history split off (#369). {description}"
)
# Diagnostics to job log.
print(
+189
View File
@@ -815,3 +815,192 @@ class TestHighRiskClassUsesElevatedListInConfig(unittest.TestCase):
sop.resolve_required_teams(items[slug], high_risk=True),
f"item {slug} should not be affected by risk-class",
)
# ---------------------------------------------------------------------------
# get_issue_comments — streaming + minimal-dict shape (task #369 / OOM fix)
# ---------------------------------------------------------------------------
class _FakeReq:
"""Stand-in for GiteaClient._req that serves canned pages."""
def __init__(self, pages):
# pages: list[list[dict]]; one page per call, exhausted in order.
self._pages = list(pages)
self.calls = []
def __call__(self, method, path, body=None, ok_codes=(200, 201, 204)):
self.calls.append((method, path))
if not self._pages:
return 200, []
return 200, self._pages.pop(0)
class TestGetIssueCommentsStreaming(unittest.TestCase):
"""Verify the OOM-fix invariants — minimal-dict shape + page break."""
def _client_with_pages(self, pages):
client = sop.GiteaClient("git.example.com", "tok")
client._req = _FakeReq(pages) # type: ignore[method-assign]
return client
def test_minimal_dict_shape_drops_large_fields(self):
"""get_issue_comments must DROP html_url/assets/timestamps/etc. and
keep ONLY {user.login, body} — that's the whole OOM-prevention."""
full_page = [
{
"id": 1234,
"html_url": "https://example.com/some-huge-url",
"pull_request_url": "https://example.com/some-other-huge-url",
"issue_url": "https://example.com/yet-another-url",
"user": {"login": "bob", "avatar_url": "x" * 4000, "id": 99},
"original_author": "",
"original_author_id": 0,
"body": "/sop-ack comprehensive-testing\n\nlooks good",
"assets": ["x" * 1000, "y" * 1000],
"created_at": "2026-05-19T01:02:03Z",
"updated_at": "2026-05-19T01:02:03Z",
}
]
client = self._client_with_pages([full_page])
out = client.get_issue_comments("o", "r", 1)
self.assertEqual(len(out), 1)
# Only the two whitelisted keys + nested user.login
self.assertEqual(set(out[0].keys()), {"user", "body"})
self.assertEqual(set(out[0]["user"].keys()), {"login"})
self.assertEqual(out[0]["user"]["login"], "bob")
self.assertEqual(out[0]["body"], "/sop-ack comprehensive-testing\n\nlooks good")
# Critical: avatar/assets/timestamps/etc. must be gone (~4KB+ each).
self.assertNotIn("html_url", out[0])
self.assertNotIn("assets", out[0])
self.assertNotIn("created_at", out[0])
def test_pagination_break_on_short_page(self):
# Page-size 50; a page of <50 means no more pages.
page1 = [{"user": {"login": "u"}, "body": "x"}] * 7
client = self._client_with_pages([page1])
out = client.get_issue_comments("o", "r", 2)
self.assertEqual(len(out), 7)
# Should have made exactly 1 _req call (no page-2 probe).
self.assertEqual(len(client._req.calls), 1)
def test_pagination_continues_until_empty(self):
# Two full pages + one short page.
page1 = [{"user": {"login": "u"}, "body": "x"}] * 50
page2 = [{"user": {"login": "u"}, "body": "y"}] * 50
page3 = [{"user": {"login": "u"}, "body": "z"}] * 3
client = self._client_with_pages([page1, page2, page3])
out = client.get_issue_comments("o", "r", 3)
self.assertEqual(len(out), 103)
self.assertEqual(len(client._req.calls), 3)
def test_max_comments_caps_collection(self):
page1 = [{"user": {"login": "u"}, "body": "x"}] * 50
page2 = [{"user": {"login": "u"}, "body": "y"}] * 50
page3 = [{"user": {"login": "u"}, "body": "z"}] * 50
client = self._client_with_pages([page1, page2, page3])
out = client.get_issue_comments("o", "r", 4, max_comments=75)
self.assertEqual(len(out), 75)
# Stops short: shouldn't have requested page-3.
self.assertLessEqual(len(client._req.calls), 2)
def test_oversized_body_truncated(self):
# An individual comment with a multi-MiB body (e.g. pasted CI log)
# must NOT pull the whole thing into memory. The directive parser
# only needs the first ~8 KiB to find /sop-* markers.
huge_body = "/sop-ack comprehensive-testing\n" + ("X" * (4 * 1024 * 1024))
page = [{"user": {"login": "bob"}, "body": huge_body}]
client = self._client_with_pages([page])
out = client.get_issue_comments("o", "r", 99)
self.assertEqual(len(out), 1)
# Cap is 8 KiB; comment body must be <= 8 KiB after streaming.
self.assertLessEqual(len(out[0]["body"]), 8 * 1024)
# Marker still discoverable at the start.
self.assertTrue(out[0]["body"].startswith("/sop-ack comprehensive-testing"))
def test_iter_handles_missing_user_or_body(self):
# Defensive: Gitea has been seen to return user=null on deleted users.
page = [
{"user": None, "body": "abandoned-author"},
{"user": {"login": "alice"}, "body": None},
{"body": "no-user-key"},
{"user": {"login": "bob"}, "body": "ok"},
]
client = self._client_with_pages([page])
out = client.get_issue_comments("o", "r", 5)
self.assertEqual(len(out), 4)
self.assertEqual(out[0]["user"]["login"], "")
self.assertEqual(out[0]["body"], "abandoned-author")
self.assertEqual(out[1]["user"]["login"], "alice")
self.assertEqual(out[1]["body"], "")
self.assertEqual(out[2]["user"]["login"], "")
self.assertEqual(out[3]["user"]["login"], "bob")
def test_minimal_dicts_work_with_compute_ack_state(self):
"""Round-trip: minimal dicts feed back through compute_ack_state."""
page = [{"user": {"login": "bob"}, "body": "/sop-ack comprehensive-testing"}]
client = self._client_with_pages([page])
comments = client.get_issue_comments("o", "r", 6)
items = _items_by_slug()
aliases = _numeric_aliases()
state = sop.compute_ack_state(
comments, "alice", items, aliases, lambda slug, users: list(users)
)
self.assertEqual(state["comprehensive-testing"]["ackers"], ["bob"])
# ---------------------------------------------------------------------------
# probe() na-gate fallback — fix for #355-class KeyError 'security-review'
# ---------------------------------------------------------------------------
class TestComputeNaStateAcceptsGateNotInItems(unittest.TestCase):
"""compute_na_state passes the gate NAME to probe(); when the gate is
NOT also an items entry (the common case for `security-review`,
`qa-review`), probe must fall back to the gate's own required_teams
instead of KeyError'ing on items_by_slug[slug].
This test exercises the public surface (compute_na_state) rather than
the inline `probe` closure, because the closure is built inside main().
We simulate the fallback by passing a probe that mirrors the production
contract — slug may be either an item OR an n/a-gate key, both are valid.
"""
def test_na_gate_with_required_teams_resolves_without_keyerror(self):
na_gates = {
"security-review": {
"required_teams": ["security", "managers", "ceo"],
"description": "security N/A",
},
}
comments = [
{"user": {"login": "carol"}, "body": "/sop-n/a security-review docs-only"},
]
# Probe approves any user in the security team; importantly it does
# NOT try items_by_slug[slug] for the gate name.
called_with = []
def probe(slug, users):
called_with.append(slug)
# production probe accepts gate-name OR item-slug; for this test
# we just approve everyone.
return list(users)
na_state = sop.compute_na_state(comments, "alice", na_gates, probe)
self.assertTrue(na_state["security-review"]["declared"])
self.assertEqual(na_state["security-review"]["decl_ackers"], ["carol"])
# probe must have been called with the GATE name, not an item slug.
self.assertEqual(called_with, ["security-review"])
def test_na_gate_self_declaration_rejected(self):
# Author cannot self-declare N/A — pre-existing invariant; pin it
# so the new probe-fallback doesn't regress this.
na_gates = {"security-review": {"required_teams": ["security"]}}
comments = [
{"user": {"login": "alice"}, "body": "/sop-n/a security-review"},
]
na_state = sop.compute_na_state(
comments, "alice", na_gates, lambda *_: ["alice"]
)
self.assertFalse(na_state["security-review"]["declared"])