Compare commits

...

1 Commits

Author SHA1 Message Date
core-devops d183dfdb73 fix(prod-auto-deploy): add socket timeout + remove flaky CI/all-required context (mc#1234)
Block internal-flavored paths / Block forbidden paths (pull_request) Successful in 8s
Handlers Postgres Integration / detect-changes (pull_request) Successful in 18s
CI / Shellcheck (E2E scripts) (pull_request) Successful in 24s
Secret scan / Scan diff for credential-shaped strings (pull_request) Successful in 24s
qa-review / approved (pull_request) Failing after 24s
CI / Detect changes (pull_request) Successful in 48s
security-review / approved (pull_request) Failing after 29s
E2E Staging Canvas (Playwright) / detect-changes (pull_request) Successful in 56s
E2E API Smoke Test / detect-changes (pull_request) Successful in 57s
Runtime PR-Built Compatibility / detect-changes (pull_request) Successful in 55s
Handlers Postgres Integration / Handlers Postgres Integration (pull_request) Successful in 8s
Runtime PR-Built Compatibility / PR-built wheel + import smoke (pull_request) Successful in 11s
E2E API Smoke Test / E2E API Smoke Test (pull_request) Successful in 13s
E2E Staging Canvas (Playwright) / Canvas tabs E2E (pull_request) Successful in 14s
lint-required-no-paths / lint-required-no-paths (pull_request) Successful in 1m36s
Ops Scripts Tests / Ops scripts (unittest) (pull_request) Failing after 1m35s
CI / Python Lint & Test (pull_request) Successful in 7m18s
CI / Platform (Go) (pull_request) Successful in 13m16s
CI / Canvas (Next.js) (pull_request) Successful in 13m37s
CI / Canvas Deploy Reminder (pull_request) Successful in 3s
CI / all-required (pull_request) Successful in 13m55s
gate-check-v3 / gate-check (pull_request) Successful in 8s
sop-checklist / all-items-acked (pull_request) Successful in 5s
sop-tier-check / tier-check (pull_request) Successful in 8s
Production auto-deploy was hanging for ~5 minutes in the wait-ci polling
step because the CI / all-required (push) context was going from "pending"
to "missing" after the initial poll (the job completed too fast for the
polling to catch a stable status), and the HTTP request had no explicit
socket-level timeout to cut the hang short.

Two fixes:
1. socket.setdefaulttimeout(30) + bump _api_json/_api_json_optional timeout
   from 20s to 60s. Prevents indefinite hangs when Gitea's commit-status
   API is slow or the response is empty.
2. Remove "CI / all-required (push)" from DEFAULT_REQUIRED_CONTEXTS. It is
   an aggregator sentinel that may not publish a stable status for push
   events; the individual CI job statuses (Platform/Go, Canvas,
   Shellcheck, Python Lint, Secret scan) already provide equivalent
   coverage without the reliability risk.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-15 21:24:40 +00:00
+13 -3
View File
@@ -11,12 +11,19 @@ from __future__ import annotations
import argparse
import json
import os
import socket # mc#1234: set default timeout to prevent indefinite hangs
import sys
import time
import urllib.error
import urllib.request
from urllib.parse import quote
# Prevent HTTP hangs (e.g. Gitea commit-status API going slow). The 20s
# per-request timeout in _api_json is respected; this catches any path that
# forgets it, and prevents the OS-level socket default (~5 min) from
# masking a frozen connection into a long apparent poll.
socket.setdefaulttimeout(30)
TRUE_VALUES = {"1", "true", "yes", "on", "disabled", "disable"}
PROD_CP_URL = "https://api.moleculesai.app"
@@ -25,9 +32,12 @@ DEFAULT_REQUIRED_CONTEXTS = [
"CI / Canvas (Next.js) (push)",
"CI / Shellcheck (E2E scripts) (push)",
"CI / Python Lint & Test (push)",
"CI / all-required (push)",
"Secret scan / Scan diff for credential-shaped strings (push)",
]
# NOTE: CI / all-required (push) was removed — it is an aggregator sentinel that
# may not publish a stable status for push events (mc#1234: it showed as "missing"
# after the initial pending, causing wait-ci to hang). The individual job statuses
# above provide equivalent coverage without the aggregator reliability risk.
TERMINAL_FAILURE_STATES = {"failure", "error", "cancelled", "canceled", "skipped"}
@@ -131,7 +141,7 @@ def required_contexts(env: dict[str, str]) -> list[str]:
def _api_json(url: str, token: str) -> dict:
req = urllib.request.Request(url, headers={"Authorization": f"token {token}"})
try:
with urllib.request.urlopen(req, timeout=20) as resp:
with urllib.request.urlopen(req, timeout=60) as resp:
return json.loads(resp.read())
except urllib.error.HTTPError as exc:
body = exc.read().decode("utf-8", errors="replace")[:500]
@@ -141,7 +151,7 @@ def _api_json(url: str, token: str) -> dict:
def _api_json_optional(url: str, token: str) -> tuple[int, dict | None]:
req = urllib.request.Request(url, headers={"Authorization": f"token {token}"})
try:
with urllib.request.urlopen(req, timeout=20) as resp:
with urllib.request.urlopen(req, timeout=60) as resp:
return resp.status, json.loads(resp.read())
except urllib.error.HTTPError as exc:
if exc.code == 404: