#!/usr/bin/env bash # codex_auth_refresh.sh — proactive OAuth refresh watchdog for the # ChatGPT/Codex-subscription credential. # # Why this exists (RFC internal#569): # Codex CLI 0.130.0 ONLY refreshes auth.json on the demand path # `AuthManager::auth().await` (codex-rs/login/src/auth/manager.rs:1411). # In our prod-Reviewer / prod-Researcher topology the workspace can # sit idle (or wedge upstream in executor.py) for >8 days between # turns, so `auth()` is never invoked end-to-end and refresh is never # armed. id_token expires within hours; access_token within ~198h; # eventually refresh_token rolls past its own window and the workspace # becomes hard-broken (no recovery path, requires a CTO re-login). # # Strategy: # Background bash loop, started by start.sh as `gosu agent`. Every # $REFRESH_INTERVAL seconds (default 6h, RFC §"Proposed fix" item 2) # it inspects $CODEX_HOME/auth.json. If either: # - tokens.access_token JWT `exp` is within $SAFETY_MARGIN seconds # of now (default 4h), OR # - last_refresh ISO timestamp is older than $STALE_AFTER seconds # (default 7 days — one day below the CLI's TOKEN_REFRESH_INTERVAL # of 8d so we refresh before the CLI would mark it stale) # then it POSTs the refresh_token to # `https://auth.openai.com/oauth/token` (the endpoint baked into the # CLI — codex-rs/login/src/auth/manager.rs:94, const REFRESH_TOKEN_URL) # with the same CLIENT_ID (`app_EMoamEEZ73f0CkXaXp7hrann`, # codex-rs/login/src/auth/manager.rs:928 / v0.130.0:921) and # grant_type=refresh_token, and atomically rewrites auth.json in place # (write to .tmp + chmod + rename). The CLI tolerates an externally # refreshed file: `reload()` is called inside `auth()` whenever the # file mtime changes. # # Vendor contract source — all field names, endpoint URL, client id, # response shape verified against `openai/codex@rust-v0.130.0` # (the exact CLI version pinned in the Dockerfile, line 166): # - REFRESH_TOKEN_URL manager.rs:94 # - CLIENT_ID manager.rs:921 # - request shape: { client_id, grant_type:"refresh_token", # refresh_token } manager.rs:913-918 # - response shape: { id_token?, access_token?, # refresh_token? } manager.rs:920-925 # - persist semantics: update tokens.{id_token,access_token, # refresh_token} only when present, set # last_refresh = now (manager.rs:787-810) # - stale predicate: access_token exp <= now OR # last_refresh < now - 8d # manager.rs:1786-1808 # # Constraints honored (RFC §"Constraints honored"): # - NEVER echo token values. Filenames + JWT field NAMES + status # codes + ISO timestamps only. # - No mutation of auth.json outside the atomic-rename path. # - Inert when $CODEX_HOME/auth.json is absent OR auth_mode != chatgpt # OR refresh_token is empty (the API-key / MiniMax paths don't have # refresh tokens; this watchdog is subscription-only by design). # # Health sidecar (RFC §"Surface a health metric"): # On every loop iteration, write the parsed `last_refresh` ISO and # age-in-seconds to $CODEX_HOME/auth_refresh_status.json (mode 0600, # agent-owned). The molecule-runtime obs sidecar can scrape this # file instead of the auth.json itself — keeps tokens out of the # metrics path entirely. set -uo pipefail CODEX_HOME="${CODEX_HOME:-/home/agent/.codex}" AUTH_JSON="${CODEX_HOME}/auth.json" STATUS_FILE="${CODEX_HOME}/auth_refresh_status.json" # Resolve python3 portably. PR#19 (internal#569) hardcoded # /opt/molecule-venv/bin/python3, but the codex image is built FROM # python:3.11-slim with python3 at /usr/local/bin/python3 — no # /opt/molecule-venv exists. Every helper invocation exited 127 → # OAuth refresh never fired → id_token expired silently and Researcher # wedged upstream of stdout (ae2c3012 diagnosis). Override via # CODEX_PYTHON for test/dev rigs that need a custom interpreter. PYTHON_BIN="${CODEX_PYTHON:-$(command -v python3 || true)}" if [ -z "$PYTHON_BIN" ] || [ ! -x "$PYTHON_BIN" ]; then printf '[codex_auth_refresh %s] FATAL: python3 not found on PATH (CODEX_PYTHON=%s); the watchdog cannot run.\n' \ "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "${CODEX_PYTHON:-}" >&2 exit 127 fi # Tunable via env. Defaults come from RFC §"Proposed fix" + CLI source: # - REFRESH_INTERVAL: how often the loop wakes (6h) # - SAFETY_MARGIN: refresh if access_token exp - now < this (4h) # - STALE_AFTER: refresh if last_refresh older than this (7d, # chosen to fire before the CLI's 8d cliff) REFRESH_INTERVAL="${CODEX_AUTH_REFRESH_INTERVAL_SECONDS:-21600}" SAFETY_MARGIN="${CODEX_AUTH_SAFETY_MARGIN_SECONDS:-14400}" STALE_AFTER="${CODEX_AUTH_STALE_AFTER_SECONDS:-604800}" # Refresh endpoint — keep the CLI's env override honored so a test rig # can point us at a mock server. REFRESH_URL="${CODEX_REFRESH_TOKEN_URL_OVERRIDE:-https://auth.openai.com/oauth/token}" CLIENT_ID="app_EMoamEEZ73f0CkXaXp7hrann" log() { # Stamped, prefixed; never includes token contents. printf '[codex_auth_refresh %s] %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$*" >&2 } # Run a one-shot python helper to inspect auth.json and decide whether # to refresh. Python gives us reliable JSON + JWT decoding without # pulling in jq (which is not in the image). We deliberately keep the # helper inline so the script is a single self-contained file. needs_refresh_and_payload() { # Emits one of: # SKIP # READY # Note: is on stdout for the caller to consume in a # variable. Caller MUST NOT echo it. The wrapper below redacts it # before any logging. "$PYTHON_BIN" - "$AUTH_JSON" "$SAFETY_MARGIN" "$STALE_AFTER" <<'PY' import base64, json, sys, time from datetime import datetime, timezone path = sys.argv[1] safety_margin = int(sys.argv[2]) stale_after = int(sys.argv[3]) def emit(line): print(line, flush=True) try: with open(path, "r", encoding="utf-8") as f: blob = json.load(f) except FileNotFoundError: emit("SKIP no_auth_json") sys.exit(0) except (OSError, json.JSONDecodeError) as exc: emit(f"SKIP unreadable:{type(exc).__name__}") sys.exit(0) # Subscription-only: API-key / MiniMax paths have no refresh token. auth_mode = (blob.get("auth_mode") or "").lower() if auth_mode not in ("chatgpt", "chatgpt_auth_tokens"): emit(f"SKIP auth_mode={auth_mode or 'none'}") sys.exit(0) tokens = blob.get("tokens") or {} refresh_token = tokens.get("refresh_token") or "" access_token = tokens.get("access_token") or "" if not refresh_token: emit("SKIP no_refresh_token") sys.exit(0) # Parse last_refresh ISO. Tolerate trailing Z (codex emits +00:00 in # newer versions, Z in 0.130.0 era). last_refresh_iso = blob.get("last_refresh") or "" now = datetime.now(timezone.utc) age_seconds = -1 if last_refresh_iso: try: iso = last_refresh_iso.replace("Z", "+00:00") last_refresh_dt = datetime.fromisoformat(iso) if last_refresh_dt.tzinfo is None: last_refresh_dt = last_refresh_dt.replace(tzinfo=timezone.utc) age_seconds = int((now - last_refresh_dt).total_seconds()) except ValueError: age_seconds = -1 # Decode the access_token JWT `exp` claim (RFC matches the CLI: # parse_jwt_expiration in codex-rs/login/src/auth/manager.rs). exp_seconds_remaining = None if access_token and access_token.count(".") >= 2: try: payload_b64 = access_token.split(".")[1] padded = payload_b64 + "=" * (-len(payload_b64) % 4) claims = json.loads(base64.urlsafe_b64decode(padded.encode())) exp = claims.get("exp") if isinstance(exp, (int, float)): exp_seconds_remaining = int(exp - time.time()) except (ValueError, json.JSONDecodeError): exp_seconds_remaining = None needs = False reasons = [] if exp_seconds_remaining is not None and exp_seconds_remaining <= safety_margin: needs = True reasons.append(f"exp_in={exp_seconds_remaining}s<={safety_margin}s") if age_seconds >= 0 and age_seconds >= stale_after: needs = True reasons.append(f"last_refresh_age={age_seconds}s>={stale_after}s") if exp_seconds_remaining is None and age_seconds == -1: # Defensive: if we can't parse anything, refresh anyway. The CLI's # own staleness path would do the same. needs = True reasons.append("unparseable_metadata") if not needs: emit( "SKIP fresh exp_in=" + (str(exp_seconds_remaining) if exp_seconds_remaining is not None else "unknown") + "s age=" + (str(age_seconds) if age_seconds >= 0 else "unknown") + "s" ) sys.exit(0) # Caller consumes refresh_token from the next line. Reasons echoed # after for logging. emit(f"READY {refresh_token}") print(f"REASONS {','.join(reasons)}", file=sys.stderr) PY } # Apply a refresh response (file path passed as $2) to auth.json # atomically. We pass the response as a FILE PATH (not stdin) because # the python helper's body is itself a heredoc on stdin — using stdin # for the response would collide with the heredoc. apply_refresh_response() { local response_file="$1" "$PYTHON_BIN" - "$AUTH_JSON" "$response_file" <<'PY' import json, os, stat, sys, tempfile from datetime import datetime, timezone path = sys.argv[1] response_path = sys.argv[2] try: with open(path, "r", encoding="utf-8") as f: blob = json.load(f) except (OSError, json.JSONDecodeError) as exc: print(f"FAIL load_auth:{type(exc).__name__}", file=sys.stderr) sys.exit(2) try: with open(response_path, "r", encoding="utf-8") as f: response = json.load(f) except (OSError, json.JSONDecodeError) as exc: print(f"FAIL bad_response_json:{exc.__class__.__name__}", file=sys.stderr) sys.exit(2) if not isinstance(response, dict): print("FAIL response_not_object", file=sys.stderr) sys.exit(2) tokens = blob.get("tokens") or {} # Per manager.rs:787-810: persist only what the response contains. new_id = response.get("id_token") new_access = response.get("access_token") new_refresh = response.get("refresh_token") if new_id: tokens["id_token"] = new_id if new_access: tokens["access_token"] = new_access if new_refresh: tokens["refresh_token"] = new_refresh blob["tokens"] = tokens blob["last_refresh"] = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") # Atomic rename, preserve mode 0600. dir_path = os.path.dirname(path) or "." fd, tmp_path = tempfile.mkstemp(prefix=".auth.", suffix=".tmp", dir=dir_path) try: with os.fdopen(fd, "w", encoding="utf-8") as f: json.dump(blob, f, indent=2) f.flush() os.fsync(f.fileno()) os.chmod(tmp_path, stat.S_IRUSR | stat.S_IWUSR) # 0600 os.replace(tmp_path, path) except Exception: try: os.unlink(tmp_path) except OSError: pass raise # Summary line — token VALUES are not printed, only WHICH fields were # rotated. fields = [] if new_id: fields.append("id_token") if new_access: fields.append("access_token") if new_refresh: fields.append("refresh_token") print("OK rotated=" + ",".join(fields) if fields else "OK rotated=") PY } # Write the status sidecar (mode 0600). Never includes token contents. write_status() { local last_iso="$1" age="$2" outcome="$3" "$PYTHON_BIN" - "$STATUS_FILE" "$last_iso" "$age" "$outcome" <<'PY' import json, os, stat, sys path, last_iso, age, outcome = sys.argv[1:5] payload = { "last_refresh": last_iso, "last_refresh_age_seconds": int(age) if age.lstrip("-").isdigit() else None, "watchdog_last_outcome": outcome, "watchdog_last_run_iso": __import__("datetime").datetime.now( __import__("datetime").timezone.utc ).isoformat().replace("+00:00", "Z"), } with open(path + ".tmp", "w", encoding="utf-8") as f: json.dump(payload, f, indent=2) os.chmod(path + ".tmp", stat.S_IRUSR | stat.S_IWUSR) os.replace(path + ".tmp", path) PY } # One refresh attempt. Returns 0 on refreshed, 1 on skipped, 2 on # transient failure (will retry next loop), 3 on permanent failure # (e.g. refresh_token_expired — operator intervention needed). attempt_refresh_once() { if [ ! -s "$AUTH_JSON" ]; then log "skip: $AUTH_JSON absent or empty (no chatgpt-subscription auth in this workspace)" write_status "" -1 "skip:no_auth_json" return 1 fi local decision rc=0 # Use a temp file so the multi-line decision output is fully captured. local decision_file decision_file="$(mktemp)" needs_refresh_and_payload >"$decision_file" 2>>/tmp/codex_auth_refresh.errlog || rc=$? if [ "$rc" -ne 0 ]; then log "decision: helper exited $rc; see /tmp/codex_auth_refresh.errlog" rm -f "$decision_file" write_status "" -1 "skip:helper_error" return 2 fi local verdict verdict="$(head -n1 "$decision_file")" case "$verdict" in SKIP*) log "no-op: ${verdict#SKIP }" rm -f "$decision_file" # Re-parse just to refresh the sidecar — separate, doesn't see # the token because no refresh body is emitted. local last_iso age read -r last_iso age <<<"$("$PYTHON_BIN" - "$AUTH_JSON" <<'PY' import json, sys from datetime import datetime, timezone try: blob = json.load(open(sys.argv[1])) except Exception: print(' -1'); sys.exit(0) iso = (blob.get('last_refresh') or '').replace('Z', '+00:00') age = -1 if iso: try: dt = datetime.fromisoformat(iso) if dt.tzinfo is None: dt = dt.replace(tzinfo=timezone.utc) age = int((datetime.now(timezone.utc) - dt).total_seconds()) except ValueError: pass print((blob.get('last_refresh') or ''), age) PY )" write_status "${last_iso:-}" "${age:--1}" "skip" return 1 ;; READY*) : ;; *) log "decision: unrecognized verdict line (length=${#verdict}); aborting attempt" rm -f "$decision_file" write_status "" -1 "skip:bad_verdict" return 2 ;; esac # READY local refresh_token refresh_token="${verdict#READY }" rm -f "$decision_file" if [ -z "$refresh_token" ]; then log "decision READY but refresh_token empty; aborting" write_status "" -1 "skip:empty_refresh_token" return 2 fi log "refresh: starting (endpoint=${REFRESH_URL%%\?*})" # Build the request body via python to avoid quoting the token in # the shell — token value never appears in argv or env. Send body # to curl on stdin via @- ; the shell never sees the token in # process arguments. local response_file http_code response_file="$(mktemp)" http_code="$( "$PYTHON_BIN" -c "import json,sys; sys.stdout.write(json.dumps({'client_id': '$CLIENT_ID', 'grant_type': 'refresh_token', 'refresh_token': sys.argv[1]}))" "$refresh_token" \ | curl -sS --max-time 30 \ -H "Content-Type: application/json" \ -X POST -d @- \ -o "$response_file" \ -w "%{http_code}" \ "$REFRESH_URL" \ || echo "000" )" unset refresh_token case "$http_code" in 2*) if apply_refresh_response "$response_file" >>/tmp/codex_auth_refresh.errlog 2>&1; then rm -f "$response_file" log "refresh: ok (http=$http_code)" # Re-read the freshly written file for the sidecar. local last_iso age read -r last_iso age <<<"$("$PYTHON_BIN" - "$AUTH_JSON" <<'PY' import json, sys from datetime import datetime, timezone blob = json.load(open(sys.argv[1])) iso = (blob.get('last_refresh') or '').replace('Z', '+00:00') age = -1 if iso: dt = datetime.fromisoformat(iso) if dt.tzinfo is None: dt = dt.replace(tzinfo=timezone.utc) age = int((datetime.now(timezone.utc) - dt).total_seconds()) print((blob.get('last_refresh') or ''), age) PY )" write_status "${last_iso:-}" "${age:--1}" "refreshed" return 0 else log "refresh: apply_refresh_response failed (see /tmp/codex_auth_refresh.errlog)" rm -f "$response_file" write_status "" -1 "fail:apply_response" return 2 fi ;; 401) # Per manager.rs:846-848 — 401 is a PERMANENT failure (refresh # token expired / reused / revoked). Operator must re-login on # the host running the CTO ChatGPT account. log "refresh: PERMANENT FAILURE (http=401) — refresh_token rejected. Operator must re-login the CTO ChatGPT subscription and re-inject CODEX_AUTH_JSON." rm -f "$response_file" write_status "" -1 "fail:permanent_401" return 3 ;; *) # Anything else is treated as transient (network glitch, 5xx). # Per manager.rs:849-854, the CLI does the same: keep the old # tokens and retry later. log "refresh: transient failure (http=$http_code); will retry next loop" rm -f "$response_file" write_status "" -1 "fail:transient_$http_code" return 2 ;; esac } main_loop() { log "watchdog: starting (interval=${REFRESH_INTERVAL}s safety_margin=${SAFETY_MARGIN}s stale_after=${STALE_AFTER}s codex_home=${CODEX_HOME})" while true; do attempt_refresh_once || true sleep "$REFRESH_INTERVAL" done } # Allow `--once` for manual probe + the boot-time priming probe in # start.sh. Default is the long-running loop. case "${1:-}" in --once) attempt_refresh_once exit $? ;; --help|-h) grep '^#' "$0" | sed 's/^# \{0,1\}//' exit 0 ;; *) main_loop ;; esac