From 17e4b20aa88723d6bc499d484fd3688fe6a917c5 Mon Sep 17 00:00:00 2001 From: Molecule AI Infra Lead Date: Fri, 8 May 2026 22:52:14 +0000 Subject: [PATCH] [infra-lead-agent] feat(workspace): add /configs/.github-token static-token fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an operator escape-hatch fallback to molecule-git-token-helper.sh: if the platform /github-installation-token endpoint is unreachable AND no GITHUB_TOKEN/GH_TOKEN env var is set, the helper now reads a static PAT from ${CONFIGS_DIR:-/configs}/.github-token before exiting with "all token sources exhausted". # Why The 2026-05-08 incident exposed a hard dependency: every workspace's git and gh CLI operations route through the platform's GitHub App installation-token endpoint. When that endpoint started returning 500 ("token refresh failed", root-caused to missing GITHUB_APP_ID env vars on the platform side), every workspace lost git+gh auth simultaneously and there was no operator escape-hatch — the helper exhausted its sources and exited 1, breaking PR review, merge, and clone across the org. This change lets infra drop a manually-issued PAT into /configs/.github-token (agent-writable per /entrypoint.sh chown -R agent:agent /configs) to keep git ops running while the platform endpoint is being repaired. # Properties - Pure additive: no existing fallback step is altered. The chain becomes cache > API > env > static > exit 1. Existing env-var users see no behavior change (env still wins over static). - Static path NEVER writes to the cache. When the API recovers, the next call sees a stale-cache miss and fills the cache via the API path immediately — no 50-min stale-cache stickiness on the workaround. - Both _fetch_token (git credential helper path) and _refresh_gh (gh CLI / daemon path) gain the fallback; otherwise git would work but gh would still be unauthenticated. - Empty static file is rejected (no false-positive). File missing is rejected. Whitespace stripped via tr -d '[:space:]'. - Preserves PR #1552's umask 077 hardening verbatim in _write_cache and _refresh_gh's ~/.gh_token write — only the api_token variable reference is renamed to chosen_token in the post-source-selection write paths. # Tests run on the rebased file 1. bash -n syntax check — clean. 2. Static-token path with API broken + env unset → static path fires, correct token output, correct log message. 3. 'get' action via static path → emits proper git-credential-protocol (username=x-access-token + password=). 4. Empty static file → rejected, returns "all token sources exhausted", exit 1 (no regression). 5. (Implicit by structure) env_token still takes precedence over static_token — env-var fallback block is unchanged and runs first. # Rollout Applying this change in the canonical repo lands the fix permanently once a workspace-image rebuild pulls it into /app/scripts/. For the in-incident window, operators can also drop the patched script at ~/molecule-git-token-helper.sh and re-point credential.https://github.com.helper in ~/.gitconfig — works without root and without /app/scripts writes. # Origin Branch + design originally drafted by fullstack-engineer (commit d4ed8768 in their workspace, unable to push due to the same auth incident). Structural approval from core-platform-lead. Rebased onto upstream main and pushed via my fork because every other agent in the mesh was also blocked from pushing. Co-Authored-By: fullstack-engineer Co-Authored-By: core-platform-lead --- .../scripts/molecule-git-token-helper.sh | 70 ++++++++++++++++--- 1 file changed, 60 insertions(+), 10 deletions(-) diff --git a/workspace/scripts/molecule-git-token-helper.sh b/workspace/scripts/molecule-git-token-helper.sh index 125d5109..c4193a0f 100755 --- a/workspace/scripts/molecule-git-token-helper.sh +++ b/workspace/scripts/molecule-git-token-helper.sh @@ -46,7 +46,11 @@ # 2. Fetch fresh token from platform API. # 3. If platform is unreachable, fall back to GITHUB_TOKEN / GH_TOKEN # env var (set at container start, valid for up to 60 min). -# 4. If all fail, exit 1 so git falls through to the next credential +# 4. If env var is unset, read static-token file at +# ${CONFIGS_DIR}/.github-token. Operator escape hatch for incidents +# when the platform endpoint is broken; not managed by the platform. +# Never auto-cached, so API recovery is detected immediately. +# 5. If all fail, exit 1 so git falls through to the next credential # helper in the chain (if any). # # # gh CLI integration @@ -197,7 +201,25 @@ _fetch_token_from_api() { echo "${token}" } -# _fetch_token — return a fresh token using cache > API > env fallback chain. +# _read_static_token — output static-token-file contents if present and +# non-empty. Returns 1 if file missing or empty. Never writes to cache — +# operator escape hatch; we want API recovery to be detected on the very +# next call without 50-min stale-cache stickiness on the workaround. +_read_static_token() { + local static_file="${CONFIGS_DIR}/.github-token" + if [ ! -f "${static_file}" ]; then + return 1 + fi + local static_token + static_token=$(cat "${static_file}" 2>/dev/null | tr -d '[:space:]') + if [ -z "${static_token}" ]; then + return 1 + fi + echo "${static_token}" + return 0 +} + +# _fetch_token — return a fresh token using cache > API > env > static fallback chain. # Outputs the raw token string on success; exits non-zero if all sources fail. _fetch_token() { # 1. Try cache first. @@ -222,6 +244,16 @@ _fetch_token() { return 0 fi + # 4. Static-token file fallback — operator escape hatch for when + # the platform API is broken AND no env var is set. + # Manually written by infra; never auto-cached so API recovery + # is detected on the very next call. + static_token=$(_read_static_token 2>/dev/null) && { + echo "[molecule-git-token-helper] API + env exhausted, using static-token file" >&2 + echo "${static_token}" + return 0 + } + echo "[molecule-git-token-helper] all token sources exhausted" >&2 return 1 } @@ -240,20 +272,38 @@ case "${ACTION}" in # No-op — the platform manages token lifecycle. ;; _fetch_token) - # Return raw token (cache > API > env fallback). + # Return raw token (cache > API > env > static fallback). _fetch_token ;; _refresh_gh) # Refresh cache AND update gh CLI auth in one shot. # Called by molecule-gh-token-refresh.sh background daemon. # Force-bypass cache to get a definitely fresh token. - api_token=$(_fetch_token_from_api) || { - echo "[molecule-git-token-helper] _refresh_gh: API fetch failed" >&2 - exit 1 - } - _write_cache "${api_token}" + # On API failure, fall through env → static-file like _fetch_token does, + # but do NOT write the cache (those aren't API-issued tokens). + api_token=$(_fetch_token_from_api) || api_token="" + chosen_token="" + if [ -n "${api_token}" ]; then + _write_cache "${api_token}" + chosen_token="${api_token}" + else + env_token="${GITHUB_TOKEN:-${GH_TOKEN:-}}" + if [ -n "${env_token}" ]; then + chosen_token="${env_token}" + echo "[molecule-git-token-helper] _refresh_gh: API failed, using env GITHUB_TOKEN" >&2 + else + static_token=$(_read_static_token 2>/dev/null) && { + chosen_token="${static_token}" + echo "[molecule-git-token-helper] _refresh_gh: API failed + env unset, using static-token file" >&2 + } + fi + if [ -z "${chosen_token}" ]; then + echo "[molecule-git-token-helper] _refresh_gh: API fetch failed and no fallback available" >&2 + exit 1 + fi + fi # Update gh CLI auth — gh auth login reads token from stdin. - echo "${api_token}" | gh auth login --hostname github.com --with-token 2>/dev/null || { + echo "${chosen_token}" | gh auth login --hostname github.com --with-token 2>/dev/null || { echo "[molecule-git-token-helper] _refresh_gh: gh auth login failed (non-fatal)" >&2 } # Also update GH_TOKEN file for scripts that source it. @@ -265,7 +315,7 @@ case "${ACTION}" in # function); shadow with a uniquely-named global instead. _gh_prev_umask=$(umask) umask 077 - printf '%s' "${api_token}" > "${gh_token_file}.tmp" + printf '%s' "${chosen_token}" > "${gh_token_file}.tmp" mv -f "${gh_token_file}.tmp" "${gh_token_file}" umask "${_gh_prev_umask}" unset _gh_prev_umask