From d4ab584debb2972ad2e49ce227316be882b60026 Mon Sep 17 00:00:00 2001 From: rabbitblood Date: Thu, 23 Apr 2026 17:57:30 -0700 Subject: [PATCH] =?UTF-8?q?fix:=20wire=20up=20GitHub=20App=20token=20refre?= =?UTF-8?q?sh=20=E2=80=94=20fixes=20#1933?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Symptoms before this PR: - After ~60 min of workspace uptime, every git push/clone returns 401 - PMM, DevRel, Social Media Brand and other content agents infinite-loop status reports back to PMs ("I tried, GH_TOKEN dead") - PM A2A queues overflow with retry-status messages (depth 27 on Marketing Lead, 18 on Dev Lead, 11 on Core Platform Lead at peak) Root cause: - GH_TOKEN/GITHUB_TOKEN injected at provision time has a ~60 min TTL (GitHub App installation tokens cap at one hour) - Workspace env is frozen at container start — no in-process mechanism to refresh after expiry - The credential-helper architecture exists in the codebase but was never wired up at template boot. Specifically the claude-code template: - did not COPY the helper scripts into the image - did not configure git credential.helper at boot - did not start the background refresh daemon - did not run initial gh auth login Fix: 1. Dockerfile COPYs scripts/molecule-git-token-helper.sh and scripts/molecule-gh-token-refresh.sh into /app/scripts/ 2. entrypoint.sh (root half) configures git credential helper for github.com and creates the per-user token cache directory 3. entrypoint.sh (agent half) starts the refresh daemon under a respawn loop and runs initial `gh auth login --with-token` The helper hits the platform's /admin/github-installation-token endpoint (fallback to env-var GH_TOKEN when platform unreachable). The refresh daemon calls _refresh_gh every ~45 min ± 2 min jitter so cli auth and helper cache stay warm even when no git operation triggers a refresh. Acceptance: - After this image deploys, `gh api /user` from inside a workspace should keep returning 200 even after >60 min uptime - Marketing Lead / Dev Lead a2a queues should drain to <5 within one cycle of the new image rolling Follow-up issues to file (not in this PR): - Replicate this wiring in the other 7 template repos (autogen, crewai, deepagents, gemini-cli, hermes, langgraph, openclaw) - Lift the wiring into the molecule-runtime PyPI package so future templates inherit it instead of re-implementing Co-Authored-By: Claude Opus 4.7 (1M context) --- Dockerfile | 13 ++ entrypoint.sh | 43 +++++ scripts/molecule-gh-token-refresh.sh | 57 +++++++ scripts/molecule-git-token-helper.sh | 247 +++++++++++++++++++++++++++ 4 files changed, 360 insertions(+) create mode 100644 scripts/molecule-gh-token-refresh.sh create mode 100644 scripts/molecule-git-token-helper.sh diff --git a/Dockerfile b/Dockerfile index eb30921..c58a995 100644 --- a/Dockerfile +++ b/Dockerfile @@ -32,6 +32,19 @@ COPY __init__.py . # Set the adapter module for runtime discovery ENV ADAPTER_MODULE=adapter +# Git credential helper + background refresh daemon — fix for #1933 / #1866 / #547. +# Without these, GH_TOKEN injected at provision time expires after ~60 min +# and every subsequent git push/clone returns 401, causing agents to +# infinite-loop status reports back to PMs and overflow A2A queues. +# +# The helper hits the platform's /admin/github-installation-token endpoint +# (and falls back to env-var GH_TOKEN when platform is unreachable). The +# refresh daemon calls _refresh_gh every ~45 min so `gh` CLI auth and the +# helper cache stay warm even when no git operation triggers a refresh. +COPY scripts/molecule-git-token-helper.sh /app/scripts/molecule-git-token-helper.sh +COPY scripts/molecule-gh-token-refresh.sh /app/scripts/molecule-gh-token-refresh.sh +RUN chmod +x /app/scripts/molecule-git-token-helper.sh /app/scripts/molecule-gh-token-refresh.sh + # Drop-priv entrypoint — claude-code refuses --dangerously-skip-permissions # as root, so we run molecule-runtime as the agent user (uid 1000). # The script handles volume-ownership fix + session-dir symlink before diff --git a/entrypoint.sh b/entrypoint.sh index fd9feef..bd19445 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -34,8 +34,51 @@ if [ "$(id -u)" = "0" ]; then chown -R agent:agent /root/.claude /home/agent/.claude 2>/dev/null ln -sfn /root/.claude/sessions /home/agent/.claude/sessions fi + + # GitHub credential helper setup (fix #1933 / #1866 / #547). + # Runs as root so the global gitconfig is written before we drop to agent. + # The helper fetches fresh GitHub App installation tokens from the + # platform API on every git push/clone, with caching + env-var fallback. + if [ -x /app/scripts/molecule-git-token-helper.sh ]; then + git config --global "credential.https://github.com.helper" \ + "!/app/scripts/molecule-git-token-helper.sh" + git config --global "credential.https://github.com.useHttpPath" true + if [ -f /root/.gitconfig ]; then + cp /root/.gitconfig /home/agent/.gitconfig + chown agent:agent /home/agent/.gitconfig + fi + fi + mkdir -p /home/agent/.molecule-token-cache + chown agent:agent /home/agent/.molecule-token-cache + chmod 700 /home/agent/.molecule-token-cache + exec gosu agent "$0" "$@" fi # Now running as agent (uid 1000) + +# Background token refresh daemon — keeps `gh` CLI auth + credential helper +# cache warm across the ~60 min GitHub App installation token TTL. Wrapped +# in a respawn loop so a daemon crash doesn't silently leave the workspace +# stuck on an expired token (which is exactly how #1933 was discovered). +if [ -x /app/scripts/molecule-gh-token-refresh.sh ]; then + nohup bash -c ' + while true; do + /app/scripts/molecule-gh-token-refresh.sh + rc=$? + echo "[molecule-gh-token-refresh] daemon exited rc=$rc — respawning in 30s" >&2 + sleep 30 + done + ' > /home/agent/.gh-token-refresh.log 2>&1 & +fi + +# Initial gh auth — primes the CLI with whatever GH_TOKEN/GITHUB_TOKEN was +# injected at provision time, so commands work in the ~60s window before the +# background daemon's first refresh fires. +if [ -n "${GITHUB_TOKEN:-}" ]; then + echo "${GITHUB_TOKEN}" | gh auth login --hostname github.com --with-token 2>/dev/null || true +elif [ -n "${GH_TOKEN:-}" ]; then + echo "${GH_TOKEN}" | gh auth login --hostname github.com --with-token 2>/dev/null || true +fi + exec molecule-runtime "$@" diff --git a/scripts/molecule-gh-token-refresh.sh b/scripts/molecule-gh-token-refresh.sh new file mode 100644 index 0000000..e7f4587 --- /dev/null +++ b/scripts/molecule-gh-token-refresh.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# molecule-gh-token-refresh.sh — background daemon that keeps GitHub +# credentials fresh inside Molecule AI workspace containers. +# +# Started by entrypoint.sh under a respawn wrapper. Every +# REFRESH_INTERVAL_SEC + jitter (default 45 min ± 2 min) it calls the +# credential helper's _refresh_gh action. +# +# # Jitter +# A 0..120s random offset prevents 39 containers from synchronizing +# their refresh requests against /workspaces/:id/github-installation-token. +# +# # Security +# - This daemon NEVER prints token values. Failures log the helper's +# exit code only, not its stderr, so token bytes can't leak via the +# docker log pipeline. +# - The helper script is responsible for chmod 600 on cache files. +# +set -uo pipefail + +HELPER_SCRIPT="${TOKEN_HELPER_SCRIPT:-/app/scripts/molecule-git-token-helper.sh}" +REFRESH_INTERVAL_SEC="${TOKEN_REFRESH_INTERVAL_SEC:-2700}" # 45 min +JITTER_MAX_SEC="${TOKEN_REFRESH_JITTER_SEC:-120}" +INITIAL_DELAY_SEC="${TOKEN_REFRESH_INITIAL_DELAY_SEC:-60}" + +log() { + echo "[molecule-gh-token-refresh] $(date -u '+%Y-%m-%dT%H:%M:%SZ') $*" >&2 +} + +jittered_sleep() { + local base="$1" + local jitter=$((RANDOM % (JITTER_MAX_SEC + 1))) + sleep $((base + jitter)) +} + +log "starting (interval=${REFRESH_INTERVAL_SEC}s ± ${JITTER_MAX_SEC}s, initial_delay=${INITIAL_DELAY_SEC}s)" +sleep "${INITIAL_DELAY_SEC}" + +# Initial refresh — prime the cache + gh auth immediately after boot. +# Discard helper output to /dev/null so token can't leak via docker logs. +log "initial token refresh" +if bash "${HELPER_SCRIPT}" _refresh_gh >/dev/null 2>&1; then + log "initial refresh succeeded" +else + log "initial refresh failed (rc=$?) — will retry in ~${REFRESH_INTERVAL_SEC}s" +fi + +# Steady-state loop. +while true; do + jittered_sleep "${REFRESH_INTERVAL_SEC}" + log "periodic token refresh" + if bash "${HELPER_SCRIPT}" _refresh_gh >/dev/null 2>&1; then + log "refresh succeeded" + else + log "refresh failed (rc=$?) — will retry in ~${REFRESH_INTERVAL_SEC}s" + fi +done diff --git a/scripts/molecule-git-token-helper.sh b/scripts/molecule-git-token-helper.sh new file mode 100644 index 0000000..0faab0f --- /dev/null +++ b/scripts/molecule-git-token-helper.sh @@ -0,0 +1,247 @@ +#!/bin/bash +# molecule-git-token-helper.sh — git credential helper for GitHub App tokens +# +# Fetches a fresh GitHub App installation token from the Molecule AI +# platform endpoint and caches it locally (~50 min), so workspace +# containers never use an expired GH_TOKEN after the ~60 min GitHub App +# token TTL. The cache avoids hitting the platform API on every git +# operation (push/fetch/clone). +# +# # Setup (called once at container boot by entrypoint.sh) +# +# git config --global \ +# "credential.https://github.com.helper" \ +# "!/app/scripts/molecule-git-token-helper.sh" +# +# # How git calls this helper +# +# git passes the action as the first positional arg. The protocol is: +# get → output credentials on stdout (we handle this) +# store → persist credentials (no-op — we never cache via git) +# erase → revoke credentials (no-op — platform manages lifecycle) +# +# On `get`, git reads key=value pairs terminated by an empty line. +# We must emit at minimum: +# username=x-access-token +# password= +# (blank line) +# +# # Auth +# +# The platform endpoint requires a valid workspace bearer token. The +# token is stored at ${CONFIGS_DIR}/.auth_token (written by platform_auth.py +# on first /registry/register). Workspace env var PLATFORM_URL defaults +# to http://platform:8080. +# +# # Caching +# +# Tokens are cached at ${CACHE_DIR}/gh_installation_token with a +# companion ${CACHE_DIR}/gh_installation_token_expiry file containing +# the epoch-seconds expiry. Cache TTL is ~50 min (TOKEN_CACHE_TTL_SEC). +# If the cache is fresh, we return immediately without calling the API. +# +# # Fallback chain +# +# 1. Return cached token if not expired. +# 2. Fetch fresh token from platform API. +# 3. If platform is unreachable, fall back to GITHUB_TOKEN / GH_TOKEN +# env var (set at container start, valid for up to 60 min). +# 4. If all fail, exit 1 so git falls through to the next credential +# helper in the chain (if any). +# +# # gh CLI integration +# +# Use the _refresh_gh action to atomically refresh both the cache and +# gh CLI auth: +# +# bash /app/scripts/molecule-git-token-helper.sh _refresh_gh +# +# This is called by molecule-gh-token-refresh.sh (the background daemon) +# every 45 min. +# +set -euo pipefail + +PLATFORM_URL="${PLATFORM_URL:-http://host.docker.internal:8080}" +CONFIGS_DIR="${CONFIGS_DIR:-/configs}" +TOKEN_FILE="${CONFIGS_DIR}/.auth_token" + +# Cache location — writable by agent user +CACHE_DIR="${HOME:=/home/agent}/.molecule-token-cache" +CACHE_TOKEN_FILE="${CACHE_DIR}/gh_installation_token" +CACHE_EXPIRY_FILE="${CACHE_DIR}/gh_installation_token_expiry" + +# Cache lifetime: 50 min = 3000 sec. Installation tokens last ~60 min; +# 50 min gives a 10-min safety margin for clock skew + in-flight ops. +TOKEN_CACHE_TTL_SEC=3000 + +# #1068: use workspace-scoped path (WorkspaceAuth) instead of admin path +# (AdminAuth rejects workspace bearer tokens since PR #729). +WORKSPACE_ID="${WORKSPACE_ID:-}" +if [ -n "$WORKSPACE_ID" ]; then + ENDPOINT="${PLATFORM_URL}/workspaces/${WORKSPACE_ID}/github-installation-token" +else + ENDPOINT="${PLATFORM_URL}/admin/github-installation-token" +fi + +# _now_epoch — portable epoch-seconds (works on both GNU and BusyBox date). +_now_epoch() { + date +%s +} + +# _read_cache — output cached token if still valid; return 1 if stale/missing. +_read_cache() { + if [ ! -f "${CACHE_TOKEN_FILE}" ] || [ ! -f "${CACHE_EXPIRY_FILE}" ]; then + return 1 + fi + expiry=$(cat "${CACHE_EXPIRY_FILE}" 2>/dev/null | tr -d '[:space:]') + if [ -z "${expiry}" ]; then + return 1 + fi + now=$(_now_epoch) + if [ "${now}" -ge "${expiry}" ]; then + return 1 + fi + token=$(cat "${CACHE_TOKEN_FILE}" 2>/dev/null | tr -d '[:space:]') + if [ -z "${token}" ]; then + return 1 + fi + echo "${token}" + return 0 +} + +# _write_cache — atomically persist token + expiry. +_write_cache() { + local token="$1" + mkdir -p "${CACHE_DIR}" + chmod 700 "${CACHE_DIR}" 2>/dev/null || true + now=$(_now_epoch) + expiry=$((now + TOKEN_CACHE_TTL_SEC)) + # Write atomically via tmp + mv to avoid partial reads. + printf '%s' "${token}" > "${CACHE_TOKEN_FILE}.tmp" + printf '%s' "${expiry}" > "${CACHE_EXPIRY_FILE}.tmp" + mv -f "${CACHE_TOKEN_FILE}.tmp" "${CACHE_TOKEN_FILE}" + mv -f "${CACHE_EXPIRY_FILE}.tmp" "${CACHE_EXPIRY_FILE}" + chmod 600 "${CACHE_TOKEN_FILE}" "${CACHE_EXPIRY_FILE}" 2>/dev/null || true +} + +# _fetch_token_from_api — hit the platform endpoint. +# Outputs the raw token string on success; returns non-zero on failure. +_fetch_token_from_api() { + if [ ! -f "${TOKEN_FILE}" ]; then + echo "[molecule-git-token-helper] .auth_token not found at ${TOKEN_FILE}" >&2 + return 1 + fi + + bearer=$(cat "${TOKEN_FILE}" | tr -d '[:space:]') + if [ -z "${bearer}" ]; then + echo "[molecule-git-token-helper] .auth_token is empty" >&2 + return 1 + fi + + # NOTE: capture stderr to a tmp file (NOT $response) so the response + # body — which contains the token on success — never lands in error + # log lines via $response interpolation. + local _err_file + _err_file=$(mktemp) + response=$(curl -sf \ + -H "Authorization: Bearer ${bearer}" \ + -H "Accept: application/json" \ + --max-time 10 \ + "${ENDPOINT}" 2>"${_err_file}") || { + local _curl_rc=$? + local _err_msg + _err_msg=$(cat "${_err_file}") + rm -f "${_err_file}" + echo "[molecule-git-token-helper] platform request failed (curl rc=${_curl_rc}): ${_err_msg}" >&2 + return 1 + } + rm -f "${_err_file}" + + # Parse {"token":"ghs_...","expires_at":"..."} with sed (no jq dependency). + token=$(echo "${response}" | sed -n 's/.*"token":"\([^"]*\)".*/\1/p') + if [ -z "${token}" ]; then + # SECURITY: the response body MAY contain a token under a different + # JSON key name. Never include $response in this error message — + # log only the size as a coarse debugging signal. + echo "[molecule-git-token-helper] empty token in platform response (body=${#response} bytes)" >&2 + return 1 + fi + + echo "${token}" +} + +# _fetch_token — return a fresh token using cache > API > env fallback chain. +# Outputs the raw token string on success; exits non-zero if all sources fail. +_fetch_token() { + # 1. Try cache first. + cached=$(_read_cache) && { + echo "${cached}" + return 0 + } + + # 2. Fetch from platform API. + api_token=$(_fetch_token_from_api 2>/dev/null) && { + _write_cache "${api_token}" + echo "${api_token}" + return 0 + } + + # 3. Fall back to env var (set at container start, may be stale but + # better than nothing for the first ~60 min of container life). + env_token="${GITHUB_TOKEN:-${GH_TOKEN:-}}" + if [ -n "${env_token}" ]; then + echo "[molecule-git-token-helper] API unreachable, falling back to env GITHUB_TOKEN" >&2 + echo "${env_token}" + return 0 + fi + + echo "[molecule-git-token-helper] all token sources exhausted" >&2 + return 1 +} + +ACTION="${1:-get}" + +case "${ACTION}" in + get) + token=$(_fetch_token) || exit 1 + # Emit git credential protocol response. + printf 'username=x-access-token\n' + printf 'password=%s\n' "${token}" + printf '\n' + ;; + store|erase) + # No-op — the platform manages token lifecycle. + ;; + _fetch_token) + # Return raw token (cache > API > env fallback). + _fetch_token + ;; + _refresh_gh) + # Refresh cache AND update gh CLI auth in one shot. + # Called by molecule-gh-token-refresh.sh background daemon. + # Force-bypass cache to get a definitely fresh token. + api_token=$(_fetch_token_from_api) || { + echo "[molecule-git-token-helper] _refresh_gh: API fetch failed" >&2 + exit 1 + } + _write_cache "${api_token}" + # Update gh CLI auth — gh auth login reads token from stdin. + echo "${api_token}" | gh auth login --hostname github.com --with-token 2>/dev/null || { + echo "[molecule-git-token-helper] _refresh_gh: gh auth login failed (non-fatal)" >&2 + } + # Also update GH_TOKEN file for scripts that source it. + gh_token_file="${HOME}/.gh_token" + printf '%s' "${api_token}" > "${gh_token_file}.tmp" + mv -f "${gh_token_file}.tmp" "${gh_token_file}" + chmod 600 "${gh_token_file}" 2>/dev/null || true + echo "[molecule-git-token-helper] _refresh_gh: token refreshed successfully" >&2 + ;; + _invalidate_cache) + # Force next call to hit the API (useful after a 401). + rm -f "${CACHE_TOKEN_FILE}" "${CACHE_EXPIRY_FILE}" 2>/dev/null + ;; + *) + echo "[molecule-git-token-helper] unknown action: ${ACTION}" >&2 + exit 1 + ;; +esac