Merge pull request #44 from Molecule-AI/feat/inline-credential-helper

feat: ship GitHub credential-helper inline in runtime (fixes #1933 class)
This commit is contained in:
Hongming Wang 2026-04-24 00:42:32 -07:00 committed by GitHub
commit f334872d56
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 530 additions and 2 deletions

View File

@ -0,0 +1,214 @@
"""Inline GitHub credential-helper installer.
Lifts the per-template wiring (Dockerfile COPY + entrypoint.sh git config +
nohup daemon launch) into the Python runtime. Templates that depend on
``molecule-ai-workspace-runtime`` get the behavior automatically they no
longer need to maintain their own copy of the helper scripts or remember to
write the right git config in their entrypoint.
Background fix for the #1933 cascade
======================================
GitHub App installation tokens (``ghs_``) expire ~60 min after issue.
Workspaces inject the token at provision time as ``GH_TOKEN`` /
``GITHUB_TOKEN`` env vars; once the container has been alive >60 min,
every git push and gh CLI call returns 401. The platform exposes
``GET /admin/github-installation-token`` for live refresh, but the
workspace side has to (a) install a credential helper that hits that
endpoint, (b) configure git to call it, and (c) run a periodic refresh
daemon to keep ``gh auth login --with-token`` warm.
Before this module the wiring lived in each template's ``entrypoint.sh``
+ ``Dockerfile``. The ``claude-code-default`` template shipped without
it (cycle 62-66 incident: 39 workspaces lost their tokens, three PMs'
A2A queues filled with retry-status messages, manual fleet restart
required). Now any template that ``pip install molecule-ai-workspace-runtime``
+ calls :func:`install_credential_helper` early in startup gets the
behavior the bug becomes structurally impossible.
What it does
============
On import / call:
1. Extracts the bundled helper scripts from package data to
``~/.molecule-runtime/scripts/`` (writable by agent user).
2. ``git config --global credential.https://github.com.helper`` the
extracted helper script. Idempotent.
3. Creates ``~/.molecule-token-cache/`` with mode 0700 (helper writes
token cache files there).
4. Spawns the refresh daemon as a detached subprocess under a respawn
loop. PID written to ``~/.molecule-runtime/refresh-daemon.pid`` so a
restart of the runtime can detect + skip if already alive.
5. Runs initial ``gh auth login --with-token`` using whatever ``GH_TOKEN``
env was injected at provision so commands work in the ~60s window
before the daemon's first refresh fires.
Failures fail-soft (log + continue). The runtime starting is more
important than the credential helper being perfect without it agents
still work for the first ~50 minutes, which is enough for the operator
to notice a log warning and restart.
"""
from __future__ import annotations
import logging
import os
import shutil
import stat
import subprocess
from importlib import resources
from pathlib import Path
log = logging.getLogger(__name__)
# Where extracted helper scripts live. Under HOME so the agent user can
# write to it without root. Templates that mount /tmp tmpfs are fine —
# this is per-process, not per-container, scope.
_INSTALL_DIR = Path(os.environ.get("HOME", "/home/agent")) / ".molecule-runtime" / "scripts"
_TOKEN_CACHE_DIR = Path(os.environ.get("HOME", "/home/agent")) / ".molecule-token-cache"
_DAEMON_PID_FILE = Path(os.environ.get("HOME", "/home/agent")) / ".molecule-runtime" / "refresh-daemon.pid"
_DAEMON_LOG_FILE = Path(os.environ.get("HOME", "/home/agent")) / ".molecule-runtime" / "refresh-daemon.log"
_HELPER_SCRIPT = "molecule-git-token-helper.sh"
_DAEMON_SCRIPT = "molecule-gh-token-refresh.sh"
def _extract_scripts() -> Path:
"""Copy bundled .sh files from package data to a writable dir.
Returns the install directory containing the extracted scripts. Idempotent
if the files already exist with the same content, no-ops.
"""
_INSTALL_DIR.mkdir(parents=True, exist_ok=True)
# importlib.resources.files() returns a Traversable that handles both
# zipped wheels and editable installs. Iterate the bundled scripts/
# subdir of this package.
pkg_scripts = resources.files("molecule_runtime").joinpath("scripts")
for entry in pkg_scripts.iterdir():
if not entry.name.endswith(".sh"):
continue
target = _INSTALL_DIR / entry.name
# Read source bytes via the Traversable interface (works for zips).
src_bytes = entry.read_bytes()
if target.exists() and target.read_bytes() == src_bytes:
continue
target.write_bytes(src_bytes)
# chmod +x so the kernel can exec the script directly.
target.chmod(target.stat().st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
return _INSTALL_DIR
def _configure_git_credential_helper(helper_path: Path) -> None:
"""Point git's credential helper for github.com at the extracted script."""
# The leading `!` tells git the value is a shell command, not a builtin.
helper_value = f"!{helper_path}"
subprocess.run(
["git", "config", "--global",
"credential.https://github.com.helper", helper_value],
check=True, capture_output=True,
)
# useHttpPath=true so the cache key includes the repo path — relevant
# if a workspace ever fetches multiple repos under different scopes.
subprocess.run(
["git", "config", "--global",
"credential.https://github.com.useHttpPath", "true"],
check=True, capture_output=True,
)
def _start_refresh_daemon(daemon_path: Path) -> None:
"""Spawn the refresh daemon as a detached child if not already running."""
# Skip if a previous run's daemon is still alive (PID file + /proc check).
if _DAEMON_PID_FILE.exists():
try:
old_pid = int(_DAEMON_PID_FILE.read_text().strip())
os.kill(old_pid, 0) # signal 0 = check existence, no actual signal
log.info("credential_helper: refresh daemon already running pid=%d", old_pid)
return
except (ValueError, ProcessLookupError, PermissionError, OSError):
# Stale PID file or process gone. Fall through to respawn.
pass
_DAEMON_PID_FILE.parent.mkdir(parents=True, exist_ok=True)
log_handle = open(_DAEMON_LOG_FILE, "ab")
# Wrap the daemon in a respawn loop so a single crash doesn't leave
# the workspace stuck on an expired token (which is exactly how #1933
# was discovered).
wrapper = (
f"while true; do "
f"bash {daemon_path}; "
f"echo \"[molecule-gh-token-refresh] daemon exited rc=$? — respawning in 30s\" >&2; "
f"sleep 30; "
f"done"
)
proc = subprocess.Popen(
["bash", "-c", wrapper],
stdout=log_handle, stderr=log_handle,
# Detach: new session so the daemon survives the runtime exiting.
start_new_session=True,
env={**os.environ, "TOKEN_HELPER_SCRIPT": str(_INSTALL_DIR / _HELPER_SCRIPT)},
)
_DAEMON_PID_FILE.write_text(str(proc.pid))
log.info("credential_helper: refresh daemon spawned pid=%d", proc.pid)
def _initial_gh_auth() -> None:
"""Prime gh CLI with the provision-time token so commands work immediately."""
token = os.environ.get("GITHUB_TOKEN") or os.environ.get("GH_TOKEN")
if not token:
log.info("credential_helper: no GH_TOKEN/GITHUB_TOKEN at startup — skip initial gh auth")
return
if not shutil.which("gh"):
log.info("credential_helper: no gh CLI on PATH — skip initial gh auth (workspace will rely on git credential helper only)")
return
try:
subprocess.run(
["gh", "auth", "login", "--hostname", "github.com", "--with-token"],
input=token, text=True,
check=True, capture_output=True, timeout=10,
)
log.info("credential_helper: initial gh auth login succeeded")
except subprocess.SubprocessError as exc:
# Non-fatal — refresh daemon will retry within ~60s.
log.warning("credential_helper: initial gh auth login failed (non-fatal): %s", exc)
def install_credential_helper() -> None:
"""Install + configure + start the GitHub credential helper machinery.
Safe to call multiple times. Each step is independently fault-tolerant:
a failure in one (e.g. no git binary) doesn't prevent the others from
trying.
Intended to be called once early in the workspace runtime startup,
before any code path that might invoke git or gh.
"""
try:
helper_dir = _extract_scripts()
except (OSError, ModuleNotFoundError) as exc:
log.warning("credential_helper: cannot extract scripts (%s) — skipping all setup", exc)
return
try:
_TOKEN_CACHE_DIR.mkdir(parents=True, exist_ok=True)
_TOKEN_CACHE_DIR.chmod(0o700)
except OSError as exc:
log.warning("credential_helper: cannot create token cache dir (%s) — helper will fail-open to env", exc)
if shutil.which("git"):
try:
_configure_git_credential_helper(helper_dir / _HELPER_SCRIPT)
except subprocess.SubprocessError as exc:
log.warning("credential_helper: git config failed (%s) — git will use env-based auth only", exc)
else:
log.info("credential_helper: no git binary on PATH — skipping git config")
try:
_start_refresh_daemon(helper_dir / _DAEMON_SCRIPT)
except OSError as exc:
log.warning("credential_helper: refresh daemon failed to start (%s) — token will go stale after ~60min", exc)
_initial_gh_auth()

View File

@ -74,6 +74,16 @@ async def main(): # pragma: no cover
from molecule_runtime.llm_auth import normalise_llm_env
print(normalise_llm_env().summary())
# 0.1 GitHub credential helper installer — extracts bundled .sh scripts,
# configures git, starts refresh daemon, primes gh CLI. Eliminates the
# per-template wiring that caused #1933 (claude-code-default template
# shipped without the wiring; 39 workspaces lost their tokens after the
# ~60min installation-token TTL). Fails-soft so a missing git/gh binary
# doesn't block runtime startup. See credential_helper.py for the full
# rationale.
from molecule_runtime.credential_helper import install_credential_helper
install_credential_helper()
# 0.5 Initialise OpenTelemetry (no-op if packages not installed)
setup_telemetry(service_name=workspace_id)

View File

@ -0,0 +1,57 @@
#!/bin/bash
# molecule-gh-token-refresh.sh — background daemon that keeps GitHub
# credentials fresh inside Molecule AI workspace containers.
#
# Started by entrypoint.sh under a respawn wrapper. Every
# REFRESH_INTERVAL_SEC + jitter (default 45 min ± 2 min) it calls the
# credential helper's _refresh_gh action.
#
# # Jitter
# A 0..120s random offset prevents 39 containers from synchronizing
# their refresh requests against /workspaces/:id/github-installation-token.
#
# # Security
# - This daemon NEVER prints token values. Failures log the helper's
# exit code only, not its stderr, so token bytes can't leak via the
# docker log pipeline.
# - The helper script is responsible for chmod 600 on cache files.
#
set -uo pipefail
HELPER_SCRIPT="${TOKEN_HELPER_SCRIPT:-/app/scripts/molecule-git-token-helper.sh}"
REFRESH_INTERVAL_SEC="${TOKEN_REFRESH_INTERVAL_SEC:-2700}" # 45 min
JITTER_MAX_SEC="${TOKEN_REFRESH_JITTER_SEC:-120}"
INITIAL_DELAY_SEC="${TOKEN_REFRESH_INITIAL_DELAY_SEC:-60}"
log() {
echo "[molecule-gh-token-refresh] $(date -u '+%Y-%m-%dT%H:%M:%SZ') $*" >&2
}
jittered_sleep() {
local base="$1"
local jitter=$((RANDOM % (JITTER_MAX_SEC + 1)))
sleep $((base + jitter))
}
log "starting (interval=${REFRESH_INTERVAL_SEC}s ± ${JITTER_MAX_SEC}s, initial_delay=${INITIAL_DELAY_SEC}s)"
sleep "${INITIAL_DELAY_SEC}"
# Initial refresh — prime the cache + gh auth immediately after boot.
# Discard helper output to /dev/null so token can't leak via docker logs.
log "initial token refresh"
if bash "${HELPER_SCRIPT}" _refresh_gh >/dev/null 2>&1; then
log "initial refresh succeeded"
else
log "initial refresh failed (rc=$?) — will retry in ~${REFRESH_INTERVAL_SEC}s"
fi
# Steady-state loop.
while true; do
jittered_sleep "${REFRESH_INTERVAL_SEC}"
log "periodic token refresh"
if bash "${HELPER_SCRIPT}" _refresh_gh >/dev/null 2>&1; then
log "refresh succeeded"
else
log "refresh failed (rc=$?) — will retry in ~${REFRESH_INTERVAL_SEC}s"
fi
done

View File

@ -0,0 +1,247 @@
#!/bin/bash
# molecule-git-token-helper.sh — git credential helper for GitHub App tokens
#
# Fetches a fresh GitHub App installation token from the Molecule AI
# platform endpoint and caches it locally (~50 min), so workspace
# containers never use an expired GH_TOKEN after the ~60 min GitHub App
# token TTL. The cache avoids hitting the platform API on every git
# operation (push/fetch/clone).
#
# # Setup (called once at container boot by entrypoint.sh)
#
# git config --global \
# "credential.https://github.com.helper" \
# "!/app/scripts/molecule-git-token-helper.sh"
#
# # How git calls this helper
#
# git passes the action as the first positional arg. The protocol is:
# get → output credentials on stdout (we handle this)
# store → persist credentials (no-op — we never cache via git)
# erase → revoke credentials (no-op — platform manages lifecycle)
#
# On `get`, git reads key=value pairs terminated by an empty line.
# We must emit at minimum:
# username=x-access-token
# password=<token>
# (blank line)
#
# # Auth
#
# The platform endpoint requires a valid workspace bearer token. The
# token is stored at ${CONFIGS_DIR}/.auth_token (written by platform_auth.py
# on first /registry/register). Workspace env var PLATFORM_URL defaults
# to http://platform:8080.
#
# # Caching
#
# Tokens are cached at ${CACHE_DIR}/gh_installation_token with a
# companion ${CACHE_DIR}/gh_installation_token_expiry file containing
# the epoch-seconds expiry. Cache TTL is ~50 min (TOKEN_CACHE_TTL_SEC).
# If the cache is fresh, we return immediately without calling the API.
#
# # Fallback chain
#
# 1. Return cached token if not expired.
# 2. Fetch fresh token from platform API.
# 3. If platform is unreachable, fall back to GITHUB_TOKEN / GH_TOKEN
# env var (set at container start, valid for up to 60 min).
# 4. If all fail, exit 1 so git falls through to the next credential
# helper in the chain (if any).
#
# # gh CLI integration
#
# Use the _refresh_gh action to atomically refresh both the cache and
# gh CLI auth:
#
# bash /app/scripts/molecule-git-token-helper.sh _refresh_gh
#
# This is called by molecule-gh-token-refresh.sh (the background daemon)
# every 45 min.
#
set -euo pipefail
PLATFORM_URL="${PLATFORM_URL:-http://host.docker.internal:8080}"
CONFIGS_DIR="${CONFIGS_DIR:-/configs}"
TOKEN_FILE="${CONFIGS_DIR}/.auth_token"
# Cache location — writable by agent user
CACHE_DIR="${HOME:=/home/agent}/.molecule-token-cache"
CACHE_TOKEN_FILE="${CACHE_DIR}/gh_installation_token"
CACHE_EXPIRY_FILE="${CACHE_DIR}/gh_installation_token_expiry"
# Cache lifetime: 50 min = 3000 sec. Installation tokens last ~60 min;
# 50 min gives a 10-min safety margin for clock skew + in-flight ops.
TOKEN_CACHE_TTL_SEC=3000
# #1068: use workspace-scoped path (WorkspaceAuth) instead of admin path
# (AdminAuth rejects workspace bearer tokens since PR #729).
WORKSPACE_ID="${WORKSPACE_ID:-}"
if [ -n "$WORKSPACE_ID" ]; then
ENDPOINT="${PLATFORM_URL}/workspaces/${WORKSPACE_ID}/github-installation-token"
else
ENDPOINT="${PLATFORM_URL}/admin/github-installation-token"
fi
# _now_epoch — portable epoch-seconds (works on both GNU and BusyBox date).
_now_epoch() {
date +%s
}
# _read_cache — output cached token if still valid; return 1 if stale/missing.
_read_cache() {
if [ ! -f "${CACHE_TOKEN_FILE}" ] || [ ! -f "${CACHE_EXPIRY_FILE}" ]; then
return 1
fi
expiry=$(cat "${CACHE_EXPIRY_FILE}" 2>/dev/null | tr -d '[:space:]')
if [ -z "${expiry}" ]; then
return 1
fi
now=$(_now_epoch)
if [ "${now}" -ge "${expiry}" ]; then
return 1
fi
token=$(cat "${CACHE_TOKEN_FILE}" 2>/dev/null | tr -d '[:space:]')
if [ -z "${token}" ]; then
return 1
fi
echo "${token}"
return 0
}
# _write_cache — atomically persist token + expiry.
_write_cache() {
local token="$1"
mkdir -p "${CACHE_DIR}"
chmod 700 "${CACHE_DIR}" 2>/dev/null || true
now=$(_now_epoch)
expiry=$((now + TOKEN_CACHE_TTL_SEC))
# Write atomically via tmp + mv to avoid partial reads.
printf '%s' "${token}" > "${CACHE_TOKEN_FILE}.tmp"
printf '%s' "${expiry}" > "${CACHE_EXPIRY_FILE}.tmp"
mv -f "${CACHE_TOKEN_FILE}.tmp" "${CACHE_TOKEN_FILE}"
mv -f "${CACHE_EXPIRY_FILE}.tmp" "${CACHE_EXPIRY_FILE}"
chmod 600 "${CACHE_TOKEN_FILE}" "${CACHE_EXPIRY_FILE}" 2>/dev/null || true
}
# _fetch_token_from_api — hit the platform endpoint.
# Outputs the raw token string on success; returns non-zero on failure.
_fetch_token_from_api() {
if [ ! -f "${TOKEN_FILE}" ]; then
echo "[molecule-git-token-helper] .auth_token not found at ${TOKEN_FILE}" >&2
return 1
fi
bearer=$(cat "${TOKEN_FILE}" | tr -d '[:space:]')
if [ -z "${bearer}" ]; then
echo "[molecule-git-token-helper] .auth_token is empty" >&2
return 1
fi
# NOTE: capture stderr to a tmp file (NOT $response) so the response
# body — which contains the token on success — never lands in error
# log lines via $response interpolation.
local _err_file
_err_file=$(mktemp)
response=$(curl -sf \
-H "Authorization: Bearer ${bearer}" \
-H "Accept: application/json" \
--max-time 10 \
"${ENDPOINT}" 2>"${_err_file}") || {
local _curl_rc=$?
local _err_msg
_err_msg=$(cat "${_err_file}")
rm -f "${_err_file}"
echo "[molecule-git-token-helper] platform request failed (curl rc=${_curl_rc}): ${_err_msg}" >&2
return 1
}
rm -f "${_err_file}"
# Parse {"token":"ghs_...","expires_at":"..."} with sed (no jq dependency).
token=$(echo "${response}" | sed -n 's/.*"token":"\([^"]*\)".*/\1/p')
if [ -z "${token}" ]; then
# SECURITY: the response body MAY contain a token under a different
# JSON key name. Never include $response in this error message —
# log only the size as a coarse debugging signal.
echo "[molecule-git-token-helper] empty token in platform response (body=${#response} bytes)" >&2
return 1
fi
echo "${token}"
}
# _fetch_token — return a fresh token using cache > API > env fallback chain.
# Outputs the raw token string on success; exits non-zero if all sources fail.
_fetch_token() {
# 1. Try cache first.
cached=$(_read_cache) && {
echo "${cached}"
return 0
}
# 2. Fetch from platform API.
api_token=$(_fetch_token_from_api 2>/dev/null) && {
_write_cache "${api_token}"
echo "${api_token}"
return 0
}
# 3. Fall back to env var (set at container start, may be stale but
# better than nothing for the first ~60 min of container life).
env_token="${GITHUB_TOKEN:-${GH_TOKEN:-}}"
if [ -n "${env_token}" ]; then
echo "[molecule-git-token-helper] API unreachable, falling back to env GITHUB_TOKEN" >&2
echo "${env_token}"
return 0
fi
echo "[molecule-git-token-helper] all token sources exhausted" >&2
return 1
}
ACTION="${1:-get}"
case "${ACTION}" in
get)
token=$(_fetch_token) || exit 1
# Emit git credential protocol response.
printf 'username=x-access-token\n'
printf 'password=%s\n' "${token}"
printf '\n'
;;
store|erase)
# No-op — the platform manages token lifecycle.
;;
_fetch_token)
# Return raw token (cache > API > env fallback).
_fetch_token
;;
_refresh_gh)
# Refresh cache AND update gh CLI auth in one shot.
# Called by molecule-gh-token-refresh.sh background daemon.
# Force-bypass cache to get a definitely fresh token.
api_token=$(_fetch_token_from_api) || {
echo "[molecule-git-token-helper] _refresh_gh: API fetch failed" >&2
exit 1
}
_write_cache "${api_token}"
# Update gh CLI auth — gh auth login reads token from stdin.
echo "${api_token}" | gh auth login --hostname github.com --with-token 2>/dev/null || {
echo "[molecule-git-token-helper] _refresh_gh: gh auth login failed (non-fatal)" >&2
}
# Also update GH_TOKEN file for scripts that source it.
gh_token_file="${HOME}/.gh_token"
printf '%s' "${api_token}" > "${gh_token_file}.tmp"
mv -f "${gh_token_file}.tmp" "${gh_token_file}"
chmod 600 "${gh_token_file}" 2>/dev/null || true
echo "[molecule-git-token-helper] _refresh_gh: token refreshed successfully" >&2
;;
_invalidate_cache)
# Force next call to hit the API (useful after a 401).
rm -f "${CACHE_TOKEN_FILE}" "${CACHE_EXPIRY_FILE}" 2>/dev/null
;;
*)
echo "[molecule-git-token-helper] unknown action: ${ACTION}" >&2
exit 1
;;
esac

View File

@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "molecule-ai-workspace-runtime"
version = "0.1.9"
version = "0.1.10"
description = "Molecule AI workspace runtime — shared infrastructure for all agent adapters"
requires-python = ">=3.11"
@ -36,4 +36,4 @@ where = ["."]
include = ["molecule_runtime*"]
[tool.setuptools.package-data]
"molecule_runtime" = ["py.typed"]
"molecule_runtime" = ["py.typed", "scripts/*.sh"]