Files
molecule-ai-workspace-templ…/entrypoint.sh
hongming 91008a54f1
CI / Template validation (static) (push) Successful in 18s
publish-image / Resolve runtime version (push) Successful in 7s
Secret scan / Scan diff for credential-shaped strings (push) Successful in 7s
CI / Adapter unit tests (push) Successful in 22s
publish-image / Build & push workspace-template-claude-code image (push) Successful in 10m24s
CI / Template validation (runtime) (push) Successful in 10m31s
CI / T4 tier-4 conformance (live) (push) Successful in 9m57s
CI / validate (push) Successful in 2s
tier:high feat(restore): cp#326 Option D — rsync from secondary volume on first boot (#68)
Co-authored-by: hongming <hongmingwang@moleculesai.app>
Co-committed-by: hongming <hongmingwang@moleculesai.app>
2026-05-27 01:36:16 +00:00

317 lines
16 KiB
Bash

#!/bin/sh
# Drop privileges to the agent user before exec'ing molecule-runtime.
# claude-code refuses --dangerously-skip-permissions when running as
# root/sudo for safety. Without this entrypoint, every cron tick fails
# with `ProcessError: Command failed with exit code 1` and the agent
# logs `--dangerously-skip-permissions cannot be used with root/sudo
# privileges for security reasons`.
#
# Pattern matches the legacy monorepo workspace-template/entrypoint.sh:
# fix volume ownership as root, then re-exec via gosu as agent (uid 1000).
# Boot-context snapshot — emitted on EVERY container start, including
# every restart of a crash-loop. Lets `docker logs` answer "what env
# was actually present?" without having to docker exec into a dying
# container. Logs NAMES of auth-relevant env vars, never VALUES. Fires
# twice (once as root pre-gosu, once as agent post-gosu) so an operator
# can see whether a value was lost across the privilege drop.
# Keep the env-name list in sync with adapter.py's _AUTH_ENV_AUDIT —
# the same set of vendors should be audited from both sides.
log_boot_context() {
echo "----- entrypoint boot $(date -u +%Y-%m-%dT%H:%M:%SZ) -----"
echo "uid=$(id -u) gid=$(id -g) user=$(id -un 2>/dev/null || echo unknown)"
echo "hostname=$(hostname) workspace_id=${WORKSPACE_ID:-<unset>}"
echo "platform_url=${PLATFORM_URL:-<unset>}"
echo "configs_dir: $(ls -ld /configs 2>/dev/null || echo MISSING)"
echo "configs_contents: $(ls /configs 2>/dev/null | tr '\n' ' ' || echo MISSING)"
echo "workspace_dir: $(ls -ld /workspace 2>/dev/null || echo MISSING)"
# Auth env presence (NAMES + set/unset only — never the values).
# Mirror of _AUTH_ENV_AUDIT in adapter.py — keep in sync if you add a vendor.
for var in CLAUDE_CODE_OAUTH_TOKEN ANTHROPIC_API_KEY ANTHROPIC_AUTH_TOKEN ANTHROPIC_BASE_URL MINIMAX_API_KEY GLM_API_KEY KIMI_API_KEY DEEPSEEK_API_KEY; do
eval "val=\$$var"
if [ -n "$val" ]; then
echo "env $var=set"
else
echo "env $var=unset"
fi
done
echo "------------------------------------------------"
}
log_boot_context
# ---------------------------------------------------------------------
# Restore-on-recreate from secondary EBS volume (cp#326 Option D).
#
# Contract with CP: when ProvisionWorkspace finds a non-expired backup
# snapshot for this WorkspaceID, it attaches the snapshot as a SECONDARY
# EBS volume at /dev/xvdb at launch (DeleteOnTermination=true). This
# function mounts that volume on first container boot and rsyncs the
# restore set (/configs, /workspace, /home/agent/.claude) from it back
# into the root filesystem, then drops a marker so subsequent container
# restarts (within the same EC2's lifetime) skip the restore.
#
# Why cp#326 needs this: AWS rejects ANY SnapshotId on the ROOT device
# at RunInstances time with "InvalidBlockDeviceMapping: snapshotId
# cannot be modified on root device". The cp#301 architecture (override
# the AMI's root snapshot) is impossible per AWS spec. Option D works
# WITH AWS's model — secondary volumes accept SnapshotId — and rsync
# bridges the data-plane gap.
#
# Operational contract:
# - Idempotent: a marker at /configs/.restore-completed gates re-runs.
# If the container restarts in the same EC2 (DOT=true so the volume
# persists across container restarts but NOT EC2 terminate), the
# restore skips. If the EC2 is terminated + replaced, /configs is
# fresh and the marker is gone — restore runs on the new EC2.
# - Best-effort: any failure (volume absent, fs unreadable, rsync
# error) is LOGGED with MOLECULE-RESTORE: prefix but does NOT abort
# the boot. The workspace comes up with empty state — the explicit
# no-restore branch the user already accepts on first-time provision.
# - Read-only mount on the secondary at /mnt/restore so a defective
# filesystem can't corrupt our root.
# - All log lines prefixed `MOLECULE-RESTORE:` so `docker logs <id>
# 2>&1 | grep MOLECULE-RESTORE` is the operator's one-liner debug.
#
# Path allowlist (NOT a blanket /mnt/restore -> / rsync — that would
# also restore /etc/passwd, /var/lib/docker, etc. which are container-
# managed):
# - /configs/ (config.yaml, .auth_token, skills/, memory)
# - /workspace/ (the shared codebase + agent's working files)
# - /home/agent/.claude/ (Claude SDK session state, settings.json)
#
# If a future template adds another persistent path (e.g. /home/agent/.cache),
# add it to RESTORE_PATHS below AND ensure the corresponding source path
# exists in the snapshot. Keep the list narrow on purpose — the alternative
# (full / rsync with exclusions) trades blast-radius safety for convenience.
restore_from_secondary_volume() {
local SECONDARY_DEV="/dev/xvdb"
local MOUNT_POINT="/mnt/restore"
local MARKER="/configs/.restore-completed"
# Marker present = restore already done for this EC2's lifetime.
# Cheapest possible idempotency check; runs before any blockdev probe.
if [ -f "$MARKER" ]; then
echo "MOLECULE-RESTORE: marker $MARKER present — skipping (already restored on this EC2)"
return 0
fi
# No secondary device = nothing to restore (first-time provision or
# no backup snapshot existed). NOT an error.
if [ ! -b "$SECONDARY_DEV" ]; then
echo "MOLECULE-RESTORE: no $SECONDARY_DEV — first-time provision or no backup snapshot, skipping"
return 0
fi
echo "MOLECULE-RESTORE: $SECONDARY_DEV detected — attempting restore"
# Probe filesystem type. If blkid fails (raw/unformatted volume), we
# skip; if the fs type is something we can't mount safely, we skip.
local FSTYPE
FSTYPE=$(blkid -s TYPE -o value "$SECONDARY_DEV" 2>/dev/null || echo "")
if [ -z "$FSTYPE" ]; then
echo "MOLECULE-RESTORE: WARN no fs detected on $SECONDARY_DEV (raw/unformatted) — skipping"
return 0
fi
echo "MOLECULE-RESTORE: $SECONDARY_DEV fstype=$FSTYPE"
# Mount read-only. ro prevents a corrupt fs from being modified by
# mount-time journal replay AND blocks any rsync mistake from
# writing to the source.
mkdir -p "$MOUNT_POINT"
if ! mount -o ro "$SECONDARY_DEV" "$MOUNT_POINT" 2>&1 | sed 's/^/MOLECULE-RESTORE: mount: /'; then
# mount(8) writes to stderr on success too via -v; we don't pass -v
# so a non-zero from the pipeline means the mount itself failed.
:
fi
if ! mountpoint -q "$MOUNT_POINT"; then
echo "MOLECULE-RESTORE: WARN mount of $SECONDARY_DEV failed — skipping restore"
return 0
fi
echo "MOLECULE-RESTORE: mounted $SECONDARY_DEV at $MOUNT_POINT (ro)"
# rsync the allowlist. -a preserves perms/owner/times/symlinks;
# --delete makes restore authoritative (a file removed from the
# prior workspace is also removed from the new one); -x stays on
# one filesystem (defensive against bind-mounts on the source).
#
# Source paths on the snapshot must match prod root layout. The
# workspace EC2's root filesystem mirrors a normal Linux root, so
# /configs lives at $MOUNT_POINT/configs and so on.
local RESTORE_PATHS="configs workspace home/agent/.claude"
local rsync_failed=0
for rel in $RESTORE_PATHS; do
local SRC="$MOUNT_POINT/$rel"
local DST="/$rel"
if [ ! -d "$SRC" ]; then
echo "MOLECULE-RESTORE: source $SRC absent — skipping (likely the prior workspace never wrote it)"
continue
fi
# Ensure dest parent exists. For /home/agent/.claude the parent
# is /home/agent which is created by useradd; for /configs and
# /workspace they're volume mount points the platform creates.
mkdir -p "$(dirname "$DST")"
echo "MOLECULE-RESTORE: rsync $SRC/ -> $DST/"
# Capture rsync's REAL exit code. A naive `rsync ... | sed`
# pipeline returns sed's exit code (0), masking rsync failures
# — under #!/bin/sh there's no PIPESTATUS, so we route rsync's
# output through a tempfile and read $? directly. The
# entrypoint-restore unit test caught this: without it,
# "MOLECULE-RESTORE: ok" prints even when rsync errors.
rsync_log="/tmp/molecule-restore-rsync.$$.log"
rsync -aHAX --delete --numeric-ids "$SRC/" "$DST/" >"$rsync_log" 2>&1
rsync_rc=$?
sed 's/^/MOLECULE-RESTORE: /' "$rsync_log" 2>/dev/null
rm -f "$rsync_log"
if [ "$rsync_rc" -eq 0 ]; then
echo "MOLECULE-RESTORE: ok $DST"
else
echo "MOLECULE-RESTORE: WARN rsync to $DST exited $rsync_rc — workspace may be partially restored"
rsync_failed=1
fi
done
# Leave the mount in place — operator audit evidence, and the
# secondary volume costs us nothing more (DOT=true at next
# terminate). Unmount would re-introduce an "is the volume
# actually attached?" failure mode for no operational gain.
# Drop marker so subsequent container restarts skip. Even if rsync
# had partial failures we drop the marker — re-running rsync would
# NOT recover (the source is the same) and would just spend time on
# every restart. Operator sees the WARN in docker logs and decides
# whether to manually rm the marker + restart for a retry.
: > "$MARKER"
if [ "$rsync_failed" -eq 0 ]; then
echo "MOLECULE-RESTORE: complete — marker $MARKER dropped"
else
echo "MOLECULE-RESTORE: complete with WARNINGS — marker $MARKER dropped; rm marker + restart for retry"
fi
}
if [ "$(id -u)" = "0" ]; then
# Restore-on-recreate runs FIRST in the root branch — before any
# chown — so rsync's preserved ownership doesn't immediately get
# re-chowned by the agent-ownership step. (The chown is still
# needed for the no-restore case + for any subdir rsync didn't
# touch.) See restore_from_secondary_volume() above for the
# contract.
restore_from_secondary_volume
# Configs volume is created by Docker as root; agent needs write access
# for plugin installs, memory writes, .auth_token rotation, etc.
#
# T4 atomic-co-sequencing contract (RFC internal#456 §10): the T4
# escalation leg (sudo NOPASSWD + docker group, baked in the
# Dockerfile) is ADDITIVE. The agent still runs uid-1000 and
# /configs/.auth_token MUST remain agent-owned — escalation must
# NOT regress the Hermes list_peers-401 token-ownership class.
# This chown -R is the agent-ownership half of that contract; the
# Layer-3 conformance gate asserts owner_uid==1000 on the running
# container alongside the host-root-reach assertion.
chown -R agent:agent /configs 2>/dev/null
# /workspace handling — only chown when the contents are root-owned
# (typical on Docker Desktop on Windows where host uid maps to 0).
# On Linux Docker with matching uids the recursive chown is skipped
# to keep startup fast.
chown agent:agent /workspace 2>/dev/null || true
if [ -d /workspace ]; then
first_entry=$(find /workspace -mindepth 1 -maxdepth 1 -print -quit 2>/dev/null)
if [ -n "$first_entry" ] && [ "$(stat -c '%u' "$first_entry" 2>/dev/null)" = "0" ]; then
chown -R agent:agent /workspace 2>/dev/null
fi
# Pre-create /workspace/.molecule/chat-uploads so the upload
# handler in workspace/internal_chat_uploads.py never has to
# mkdir as agent inside a root-owned tree. Without this the
# first upload after a fresh provision fails with "failed to
# prepare uploads dir" because the volume mount comes up with
# root-owned `.molecule` whenever a sibling subsystem (e.g. an
# adapter writing telemetry, or a workspace runtime that ran
# before the chown landed) raced ahead. Idempotent: a re-run
# finds the dir already there, mode 0755 / agent:agent.
mkdir -p /workspace/.molecule/chat-uploads 2>/dev/null || true
chown -R agent:agent /workspace/.molecule 2>/dev/null || true
fi
# Claude Code session directory — mounted at /root/.claude/sessions by
# the platform provisioner. Symlink it into agent's home so the SDK
# finds it when running as agent. The provisioner's mount point is
# hardcoded to /root/.claude/sessions; we don't want to change the
# platform contract just for this template.
#
# NOTE (T4 perms regression): on FIRST boot the host volume mount for
# /home/agent/.claude doesn't exist yet — entrypoint creates it and
# the chown lands inside the `if -d /root/.claude/sessions` guard.
# On SECOND boot with a populated /home/agent/.claude (sessions/,
# session-env/, settings.json — any of which the SDK or agent has
# written between boots) the dir may already be root-owned because
# the SDK's working files inherited root's uid when written under
# the prior root segment of an earlier entrypoint, OR because a
# newer claude-code release writes new subdirs we don't create here.
# That leaves uid-1000 agent EPERMing on every settings/session write
# ("permission restrictions" surfaced to the canvas as a generic
# Bash failure). Fix: create the well-known subdirs idempotently
# and run the chown unconditionally (no-op when ownership is already
# correct, fast on small trees). Stub ~/.claude/settings.json too so
# the agent's introspection (cat ~/.claude/settings.json) succeeds
# and shows operating mode — bypassPermissions is the canonical
# mode set programmatically by claude_sdk_executor.py.
mkdir -p /home/agent/.claude/sessions /home/agent/.claude/session-env
if [ ! -f /home/agent/.claude/settings.json ]; then
cat > /home/agent/.claude/settings.json <<'EOF'
{
"permissions": {"defaultMode": "bypassPermissions"},
"_note": "Mode is also set programmatically by claude_sdk_executor.py (permission_mode='bypassPermissions'); this file is informational and lets `cat ~/.claude/settings.json` succeed."
}
EOF
fi
chown -R agent:agent /home/agent/.claude 2>/dev/null
if [ -d /root/.claude/sessions ]; then
chown -R agent:agent /root/.claude 2>/dev/null
ln -sfn /root/.claude/sessions /home/agent/.claude/sessions
fi
# Optional GitHub mirror credential helper setup.
# GitHub is mirror-only for Molecule; keep this disabled unless an
# operator explicitly opts a workspace into mirror credentials.
if [ "${ENABLE_GITHUB_MIRROR_CREDENTIALS:-false}" = "true" ] && [ -x /app/scripts/molecule-git-token-helper.sh ]; then
git config --global "credential.https://github.com.helper" \
"!/app/scripts/molecule-git-token-helper.sh"
git config --global "credential.https://github.com.useHttpPath" true
if [ -f /root/.gitconfig ]; then
cp /root/.gitconfig /home/agent/.gitconfig
chown agent:agent /home/agent/.gitconfig
fi
fi
mkdir -p /home/agent/.molecule-token-cache
chown agent:agent /home/agent/.molecule-token-cache
chmod 700 /home/agent/.molecule-token-cache
mkdir -p /home/agent/Downloads
chown agent:agent /home/agent/Downloads
exec gosu agent "$0" "$@"
fi
# Now running as agent (uid 1000)
# Optional background token refresh daemon for GitHub mirror credentials.
if [ "${ENABLE_GITHUB_MIRROR_CREDENTIALS:-false}" = "true" ] && [ -x /app/scripts/molecule-gh-token-refresh.sh ]; then
nohup bash -c '
while true; do
/app/scripts/molecule-gh-token-refresh.sh
rc=$?
echo "[molecule-gh-token-refresh] daemon exited rc=$rc — respawning in 30s" >&2
sleep 30
done
' > /home/agent/.gh-token-refresh.log 2>&1 &
fi
# Third-party provider routing is now handled by adapter.py at boot —
# it reads the `providers:` registry from /configs/config.yaml and sets
# ANTHROPIC_BASE_URL based on the picked MODEL. Adding a new provider
# is a one-line YAML edit (see config.yaml's `providers:` section).
# Operator-set ANTHROPIC_BASE_URL still wins as the escape hatch for
# regional endpoints (e.g. Xiaomi's token-plan-sgp.*, MiniMax's
# api.minimaxi.com China endpoint).
exec molecule-runtime "$@"