91008a54f1
CI / Template validation (static) (push) Successful in 18s
publish-image / Resolve runtime version (push) Successful in 7s
Secret scan / Scan diff for credential-shaped strings (push) Successful in 7s
CI / Adapter unit tests (push) Successful in 22s
publish-image / Build & push workspace-template-claude-code image (push) Successful in 10m24s
CI / Template validation (runtime) (push) Successful in 10m31s
CI / T4 tier-4 conformance (live) (push) Successful in 9m57s
CI / validate (push) Successful in 2s
Co-authored-by: hongming <hongmingwang@moleculesai.app> Co-committed-by: hongming <hongmingwang@moleculesai.app>
317 lines
16 KiB
Bash
317 lines
16 KiB
Bash
#!/bin/sh
|
|
# Drop privileges to the agent user before exec'ing molecule-runtime.
|
|
# claude-code refuses --dangerously-skip-permissions when running as
|
|
# root/sudo for safety. Without this entrypoint, every cron tick fails
|
|
# with `ProcessError: Command failed with exit code 1` and the agent
|
|
# logs `--dangerously-skip-permissions cannot be used with root/sudo
|
|
# privileges for security reasons`.
|
|
#
|
|
# Pattern matches the legacy monorepo workspace-template/entrypoint.sh:
|
|
# fix volume ownership as root, then re-exec via gosu as agent (uid 1000).
|
|
|
|
# Boot-context snapshot — emitted on EVERY container start, including
|
|
# every restart of a crash-loop. Lets `docker logs` answer "what env
|
|
# was actually present?" without having to docker exec into a dying
|
|
# container. Logs NAMES of auth-relevant env vars, never VALUES. Fires
|
|
# twice (once as root pre-gosu, once as agent post-gosu) so an operator
|
|
# can see whether a value was lost across the privilege drop.
|
|
# Keep the env-name list in sync with adapter.py's _AUTH_ENV_AUDIT —
|
|
# the same set of vendors should be audited from both sides.
|
|
log_boot_context() {
|
|
echo "----- entrypoint boot $(date -u +%Y-%m-%dT%H:%M:%SZ) -----"
|
|
echo "uid=$(id -u) gid=$(id -g) user=$(id -un 2>/dev/null || echo unknown)"
|
|
echo "hostname=$(hostname) workspace_id=${WORKSPACE_ID:-<unset>}"
|
|
echo "platform_url=${PLATFORM_URL:-<unset>}"
|
|
echo "configs_dir: $(ls -ld /configs 2>/dev/null || echo MISSING)"
|
|
echo "configs_contents: $(ls /configs 2>/dev/null | tr '\n' ' ' || echo MISSING)"
|
|
echo "workspace_dir: $(ls -ld /workspace 2>/dev/null || echo MISSING)"
|
|
# Auth env presence (NAMES + set/unset only — never the values).
|
|
# Mirror of _AUTH_ENV_AUDIT in adapter.py — keep in sync if you add a vendor.
|
|
for var in CLAUDE_CODE_OAUTH_TOKEN ANTHROPIC_API_KEY ANTHROPIC_AUTH_TOKEN ANTHROPIC_BASE_URL MINIMAX_API_KEY GLM_API_KEY KIMI_API_KEY DEEPSEEK_API_KEY; do
|
|
eval "val=\$$var"
|
|
if [ -n "$val" ]; then
|
|
echo "env $var=set"
|
|
else
|
|
echo "env $var=unset"
|
|
fi
|
|
done
|
|
echo "------------------------------------------------"
|
|
}
|
|
log_boot_context
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Restore-on-recreate from secondary EBS volume (cp#326 Option D).
|
|
#
|
|
# Contract with CP: when ProvisionWorkspace finds a non-expired backup
|
|
# snapshot for this WorkspaceID, it attaches the snapshot as a SECONDARY
|
|
# EBS volume at /dev/xvdb at launch (DeleteOnTermination=true). This
|
|
# function mounts that volume on first container boot and rsyncs the
|
|
# restore set (/configs, /workspace, /home/agent/.claude) from it back
|
|
# into the root filesystem, then drops a marker so subsequent container
|
|
# restarts (within the same EC2's lifetime) skip the restore.
|
|
#
|
|
# Why cp#326 needs this: AWS rejects ANY SnapshotId on the ROOT device
|
|
# at RunInstances time with "InvalidBlockDeviceMapping: snapshotId
|
|
# cannot be modified on root device". The cp#301 architecture (override
|
|
# the AMI's root snapshot) is impossible per AWS spec. Option D works
|
|
# WITH AWS's model — secondary volumes accept SnapshotId — and rsync
|
|
# bridges the data-plane gap.
|
|
#
|
|
# Operational contract:
|
|
# - Idempotent: a marker at /configs/.restore-completed gates re-runs.
|
|
# If the container restarts in the same EC2 (DOT=true so the volume
|
|
# persists across container restarts but NOT EC2 terminate), the
|
|
# restore skips. If the EC2 is terminated + replaced, /configs is
|
|
# fresh and the marker is gone — restore runs on the new EC2.
|
|
# - Best-effort: any failure (volume absent, fs unreadable, rsync
|
|
# error) is LOGGED with MOLECULE-RESTORE: prefix but does NOT abort
|
|
# the boot. The workspace comes up with empty state — the explicit
|
|
# no-restore branch the user already accepts on first-time provision.
|
|
# - Read-only mount on the secondary at /mnt/restore so a defective
|
|
# filesystem can't corrupt our root.
|
|
# - All log lines prefixed `MOLECULE-RESTORE:` so `docker logs <id>
|
|
# 2>&1 | grep MOLECULE-RESTORE` is the operator's one-liner debug.
|
|
#
|
|
# Path allowlist (NOT a blanket /mnt/restore -> / rsync — that would
|
|
# also restore /etc/passwd, /var/lib/docker, etc. which are container-
|
|
# managed):
|
|
# - /configs/ (config.yaml, .auth_token, skills/, memory)
|
|
# - /workspace/ (the shared codebase + agent's working files)
|
|
# - /home/agent/.claude/ (Claude SDK session state, settings.json)
|
|
#
|
|
# If a future template adds another persistent path (e.g. /home/agent/.cache),
|
|
# add it to RESTORE_PATHS below AND ensure the corresponding source path
|
|
# exists in the snapshot. Keep the list narrow on purpose — the alternative
|
|
# (full / rsync with exclusions) trades blast-radius safety for convenience.
|
|
restore_from_secondary_volume() {
|
|
local SECONDARY_DEV="/dev/xvdb"
|
|
local MOUNT_POINT="/mnt/restore"
|
|
local MARKER="/configs/.restore-completed"
|
|
|
|
# Marker present = restore already done for this EC2's lifetime.
|
|
# Cheapest possible idempotency check; runs before any blockdev probe.
|
|
if [ -f "$MARKER" ]; then
|
|
echo "MOLECULE-RESTORE: marker $MARKER present — skipping (already restored on this EC2)"
|
|
return 0
|
|
fi
|
|
|
|
# No secondary device = nothing to restore (first-time provision or
|
|
# no backup snapshot existed). NOT an error.
|
|
if [ ! -b "$SECONDARY_DEV" ]; then
|
|
echo "MOLECULE-RESTORE: no $SECONDARY_DEV — first-time provision or no backup snapshot, skipping"
|
|
return 0
|
|
fi
|
|
|
|
echo "MOLECULE-RESTORE: $SECONDARY_DEV detected — attempting restore"
|
|
|
|
# Probe filesystem type. If blkid fails (raw/unformatted volume), we
|
|
# skip; if the fs type is something we can't mount safely, we skip.
|
|
local FSTYPE
|
|
FSTYPE=$(blkid -s TYPE -o value "$SECONDARY_DEV" 2>/dev/null || echo "")
|
|
if [ -z "$FSTYPE" ]; then
|
|
echo "MOLECULE-RESTORE: WARN no fs detected on $SECONDARY_DEV (raw/unformatted) — skipping"
|
|
return 0
|
|
fi
|
|
echo "MOLECULE-RESTORE: $SECONDARY_DEV fstype=$FSTYPE"
|
|
|
|
# Mount read-only. ro prevents a corrupt fs from being modified by
|
|
# mount-time journal replay AND blocks any rsync mistake from
|
|
# writing to the source.
|
|
mkdir -p "$MOUNT_POINT"
|
|
if ! mount -o ro "$SECONDARY_DEV" "$MOUNT_POINT" 2>&1 | sed 's/^/MOLECULE-RESTORE: mount: /'; then
|
|
# mount(8) writes to stderr on success too via -v; we don't pass -v
|
|
# so a non-zero from the pipeline means the mount itself failed.
|
|
:
|
|
fi
|
|
if ! mountpoint -q "$MOUNT_POINT"; then
|
|
echo "MOLECULE-RESTORE: WARN mount of $SECONDARY_DEV failed — skipping restore"
|
|
return 0
|
|
fi
|
|
echo "MOLECULE-RESTORE: mounted $SECONDARY_DEV at $MOUNT_POINT (ro)"
|
|
|
|
# rsync the allowlist. -a preserves perms/owner/times/symlinks;
|
|
# --delete makes restore authoritative (a file removed from the
|
|
# prior workspace is also removed from the new one); -x stays on
|
|
# one filesystem (defensive against bind-mounts on the source).
|
|
#
|
|
# Source paths on the snapshot must match prod root layout. The
|
|
# workspace EC2's root filesystem mirrors a normal Linux root, so
|
|
# /configs lives at $MOUNT_POINT/configs and so on.
|
|
local RESTORE_PATHS="configs workspace home/agent/.claude"
|
|
local rsync_failed=0
|
|
for rel in $RESTORE_PATHS; do
|
|
local SRC="$MOUNT_POINT/$rel"
|
|
local DST="/$rel"
|
|
if [ ! -d "$SRC" ]; then
|
|
echo "MOLECULE-RESTORE: source $SRC absent — skipping (likely the prior workspace never wrote it)"
|
|
continue
|
|
fi
|
|
# Ensure dest parent exists. For /home/agent/.claude the parent
|
|
# is /home/agent which is created by useradd; for /configs and
|
|
# /workspace they're volume mount points the platform creates.
|
|
mkdir -p "$(dirname "$DST")"
|
|
|
|
echo "MOLECULE-RESTORE: rsync $SRC/ -> $DST/"
|
|
# Capture rsync's REAL exit code. A naive `rsync ... | sed`
|
|
# pipeline returns sed's exit code (0), masking rsync failures
|
|
# — under #!/bin/sh there's no PIPESTATUS, so we route rsync's
|
|
# output through a tempfile and read $? directly. The
|
|
# entrypoint-restore unit test caught this: without it,
|
|
# "MOLECULE-RESTORE: ok" prints even when rsync errors.
|
|
rsync_log="/tmp/molecule-restore-rsync.$$.log"
|
|
rsync -aHAX --delete --numeric-ids "$SRC/" "$DST/" >"$rsync_log" 2>&1
|
|
rsync_rc=$?
|
|
sed 's/^/MOLECULE-RESTORE: /' "$rsync_log" 2>/dev/null
|
|
rm -f "$rsync_log"
|
|
if [ "$rsync_rc" -eq 0 ]; then
|
|
echo "MOLECULE-RESTORE: ok $DST"
|
|
else
|
|
echo "MOLECULE-RESTORE: WARN rsync to $DST exited $rsync_rc — workspace may be partially restored"
|
|
rsync_failed=1
|
|
fi
|
|
done
|
|
|
|
# Leave the mount in place — operator audit evidence, and the
|
|
# secondary volume costs us nothing more (DOT=true at next
|
|
# terminate). Unmount would re-introduce an "is the volume
|
|
# actually attached?" failure mode for no operational gain.
|
|
|
|
# Drop marker so subsequent container restarts skip. Even if rsync
|
|
# had partial failures we drop the marker — re-running rsync would
|
|
# NOT recover (the source is the same) and would just spend time on
|
|
# every restart. Operator sees the WARN in docker logs and decides
|
|
# whether to manually rm the marker + restart for a retry.
|
|
: > "$MARKER"
|
|
if [ "$rsync_failed" -eq 0 ]; then
|
|
echo "MOLECULE-RESTORE: complete — marker $MARKER dropped"
|
|
else
|
|
echo "MOLECULE-RESTORE: complete with WARNINGS — marker $MARKER dropped; rm marker + restart for retry"
|
|
fi
|
|
}
|
|
|
|
if [ "$(id -u)" = "0" ]; then
|
|
# Restore-on-recreate runs FIRST in the root branch — before any
|
|
# chown — so rsync's preserved ownership doesn't immediately get
|
|
# re-chowned by the agent-ownership step. (The chown is still
|
|
# needed for the no-restore case + for any subdir rsync didn't
|
|
# touch.) See restore_from_secondary_volume() above for the
|
|
# contract.
|
|
restore_from_secondary_volume
|
|
|
|
# Configs volume is created by Docker as root; agent needs write access
|
|
# for plugin installs, memory writes, .auth_token rotation, etc.
|
|
#
|
|
# T4 atomic-co-sequencing contract (RFC internal#456 §10): the T4
|
|
# escalation leg (sudo NOPASSWD + docker group, baked in the
|
|
# Dockerfile) is ADDITIVE. The agent still runs uid-1000 and
|
|
# /configs/.auth_token MUST remain agent-owned — escalation must
|
|
# NOT regress the Hermes list_peers-401 token-ownership class.
|
|
# This chown -R is the agent-ownership half of that contract; the
|
|
# Layer-3 conformance gate asserts owner_uid==1000 on the running
|
|
# container alongside the host-root-reach assertion.
|
|
chown -R agent:agent /configs 2>/dev/null
|
|
# /workspace handling — only chown when the contents are root-owned
|
|
# (typical on Docker Desktop on Windows where host uid maps to 0).
|
|
# On Linux Docker with matching uids the recursive chown is skipped
|
|
# to keep startup fast.
|
|
chown agent:agent /workspace 2>/dev/null || true
|
|
if [ -d /workspace ]; then
|
|
first_entry=$(find /workspace -mindepth 1 -maxdepth 1 -print -quit 2>/dev/null)
|
|
if [ -n "$first_entry" ] && [ "$(stat -c '%u' "$first_entry" 2>/dev/null)" = "0" ]; then
|
|
chown -R agent:agent /workspace 2>/dev/null
|
|
fi
|
|
# Pre-create /workspace/.molecule/chat-uploads so the upload
|
|
# handler in workspace/internal_chat_uploads.py never has to
|
|
# mkdir as agent inside a root-owned tree. Without this the
|
|
# first upload after a fresh provision fails with "failed to
|
|
# prepare uploads dir" because the volume mount comes up with
|
|
# root-owned `.molecule` whenever a sibling subsystem (e.g. an
|
|
# adapter writing telemetry, or a workspace runtime that ran
|
|
# before the chown landed) raced ahead. Idempotent: a re-run
|
|
# finds the dir already there, mode 0755 / agent:agent.
|
|
mkdir -p /workspace/.molecule/chat-uploads 2>/dev/null || true
|
|
chown -R agent:agent /workspace/.molecule 2>/dev/null || true
|
|
fi
|
|
# Claude Code session directory — mounted at /root/.claude/sessions by
|
|
# the platform provisioner. Symlink it into agent's home so the SDK
|
|
# finds it when running as agent. The provisioner's mount point is
|
|
# hardcoded to /root/.claude/sessions; we don't want to change the
|
|
# platform contract just for this template.
|
|
#
|
|
# NOTE (T4 perms regression): on FIRST boot the host volume mount for
|
|
# /home/agent/.claude doesn't exist yet — entrypoint creates it and
|
|
# the chown lands inside the `if -d /root/.claude/sessions` guard.
|
|
# On SECOND boot with a populated /home/agent/.claude (sessions/,
|
|
# session-env/, settings.json — any of which the SDK or agent has
|
|
# written between boots) the dir may already be root-owned because
|
|
# the SDK's working files inherited root's uid when written under
|
|
# the prior root segment of an earlier entrypoint, OR because a
|
|
# newer claude-code release writes new subdirs we don't create here.
|
|
# That leaves uid-1000 agent EPERMing on every settings/session write
|
|
# ("permission restrictions" surfaced to the canvas as a generic
|
|
# Bash failure). Fix: create the well-known subdirs idempotently
|
|
# and run the chown unconditionally (no-op when ownership is already
|
|
# correct, fast on small trees). Stub ~/.claude/settings.json too so
|
|
# the agent's introspection (cat ~/.claude/settings.json) succeeds
|
|
# and shows operating mode — bypassPermissions is the canonical
|
|
# mode set programmatically by claude_sdk_executor.py.
|
|
mkdir -p /home/agent/.claude/sessions /home/agent/.claude/session-env
|
|
if [ ! -f /home/agent/.claude/settings.json ]; then
|
|
cat > /home/agent/.claude/settings.json <<'EOF'
|
|
{
|
|
"permissions": {"defaultMode": "bypassPermissions"},
|
|
"_note": "Mode is also set programmatically by claude_sdk_executor.py (permission_mode='bypassPermissions'); this file is informational and lets `cat ~/.claude/settings.json` succeed."
|
|
}
|
|
EOF
|
|
fi
|
|
chown -R agent:agent /home/agent/.claude 2>/dev/null
|
|
if [ -d /root/.claude/sessions ]; then
|
|
chown -R agent:agent /root/.claude 2>/dev/null
|
|
ln -sfn /root/.claude/sessions /home/agent/.claude/sessions
|
|
fi
|
|
|
|
# Optional GitHub mirror credential helper setup.
|
|
# GitHub is mirror-only for Molecule; keep this disabled unless an
|
|
# operator explicitly opts a workspace into mirror credentials.
|
|
if [ "${ENABLE_GITHUB_MIRROR_CREDENTIALS:-false}" = "true" ] && [ -x /app/scripts/molecule-git-token-helper.sh ]; then
|
|
git config --global "credential.https://github.com.helper" \
|
|
"!/app/scripts/molecule-git-token-helper.sh"
|
|
git config --global "credential.https://github.com.useHttpPath" true
|
|
if [ -f /root/.gitconfig ]; then
|
|
cp /root/.gitconfig /home/agent/.gitconfig
|
|
chown agent:agent /home/agent/.gitconfig
|
|
fi
|
|
fi
|
|
mkdir -p /home/agent/.molecule-token-cache
|
|
chown agent:agent /home/agent/.molecule-token-cache
|
|
chmod 700 /home/agent/.molecule-token-cache
|
|
mkdir -p /home/agent/Downloads
|
|
chown agent:agent /home/agent/Downloads
|
|
|
|
exec gosu agent "$0" "$@"
|
|
fi
|
|
|
|
# Now running as agent (uid 1000)
|
|
|
|
# Optional background token refresh daemon for GitHub mirror credentials.
|
|
if [ "${ENABLE_GITHUB_MIRROR_CREDENTIALS:-false}" = "true" ] && [ -x /app/scripts/molecule-gh-token-refresh.sh ]; then
|
|
nohup bash -c '
|
|
while true; do
|
|
/app/scripts/molecule-gh-token-refresh.sh
|
|
rc=$?
|
|
echo "[molecule-gh-token-refresh] daemon exited rc=$rc — respawning in 30s" >&2
|
|
sleep 30
|
|
done
|
|
' > /home/agent/.gh-token-refresh.log 2>&1 &
|
|
fi
|
|
|
|
# Third-party provider routing is now handled by adapter.py at boot —
|
|
# it reads the `providers:` registry from /configs/config.yaml and sets
|
|
# ANTHROPIC_BASE_URL based on the picked MODEL. Adding a new provider
|
|
# is a one-line YAML edit (see config.yaml's `providers:` section).
|
|
# Operator-set ANTHROPIC_BASE_URL still wins as the escape hatch for
|
|
# regional endpoints (e.g. Xiaomi's token-plan-sgp.*, MiniMax's
|
|
# api.minimaxi.com China endpoint).
|
|
|
|
exec molecule-runtime "$@"
|