diff --git a/.gitea/scripts/lint-e2e-ki013-container-names.sh b/.gitea/scripts/lint-e2e-ki013-container-names.sh new file mode 100755 index 000000000..2fcfeaa82 --- /dev/null +++ b/.gitea/scripts/lint-e2e-ki013-container-names.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# Drift-prevention guard: SEV #2499 class (KI-013 container/volume naming). +# +# KI-013 removed 12-char UUID truncation from container/volume names. +# E2E scripts must use FULL workspace IDs (ws-${WSID}) when referencing +# containers and volumes. Any ${VAR:0:12} truncation in a ws-* context +# is a regression risk. +# +# Run: bash .gitea/scripts/lint-e2e-ki013-container-names.sh +set -euo pipefail + +PAT=':0:12([^0-9]|$)' +ERR=0 + +for f in tests/e2e/*.sh; do + # Allow :0:12 when it is NOT inside a ws-* container/volume reference. + # The grep looks for ws- followed anywhere on the same line by ${*:0:12. + MATCHES=$(grep -nE "$PAT" "$f" 2>/dev/null || true) + if [ -n "$MATCHES" ]; then + echo "::error::SEV-2499 drift guard: truncated workspace ID in container/volume name" + echo "::error::file=$f" + echo "$MATCHES" | while read -r line; do + echo "::error:: $line" + done + ERR=1 + fi +done + +if [ "$ERR" -ne 0 ]; then + echo "" + echo "FAIL: E2E scripts reference containers/volumes with 12-char truncated IDs." + echo " KI-013 requires FULL workspace IDs. Update the flagged lines." + exit 1 +fi + +echo "PASS: No truncated workspace IDs in E2E container/volume references." diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 6555b80b6..d2e18494e 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -404,6 +404,14 @@ jobs: run: | bash scripts/test-promote-tenant-image.sh + - if: ${{ needs.changes.outputs.scripts == 'true' }} + name: Drift guard — KI-013 container/volume naming (SEV #2499) + # KI-013 removed 12-char UUID truncation from container/volume names. + # E2E scripts must use FULL workspace IDs. This fail-closed guard + # prevents regressions where a new/modified script reintroduces the + # old truncated-name pattern (the root cause of SEV #2499). + run: bash .gitea/scripts/lint-e2e-ki013-container-names.sh + - if: ${{ needs.changes.outputs.scripts == 'true' }} name: Shellcheck promote-tenant-image script # scripts/ is excluded from the bulk shellcheck pass above (legacy diff --git a/tests/e2e/test_chat_attachments_e2e.sh b/tests/e2e/test_chat_attachments_e2e.sh index 3d352f053..98bf3ad9b 100755 --- a/tests/e2e/test_chat_attachments_e2e.sh +++ b/tests/e2e/test_chat_attachments_e2e.sh @@ -76,7 +76,7 @@ fi log "Step 3 — Seed a file inside /workspace and ask agent to reference it" # Relies on /workspace being writable by the platform (we copy as root via # docker exec, mimicking the path a real agent would use through its tools). -CONTAINER=$(docker ps --format '{{.Names}}' | grep -E "^ws-${WSID:0:12}" | head -1) +CONTAINER=$(docker ps --format '{{.Names}}' | grep -E "^ws-${WSID}" | head -1) [ -n "$CONTAINER" ] || { echo "container not found"; exit 1; } docker exec "$CONTAINER" sh -c 'echo "E2E report body $(date -u +%s)" > /workspace/e2e-report.txt' diff --git a/tests/e2e/test_chat_attachments_multiruntime_e2e.sh b/tests/e2e/test_chat_attachments_multiruntime_e2e.sh index 3d93e1860..a17b70de9 100755 --- a/tests/e2e/test_chat_attachments_multiruntime_e2e.sh +++ b/tests/e2e/test_chat_attachments_multiruntime_e2e.sh @@ -145,7 +145,7 @@ check_runtime() { fails=$((fails + 1)); return fi local container - container=$(docker ps --format '{{.Names}}' | grep -E "^ws-${wsid:0:12}" | head -1) + container=$(docker ps --format '{{.Names}}' | grep -E "^ws-${wsid}" | head -1) [ -z "$container" ] && { echo "FAIL $label: container not found"; fails=$((fails + 1)); return; } has_patch_in_container "$container" || { echo "FAIL $label: platform helpers missing"; fails=$((fails + 1)); return; } diff --git a/tests/e2e/test_claude_code_e2e.sh b/tests/e2e/test_claude_code_e2e.sh index 30ec73ded..49ab5708f 100755 --- a/tests/e2e/test_claude_code_e2e.sh +++ b/tests/e2e/test_claude_code_e2e.sh @@ -94,8 +94,8 @@ check_contains "Upload child prompt" "replaced" "$CHILD_UPLOAD" # Verify prompts in containers sleep 2 -ROOT_CONTAINER=$(docker ps --filter "name=ws-${ROOT:0:12}" -q | head -1) -CHILD_CONTAINER=$(docker ps --filter "name=ws-${CHILD:0:12}" -q | head -1) +ROOT_CONTAINER=$(docker ps --filter "name=ws-${ROOT}" -q | head -1) +CHILD_CONTAINER=$(docker ps --filter "name=ws-${CHILD}" -q | head -1) ROOT_HAS_PROMPT=$(docker exec $ROOT_CONTAINER cat /configs/system-prompt.md 2>/dev/null | head -1) check_contains "Root container has prompt" "Root Agent" "$ROOT_HAS_PROMPT" diff --git a/tests/e2e/test_comprehensive_e2e.sh b/tests/e2e/test_comprehensive_e2e.sh index 96370e26c..0c57ae260 100755 --- a/tests/e2e/test_comprehensive_e2e.sh +++ b/tests/e2e/test_comprehensive_e2e.sh @@ -153,19 +153,17 @@ RT_HM_ID=$(echo "$R" | jq_extract "['id']") # Wait for containers to start (poll up to 30s for first one to appear) if command -v docker &>/dev/null; then - short_cc="${RT_CC_ID:0:12}" for _ in 1 2 3 4 5 6; do sleep 5 - if docker inspect "ws-${short_cc}" >/dev/null 2>&1; then break; fi + if docker inspect "ws-${RT_CC_ID}" >/dev/null 2>&1; then break; fi done _check_image() { local ws_id="$1" expected_tag="$2" label="$3" - local short_id="${ws_id:0:12}" # Poll up to 30s for image to appear local actual_image="NOT_FOUND" for _ in 1 2 3 4 5 6; do - actual_image=$(docker inspect "ws-${short_id}" --format '{{.Config.Image}}' 2>/dev/null || echo "NOT_FOUND") + actual_image=$(docker inspect "ws-${ws_id}" --format '{{.Config.Image}}' 2>/dev/null || echo "NOT_FOUND") if echo "$actual_image" | grep -qF "$expected_tag"; then break; fi sleep 5 done @@ -216,10 +214,9 @@ if echo "$R" | grep -qF "saved"; then curl -s -X POST "$BASE/workspaces/$RT_CX_ID/restart" > /dev/null 2>&1 # Poll up to 30s for the new container image to appear (restart can take a while) if command -v docker &>/dev/null; then - short_id="${RT_CX_ID:0:12}" for _ in 1 2 3 4 5 6; do sleep 5 - actual=$(docker inspect "ws-${short_id}" --format '{{.Config.Image}}' 2>/dev/null || echo "") + actual=$(docker inspect "ws-${RT_CX_ID}" --format '{{.Config.Image}}' 2>/dev/null || echo "") if echo "$actual" | grep -qF "openclaw"; then break; fi done _check_image "$RT_CX_ID" "openclaw" "Runtime change codex to openclaw on restart" diff --git a/tests/e2e/test_local_provision_lifecycle_e2e.sh b/tests/e2e/test_local_provision_lifecycle_e2e.sh index dc985d4cf..1fa325799 100755 --- a/tests/e2e/test_local_provision_lifecycle_e2e.sh +++ b/tests/e2e/test_local_provision_lifecycle_e2e.sh @@ -191,8 +191,29 @@ except Exception: } container_running() { # container_running -> echoes name if running - local short="${1:0:12}" - docker ps --filter "name=ws-${short}" --filter "status=running" --format '{{.Names}}' 2>/dev/null | head -1 + docker ps --filter "name=ws-${1}" --filter "status=running" --format '{{.Names}}' 2>/dev/null | head -1 +} + +diagnose_provision() { + local wsid="${1:-}" + local container + container=$(container_running "$wsid") + echo "--- DIAGNOSE provisioning for $wsid ---" + echo "last_sample_error: ${LAST:-}" + echo "container_running: ${container:-}" + if [ -n "$container" ]; then + echo "--- container logs ($container) ---" + docker logs "$container" 2>&1 | tail -n 60 || true + echo "--- container env ---" + docker inspect "$container" --format '{{json .Config.Env}}' 2>&1 || true + echo "--- container reachability test ---" + docker exec "$container" sh -c 'echo "platform_url=$PLATFORM_URL"; wget -qO- "$PLATFORM_URL/health" 2>&1 || true' || true + fi + echo "--- all ws-* containers ---" + docker ps --filter "name=ws-" --format '{{.Names}} {{.Status}}' 2>/dev/null || true + echo "--- all ws-* volumes ---" + docker volume ls -q 2>/dev/null | grep '^ws-' || true + echo "--- end diagnose ---" } cleanup() { @@ -203,16 +224,11 @@ cleanup() { # SCOPED teardown — only the workspace this test created. Never a blanket # sweep (other dev workspaces may be live on this shared daemon). e2e_delete_workspace "$WSID" "" >/dev/null 2>&1 || true - local short="${WSID:0:12}" - docker rm -f "ws-${short}" >/dev/null 2>&1 || true - # Volume naming is split in the provisioner: configs + claude-sessions use the - # 12-char short id (ConfigVolumeName/ClaudeSessionVolumeName), but the - # /workspace volume uses the FULL UUID (buildWorkspaceMount: ws--workspace). - # Remove BOTH forms so neither leaks. + docker rm -f "ws-${WSID}" >/dev/null 2>&1 || true docker volume rm -f \ - "ws-${short}-configs" "ws-${short}-claude-sessions" \ - "ws-${short}-workspace" "ws-${WSID}-workspace" >/dev/null 2>&1 || true - echo "cleaned workspace $WSID + ws-${short} container/volumes" + "ws-${WSID}-configs" "ws-${WSID}-claude-sessions" \ + "ws-${WSID}-workspace" >/dev/null 2>&1 || true + echo "cleaned workspace $WSID + ws-${WSID} container/volumes" fi # Restore the cache tag to whatever it pointed at before we retagged it, so a # stub run doesn't leave the real claude-code tag aliased to the stub. @@ -331,8 +347,7 @@ if [ -z "$WSID" ]; then exit 1 fi pass "workspace created: $WSID" -SHORT="${WSID:0:12}" -CONFIG_VOL="ws-${SHORT}-configs" +CONFIG_VOL="ws-${WSID}-configs" # Mint a workspace bearer for the WorkspaceAuth-gated secret + /restart calls. WTOKEN=$(e2e_mint_workspace_token "$WSID" || true) @@ -436,8 +451,9 @@ for _ in $(seq 1 "$ONLINE_TIMEOUT"); do sleep 1 done check "workspace reached online (status=$STATUS)" "online" "$STATUS" +if [ "$FAIL" -gt 0 ]; then diagnose_provision "$WSID"; echo "=== Results: $PASS passed, $FAIL failed ==="; exit 1; fi RUN=$(container_running "$WSID") -if [ -n "$RUN" ]; then pass "container running: $RUN"; else fail "no running ws-${WSID:0:12} container" "docker ps shows none"; fi +if [ -n "$RUN" ]; then pass "container running: $RUN"; else fail "no running ws-${WSID} container" "docker ps shows none"; fi echo "" # ---------------------------------------------------------------------------- @@ -473,6 +489,7 @@ else sleep 1 done check "workspace back online after restart (status=$STATUS)" "online" "$STATUS" + if [ "$FAIL" -gt 0 ]; then diagnose_provision "$WSID"; echo "=== Results: $PASS passed, $FAIL failed ==="; exit 1; fi # Explicit negative on the exact bug signature. if echo "$LAST" | grep -qiF "config volume is empty"; then fail "restart hit 'config volume is empty' — restart-survival REGRESSION" "$LAST" diff --git a/workspace-server/internal/handlers/platform_agent.go b/workspace-server/internal/handlers/platform_agent.go index 07412ecf3..23ea4cefd 100644 --- a/workspace-server/internal/handlers/platform_agent.go +++ b/workspace-server/internal/handlers/platform_agent.go @@ -33,6 +33,7 @@ import ( "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db" "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/models" + "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/provisioner" "github.com/gin-gonic/gin" "github.com/google/uuid" ) @@ -239,7 +240,7 @@ func (h *WorkspaceHandler) applyConciergeProvisionConfig( } } if len(base) == 0 && h.provisioner != nil { - if b, err := h.provisioner.ExecRead(ctx, configDirName(workspaceID), "/configs/config.yaml"); err == nil { + if b, err := h.provisioner.ExecRead(ctx, provisioner.ContainerName(workspaceID), "/configs/config.yaml"); err == nil { base = b } } @@ -399,7 +400,7 @@ func conciergeIdentityPresent(ctx context.Context, prov localProvisionerIsRunnin // that doesn't expose ExecRead. return true } - body, err := reader.ExecRead(ctx, configDirName(id), "/configs/system-prompt.md") + body, err := reader.ExecRead(ctx, provisioner.ContainerName(id), "/configs/system-prompt.md") if err != nil { return false } diff --git a/workspace-server/internal/handlers/workspace_restart.go b/workspace-server/internal/handlers/workspace_restart.go index 4006912ed..10ae642bb 100644 --- a/workspace-server/internal/handlers/workspace_restart.go +++ b/workspace-server/internal/handlers/workspace_restart.go @@ -15,6 +15,7 @@ import ( "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db" "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/events" "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/models" + "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/provisioner" "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/provlog" "github.com/gin-gonic/gin" ) @@ -393,7 +394,7 @@ func (h *WorkspaceHandler) restartRuntimeFromConfig(ctx context.Context, id, wsN return dbRuntime } containerRuntime := dbRuntime - containerName := configDirName(id) // ws-{id[:12]} + containerName := provisioner.ContainerName(id) // ws-{id} (KI-013 full UUID) if cfgBytes, readErr := h.provisioner.ExecRead(ctx, containerName, "/configs/config.yaml"); readErr == nil { for _, line := range strings.Split(string(cfgBytes), "\n") { line = strings.TrimSpace(line)