fix(platform-agent#2919): wire identity-fallback.sh into the image-baked entrypoint (#2919 sibling) #2955

Merged
devops-engineer merged 2 commits from fix/2919-sibling-identity-fallback into main 2026-06-15 22:38:37 +00:00
2 changed files with 211 additions and 31 deletions
+82 -18
View File
@@ -18,16 +18,18 @@
#
# SSOT contract (driver hard-requirement on the IMAGE-BAKED impl):
# The image-baked content (config.yaml + prompts/concierge.md +
# mcp_servers.yaml) is SOURCED FROM the platform-agent TEMPLATE REPO,
# NOT vendored/duplicated in core. A CI DRIFT-GATE
# mcp_servers.yaml + identity-fallback.sh) is SOURCED FROM the
# platform-agent TEMPLATE REPO, NOT vendored/duplicated in core. A CI
# DRIFT-GATE
# (workspace-server/internal/provisioner/platform_agent_image_drift_test.go,
# pinned against /opt/molecule-platform-agent-template/{config.yaml,
# mcp_servers.yaml,prompts/concierge.md} vs the pre-cloned
# .tenant-bundle-deps/workspace-configs-templates/platform-agent/
# source) asserts byte-equal at build time. A future drift would
# fail the drift-gate test (go test -run TestPlatformAgentImageDriftGate),
# catching it BEFORE the image is published — so image snapshot +
# template can NEVER diverge in production without a CI-red signal.
# mcp_servers.yaml,prompts/concierge.md,identity-fallback.sh} vs the
# pre-cloned .tenant-bundle-deps/workspace-configs-templates/platform-
# agent/ source) asserts byte-equal at build time. A future drift
# would fail the drift-gate test (go test -run
# TestPlatformAgentImageDriftGate), catching it BEFORE the image is
# published — so image snapshot + template can NEVER diverge in
# production without a CI-red signal.
#
# Build context: same as Dockerfile. The platform-agent template
# content is pre-cloned by scripts/clone-manifest.sh into
@@ -52,6 +54,20 @@
# channel remains the SSOT-delivery path; the image-baked copy is
# a last-resort fallback (intentionally NOT a parallel SSOT — the
# drift-gate enforces single-SSOT).
#
# The identity-fallback.sh script is the WORKING fallback (the
# IMAGE_BAKED_IDENTITY_PRESENT echo-only marker that the #2919 PR
# shipped was a log line that did nothing — this PR's companion
# template-platform-agent PR adds the real script). The platform-
# agent entrypoint sources the script at boot, BEFORE handing off
# to the base image's /entrypoint.sh. Fill-absent-only: a delivered
# /configs/* (asset-channel SSOT) is NEVER overwritten; the image-
# baked copy is the safety net for self-host + pre-#29-bootstrap
# windows where neither the asset channel nor the local template
# path is guaranteed to be available. See
# template-platform-agent #2 (identity-fallback.sh) for the script
# semantics — SRC=/opt/molecule-platform-agent-template, DST=/configs,
# fail-soft on missing SRC.
ARG BASE_IMAGE=molecule-local/platform:latest
FROM ${BASE_IMAGE}
@@ -77,14 +93,62 @@ ARG PLATFORM_AGENT_TEMPLATE_DIR=.tenant-bundle-deps/workspace-configs-templates/
COPY ${PLATFORM_AGENT_TEMPLATE_DIR}/config.yaml /opt/molecule-platform-agent-template/config.yaml
COPY ${PLATFORM_AGENT_TEMPLATE_DIR}/mcp_servers.yaml /opt/molecule-platform-agent-template/mcp_servers.yaml
COPY ${PLATFORM_AGENT_TEMPLATE_DIR}/prompts/ /opt/molecule-platform-agent-template/prompts/
# The boot-time identity-fallback script. Sourced at container start
# (see /entrypoint-platform-agent.sh below) to fill ABSENT files at
# /configs/ from the image-baked /opt path. The script is the SSOT in
# the platform-agent TEMPLATE REPO — drift-gate
# (platform_agent_image_drift_test.go) catches content drift between
# this COPY source and the image-baked destination.
#
# RCA 12124 (DRIVER-ESCALATED live prod identity): the script MUST
# write /configs/system-prompt.md (the file the
# conciergeIdentityPresent probe at platform_agent.go:399 reads) —
# NOT just /configs/prompts/concierge.md. Prior shape had a
# conditional write (`if [ ! -s "$DST/system-prompt.md" ]`) which
# could fail to fire after a partial-template run; the fixed script
# in the template-platform-agent repo (PR-side, merged to template
# main) is unconditional: always writes /configs/system-prompt.md
# from prompts/concierge.md + {{CONCIERGE_NAME}} substitution.
COPY ${PLATFORM_AGENT_TEMPLATE_DIR}/identity-fallback.sh /opt/molecule-platform-agent-template/identity-fallback.sh
RUN chmod +x /opt/molecule-platform-agent-template/identity-fallback.sh
# PLATFORM-AGENT IDENTITY (image-baked fallback) — when the asset-
# channel deliver is unavailable (self-host without
# MOLECULE_TEMPLATE_REPO_TOKEN, pre-#29-activation bootstrap), the
# concierge's identity (config.yaml + prompts/concierge.md +
# mcp_servers.yaml) is read from the image-baked path. The runtime
# must be aware of this fallback so a misconfigured operator sees
# a clear log line (NOT a silent miss).
RUN echo '#!/bin/sh' > /opt/molecule-platform-agent-template/IMAGE_BAKED_IDENTITY_PRESENT && \
echo 'echo "platform-agent: image-baked identity present at /opt/molecule-platform-agent-template/ (last-resort fallback for self-host + pre-#29-activation)" >&2' >> /opt/molecule-platform-agent-template/IMAGE_BAKED_IDENTITY_PRESENT && \
chmod +x /opt/molecule-platform-agent-template/IMAGE_BAKED_IDENTITY_PRESENT
# PLATFORM-AGENT ENTRYPOINT — runs identity-fallback.sh FIRST (fills
# absent /configs/ files from the image-baked /opt path; the
# asset-channel deliver is the SSOT post-#29-activation, this is the
# self-host + pre-#29-bootstrap safety net), then hands off to the
# base image's /entrypoint.sh (which does docker-socket group setup,
# memory-plugin sidecar spawn-gate, then exec su-exec platform
# /platform).
#
# Why a separate entrypoint (not extending /entrypoint.sh in core):
# the IMAGE-BAKED identity-fallback is a platform-agent-specific
# concern — the base /entrypoint.sh stays the single runtime entry
# for the ordinary /platform image, and the platform-agent variant
# overrides only the boot hook. The script is sourced (not exec'd)
# so a missing-script failure bubbles up cleanly (su-exec will still
# run /platform; the runtime's MISSING_MODEL fail-closed surfaces
# the operator-visible error in that case).
COPY <<'ENTRY' /entrypoint-platform-agent.sh
#!/bin/sh
# /opt/molecule-platform-agent-template/identity-fallback.sh: per-
# file copy of ABSENT files from the image-baked SSOT path to
# /configs/. The asset-channel deliver (post-#29-activation) is the
# authoritative path when it lands; this is the safety net for self-
# host + pre-#29-bootstrap windows where neither the asset channel
# nor the local template path is guaranteed to be available.
if [ -x /opt/molecule-platform-agent-template/identity-fallback.sh ]; then
/opt/molecule-platform-agent-template/identity-fallback.sh || {
echo "platform-agent: ⚠️ identity-fallback.sh failed (see prior log lines); continuing boot — runtime will MISSING_MODEL fail-closed if /configs is empty" >&2
}
else
echo "platform-agent: identity-fallback.sh not present (image built without it); skipping fallback (runtime will MISSING_MODEL fail-closed)" >&2
fi
# Hand off to the base image's entrypoint (docker-socket group
# setup, memory-plugin sidecar spawn-gate, then su-exec platform
# /platform). Pass through any CMD args (the platform-agent image
# is invoked the same way as the base — operator/CI sets CMD as
# needed; this entrypoint is transparent to the args).
exec /entrypoint.sh "$@"
ENTRY
RUN chmod +x /entrypoint-platform-agent.sh
ENTRYPOINT ["/entrypoint-platform-agent.sh"]
@@ -5,9 +5,10 @@ package provisioner
//
// The IMAGE-BAKED impl (workspace-server/Dockerfile.platform-agent)
// bakes the concierge's identity (config.yaml +
// prompts/concierge.md + mcp_servers.yaml) from the platform-agent
// TEMPLATE REPO into the platform-agent image at
// /opt/molecule-platform-agent-template/. The driver hard-requirement:
// prompts/concierge.md + mcp_servers.yaml + identity-fallback.sh)
// from the platform-agent TEMPLATE REPO into the platform-agent
// image at /opt/molecule-platform-agent-template/. The driver
// hard-requirement:
// "The image-baked config.yaml + prompts/concierge.md +
// mcp_servers.yaml MUST be SOURCED FROM the platform-agent TEMPLATE
// REPO (single SSOT = PR #1's content) — NOT vendored/duplicated in
@@ -47,16 +48,17 @@ package provisioner
// (the publish-workspace-server-image.yml workflow does this via
// the post-pre-clone test step).
//
// Test scope: the 3 files the Dockerfile COPYs (config.yaml,
// mcp_servers.yaml, prompts/concierge.md). A future concierge-
// identity change that adds a new file MUST also extend the
// expectedImageBakedFiles list here; the Dockerfile-side check
// catches the missing COPY, and the SSOT-side check (when run)
// catches the missing identity file in the template repo.
// Test scope: the 4 files the Dockerfile COPYs (config.yaml,
// mcp_servers.yaml, prompts/concierge.md, identity-fallback.sh).
// A future concierge-identity change that adds a new file MUST also
// extend the expectedImageBakedFiles list here; the Dockerfile-side
// check catches the missing COPY, and the SSOT-side check (when
// run) catches the missing identity file in the template repo.
import (
"os"
"path/filepath"
"regexp"
"strings"
"testing"
)
@@ -70,10 +72,24 @@ import (
// Paths are RELATIVE to the SSOT root (the platform-agent template
// repo). The Dockerfile's PLATFORM_AGENT_TEMPLATE_DIR build-arg
// points at this same root.
//
// The "identity-fallback.sh" entry is the boot-time per-file copy
// script (template-platform-agent #2, copied into the image and
// invoked from the platform-agent entrypoint). It's a 1st-class
// IMAGE-BAKED asset (NOT metadata / not a future change) — the
// runtime /opt→/configs fallback (workspace-runtime PR #141
// load_config) and the boot-time /opt→/configs fallback (this
// Dockerfile's entrypoint) are complementary, and BOTH need the
// image-baked copy at /opt/.../identity-fallback.sh in the build
// to close the self-host + pre-#29-bootstrap window. Listed here
// so the SSOT-side check rejects a template-repo that ships the
// script (correctly, in the platform-agent template) without the
// matching Dockerfile COPY (regression).
var expectedImageBakedFiles = []string{
"config.yaml",
"mcp_servers.yaml",
"prompts/concierge.md",
"identity-fallback.sh",
}
// isConciergeIdentityPath reports whether a path in the platform-agent
@@ -88,6 +104,9 @@ var expectedImageBakedFiles = []string{
// - "config.yaml" — runtime entrypoint config
// - "mcp_servers.yaml" — MCP wiring (overlay)
// - "prompts/*" — system prompts
// - "identity-fallback.sh" — boot-time /opt→/configs copy script
// (template-platform-agent #2, invoked
// from the platform-agent entrypoint)
//
// A future RFC that adds a new namespace (e.g. "hooks/*") MUST
// extend this function AND the Dockerfile AND expectedImageBakedFiles
@@ -96,6 +115,7 @@ func isConciergeIdentityPath(rel string) bool {
rel = filepath.ToSlash(filepath.Clean(rel))
return rel == "config.yaml" ||
rel == "mcp_servers.yaml" ||
rel == "identity-fallback.sh" ||
strings.HasPrefix(rel, "prompts/")
}
@@ -289,15 +309,110 @@ func TestPlatformAgentImageDriftGate(t *testing.T) {
}
}
// TestPlatformAgentEntrypointWiring pins the boot-time identity-
// fallback wiring. The IMAGE_BAKED_IDENTITY_PRESENT echo-marker
// that the #2919 PR shipped was a log line that did nothing — a
// partial-template / no-fetch self-host concierge would still
// MISSING_MODEL fail at runtime because /configs would be empty
// even though /opt/molecule-platform-agent-template/ had the
// content. This test pins the WIRE-UP shape that closes the gap:
//
// 1. Dockerfile.platform-agent defines a /entrypoint-platform-agent.sh
// heredoc that invokes identity-fallback.sh BEFORE handing off
// to /entrypoint.sh (the base image's entrypoint). The
// identity-fallback.sh script is the WORKING /opt→/configs
// fill-absent-only copy from template-platform-agent #2.
// 2. The Dockerfile's ENTRYPOINT directive points at the new
// /entrypoint-platform-agent.sh (NOT the base image's
// /entrypoint.sh). Otherwise the wiring is dormant — the
// fallback would never fire.
// 3. The IMAGE_BAKED_IDENTITY_PRESENT echo-only marker is GONE.
// A regression that re-adds the echo marker would re-introduce
// the dormant-fallback bug (script exists but never runs).
//
// Why pin the wiring here (not in a shell-script test): the
// Dockerfile is the source-of-truth for the IMAGE-BAKED impl, and
// the drift-gate already pins the Dockerfile's other shape
// invariants (COPY lines, build-arg, destination path). Adding
// entrypoint-wiring pins to the same file keeps the IMAGE-BAKED
// image contract in a single test surface — operators / reviewers
// reading TestPlatformAgentImageDriftGate see the full contract
// (data + activation), not just the COPY instructions.
//
// A future change that moves the entrypoint to a different
// filename / different invocation order must update this test
// in lockstep. The shape (identity-fallback.sh + /entrypoint.sh
// handoff) is the load-bearing part; the names are conventions.
func TestPlatformAgentEntrypointWiring(t *testing.T) {
dockerfilePath := filepath.Join("..", "..", "Dockerfile.platform-agent")
dockerfile, err := os.ReadFile(dockerfilePath)
if err != nil {
t.Fatalf("read %s: %v", dockerfilePath, err)
}
dockerfileStr := string(dockerfile)
// 1. Heredoc-defined entrypoint-platform-agent.sh: must exist,
// must invoke identity-fallback.sh, must hand off to
// /entrypoint.sh (the base image's entrypoint).
if !strings.Contains(dockerfileStr, "/entrypoint-platform-agent.sh") {
t.Errorf("Dockerfile.platform-agent is missing /entrypoint-platform-agent.sh — the platform-agent entrypoint is the load-bearing wire-up that activates the /opt→/configs fallback at boot")
}
if !strings.Contains(dockerfileStr, "identity-fallback.sh") {
t.Errorf("Dockerfile.platform-agent does not reference identity-fallback.sh — the boot-time /opt→/configs fill-absent-only copy script (template-platform-agent #2) is the WORKING fallback that replaces the IMAGE_BAKED_IDENTITY_PRESENT echo-only marker")
}
// The hand-off: the new entrypoint must exec /entrypoint.sh
// (the base image's entrypoint) with the CMD args. A regression
// that omits the hand-off would skip the docker-socket group
// setup + memory-plugin sidecar + su-exec /platform boot.
if !strings.Contains(dockerfileStr, "exec /entrypoint.sh \"$@\"") {
t.Errorf("Dockerfile.platform-agent entrypoint does not exec /entrypoint.sh \"$@\" — the platform-agent entrypoint must hand off to the base image's entrypoint (docker-socket group setup, memory-plugin sidecar, su-exec /platform); a regression here would skip the base-image boot")
}
// 2. ENTRYPOINT directive: must point at the new entrypoint
// (NOT the base /entrypoint.sh). The default ENTRYPOINT
// (inherited from the base image) is /entrypoint.sh; a
// regression that omits the override would activate the
// identity-fallback.sh script via COPY but never invoke
// it at boot — the dormant-fallback bug.
if !strings.Contains(dockerfileStr, `ENTRYPOINT ["/entrypoint-platform-agent.sh"]`) {
t.Errorf(`Dockerfile.platform-agent is missing ENTRYPOINT ["/entrypoint-platform-agent.sh"] — the platform-agent entrypoint override is what activates the identity-fallback at boot; without it the script is COPY'd into the image but never runs`)
}
// 3. The IMAGE_BAKED_IDENTITY_PRESENT echo-only marker MUST
// be GONE. The marker was a no-op log line that did nothing;
// re-introducing it would either (a) replace the
// identity-fallback.sh COPY (regression — fallback never
// fires) or (b) coexist with the script (which is fine but
// leaves a confusing dead file at /opt/.../IMAGE_BAKED_
// IDENTITY_PRESENT). Either way it's a regression marker.
//
// Pin pattern: a non-comment line that creates the marker
// file (the original #2919 PR's `RUN echo ... > ...IMAGE_BAKED
// _IDENTITY_PRESENT` heredoc). A comment that mentions the
// marker name is fine (documentation); a creation line is a
// regression. The check requires the marker name to be on a
// line that ALSO contains a shell-creating token (`>`, `tee`,
// `cp`, or the start of a `RUN` directive with a heredoc) —
// this is intentionally a coarse heuristic, not a full
// Dockerfile parser, but it's tight enough to catch the
// regression while not flagging the explanatory comment.
markerCreationRegex := regexp.MustCompile(`(?m)^[^#]*IMAGE_BAKED_IDENTITY_PRESENT[^#]*(>|tee |cp |<<)`)
if markerCreationRegex.MatchString(dockerfileStr) {
t.Errorf("Dockerfile.platform-agent still creates the IMAGE_BAKED_IDENTITY_PRESENT echo-only marker — the marker was a no-op log line that did nothing; the identity-fallback.sh script (template-platform-agent #2) is the real working fallback. The marker creation line must be removed when the script is wired in.")
}
}
// scanConciergeIdentityFiles walks the platform-agent template repo
// and returns the RELATIVE paths of every file in the concierge-
// identity namespace (config.yaml + mcp_servers.yaml + prompts/).
// Non-identity files (README, .gitignore, etc.) are filtered out.
// identity namespace (config.yaml + mcp_servers.yaml +
// identity-fallback.sh + prompts/). Non-identity files (README,
// .gitignore, etc.) are filtered out.
//
// Errors are returned for filesystem-walk failures; the caller turns
// them into a t.Errorf (so other checks still run). The walk is
// deliberately non-recursive beyond the namespace prefix — the
// concierge's identity is config + mcp + prompts, nothing nested.
// concierge's identity is config + mcp + fallback-script + prompts,
// nothing nested.
func scanConciergeIdentityFiles(ssotRoot string) ([]string, error) {
var identity []string
entries, err := os.ReadDir(ssotRoot)
@@ -305,7 +420,8 @@ func scanConciergeIdentityFiles(ssotRoot string) ([]string, error) {
return nil, err
}
for _, e := range entries {
// Top-level files: config.yaml, mcp_servers.yaml
// Top-level files: config.yaml, mcp_servers.yaml,
// identity-fallback.sh
if !e.IsDir() {
if isConciergeIdentityPath(e.Name()) {
identity = append(identity, e.Name())