Merge pull request 'feat(plugins): atomic install — stage→snapshot→swap→marker (docker path)' (#120) from feat/plugin-atomic-install into main

This commit is contained in:
claude-ceo-assistant 2026-05-08 15:23:31 +00:00
commit 3e96184d6f
4 changed files with 477 additions and 1 deletions

View File

@ -0,0 +1,207 @@
package handlers
// plugins_atomic.go — atomic install pattern for plugin delivery into a
// running workspace container. Closes molecule-core#114.
//
// Replaces the prior "tar + docker.CopyToContainer to /configs/plugins/<name>"
// single-step write (no atomicity, no marker, no rollback) with a 4-step
// dance:
//
// 1. STAGE — extract tar into /configs/plugins/.staging/<name>.<ts>/
// 2. SNAPSHOT — if /configs/plugins/<name>/ exists, mv to .previous/<name>.<ts>/
// 3. SWAP — mv /configs/plugins/.staging/<name>.<ts>/ → /configs/plugins/<name>/
// 4. MARKER — touch /configs/plugins/<name>/.complete
//
// On any post-snapshot failure we attempt a best-effort rollback by mv-ing
// the previous snapshot back into place. The .complete marker is the
// canonical "this install is fully landed" signal — workspace-side plugin
// loaders should refuse to load a plugin dir without it.
//
// Scope: docker path only (workspace running as a local container). The
// SaaS path (deliverViaEIC, SSH-into-EC2) is unchanged in this PR; tracked
// as a follow-up. The same stage-then-swap shape applies but the exec
// primitives differ (ssh vs docker exec), and shipping both paths in one
// PR doubles the test surface.
import (
"bytes"
"context"
"fmt"
"path"
"strings"
"time"
"github.com/docker/docker/api/types/container"
)
const (
pluginsRoot = "/configs/plugins"
pluginsStagingDir = "/configs/plugins/.staging"
pluginsPrevDir = "/configs/plugins/.previous"
completeMarker = ".complete"
)
// installVersion identifies one install attempt — the plugin name plus a
// monotonic-ish UTC timestamp suffix. Used to namespace the staging dir
// and any snapshot of the previous version, so a reinstall mid-flight
// can't collide with a concurrent reinstall.
type installVersion struct {
plugin string
stamp string // e.g. 20260508T141530Z
}
func newInstallVersion(plugin string) installVersion {
return installVersion{
plugin: plugin,
stamp: time.Now().UTC().Format("20060102T150405Z"),
}
}
// stagedPath is the container path where the new content lands during fetch.
// e.g. /configs/plugins/.staging/molecule-skill-foo.20260508T141530Z
func (v installVersion) stagedPath() string {
return path.Join(pluginsStagingDir, v.plugin+"."+v.stamp)
}
// previousPath is where the prior live version is moved before swap.
// e.g. /configs/plugins/.previous/molecule-skill-foo.20260508T141530Z
func (v installVersion) previousPath() string {
return path.Join(pluginsPrevDir, v.plugin+"."+v.stamp)
}
// livePath is the destination after swap.
// e.g. /configs/plugins/molecule-skill-foo
func (v installVersion) livePath() string {
return path.Join(pluginsRoot, v.plugin)
}
// markerPath is the .complete file inside the live dir written last.
func (v installVersion) markerPath() string {
return path.Join(v.livePath(), completeMarker)
}
// atomicCopyToContainer does a stage→snapshot→swap→marker install of a
// host-side staged plugin tree into a running container's
// /configs/plugins/<name>/. Returns nil on success.
//
// On post-snapshot failure (swap or marker write), best-effort rollback
// restores the previous snapshot to the live path. Returns the original
// error wrapped — the caller should surface it; rollback success is
// logged separately.
func (h *PluginsHandler) atomicCopyToContainer(
ctx context.Context, containerName, hostDir, pluginName string,
) error {
v := newInstallVersion(pluginName)
// Step 0a: ensure staging + previous root dirs exist (idempotent).
if _, err := h.execAsRoot(ctx, containerName, []string{
"mkdir", "-p", pluginsStagingDir, pluginsPrevDir,
}); err != nil {
return fmt.Errorf("atomic install: mkdir staging/previous: %w", err)
}
// Step 0b: tar the host content with a path prefix that lands it in the
// staging dir — NOT directly into the live name. The prefix has no
// leading "/" because docker.CopyToContainer extracts paths relative
// to the dstPath argument we pass below.
stagedRel := strings.TrimPrefix(v.stagedPath(), "/")
tarBuf, err := tarHostDirWithPrefix(hostDir, stagedRel)
if err != nil {
return fmt.Errorf("atomic install: tar host dir: %w", err)
}
// Step 1: STAGE — extract tar into /configs/plugins/.staging/<name>.<ts>/
if err := h.docker.CopyToContainer(ctx, containerName, "/", &tarBuf,
container.CopyToContainerOptions{}); err != nil {
// Best-effort: clean up any partial staging extract before returning.
_, _ = h.execAsRoot(ctx, containerName, []string{
"rm", "-rf", v.stagedPath(),
})
return fmt.Errorf("atomic install: copy to container: %w", err)
}
// Step 2: SNAPSHOT — if a live version exists, move it aside.
// `test -d` exits 0 if the dir exists, non-zero otherwise; the helper
// returns a non-nil error in the non-zero case which we treat as
// "no previous version" rather than a real failure.
snapshotted := false
if _, err := h.execAsRoot(ctx, containerName, []string{
"test", "-d", v.livePath(),
}); err == nil {
if _, err := h.execAsRoot(ctx, containerName, []string{
"mv", v.livePath(), v.previousPath(),
}); err != nil {
// Snapshot failure: roll back the staged extract before failing.
_, _ = h.execAsRoot(ctx, containerName, []string{
"rm", "-rf", v.stagedPath(),
})
return fmt.Errorf("atomic install: snapshot previous version: %w", err)
}
snapshotted = true
}
// Step 3: SWAP — atomic rename of the staged dir into the live name.
// `mv` on the same filesystem is a single rename(2), atomic at the FS level.
if _, err := h.execAsRoot(ctx, containerName, []string{
"mv", v.stagedPath(), v.livePath(),
}); err != nil {
// Swap failure: roll back if we had a snapshot.
if snapshotted {
if _, rbErr := h.execAsRoot(ctx, containerName, []string{
"mv", v.previousPath(), v.livePath(),
}); rbErr != nil {
return fmt.Errorf("atomic install: swap failed AND rollback failed: swap=%w, rollback=%v", err, rbErr)
}
}
// Best-effort cleanup of the still-staged dir.
_, _ = h.execAsRoot(ctx, containerName, []string{
"rm", "-rf", v.stagedPath(),
})
return fmt.Errorf("atomic install: swap to live path: %w", err)
}
// Step 4: MARKER — touch .complete inside the live dir as the last write.
// Workspace-side plugin loaders treat a plugin dir without this marker
// as half-installed and skip it (or surface a clear error to the
// operator instead of loading a possibly-partial tree).
if _, err := h.execAsRoot(ctx, containerName, []string{
"touch", v.markerPath(),
}); err != nil {
// Marker write failure with the new content already in place is a
// weird state — content is fine on disk, but the plugin loader
// will refuse to use it. Log loudly; do NOT roll back, since the
// content is the latest, just unmarked. Operator can manually
// `touch <plugin>/.complete` to recover.
return fmt.Errorf("atomic install: write .complete marker (content landed but unmarked, manual recovery: touch %s): %w", v.markerPath(), err)
}
// Step 5: GC — best-effort delete the previous snapshot. Failures here
// just leave a directory; not load-bearing for correctness, the next
// install or a separate sweeper will reclaim the space.
if snapshotted {
_, _ = h.execAsRoot(ctx, containerName, []string{
"rm", "-rf", v.previousPath(),
})
}
return nil
}
// tarHostDirWithPrefix walks hostDir and writes a tar to a buffer with
// every entry's name prefixed by `prefix`. Mirrors the prior streaming
// shape used in copyPluginToContainer but with a configurable prefix
// (the prior version hardcoded "plugins/<name>/"; we use a full
// staging path so the extracted layout is the staging dir directly).
//
// Symlinks are skipped — same posture as streamDirAsTar elsewhere in
// this file. Skipping prevents a hostile plugin from injecting a
// symlink that, post-extract, points outside the plugin's own dir.
func tarHostDirWithPrefix(hostDir, prefix string) (bytes.Buffer, error) {
var buf bytes.Buffer
tw := newTarWriter(&buf)
defer tw.Close()
if err := tarWalk(hostDir, prefix, tw); err != nil {
return bytes.Buffer{}, err
}
return buf, nil
}

View File

@ -0,0 +1,70 @@
package handlers
// plugins_atomic_tar.go — tar-walk helpers split out so the main atomic
// install flow stays readable. The prefix argument lets the caller
// arrange where the tar's contents land at extract time.
import (
"archive/tar"
"io"
"os"
"path/filepath"
)
// newTarWriter is a thin wrapper so atomic_test.go can swap the writer
// destination if it needs to.
func newTarWriter(w io.Writer) *tar.Writer {
return tar.NewWriter(w)
}
// tarWalk walks hostDir and writes every regular file + dir to the tar
// writer with paths of the form `<prefix>/<relative>`. Symlinks are
// skipped — same posture as streamDirAsTar in plugins_install_pipeline.go.
//
// The trailing-slash on prefix is normalized away: prefix "foo" and
// prefix "foo/" produce identical archives.
func tarWalk(hostDir, prefix string, tw *tar.Writer) error {
prefix = filepath.Clean(prefix)
return filepath.Walk(hostDir, func(p string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if info.Mode()&os.ModeSymlink != 0 {
return nil // skip symlinks; see doc above
}
rel, err := filepath.Rel(hostDir, p)
if err != nil {
return err
}
if rel == "." {
// Emit the prefix dir itself once, with the source dir's mode.
hdr, err := tar.FileInfoHeader(info, "")
if err != nil {
return err
}
hdr.Name = prefix + "/"
return tw.WriteHeader(hdr)
}
hdr, err := tar.FileInfoHeader(info, "")
if err != nil {
return err
}
hdr.Name = filepath.Join(prefix, rel)
if info.IsDir() {
hdr.Name += "/"
}
if err := tw.WriteHeader(hdr); err != nil {
return err
}
if !info.Mode().IsRegular() {
return nil
}
f, err := os.Open(p)
if err != nil {
return err
}
defer f.Close()
_, err = io.Copy(tw, f)
return err
})
}

View File

@ -0,0 +1,193 @@
package handlers
import (
"archive/tar"
"bytes"
"io"
"os"
"path/filepath"
"sort"
"strings"
"testing"
"time"
)
// TestInstallVersion_Paths: the path helpers must produce a stable shape
// the in-container exec calls depend on. Pinning the layout here
// catches a future refactor that accidentally changes where staging /
// previous / live dirs live, which would break the swap atomicity.
func TestInstallVersion_Paths(t *testing.T) {
v := installVersion{plugin: "molecule-skill-foo", stamp: "20260508T141530Z"}
if got, want := v.stagedPath(), "/configs/plugins/.staging/molecule-skill-foo.20260508T141530Z"; got != want {
t.Errorf("stagedPath = %q; want %q", got, want)
}
if got, want := v.previousPath(), "/configs/plugins/.previous/molecule-skill-foo.20260508T141530Z"; got != want {
t.Errorf("previousPath = %q; want %q", got, want)
}
if got, want := v.livePath(), "/configs/plugins/molecule-skill-foo"; got != want {
t.Errorf("livePath = %q; want %q", got, want)
}
if got, want := v.markerPath(), "/configs/plugins/molecule-skill-foo/.complete"; got != want {
t.Errorf("markerPath = %q; want %q", got, want)
}
}
// TestInstallVersion_StampUniqueness: two newInstallVersion calls within
// the same second produce the same stamp (we use second precision); the
// caller relies on the mv-rename being atomic, so collision-free
// stamping is NOT a correctness requirement — but a regression that
// changes stamp shape (e.g. RFC3339 with colons) would break the path
// helpers since path.Join treats a colon as a regular char but ssh +
// docker exec generally don't. Pin the no-colon shape.
func TestInstallVersion_StampShape(t *testing.T) {
v := newInstallVersion("anything")
if strings.Contains(v.stamp, ":") {
t.Errorf("stamp must not contain colons (breaks shell-quoting in exec): %q", v.stamp)
}
if strings.Contains(v.stamp, " ") {
t.Errorf("stamp must not contain spaces: %q", v.stamp)
}
// Sanity: stamp parses as the documented format.
if _, err := time.Parse("20060102T150405Z", v.stamp); err != nil {
t.Errorf("stamp %q does not parse as 20060102T150405Z: %v", v.stamp, err)
}
}
// TestTarHostDirWithPrefix_HappyPath: walks a host dir, builds a tar with
// the configured prefix, verifies every entry's name is rooted under
// the prefix, and the file contents survive round-trip.
func TestTarHostDirWithPrefix_HappyPath(t *testing.T) {
hostDir := t.TempDir()
// Plant: <host>/plugin.yaml + <host>/skills/foo/SKILL.md + <host>/.complete
files := map[string]string{
"plugin.yaml": "name: foo\nversion: 1.0.0\n",
"skills/foo/SKILL.md": "# Foo skill\n",
".complete": "", // upstream may already have a marker
}
for rel, body := range files {
full := filepath.Join(hostDir, rel)
if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(full, []byte(body), 0o644); err != nil {
t.Fatal(err)
}
}
prefix := "configs/plugins/.staging/foo.20260508T141530Z"
buf, err := tarHostDirWithPrefix(hostDir, prefix)
if err != nil {
t.Fatalf("tar: %v", err)
}
// Read back the tar; collect names + body for regular files.
got := map[string]string{}
tr := tar.NewReader(&buf)
for {
hdr, err := tr.Next()
if err == io.EOF {
break
}
if err != nil {
t.Fatalf("tar reader: %v", err)
}
// Every entry must start with the prefix
if !strings.HasPrefix(hdr.Name, prefix) {
t.Errorf("entry %q does not start with prefix %q", hdr.Name, prefix)
}
if hdr.Typeflag == tar.TypeReg {
body, err := io.ReadAll(tr)
if err != nil {
t.Fatal(err)
}
rel := strings.TrimPrefix(hdr.Name, prefix+"/")
got[rel] = string(body)
}
}
for rel, want := range files {
if got[rel] != want {
t.Errorf("body[%q] = %q; want %q", rel, got[rel], want)
}
}
}
// TestTarHostDirWithPrefix_SkipsSymlinks: a hostile plugin shouldn't be
// able to ship a symlink that, post-extract, points outside its own
// dir. The walker silently skips symlinks (same posture as
// streamDirAsTar). Verify a planted symlink doesn't appear in the tar.
func TestTarHostDirWithPrefix_SkipsSymlinks(t *testing.T) {
hostDir := t.TempDir()
// Plant a real file + a symlink pointing outside hostDir.
if err := os.WriteFile(filepath.Join(hostDir, "real.txt"), []byte("ok"), 0o644); err != nil {
t.Fatal(err)
}
target := filepath.Join(t.TempDir(), "outside")
if err := os.WriteFile(target, []byte("SHOULD NOT APPEAR"), 0o644); err != nil {
t.Fatal(err)
}
if err := os.Symlink(target, filepath.Join(hostDir, "evil")); err != nil {
t.Fatal(err)
}
buf, err := tarHostDirWithPrefix(hostDir, "p")
if err != nil {
t.Fatal(err)
}
names := []string{}
tr := tar.NewReader(&buf)
for {
hdr, err := tr.Next()
if err == io.EOF {
break
}
if err != nil {
t.Fatal(err)
}
names = append(names, hdr.Name)
}
sort.Strings(names)
for _, n := range names {
if strings.Contains(n, "evil") {
t.Errorf("symlink leaked into tar: %q", n)
}
}
// real.txt should be present
found := false
for _, n := range names {
if strings.HasSuffix(n, "real.txt") {
found = true
break
}
}
if !found {
t.Errorf("real.txt missing from tar; got names: %v", names)
}
}
// TestTarHostDirWithPrefix_PrefixNormalization: trailing slash on prefix
// should not change the archive shape. Pinning this so a future caller
// passing "foo/" instead of "foo" doesn't double-slash entry names.
func TestTarHostDirWithPrefix_PrefixNormalization(t *testing.T) {
hostDir := t.TempDir()
if err := os.WriteFile(filepath.Join(hostDir, "x"), []byte("y"), 0o644); err != nil {
t.Fatal(err)
}
a, err := tarHostDirWithPrefix(hostDir, "foo")
if err != nil {
t.Fatal(err)
}
b, err := tarHostDirWithPrefix(hostDir, "foo/")
if err != nil {
t.Fatal(err)
}
if !bytes.Equal(a.Bytes(), b.Bytes()) {
t.Errorf("trailing-slash on prefix changed archive shape; tarHostDirWithPrefix should be slash-insensitive")
}
}

View File

@ -276,7 +276,13 @@ func (h *PluginsHandler) resolveAndStage(ctx context.Context, req installRequest
// using NewPluginsHandler without a DB; production wires it in router.go.
func (h *PluginsHandler) deliverToContainer(ctx context.Context, workspaceID string, r *stageResult) error {
if containerName := h.findRunningContainer(ctx, workspaceID); containerName != "" {
if err := h.copyPluginToContainer(ctx, containerName, r.StagedDir, r.PluginName); err != nil {
// Atomic stage→snapshot→swap→marker (molecule-core#114).
// Replaces the prior single docker.CopyToContainer write that
// left a partially-extracted tree on mid-install failure with
// no rollback path. atomicCopyToContainer writes a .complete
// marker as the last step; workspace-side plugin loaders should
// refuse to load a plugin dir without it.
if err := h.atomicCopyToContainer(ctx, containerName, r.StagedDir, r.PluginName); err != nil {
log.Printf("Plugin install: failed to copy %s to %s: %v", r.PluginName, workspaceID, err)
return newHTTPErr(http.StatusInternalServerError, gin.H{"error": "failed to copy plugin to container"})
}