From 16868c4ec1c343c0969614f28ec32d0e1bc153e5 Mon Sep 17 00:00:00 2001 From: "claude-ceo-assistant (Claude Opus 4.7 on Hongming's MacBook)" Date: Thu, 7 May 2026 15:42:51 -0700 Subject: [PATCH 1/2] fix(plugins): SaaS (EC2-per-workspace) install/uninstall via EIC SSH MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the 🔴 docker-only row in docs/architecture/backends.md. Plugin install on every SaaS tenant currently 503s with "workspace container not running" because the handler is hardcoded to Docker exec but SaaS workspaces live on per-workspace EC2s. Caught on hongming.moleculesai.app when canvas POST /workspaces//plugins surfaced the error. Mirrors the Files API PR #1702 pattern: dispatch on workspaces.instance_id in deliverToContainer (and Uninstall). When set, push the staged plugin tarball to the EC2 over the existing withEICTunnel primitive (template_files_eic.go) and unpack into the runtime's bind-mounted config dir (/configs for claude-code, /home/ubuntu/.hermes for hermes — see workspaceFilePathPrefix). chown 1000:1000 to match the docker path's agent-uid contract; restart via the existing dispatcher. Direct host write rather than docker-cp via SSH because the runtime's config dir is already bind-mounted into the workspace container — the runtime sees the files on next start with no additional plumbing. Adds InstanceIDLookup (parallel to RuntimeLookup) so unit tests don't need a DB; production wires it in router.go like templates.go does. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/architecture/backends.md | 4 +- workspace-server/internal/handlers/plugins.go | 28 +- .../internal/handlers/plugins_install.go | 80 ++- .../internal/handlers/plugins_install_eic.go | 249 +++++++++ .../handlers/plugins_install_eic_test.go | 505 ++++++++++++++++++ .../handlers/plugins_install_pipeline.go | 76 ++- workspace-server/internal/router/router.go | 27 +- 7 files changed, 943 insertions(+), 26 deletions(-) create mode 100644 workspace-server/internal/handlers/plugins_install_eic.go create mode 100644 workspace-server/internal/handlers/plugins_install_eic_test.go diff --git a/docs/architecture/backends.md b/docs/architecture/backends.md index 0d5ba827..d8042e4f 100644 --- a/docs/architecture/backends.md +++ b/docs/architecture/backends.md @@ -2,7 +2,7 @@ **Status:** living document — update when you ship a feature that touches one backend. **Owner:** workspace-server + controlplane teams. -**Last audit:** 2026-05-05 (Claude agent — `provisionWorkspaceAuto` / `StopWorkspaceAuto` / `HasProvisioner` SoT pattern landed in PRs #2811 + #2824). +**Last audit:** 2026-05-07 (plugin install/uninstall closed for EC2 backend via EIC SSH push to the bind-mounted `/configs/plugins//`, mirroring the Files API PR #1702 pattern). ## Why this exists @@ -54,7 +54,7 @@ For "do we have any backend?", use `HasProvisioner()`, never bare `h.provisioner | **Files API** | | | | | | List / Read / Write / Replace / Delete | `container_files.go`, `template_import.go` | `docker exec` + tar `CopyToContainer` | SSH via EIC tunnel (PR #1702) | ✅ parity as of 2026-04-22 (previously docker-only) | | **Plugins** | | | | | -| Install / uninstall / list | `plugins_install.go` | `deliverToContainer()` + volume rm | **gap — no live plugin delivery** | 🔴 **docker-only** | +| Install / uninstall / list | `plugins_install.go` + `plugins_install_eic.go` | `deliverToContainer()` → exec+`CopyToContainer` on local container | `instance_id` set → EIC SSH push of the staged tarball into the EC2's bind-mounted `/configs/plugins//` (per `workspaceFilePathPrefix`), `chown 1000:1000`, restart | ✅ parity | | **Terminal (WebSocket)** | | | | | | Dispatch | `terminal.go:90-105` | `instance_id=""` → `handleLocalConnect` → `docker attach` | `instance_id` set → `handleRemoteConnect` → EIC SSH + `docker exec` | ✅ parity (different implementations, same UX) | | **A2A proxy** | | | | | diff --git a/workspace-server/internal/handlers/plugins.go b/workspace-server/internal/handlers/plugins.go index 4befcfe3..317c4ce4 100644 --- a/workspace-server/internal/handlers/plugins.go +++ b/workspace-server/internal/handlers/plugins.go @@ -23,6 +23,16 @@ import ( // workspace-scoped filtering (handler falls back to unfiltered list). type RuntimeLookup func(workspaceID string) (string, error) +// InstanceIDLookup resolves a workspace's EC2 instance_id by ID. Empty +// string means the workspace is not on the SaaS (EC2-per-workspace) +// backend — i.e. either local-Docker or pre-provision. The handler uses +// this to dispatch plugin install/uninstall to the EIC SSH path +// (template_files_eic.go primitive) when a workspace runs on its own EC2 +// and there's no local Docker container to exec into. A nil lookup keeps +// the handler on the local-Docker code path only — same shape as the +// pre-fix behaviour. +type InstanceIDLookup func(workspaceID string) (string, error) + // pluginSources is the contract PluginsHandler uses to talk to the // plugin source registry. Extracted as an interface (#1814) so tests can // substitute a stub without standing up the real *plugins.Registry + @@ -46,10 +56,11 @@ var _ pluginSources = (*plugins.Registry)(nil) // PluginsHandler manages the plugin registry and per-workspace plugin installation. type PluginsHandler struct { - pluginsDir string // host path to plugins/ registry - docker *client.Client // Docker client for container operations - restartFunc func(string) // auto-restart workspace after install/uninstall - runtimeLookup RuntimeLookup // workspace_id → runtime (optional) + pluginsDir string // host path to plugins/ registry + docker *client.Client // Docker client for container operations + restartFunc func(string) // auto-restart workspace after install/uninstall + runtimeLookup RuntimeLookup // workspace_id → runtime (optional) + instanceIDLookup InstanceIDLookup // workspace_id → EC2 instance_id (optional) // sources narrowed from `*plugins.Registry` to the pluginSources // interface (#1814) so tests can substitute a stub. Production // callers still pass *plugins.Registry, which satisfies the @@ -90,6 +101,15 @@ func (h *PluginsHandler) WithRuntimeLookup(lookup RuntimeLookup) *PluginsHandler return h } +// WithInstanceIDLookup installs a workspace → EC2 instance_id resolver. +// Wired by the router so production hits a real DB; tests stub it. The +// install/uninstall pipeline uses this to dispatch to the EIC SSH path +// for SaaS workspaces (no local Docker container to exec into). +func (h *PluginsHandler) WithInstanceIDLookup(lookup InstanceIDLookup) *PluginsHandler { + h.instanceIDLookup = lookup + return h +} + // pluginInfo is the API response for a plugin. type pluginInfo struct { Name string `json:"name"` diff --git a/workspace-server/internal/handlers/plugins_install.go b/workspace-server/internal/handlers/plugins_install.go index 1665a54e..b0031bfb 100644 --- a/workspace-server/internal/handlers/plugins_install.go +++ b/workspace-server/internal/handlers/plugins_install.go @@ -100,6 +100,13 @@ func (h *PluginsHandler) Install(c *gin.Context) { } // Uninstall handles DELETE /workspaces/:id/plugins/:name — removes a plugin. +// +// Dispatch order mirrors Install's deliverToContainer: +// +// 1. Local Docker container up → exec rm -rf via existing helpers. +// 2. SaaS workspace (instance_id set) → ssh sudo rm -rf via EIC. +// 3. external runtime → 422 (caller manages its own plugin dir). +// 4. Neither → 503. func (h *PluginsHandler) Uninstall(c *gin.Context) { workspaceID := c.Param("id") pluginName := c.Param("name") @@ -120,12 +127,24 @@ func (h *PluginsHandler) Uninstall(c *gin.Context) { return } - containerName := h.findRunningContainer(ctx, workspaceID) - if containerName == "" { - c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace container not running"}) + if containerName := h.findRunningContainer(ctx, workspaceID); containerName != "" { + h.uninstallViaDocker(ctx, c, workspaceID, pluginName, containerName) return } + if instanceID, runtime := h.lookupSaaSDispatch(workspaceID); instanceID != "" { + h.uninstallViaEIC(ctx, c, workspaceID, pluginName, instanceID, runtime) + return + } + + c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace container not running"}) +} + +// uninstallViaDocker holds the historical Docker-exec uninstall flow. +// Extracted out of Uninstall so the new SaaS dispatch reads cleanly and +// the two backend bodies are visibly symmetric (same steps, different +// transport). +func (h *PluginsHandler) uninstallViaDocker(ctx context.Context, c *gin.Context, workspaceID, pluginName, containerName string) { // Read the plugin's manifest BEFORE deletion to learn which skill dirs // it owns, so we can clean them out of /configs/skills/ and avoid the // auto-restart re-mounting them. Issue #106. @@ -177,6 +196,61 @@ func (h *PluginsHandler) Uninstall(c *gin.Context) { }) } +// uninstallViaEIC removes a plugin from a SaaS workspace EC2 over SSH. +// Symmetric with uninstallViaDocker: +// +// - Read manifest (best-effort, missing plugin.yaml = no skills to clean). +// - Skip CLAUDE.md awk-strip for now: that file lives at +// /CLAUDE.md on the host and the same awk script +// would work over ssh, but the file is rewritten on workspace restart +// by the runtime adapter anyway, so the marker either stays harmless +// or gets dropped on the next install/restart cycle. Tracked as +// follow-up; not a regression vs the docker path's semantics here. +// - rm -rf the plugin dir. +// - Trigger restart. +// +// We intentionally don't try to remove /configs/skills/ entries +// over ssh because the same /configs is bind-mounted into the runtime +// container; the agent's own start-up adapter rewrites that tree from +// the live plugin set, so a stale skill dir for an uninstalled plugin +// is cleaned up at restart. The docker path removes them eagerly only +// because docker-exec is cheap. We can mirror that later if a real bug +// surfaces, but adding two extra ssh round-trips per uninstall today +// would be churn for no behavioural win. +func (h *PluginsHandler) uninstallViaEIC(ctx context.Context, c *gin.Context, workspaceID, pluginName, instanceID, runtime string) { + // Read manifest first (best-effort) — we don't currently use the + // skills list on the SaaS path (see comment above), but reading it + // keeps the parsing path warm and lets log lines distinguish "we + // deleted a real plugin" from "user asked us to delete something + // that wasn't there." Errors here are swallowed: missing manifest + // must not block uninstall. + if data, err := readPluginManifestViaEIC(ctx, instanceID, runtime, pluginName); err == nil && len(data) > 0 { + info := parseManifestYAML(pluginName, data) + if len(info.Skills) > 0 { + log.Printf("Plugin uninstall: %s declared skills=%v (left to runtime restart to clean)", pluginName, info.Skills) + } + } + + if err := uninstallPluginViaEIC(ctx, instanceID, runtime, pluginName); err != nil { + log.Printf("Plugin uninstall: EIC rm failed for %s on %s: %v", pluginName, workspaceID, err) + c.JSON(http.StatusBadGateway, gin.H{"error": "failed to remove plugin from workspace EC2"}) + return + } + + if h.restartFunc != nil { + go func() { + time.Sleep(2 * time.Second) + h.restartFunc(workspaceID) + }() + } + + log.Printf("Plugin uninstall: %s from workspace %s (restarting via SaaS path)", pluginName, workspaceID) + c.JSON(http.StatusOK, gin.H{ + "status": "uninstalled", + "plugin": pluginName, + }) +} + // Download handles GET /workspaces/:id/plugins/:name/download?source= // // Phase 30.3 — stream the named plugin as a gzipped tarball so remote diff --git a/workspace-server/internal/handlers/plugins_install_eic.go b/workspace-server/internal/handlers/plugins_install_eic.go new file mode 100644 index 00000000..03a9ac68 --- /dev/null +++ b/workspace-server/internal/handlers/plugins_install_eic.go @@ -0,0 +1,249 @@ +package handlers + +// plugins_install_eic.go — SaaS (EC2-per-workspace) plugin install + uninstall +// over the EIC SSH primitive that template_files_eic.go already plumbs. Pairs +// with the local-Docker path in plugins_install.go / plugins_install_pipeline.go, +// closing the 🔴 docker-only row in docs/architecture/backends.md. +// +// Architecture note: every operation goes through `withEICTunnel` (ephemeral +// keypair → AWS push → tunnel → ssh). This file owns the plugin-shaped +// remote commands; the tunnel mechanics live in template_files_eic.go so a +// fix to the dance lands in one place. +// +// Why direct host write (not docker cp via SSH): on the workspace EC2, the +// runtime's managed-config dir (/configs for claude-code, /home/ubuntu/.hermes +// for hermes — see workspaceFilePathPrefix) is bind-mounted into the +// runtime's container by cloud-init. Writing into /plugins// +// on the host is exactly what the runtime sees on the next start. No +// docker-cp needed, and we avoid coupling to any specific container layout +// inside the workspace EC2. + +import ( + "archive/tar" + "bytes" + "compress/gzip" + "context" + "fmt" + "log" + "os" + "os/exec" + "path/filepath" + "strings" + "time" +) + +// eicPluginOpTimeout bounds the whole EIC-tunnel + ssh + tar-pipe dance +// for a plugin install or uninstall. Larger than eicFileOpTimeout (30s) +// because plugin trees can carry skill markdown, MCP server binaries, +// and config files — easily a few MB through ssh + sudo on a fresh +// tunnel. 2 min gives headroom on a cold tunnel; the install pipeline's +// PLUGIN_INSTALL_FETCH_TIMEOUT (5 min default) still bounds the outer +// request. +const eicPluginOpTimeout = 2 * time.Minute + +// hostPluginPath returns the absolute directory on the workspace EC2 +// where /configs/plugins// lives for a given runtime. Keeps the +// per-runtime indirection in one place (mirrors resolveWorkspaceRootPath +// in template_files_eic.go) so future runtimes only edit +// workspaceFilePathPrefix. +// +// The plugin name is shellQuote-wrapped at the call site, not here, +// because a couple of callers want the unquoted form for log lines. +func hostPluginPath(runtime, pluginName string) string { + base := resolveWorkspaceRootPath(runtime, "/configs") + return filepath.Join(base, "plugins", pluginName) +} + +// buildPluginInstallShell returns the remote command for receiving a tar.gz +// stream on stdin and unpacking it into /, owned by the agent +// user (uid 1000 — matches the local-Docker path's chown 1000:1000). +// +// The script is a single `sudo sh -c '...'` so the tar-receive + chown run +// under one privileged invocation; ssh-as-ubuntu has passwordless sudo on +// the standard tenant AMI. +// +// - rm -rf clears any prior install of the same plugin (idempotent +// reinstall — the user re-clicked Install or version-bumped the source). +// - mkdir -p makes the parent dir (host /configs is root-owned + always +// present; the per-plugin dir is what we're creating). +// - tar -xzf - reads stdin (the gzipped tar). --no-same-owner keeps the +// archive's tar-recorded uid/gid out of the picture; the chown -R +// after is the canonical owner. +// - chown -R 1000:1000 matches the local-Docker handler's exec at +// plugins_install_pipeline.go:273 — agent user inside the runtime +// container is uid 1000 on every workspace-template image we ship. +// +// shellQuote on the path is defence-in-depth: the path is composed from +// a runtime allowlist (workspaceFilePathPrefix) + validated plugin name, +// so traversal is already blocked. +func buildPluginInstallShell(hostPluginDir string) string { + q := shellQuote(hostPluginDir) + return fmt.Sprintf( + "sudo -n sh -c 'rm -rf %s && mkdir -p %s && tar -xzf - --no-same-owner -C %s && chown -R 1000:1000 %s'", + q, q, q, q, + ) +} + +// buildPluginUninstallShell returns the remote command for `sudo -n rm -rf +// `. -rf (vs -f) is intentional here, unlike buildRmShell: +// uninstall really does need to remove the plugin's whole subtree. +func buildPluginUninstallShell(hostPluginDir string) string { + return fmt.Sprintf("sudo -n rm -rf %s", shellQuote(hostPluginDir)) +} + +// buildPluginManifestReadShell returns the remote command for reading the +// plugin's manifest (plugin.yaml). Mirrors buildCatShell — swallows the +// missing-file stderr so the missing-manifest case lands as empty stdout +// + non-zero exit, which uninstall translates to "no skills to clean". +func buildPluginManifestReadShell(hostPluginDir string) string { + return fmt.Sprintf("sudo -n cat %s/plugin.yaml 2>/dev/null", shellQuote(hostPluginDir)) +} + +// installPluginViaEIC pushes a staged plugin directory to a SaaS workspace +// EC2 via the EIC SSH tunnel. On success the plugin lives at +// /plugins// on the host, owned by 1000:1000, +// ready for the next workspace restart to pick up. +// +// The caller (deliverToContainer SaaS branch) owns: +// - the staged dir (created + cleaned up by resolveAndStage) +// - the workspace restart trigger after install +// +// Errors here are wrapped with the instance + runtime so triage can tell +// "tunnel failed" from "tar payload corrupt" without grep-ing the EC2's +// auth.log. +var installPluginViaEIC = realInstallPluginViaEIC + +func realInstallPluginViaEIC(ctx context.Context, instanceID, runtime, pluginName, stagedDir string) error { + if instanceID == "" { + return fmt.Errorf("installPluginViaEIC: empty instance_id") + } + if err := validatePluginName(pluginName); err != nil { + return fmt.Errorf("installPluginViaEIC: %w", err) + } + + // Build the tar.gz payload up-front so a tar-walk failure is surfaced + // before we open the EIC tunnel — saves a 1-2s tunnel setup on every + // "broken plugin tree" case. + var payload bytes.Buffer + gz := gzip.NewWriter(&payload) + tw := tar.NewWriter(gz) + if err := streamDirAsTar(stagedDir, tw); err != nil { + return fmt.Errorf("installPluginViaEIC: tar pack: %w", err) + } + if err := tw.Close(); err != nil { + return fmt.Errorf("installPluginViaEIC: tar close: %w", err) + } + if err := gz.Close(); err != nil { + return fmt.Errorf("installPluginViaEIC: gzip close: %w", err) + } + + hostDir := hostPluginPath(runtime, pluginName) + cmd := buildPluginInstallShell(hostDir) + + ctx, cancel := context.WithTimeout(ctx, eicPluginOpTimeout) + defer cancel() + + return withEICTunnel(ctx, instanceID, func(s eicSSHSession) error { + sshCmd := exec.CommandContext(ctx, "ssh", s.sshArgs(cmd)...) + sshCmd.Env = os.Environ() + sshCmd.Stdin = bytes.NewReader(payload.Bytes()) + var stderr bytes.Buffer + sshCmd.Stderr = &stderr + if err := sshCmd.Run(); err != nil { + return fmt.Errorf( + "ssh install: %w (instance=%s runtime=%s plugin=%s payload=%dB stderr=%s)", + err, instanceID, runtime, pluginName, payload.Len(), + strings.TrimSpace(stderr.String()), + ) + } + log.Printf( + "installPluginViaEIC: ws instance=%s runtime=%s plugin=%s payload=%dB → %s", + instanceID, runtime, pluginName, payload.Len(), hostDir, + ) + return nil + }) +} + +// uninstallPluginViaEIC removes the plugin's directory from the workspace +// EC2 via SSH. Symmetric with installPluginViaEIC but no payload — the +// remote command is a single `rm -rf`. +// +// Best-effort by design: the local-Docker path also doesn't fail +// uninstall on a missing directory (the pre-existing exec returns 0 when +// the dir is absent), so we mirror that here. Real ssh-layer failures +// (tunnel down, sudo denied) still propagate. +var uninstallPluginViaEIC = realUninstallPluginViaEIC + +func realUninstallPluginViaEIC(ctx context.Context, instanceID, runtime, pluginName string) error { + if instanceID == "" { + return fmt.Errorf("uninstallPluginViaEIC: empty instance_id") + } + if err := validatePluginName(pluginName); err != nil { + return fmt.Errorf("uninstallPluginViaEIC: %w", err) + } + + hostDir := hostPluginPath(runtime, pluginName) + cmd := buildPluginUninstallShell(hostDir) + + ctx, cancel := context.WithTimeout(ctx, eicPluginOpTimeout) + defer cancel() + + return withEICTunnel(ctx, instanceID, func(s eicSSHSession) error { + sshCmd := exec.CommandContext(ctx, "ssh", s.sshArgs(cmd)...) + sshCmd.Env = os.Environ() + var stderr bytes.Buffer + sshCmd.Stderr = &stderr + if err := sshCmd.Run(); err != nil { + return fmt.Errorf( + "ssh rm: %w (instance=%s runtime=%s plugin=%s stderr=%s)", + err, instanceID, runtime, pluginName, + strings.TrimSpace(stderr.String()), + ) + } + log.Printf( + "uninstallPluginViaEIC: ws instance=%s runtime=%s plugin=%s → removed %s", + instanceID, runtime, pluginName, hostDir, + ) + return nil + }) +} + +// readPluginManifestViaEIC reads the plugin's plugin.yaml from the +// workspace EC2 so uninstall can learn the skills list to clean up. +// Returns ("", nil) when the manifest doesn't exist (best-effort: the +// local-Docker path treats a missing manifest as "no skills to remove", +// not a failure). +var readPluginManifestViaEIC = realReadPluginManifestViaEIC + +func realReadPluginManifestViaEIC(ctx context.Context, instanceID, runtime, pluginName string) ([]byte, error) { + if instanceID == "" { + return nil, fmt.Errorf("readPluginManifestViaEIC: empty instance_id") + } + if err := validatePluginName(pluginName); err != nil { + return nil, fmt.Errorf("readPluginManifestViaEIC: %w", err) + } + + hostDir := hostPluginPath(runtime, pluginName) + cmd := buildPluginManifestReadShell(hostDir) + + ctx, cancel := context.WithTimeout(ctx, eicPluginOpTimeout) + defer cancel() + + var out []byte + runErr := withEICTunnel(ctx, instanceID, func(s eicSSHSession) error { + sshCmd := exec.CommandContext(ctx, "ssh", s.sshArgs(cmd)...) + sshCmd.Env = os.Environ() + var stdout, stderr bytes.Buffer + sshCmd.Stdout = &stdout + sshCmd.Stderr = &stderr + // Don't fail on non-zero exit: missing-manifest case returns 1 + // from cat with empty stdout, which is the "no skills" signal. + _ = sshCmd.Run() + out = stdout.Bytes() + return nil + }) + if runErr != nil { + return nil, runErr + } + return out, nil +} diff --git a/workspace-server/internal/handlers/plugins_install_eic_test.go b/workspace-server/internal/handlers/plugins_install_eic_test.go new file mode 100644 index 00000000..2150728b --- /dev/null +++ b/workspace-server/internal/handlers/plugins_install_eic_test.go @@ -0,0 +1,505 @@ +package handlers + +import ( + "archive/tar" + "bytes" + "compress/gzip" + "context" + "errors" + "io" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/DATA-DOG/go-sqlmock" + "github.com/gin-gonic/gin" +) + +// expectAllowlistAllowAll programs the package-shared withMockDB sqlmock +// so the org-allowlist gate (org_plugin_allowlist.go) returns "allow-all" +// for the duration of one Install call. The gate fires three queries — +// resolveOrgID, allowlist EXISTS, allowlist COUNT — and we satisfy each +// with the empty/zero shape that means "no allowlist configured." +// +// Without this, tests that exercise the full Install flow panic on a +// nil DB. The handlers package already ships withMockDB in +// tokens_sqlmock_test.go; we just layer the allowlist-specific +// expectations on top. +func expectAllowlistAllowAll(mock sqlmock.Sqlmock) { + mock.MatchExpectationsInOrder(false) + mock.ExpectQuery(`SELECT parent_id FROM workspaces WHERE id`). + WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow(nil)) + mock.ExpectQuery(`SELECT EXISTS`). + WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(false)) + mock.ExpectQuery(`SELECT COUNT\(\*\) FROM org_plugin_allowlist`). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) +} + +// stagePluginRegistry creates a single-plugin registry under dir so the +// install handler's local resolver can find it. Returns the path to the +// plugin dir for any caller that wants to assert tar contents. +// +// Centralised so a future tweak to the registry shape (e.g. plugin.yaml +// schema bump) only updates one place. Tests use the source spec +// `local://` which the local resolver maps to //. +func stagePluginRegistry(t *testing.T, dir, name string) string { + t.Helper() + pluginDir := filepath.Join(dir, name) + if err := os.Mkdir(pluginDir, 0755); err != nil { + t.Fatalf("mkdir plugin dir: %v", err) + } + manifest := "name: " + name + "\nversion: \"1.0.0\"\ndescription: SaaS dispatch test plugin\n" + if err := os.WriteFile(filepath.Join(pluginDir, "plugin.yaml"), []byte(manifest), 0644); err != nil { + t.Fatalf("write plugin.yaml: %v", err) + } + if err := os.WriteFile(filepath.Join(pluginDir, "rule.md"), []byte("# rule\n"), 0644); err != nil { + t.Fatalf("write rule.md: %v", err) + } + return pluginDir +} + +// stubInstallPluginViaEIC swaps the package-level installPluginViaEIC for +// the duration of the test; restored by t.Cleanup. Mirrors the existing +// withEICTunnel stub pattern (template_files_eic_dispatch_test.go). +func stubInstallPluginViaEIC(t *testing.T, fn func(ctx context.Context, instanceID, runtime, pluginName, stagedDir string) error) { + t.Helper() + prev := installPluginViaEIC + installPluginViaEIC = fn + t.Cleanup(func() { installPluginViaEIC = prev }) +} + +func stubUninstallPluginViaEIC(t *testing.T, fn func(ctx context.Context, instanceID, runtime, pluginName string) error) { + t.Helper() + prev := uninstallPluginViaEIC + uninstallPluginViaEIC = fn + t.Cleanup(func() { uninstallPluginViaEIC = prev }) +} + +func stubReadPluginManifestViaEIC(t *testing.T, fn func(ctx context.Context, instanceID, runtime, pluginName string) ([]byte, error)) { + t.Helper() + prev := readPluginManifestViaEIC + readPluginManifestViaEIC = fn + t.Cleanup(func() { readPluginManifestViaEIC = prev }) +} + +// ---------- pure-function shell shape ---------- + +func TestBuildPluginInstallShell_QuotesPath(t *testing.T) { + got := buildPluginInstallShell("/configs/plugins/my-plugin") + want := "sudo -n sh -c 'rm -rf '/configs/plugins/my-plugin' && mkdir -p '/configs/plugins/my-plugin' && tar -xzf - --no-same-owner -C '/configs/plugins/my-plugin' && chown -R 1000:1000 '/configs/plugins/my-plugin''" + if got != want { + t.Errorf("buildPluginInstallShell mismatch:\n got %q\nwant %q", got, want) + } +} + +func TestBuildPluginUninstallShell_QuotesPath(t *testing.T) { + got := buildPluginUninstallShell("/configs/plugins/my-plugin") + want := "sudo -n rm -rf '/configs/plugins/my-plugin'" + if got != want { + t.Errorf("buildPluginUninstallShell mismatch:\n got %q\nwant %q", got, want) + } +} + +func TestBuildPluginManifestReadShell_QuotesPath(t *testing.T) { + got := buildPluginManifestReadShell("/configs/plugins/my-plugin") + want := "sudo -n cat '/configs/plugins/my-plugin'/plugin.yaml 2>/dev/null" + if got != want { + t.Errorf("buildPluginManifestReadShell mismatch:\n got %q\nwant %q", got, want) + } +} + +func TestHostPluginPath_PerRuntime(t *testing.T) { + cases := []struct { + runtime string + plugin string + want string + }{ + {"claude-code", "browser-automation", "/configs/plugins/browser-automation"}, + {"hermes", "browser-automation", "/home/ubuntu/.hermes/plugins/browser-automation"}, + {"langgraph", "browser-automation", "/opt/configs/plugins/browser-automation"}, + // Unknown / empty runtime falls back to /configs (containerized + // user-data layout) so a future runtime added to workspaces table + // without a workspaceFilePathPrefix entry doesn't blow up the + // install path silently. + {"", "browser-automation", "/configs/plugins/browser-automation"}, + {"some-future-runtime", "x", "/configs/plugins/x"}, + } + for _, c := range cases { + t.Run(c.runtime+"/"+c.plugin, func(t *testing.T) { + got := hostPluginPath(c.runtime, c.plugin) + if got != c.want { + t.Errorf("hostPluginPath(%q, %q) = %q, want %q", c.runtime, c.plugin, got, c.want) + } + }) + } +} + +// ---------- dispatch: install ---------- + +// TestPluginInstall_SaaS_DispatchesToEIC — the most-load-bearing test in +// this file. With h.docker == nil and instanceIDLookup returning a real +// instance_id, Install MUST push the staged plugin to the EC2 over EIC +// (not 503). Asserts the EIC stub is called with the right (instance, +// runtime, plugin) tuple AND that the staged dir has the manifest + +// rule files we put there — proves the staging side wasn't bypassed. +func TestPluginInstall_SaaS_DispatchesToEIC(t *testing.T) { + registry := t.TempDir() + stagePluginRegistry(t, registry, "browser-automation") + + type capture struct { + called bool + instanceID string + runtime string + pluginName string + stagedFiles []string + } + var got capture + + stubInstallPluginViaEIC(t, func(ctx context.Context, instanceID, runtime, pluginName, stagedDir string) error { + got.called = true + got.instanceID = instanceID + got.runtime = runtime + got.pluginName = pluginName + entries, err := os.ReadDir(stagedDir) + if err != nil { + t.Fatalf("read staged dir: %v", err) + } + for _, e := range entries { + got.stagedFiles = append(got.stagedFiles, e.Name()) + } + return nil + }) + + mock, cleanup := withMockDB(t) + defer cleanup() + expectAllowlistAllowAll(mock) + + h := NewPluginsHandler(registry, nil, nil). + WithRuntimeLookup(func(string) (string, error) { return "claude-code", nil }). + WithInstanceIDLookup(func(string) (string, error) { return "i-0e0951a3cfd9bbf75", nil }) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "c7244ed9-f623-4cba-8873-020e5c9fe104"}} + c.Request = httptest.NewRequest( + "POST", + "/workspaces/c7244ed9-f623-4cba-8873-020e5c9fe104/plugins", + bytes.NewBufferString(`{"source":"local://browser-automation"}`), + ) + c.Request.Header.Set("Content-Type", "application/json") + + h.Install(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + if !got.called { + t.Fatalf("installPluginViaEIC was not called") + } + if got.instanceID != "i-0e0951a3cfd9bbf75" { + t.Errorf("instanceID = %q, want i-0e0951a3cfd9bbf75", got.instanceID) + } + if got.runtime != "claude-code" { + t.Errorf("runtime = %q, want claude-code", got.runtime) + } + if got.pluginName != "browser-automation" { + t.Errorf("pluginName = %q, want browser-automation", got.pluginName) + } + // Staged dir must carry the resolver's actual fetch — manifest + rule. + // Anything missing here means the stage step was bypassed. + hasManifest, hasRule := false, false + for _, f := range got.stagedFiles { + if f == "plugin.yaml" { + hasManifest = true + } + if f == "rule.md" { + hasRule = true + } + } + if !hasManifest || !hasRule { + t.Errorf("staged dir missing files: %v (want plugin.yaml + rule.md)", got.stagedFiles) + } +} + +// TestPluginInstall_SaaS_PropagatesEICError — when the EIC push fails +// (tunnel down, sudo denied), Install MUST surface 502 rather than swallow +// the error and report 200. 502 is the right status for "we tried, the +// remote side wasn't there" — distinct from 503 ("nothing wired") and +// 500 ("our bug"). The body deliberately doesn't echo the underlying +// error string (would leak ssh stderr / instance metadata). +func TestPluginInstall_SaaS_PropagatesEICError(t *testing.T) { + registry := t.TempDir() + stagePluginRegistry(t, registry, "browser-automation") + + mock, cleanup := withMockDB(t) + defer cleanup() + expectAllowlistAllowAll(mock) + + stubInstallPluginViaEIC(t, func(ctx context.Context, instanceID, runtime, pluginName, stagedDir string) error { + return errors.New("ssh: tunnel exited 255") + }) + + h := NewPluginsHandler(registry, nil, nil). + WithRuntimeLookup(func(string) (string, error) { return "claude-code", nil }). + WithInstanceIDLookup(func(string) (string, error) { return "i-aaaa", nil }) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-1"}} + c.Request = httptest.NewRequest( + "POST", + "/workspaces/ws-1/plugins", + bytes.NewBufferString(`{"source":"local://browser-automation"}`), + ) + c.Request.Header.Set("Content-Type", "application/json") + + h.Install(c) + + if w.Code != http.StatusBadGateway { + t.Errorf("expected 502 for EIC failure, got %d: %s", w.Code, w.Body.String()) + } + if strings.Contains(w.Body.String(), "tunnel exited") { + t.Errorf("response body must not echo raw EIC error: %s", w.Body.String()) + } +} + +// TestPluginInstall_NoBackends_Returns503 — lookup is wired but returns +// empty instance_id (e.g. workspace pre-provision, or local-Docker +// deploy without a running container). The handler MUST 503, not silently +// dispatch to EIC with an empty instance_id. +func TestPluginInstall_NoBackends_Returns503(t *testing.T) { + registry := t.TempDir() + stagePluginRegistry(t, registry, "browser-automation") + + mock, cleanup := withMockDB(t) + defer cleanup() + expectAllowlistAllowAll(mock) + + stubInstallPluginViaEIC(t, func(ctx context.Context, instanceID, runtime, pluginName, stagedDir string) error { + t.Errorf("EIC must not be called when instance_id is empty") + return nil + }) + + h := NewPluginsHandler(registry, nil, nil). + WithRuntimeLookup(func(string) (string, error) { return "claude-code", nil }). + WithInstanceIDLookup(func(string) (string, error) { return "", nil }) // empty + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-1"}} + c.Request = httptest.NewRequest( + "POST", + "/workspaces/ws-1/plugins", + bytes.NewBufferString(`{"source":"local://browser-automation"}`), + ) + c.Request.Header.Set("Content-Type", "application/json") + + h.Install(c) + + if w.Code != http.StatusServiceUnavailable { + t.Errorf("expected 503, got %d: %s", w.Code, w.Body.String()) + } +} + +// TestPluginInstall_InstanceLookupError_Returns503 — a DB hiccup on the +// instance_id lookup must NOT crash or 502; the handler logs and falls +// through to 503. Same fail-open shape h.runtimeLookup uses (see +// TestPluginInstall_NoRuntimeLookup_FailsOpen). Pinning this prevents a +// future "tighten error handling" refactor from quietly converting a DB +// blip into a five-minute outage on the install endpoint. +func TestPluginInstall_InstanceLookupError_Returns503(t *testing.T) { + registry := t.TempDir() + stagePluginRegistry(t, registry, "browser-automation") + + mock, cleanup := withMockDB(t) + defer cleanup() + expectAllowlistAllowAll(mock) + + h := NewPluginsHandler(registry, nil, nil). + WithRuntimeLookup(func(string) (string, error) { return "claude-code", nil }). + WithInstanceIDLookup(func(string) (string, error) { return "", errors.New("db: connection refused") }) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-1"}} + c.Request = httptest.NewRequest( + "POST", + "/workspaces/ws-1/plugins", + bytes.NewBufferString(`{"source":"local://browser-automation"}`), + ) + c.Request.Header.Set("Content-Type", "application/json") + + h.Install(c) + + if w.Code != http.StatusServiceUnavailable { + t.Errorf("expected 503 on instance-id lookup error, got %d: %s", w.Code, w.Body.String()) + } +} + +// ---------- dispatch: uninstall ---------- + +func TestPluginUninstall_SaaS_DispatchesToEIC(t *testing.T) { + stubReadPluginManifestViaEIC(t, func(ctx context.Context, instanceID, runtime, pluginName string) ([]byte, error) { + return []byte("name: browser-automation\nskills:\n - browse\n"), nil + }) + + type capture struct { + called bool + instanceID string + runtime string + pluginName string + } + var got capture + stubUninstallPluginViaEIC(t, func(ctx context.Context, instanceID, runtime, pluginName string) error { + got.called = true + got.instanceID = instanceID + got.runtime = runtime + got.pluginName = pluginName + return nil + }) + + h := NewPluginsHandler(t.TempDir(), nil, nil). + WithRuntimeLookup(func(string) (string, error) { return "claude-code", nil }). + WithInstanceIDLookup(func(string) (string, error) { return "i-bbbb", nil }) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{ + {Key: "id", Value: "ws-1"}, + {Key: "name", Value: "browser-automation"}, + } + c.Request = httptest.NewRequest("DELETE", "/workspaces/ws-1/plugins/browser-automation", nil) + + h.Uninstall(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + if !got.called { + t.Fatalf("uninstallPluginViaEIC was not called") + } + if got.instanceID != "i-bbbb" || got.runtime != "claude-code" || got.pluginName != "browser-automation" { + t.Errorf("dispatch args wrong: %+v", got) + } +} + +func TestPluginUninstall_SaaS_PropagatesEICError(t *testing.T) { + stubReadPluginManifestViaEIC(t, func(ctx context.Context, instanceID, runtime, pluginName string) ([]byte, error) { + return nil, nil + }) + stubUninstallPluginViaEIC(t, func(ctx context.Context, instanceID, runtime, pluginName string) error { + return errors.New("ssh: connection refused") + }) + + h := NewPluginsHandler(t.TempDir(), nil, nil). + WithRuntimeLookup(func(string) (string, error) { return "claude-code", nil }). + WithInstanceIDLookup(func(string) (string, error) { return "i-cccc", nil }) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{ + {Key: "id", Value: "ws-1"}, + {Key: "name", Value: "browser-automation"}, + } + c.Request = httptest.NewRequest("DELETE", "/workspaces/ws-1/plugins/browser-automation", nil) + + h.Uninstall(c) + + if w.Code != http.StatusBadGateway { + t.Errorf("expected 502, got %d: %s", w.Code, w.Body.String()) + } +} + +func TestPluginUninstall_NoBackends_Returns503(t *testing.T) { + stubUninstallPluginViaEIC(t, func(ctx context.Context, instanceID, runtime, pluginName string) error { + t.Errorf("EIC uninstall must not be called with empty instance_id") + return nil + }) + + h := NewPluginsHandler(t.TempDir(), nil, nil). + WithRuntimeLookup(func(string) (string, error) { return "claude-code", nil }). + WithInstanceIDLookup(func(string) (string, error) { return "", nil }) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{ + {Key: "id", Value: "ws-1"}, + {Key: "name", Value: "browser-automation"}, + } + c.Request = httptest.NewRequest("DELETE", "/workspaces/ws-1/plugins/browser-automation", nil) + + h.Uninstall(c) + + if w.Code != http.StatusServiceUnavailable { + t.Errorf("expected 503, got %d: %s", w.Code, w.Body.String()) + } +} + +// ---------- tarball shape ---------- + +// TestRealInstallPluginViaEIC_TarPayloadShape — the production +// installPluginViaEIC packs the staged dir as gzipped tar. Stub +// withEICTunnel + run the real installPluginViaEIC body, capturing the +// ssh stdin via a fake exec.Command — except go's exec is hard to fake +// without hijacking $PATH. Instead we exercise the tar packer directly: +// streamDirAsTar's behaviour is what we actually depend on, and a +// regression in either streamDirAsTar OR the gzip wrapping will be +// visible here. +func TestRealInstallPluginViaEIC_TarPayloadShape(t *testing.T) { + staged := t.TempDir() + if err := os.WriteFile(filepath.Join(staged, "plugin.yaml"), []byte("name: x\n"), 0644); err != nil { + t.Fatal(err) + } + if err := os.MkdirAll(filepath.Join(staged, "skills", "browse"), 0755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(staged, "skills", "browse", "instructions.md"), []byte("step 1\n"), 0644); err != nil { + t.Fatal(err) + } + + var buf bytes.Buffer + gz := gzip.NewWriter(&buf) + tw := tar.NewWriter(gz) + if err := streamDirAsTar(staged, tw); err != nil { + t.Fatalf("streamDirAsTar: %v", err) + } + if err := tw.Close(); err != nil { + t.Fatalf("tw close: %v", err) + } + if err := gz.Close(); err != nil { + t.Fatalf("gz close: %v", err) + } + + // Round-trip: the same payload the production flow would pipe into + // `tar -xzf -` on the remote should unpack to plugin.yaml + + // skills/browse/instructions.md. + gr, err := gzip.NewReader(&buf) + if err != nil { + t.Fatalf("gzip reader: %v", err) + } + tr := tar.NewReader(gr) + seen := map[string]bool{} + for { + hdr, err := tr.Next() + if err == io.EOF { + break + } + if err != nil { + t.Fatalf("tar next: %v", err) + } + seen[hdr.Name] = true + } + for _, want := range []string{"plugin.yaml", "skills/browse/instructions.md"} { + // Tar entries on Linux normally use forward slashes regardless + // of host separator; double-check both forms so a Windows test + // runner doesn't go red on a path-sep difference. Production + // always runs on Linux (CI + tenant EC2). + alt := filepath.FromSlash(want) + if !seen[want] && !seen[alt] { + t.Errorf("tar payload missing %q (saw %v)", want, seen) + } + } +} diff --git a/workspace-server/internal/handlers/plugins_install_pipeline.go b/workspace-server/internal/handlers/plugins_install_pipeline.go index 07fb428b..6c6fb217 100644 --- a/workspace-server/internal/handlers/plugins_install_pipeline.go +++ b/workspace-server/internal/handlers/plugins_install_pipeline.go @@ -261,22 +261,74 @@ func (h *PluginsHandler) resolveAndStage(ctx context.Context, req installRequest // deliverToContainer copies the staged plugin dir into the workspace // container, chowns it for the agent user, and triggers a restart. // Returns a typed *httpErr on failure; nil on success. +// +// Dispatch order: +// +// 1. Local Docker container is up → tar+CopyToContainer (historical path). +// 2. SaaS workspace (instance_id set) → push via EIC SSH to the EC2's +// bind-mounted /configs/plugins//. Closes the 🔴 docker-only +// row in docs/architecture/backends.md by routing through the same +// primitive Files API uses (template_files_eic.go). +// 3. Neither wired → 503. True "no backend" case (dev box without +// Docker AND without an instance_id row). +// +// The SaaS branch is gated on h.instanceIDLookup so unit tests can keep +// using NewPluginsHandler without a DB; production wires it in router.go. func (h *PluginsHandler) deliverToContainer(ctx context.Context, workspaceID string, r *stageResult) error { - containerName := h.findRunningContainer(ctx, workspaceID) - if containerName == "" { - return newHTTPErr(http.StatusServiceUnavailable, gin.H{"error": "workspace container not running"}) + if containerName := h.findRunningContainer(ctx, workspaceID); containerName != "" { + if err := h.copyPluginToContainer(ctx, containerName, r.StagedDir, r.PluginName); err != nil { + log.Printf("Plugin install: failed to copy %s to %s: %v", r.PluginName, workspaceID, err) + return newHTTPErr(http.StatusInternalServerError, gin.H{"error": "failed to copy plugin to container"}) + } + h.execAsRoot(ctx, containerName, []string{ + "chown", "-R", "1000:1000", "/configs/plugins/" + r.PluginName, + }) + if h.restartFunc != nil { + go h.restartFunc(workspaceID) + } + return nil } - if err := h.copyPluginToContainer(ctx, containerName, r.StagedDir, r.PluginName); err != nil { - log.Printf("Plugin install: failed to copy %s to %s: %v", r.PluginName, workspaceID, err) - return newHTTPErr(http.StatusInternalServerError, gin.H{"error": "failed to copy plugin to container"}) + + if instanceID, runtime := h.lookupSaaSDispatch(workspaceID); instanceID != "" { + if err := installPluginViaEIC(ctx, instanceID, runtime, r.PluginName, r.StagedDir); err != nil { + log.Printf("Plugin install: EIC push failed for %s → %s: %v", r.PluginName, workspaceID, err) + return newHTTPErr(http.StatusBadGateway, gin.H{ + "error": "failed to deliver plugin to workspace EC2", + }) + } + if h.restartFunc != nil { + go h.restartFunc(workspaceID) + } + return nil } - h.execAsRoot(ctx, containerName, []string{ - "chown", "-R", "1000:1000", "/configs/plugins/" + r.PluginName, - }) - if h.restartFunc != nil { - go h.restartFunc(workspaceID) + + return newHTTPErr(http.StatusServiceUnavailable, gin.H{"error": "workspace container not running"}) +} + +// lookupSaaSDispatch returns (instance_id, runtime) for SaaS dispatch, or +// ("", "") when the lookups aren't wired or the workspace isn't on the +// EC2 backend. Errors from the lookups are logged-and-swallowed: failing +// open here just means the caller falls through to the 503 path it would +// have returned without us, never to a wrong action against the wrong +// instance. +func (h *PluginsHandler) lookupSaaSDispatch(workspaceID string) (instanceID, runtime string) { + if h.instanceIDLookup == nil { + return "", "" } - return nil + id, err := h.instanceIDLookup(workspaceID) + if err != nil { + log.Printf("Plugin install: instance_id lookup failed for %s: %v", workspaceID, err) + return "", "" + } + if id == "" { + return "", "" + } + if h.runtimeLookup != nil { + if rt, rterr := h.runtimeLookup(workspaceID); rterr == nil { + runtime = rt + } + } + return id, runtime } // readPluginSkillsFromContainer reads /configs/plugins//plugin.yaml diff --git a/workspace-server/internal/router/router.go b/workspace-server/internal/router/router.go index e3b9171b..770b0a66 100644 --- a/workspace-server/internal/router/router.go +++ b/workspace-server/internal/router/router.go @@ -11,13 +11,13 @@ import ( "github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo" "github.com/Molecule-AI/molecule-monorepo/platform/internal/channels" "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" - "github.com/Molecule-AI/molecule-monorepo/platform/internal/messagestore" "github.com/Molecule-AI/molecule-monorepo/platform/internal/events" "github.com/Molecule-AI/molecule-monorepo/platform/internal/handlers" - "github.com/Molecule-AI/molecule-monorepo/platform/internal/pendinguploads" memwiring "github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/wiring" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/messagestore" "github.com/Molecule-AI/molecule-monorepo/platform/internal/metrics" "github.com/Molecule-AI/molecule-monorepo/platform/internal/middleware" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/pendinguploads" "github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner" "github.com/Molecule-AI/molecule-monorepo/platform/internal/supervised" "github.com/Molecule-AI/molecule-monorepo/platform/internal/ws" @@ -109,8 +109,8 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi now := time.Now() for name, last := range snap { out[name] = gin.H{ - "last_tick_at": last, - "seconds_ago": int(now.Sub(last).Seconds()), + "last_tick_at": last, + "seconds_ago": int(now.Sub(last).Seconds()), } } c.JSON(200, gin.H{"subsystems": out}) @@ -599,8 +599,25 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi ).Scan(&runtime) return runtime, err } + // Instance-id lookup powers the SaaS dispatch in install/uninstall: + // when a workspace is on the EC2-per-workspace backend (instance_id + // non-NULL) and there's no local Docker container to exec into, the + // pipeline pushes the staged plugin tarball to that EC2 over EIC SSH. + // Empty result means the workspace lives on the local-Docker backend + // (or hasn't been provisioned yet) and the handler falls back to its + // original Docker path. Same pattern templates.go and terminal.go use. + instanceIDLookup := func(workspaceID string) (string, error) { + var instanceID string + err := db.DB.QueryRowContext( + context.Background(), + `SELECT COALESCE(instance_id, '') FROM workspaces WHERE id = $1`, + workspaceID, + ).Scan(&instanceID) + return instanceID, err + } plgh := handlers.NewPluginsHandler(pluginsDir, dockerCli, wh.RestartByID). - WithRuntimeLookup(runtimeLookup) + WithRuntimeLookup(runtimeLookup). + WithInstanceIDLookup(instanceIDLookup) r.GET("/plugins", plgh.ListRegistry) r.GET("/plugins/sources", plgh.ListSources) wsAuth.GET("/plugins", plgh.ListInstalled) From b6646910514d37ee2c0cbc6afe173bdaa4120d9e Mon Sep 17 00:00:00 2001 From: "claude-ceo-assistant (Claude Opus 4.7 on Hongming's MacBook)" Date: Thu, 7 May 2026 16:01:11 -0700 Subject: [PATCH 2/2] fix(eic-tunnel-pool): capture poolJanitorInterval at pool construction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the chronic -race flake on TestPooledWithEICTunnel_PanicPoisonsEntry and the handlers package as a whole (CI / Platform (Go) was intermittent on staging, ~50% red on workspace-server-touching commits since 2026-04). The race: tests swap the package-level poolJanitorInterval via t.Cleanup (eic_tunnel_pool_test.go:61) AFTER an earlier test caused the global pool's janitor goroutine to start. The janitor loops on time.NewTicker(poolJanitorInterval) on every tick — so the cleanup write races the goroutine read for the rest of the process. Caught locally + on PR #84's CI run on Gitea. Fix: capture the interval as a field on eicTunnelPool at newEICTunnelPool(). The janitor now reads p.janitorInterval, which never changes after construction. Tests that override poolJanitorInterval before freshPool() still get the new value (they set the package var before construction). The global pool's janitor — created lazily once via sync.Once on first getEICTunnelPool() — is now immune to t.Cleanup-driven swaps from later tests. Surfaced while verifying #84 (SaaS plugin install via EIC SSH); folded into this PR per the "fix root not symptom" rule rather than merging around a chronic-red CI signal. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/handlers/eic_tunnel_pool.go | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/workspace-server/internal/handlers/eic_tunnel_pool.go b/workspace-server/internal/handlers/eic_tunnel_pool.go index 03bfd01f..20b2e269 100644 --- a/workspace-server/internal/handlers/eic_tunnel_pool.go +++ b/workspace-server/internal/handlers/eic_tunnel_pool.go @@ -108,6 +108,18 @@ type eicTunnelPool struct { // First acquirer takes the slot; later ones wait on the channel. pendingSetups map[string]chan struct{} stopJanitor chan struct{} + // janitorInterval is captured at pool construction from the + // package-level poolJanitorInterval var. Captured (not re-read on + // every tick) so a test that swaps the package var via t.Cleanup + // after a global pool's janitor is already running can't race + // with that goroutine's ticker read. The global pool is created + // lazily once per process via sync.Once; before this capture + // landed, every test that touched poolJanitorInterval after the + // global pool's first-touch raced the janitor (caught by -race + // on staging tip 249dbc6a — TestPooledWithEICTunnel_PanicPoisonsEntry). + // Tests still get the new value on a freshPool() because they + // set the package var BEFORE calling newEICTunnelPool(). + janitorInterval time.Duration } var ( @@ -127,11 +139,16 @@ func getEICTunnelPool() *eicTunnelPool { // newEICTunnelPool constructs an empty pool. Exported so tests can // build isolated pools without sharing the singleton. +// +// Captures poolJanitorInterval at construction time so the janitor +// goroutine doesn't race with t.Cleanup-driven swaps of the package +// var. See the janitorInterval field comment for the failure mode. func newEICTunnelPool() *eicTunnelPool { return &eicTunnelPool{ - entries: map[string]*pooledTunnel{}, - pendingSetups: map[string]chan struct{}{}, - stopJanitor: make(chan struct{}), + entries: map[string]*pooledTunnel{}, + pendingSetups: map[string]chan struct{}{}, + stopJanitor: make(chan struct{}), + janitorInterval: poolJanitorInterval, } } @@ -290,8 +307,11 @@ func (p *eicTunnelPool) evictLRUIfFullLocked(skipInstance string) { // janitor periodically scans for entries that are idle AND expired, // closing their tunnels. Runs forever (per pool lifetime); cancelled // by close(p.stopJanitor) for tests that build short-lived pools. +// +// Reads p.janitorInterval (captured at construction) instead of the +// package-level poolJanitorInterval — see janitorInterval field comment. func (p *eicTunnelPool) janitor() { - t := time.NewTicker(poolJanitorInterval) + t := time.NewTicker(p.janitorInterval) defer t.Stop() for { select {