Merge remote-tracking branch 'origin/main' into trig-189

2026-05-09 22:58:32 +00:00 · 2026-05-09 22:58:32 +00:00 · 2a04233d5a
commit 2a04233d5a
parent ebc56a2ce6 d144827ea4
14 changed files with 903 additions and 41 deletions
--- a/canvas/src/components/canvas/tests/useKeyboardShortcuts.test.tsx
+++ b/canvas/src/components/canvas/tests/useKeyboardShortcuts.test.tsx
@ -0,0 +1,309 @@
+// @vitest-environment jsdom
+/**
+ * Tests for canvas keyboard shortcuts (useKeyboardShortcuts hook).
+ *
+ * Covers: Esc, Enter/Shift+Enter, Cmd+]/[, Z, and Arrow keys.
+ *
+ * The hook is tested by dispatching KeyboardEvents at the window and
+ * asserting the resulting store mutations / dispatched events.
+ */
+import React from "react";
+import { render, cleanup, fireEvent } from "@testing-library/react";
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import { useKeyboardShortcuts } from "../useKeyboardShortcuts";
+import { useCanvasStore } from "@/store/canvas";
+
+// ─── Mock store ──────────────────────────────────────────────────────────────
+
+const mockSavePosition = vi.fn().mockResolvedValue(undefined);
+
+vi.mock("@/store/canvas", () => ({
+  useCanvasStore: Object.assign(
+    vi.fn((sel) => sel(mockStoreState)),
+    {
+      getState: () => mockStoreState,
+    }
+  ),
+}));
+
+// Module-level mutable state so tests can mutate between cases
+const mockStoreState = {
+  selectedNodeId: null as string | null,
+  selectedNodeIds: new Set<string>(),
+  nodes: [] as Array<{ id: string; position: { x: number; y: number }; data: { parentId?: string | null } }>,
+  contextMenu: null as { x: number; y: number; nodeId: string } | null,
+  closeContextMenu: vi.fn(),
+  selectNode: vi.fn(),
+  clearSelection: vi.fn(),
+  bumpZOrder: vi.fn(),
+  savePosition: mockSavePosition,
+  moveNode: vi.fn(),
+};
+
+afterEach(() => {
+  cleanup();
+  vi.clearAllMocks();
+  // Reset to default empty state between tests
+  mockStoreState.selectedNodeId = null;
+  mockStoreState.selectedNodeIds = new Set();
+  mockStoreState.nodes = [];
+  mockStoreState.contextMenu = null;
+  mockStoreState.closeContextMenu.mockClear();
+  mockStoreState.selectNode.mockClear();
+  mockStoreState.clearSelection.mockClear();
+  mockStoreState.bumpZOrder.mockClear();
+  mockStoreState.moveNode.mockClear();
+  mockStoreState.savePosition.mockClear();
+});
+
+// ─── Test wrapper ────────────────────────────────────────────────────────────
+
+function ShortcutTestComponent() {
+  useKeyboardShortcuts();
+  return <div data-testid="canvas-root" />;
+}
+
+function renderWithProvider() {
+  return render(<ShortcutTestComponent />);
+}
+
+// ─── Tests ───────────────────────────────────────────────────────────────────
+
+describe("Esc — deselect / close context menu", () => {
+  it("closes the context menu when one is open", () => {
+    mockStoreState.contextMenu = { x: 100, y: 100, nodeId: "n1" };
+    renderWithProvider();
+    fireEvent.keyDown(window, { key: "Escape" });
+    expect(mockStoreState.closeContextMenu).toHaveBeenCalledTimes(1);
+  });
+
+  it("clears the batch selection when no context menu is open", () => {
+    mockStoreState.contextMenu = null;
+    mockStoreState.selectedNodeIds = new Set(["n1", "n2"]);
+    renderWithProvider();
+    fireEvent.keyDown(window, { key: "Escape" });
+    expect(mockStoreState.clearSelection).toHaveBeenCalledTimes(1);
+  });
+
+  it("deselects the focused node when no batch selection exists", () => {
+    mockStoreState.contextMenu = null;
+    mockStoreState.selectedNodeIds = new Set();
+    mockStoreState.selectedNodeId = "n1";
+    renderWithProvider();
+    fireEvent.keyDown(window, { key: "Escape" });
+    expect(mockStoreState.selectNode).toHaveBeenCalledWith(null);
+  });
+});
+
+describe("Enter — hierarchy navigation", () => {
+  beforeEach(() => {
+    mockStoreState.selectedNodeId = "n1";
+    mockStoreState.nodes = [
+      { id: "n1", position: { x: 0, y: 0 }, data: { parentId: null } },
+      { id: "n2", position: { x: 100, y: 0 }, data: { parentId: "n1" } },
+      { id: "n3", position: { x: 200, y: 0 }, data: { parentId: null } },
+    ];
+  });
+
+  it("navigates to the first child on Enter", () => {
+    renderWithProvider();
+    fireEvent.keyDown(window, { key: "Enter" });
+    expect(mockStoreState.selectNode).toHaveBeenCalledWith("n2");
+  });
+
+  it("navigates to the parent on Shift+Enter", () => {
+    mockStoreState.nodes = [
+      { id: "n1", position: { x: 0, y: 0 }, data: { parentId: null } },
+      { id: "n2", position: { x: 100, y: 0 }, data: { parentId: "n1" } },
+    ];
+    mockStoreState.selectedNodeId = "n2";
+    renderWithProvider();
+    fireEvent.keyDown(window, { key: "Enter", shiftKey: true });
+    expect(mockStoreState.selectNode).toHaveBeenCalledWith("n1");
+  });
+
+  it("does NOT navigate when no node is selected", () => {
+    mockStoreState.selectedNodeId = null;
+    renderWithProvider();
+    fireEvent.keyDown(window, { key: "Enter" });
+    expect(mockStoreState.selectNode).not.toHaveBeenCalled();
+  });
+});
+
+describe("Cmd+]/[ — z-order bump", () => {
+  beforeEach(() => {
+    mockStoreState.selectedNodeId = "n1";
+  });
+
+  it("bumps z-order forward on Cmd+]", () => {
+    renderWithProvider();
+    fireEvent.keyDown(window, { key: "]", metaKey: true });
+    expect(mockStoreState.bumpZOrder).toHaveBeenCalledWith("n1", 1);
+  });
+
+  it("bumps z-order backward on Cmd+[", () => {
+    renderWithProvider();
+    fireEvent.keyDown(window, { key: "[", metaKey: true });
+    expect(mockStoreState.bumpZOrder).toHaveBeenCalledWith("n1", -1);
+  });
+
+  it("uses Ctrl as the modifier key", () => {
+    renderWithProvider();
+    fireEvent.keyDown(window, { key: "]", ctrlKey: true });
+    expect(mockStoreState.bumpZOrder).toHaveBeenCalledWith("n1", 1);
+  });
+});
+
+describe("Z — zoom-to-team", () => {
+  let dispatchedEvents: CustomEvent[] = [];
+
+  beforeEach(() => {
+    dispatchedEvents = [];
+    mockStoreState.selectedNodeId = "n1";
+    mockStoreState.nodes = [
+      { id: "n1", position: { x: 0, y: 0 }, data: { parentId: null } },
+      { id: "n2", position: { x: 100, y: 0 }, data: { parentId: "n1" } },
+    ];
+    window.addEventListener("molecule:zoom-to-team", (e) => {
+      dispatchedEvents.push(e as CustomEvent);
+    });
+  });
+
+  afterEach(() => {
+    window.removeEventListener("molecule:zoom-to-team", () => {});
+  });
+
+  it("dispatches zoom-to-team when the selected node has children", () => {
+    renderWithProvider();
+    fireEvent.keyDown(window, { key: "z" });
+    expect(dispatchedEvents).toHaveLength(1);
+    expect(dispatchedEvents[0].detail.nodeId).toBe("n1");
+  });
+
+  it("does NOT fire when no node is selected", () => {
+    mockStoreState.selectedNodeId = null;
+    renderWithProvider();
+    fireEvent.keyDown(window, { key: "z" });
+    expect(dispatchedEvents).toHaveLength(0);
+  });
+
+  it("does NOT fire when the node has no children", () => {
+    mockStoreState.nodes = [
+      { id: "n1", position: { x: 0, y: 0 }, data: { parentId: null } },
+    ];
+    renderWithProvider();
+    fireEvent.keyDown(window, { key: "z" });
+    expect(dispatchedEvents).toHaveLength(0);
+  });
+
+  it("skips when the target element is an input", () => {
+    renderWithProvider();
+    const input = document.createElement("input");
+    document.body.appendChild(input);
+    fireEvent.keyDown(input, { key: "z" });
+    expect(dispatchedEvents).toHaveLength(0);
+    document.body.removeChild(input);
+  });
+});
+
+describe("Arrow keys — keyboard node movement", () => {
+  beforeEach(() => {
+    mockStoreState.selectedNodeId = "n1";
+    mockStoreState.nodes = [
+      { id: "n1", position: { x: 100, y: 200 }, data: { parentId: null } },
+    ];
+  });
+
+  it("moves the selected node down on ArrowDown", () => {
+    renderWithProvider();
+    fireEvent.keyDown(window, { key: "ArrowDown" });
+    expect(mockStoreState.moveNode).toHaveBeenCalledWith("n1", 0, 10);
+  });
+
+  it("moves the selected node up on ArrowUp", () => {
+    renderWithProvider();
+    fireEvent.keyDown(window, { key: "ArrowUp" });
+    expect(mockStoreState.moveNode).toHaveBeenCalledWith("n1", 0, -10);
+  });
+
+  it("moves the selected node right on ArrowRight", () => {
+    renderWithProvider();
+    fireEvent.keyDown(window, { key: "ArrowRight" });
+    expect(mockStoreState.moveNode).toHaveBeenCalledWith("n1", 10, 0);
+  });
+
+  it("moves the selected node left on ArrowLeft", () => {
+    renderWithProvider();
+    fireEvent.keyDown(window, { key: "ArrowLeft" });
+    expect(mockStoreState.moveNode).toHaveBeenCalledWith("n1", -10, 0);
+  });
+
+  it("moves 50 px when Shift is held", () => {
+    renderWithProvider();
+    fireEvent.keyDown(window, { key: "ArrowDown", shiftKey: true });
+    expect(mockStoreState.moveNode).toHaveBeenCalledWith("n1", 0, 50);
+  });
+
+  it("does NOT fire when no node is selected", () => {
+    mockStoreState.selectedNodeId = null;
+    renderWithProvider();
+    fireEvent.keyDown(window, { key: "ArrowDown" });
+    expect(mockStoreState.moveNode).not.toHaveBeenCalled();
+  });
+
+  it("skips when the target element is an input", () => {
+    renderWithProvider();
+    const input = document.createElement("input");
+    document.body.appendChild(input);
+    fireEvent.keyDown(input, { key: "ArrowDown" });
+    expect(mockStoreState.moveNode).not.toHaveBeenCalled();
+    document.body.removeChild(input);
+  });
+
+  it("skips when a modal dialog is already open", () => {
+    renderWithProvider();
+    const dialog = document.createElement("div");
+    dialog.setAttribute("role", "dialog");
+    dialog.setAttribute("aria-modal", "true");
+    document.body.appendChild(dialog);
+    fireEvent.keyDown(window, { key: "ArrowDown" });
+    expect(mockStoreState.moveNode).not.toHaveBeenCalled();
+    document.body.removeChild(dialog);
+  });
+
+  it("prevents default browser scroll on arrow keys", () => {
+    renderWithProvider();
+    const preventDefault = vi.fn();
+    fireEvent.keyDown(window, {
+      key: "ArrowDown",
+      preventDefault,
+    });
+    expect(preventDefault).toHaveBeenCalled();
+  });
+});
+
+describe("all shortcuts respect inInput guard", () => {
+  it("ArrowDown is skipped in an input element", () => {
+    mockStoreState.selectedNodeId = "n1";
+    renderWithProvider();
+    const textarea = document.createElement("textarea");
+    document.body.appendChild(textarea);
+    fireEvent.keyDown(textarea, { key: "ArrowDown" });
+    expect(mockStoreState.moveNode).not.toHaveBeenCalled();
+    document.body.removeChild(textarea);
+  });
+
+  it("Enter navigation is skipped in an input element", () => {
+    mockStoreState.selectedNodeId = "n1";
+    mockStoreState.nodes = [
+      { id: "n1", position: { x: 0, y: 0 }, data: { parentId: null } },
+      { id: "n2", position: { x: 100, y: 0 }, data: { parentId: "n1" } },
+    ];
+    renderWithProvider();
+    const input = document.createElement("input");
+    document.body.appendChild(input);
+    fireEvent.keyDown(input, { key: "Enter" });
+    expect(mockStoreState.selectNode).not.toHaveBeenCalled();
+    document.body.removeChild(input);
+  });
+});
--- a/canvas/src/components/canvas/useKeyboardShortcuts.ts
+++ b/canvas/src/components/canvas/useKeyboardShortcuts.ts
@ -14,6 +14,7 @@ import { useCanvasStore } from "@/store/canvas";
 *   Cmd/Ctrl+]           — bump selected node forward in z-order
 *   Cmd/Ctrl+[           — bump selected node backward in z-order
 *   Z                    — zoom-to-team if the selected node has children
+ *   Arrow keys           — move selected node 10px (50px with Shift)
 */
 export function useKeyboardShortcuts() {
  useEffect(() => {
@ -80,6 +81,33 @@ export function useKeyboardShortcuts() {
          );
        }
      }
+
+      // Arrow-key node movement — Figma-style keyboard drag for keyboard users.
+      // 10 px per press, 50 px with Shift held. Only fires when a node
+      // is selected and the target isn't a form control.
+      if (
+        !inInput &&
+        (e.key === "ArrowUp" ||
+          e.key === "ArrowDown" ||
+          e.key === "ArrowLeft" ||
+          e.key === "ArrowRight")
+      ) {
+        const state = useCanvasStore.getState();
+        const selectedId = state.selectedNodeId;
+        if (!selectedId) return;
+        // Skip when a modal/dialog is already open — dialogs own their own
+        // arrow-key semantics and shouldn't trigger canvas moves.
+        if (document.querySelector('[role="dialog"][aria-modal="true"]')) return;
+        e.preventDefault();
+        const step = e.shiftKey ? 50 : 10;
+        let dx = 0;
+        let dy = 0;
+        if (e.key === "ArrowUp") dy = -step;
+        else if (e.key === "ArrowDown") dy = step;
+        else if (e.key === "ArrowLeft") dx = -step;
+        else dx = step;
+        state.moveNode(selectedId, dx, dy);
+      }
    };
    window.addEventListener("keydown", handler);
    return () => window.removeEventListener("keydown", handler);
--- a/canvas/src/store/tests/canvas-events.test.ts
+++ b/canvas/src/store/tests/canvas-events.test.ts
@ -1012,4 +1012,3 @@ describe("handleCanvasEvent – liveAnnouncement", () => {
    expect(state.liveAnnouncement ?? "").toBe("");
  });
 });
-});
--- a/canvas/src/store/tests/canvas.test.ts
+++ b/canvas/src/store/tests/canvas.test.ts
@ -1181,3 +1181,46 @@ describe("batchNest", () => {
    expect(nestPatches).toHaveLength(1);
  });
 });
+
+// ---------- moveNode ----------
+
+describe("moveNode", () => {
+  beforeEach(() => {
+    const mock = global.fetch as ReturnType<typeof vi.fn>;
+    mock.mockImplementation(() =>
+      Promise.resolve({ ok: true, json: () => Promise.resolve({}) } as Response),
+    );
+    mock.mockClear();
+  });
+
+  it("updates the node's position by the given delta", () => {
+    useCanvasStore.getState().hydrate([
+      makeWS({ id: "n1", name: "Node 1", x: 100, y: 200 }),
+    ]);
+    useCanvasStore.getState().selectNode("n1");
+    useCanvasStore.getState().moveNode("n1", 10, -50);
+    const node = useCanvasStore.getState().nodes.find((n) => n.id === "n1")!;
+    expect(node.position).toEqual({ x: 110, y: 150 });
+  });
+
+  it("is a no-op when the node does not exist", () => {
+    useCanvasStore.getState().hydrate([makeWS({ id: "n1", name: "Node 1", x: 0, y: 0 })]);
+    expect(() => useCanvasStore.getState().moveNode("nonexistent", 10, 10)).not.toThrow();
+  });
+
+  it("calls savePosition with the new absolute coordinates", async () => {
+    useCanvasStore.getState().hydrate([makeWS({ id: "n1", name: "Node 1", x: 100, y: 200 })]);
+    useCanvasStore.getState().selectNode("n1");
+    const mock = global.fetch as ReturnType<typeof vi.fn>;
+    useCanvasStore.getState().moveNode("n1", 10, 20);
+    await vi.waitFor(() => {
+      expect(mock).toHaveBeenCalledWith(
+        expect.stringContaining("/workspaces/n1"),
+        expect.objectContaining({
+          method: "PATCH",
+          body: JSON.stringify({ x: 110, y: 220 }),
+        }),
+      );
+    });
+  });
+});
--- a/canvas/src/store/canvas.ts
+++ b/canvas/src/store/canvas.ts
@ -165,6 +165,13 @@ interface CanvasState {
   *  this so a drag that pushed a child past the parent edge commits
   *  the parent grow on release (commit-on-release pattern). */
  growParentsToFitChildren: () => void;
+  /** Move a selected node by (dx, dy) in canvas space. Used by keyboard
+   *  arrow-key shortcuts so keyboard users can reposition nodes without a
+   *  mouse. Persists the new position to the backend and skips the
+   *  grow-parents pass that onNodesChange runs on every drag tick
+   *  (avoids the "edge-chase" flicker that commit-on-release is meant to
+   *  prevent). */
+  moveNode: (nodeId: string, dx: number, dy: number) => void;
  /** Re-layout a parent's children to the default 2-column grid. Used
   *  by the "Arrange children" context-menu command so users can rescue
   *  out-of-bounds children on demand — topology no longer does it
@ -1032,6 +1039,19 @@ export const useCanvasStore = create<CanvasState>((set, get) => ({
    }
  },

+  moveNode: (nodeId, dx, dy) => {
+    const node = get().nodes.find((n) => n.id === nodeId);
+    if (!node) return;
+    set({
+      nodes: get().nodes.map((n) =>
+        n.id === nodeId
+          ? { ...n, position: { x: n.position.x + dx, y: n.position.y + dy } }
+          : n,
+      ),
+    });
+    void get().savePosition(nodeId, node.position.x + dx, node.position.y + dy);
+  },
+
  savePosition: async (nodeId: string, x: number, y: number) => {
    try {
      await api.patch(`/workspaces/${nodeId}`, { x, y });
--- a/docs/design-system/canvas-audit-items.md
+++ b/docs/design-system/canvas-audit-items.md
@ -55,7 +55,7 @@ canvas/src/
 ### Node Rendering ✅ (with notes)
 - **Framework:** `@xyflow/react` (React Flow) — DOM-based, not SVG/Canvas
 - **Node selection:** `aria-pressed` + border ring (`border-accent/70`) + shadow
- **Node drag:** React Flow native drag — mouse only, no keyboard alternative yet
+- **Node drag:** React Flow native drag + Arrow keys (10px/step, Shift 50px) — keyboard-accessible (PR #182) ✅
 - **Node resize:** `NodeResizer` component visible on selected card, keyboard-inaccessible
 - **Status:** Accessible via `aria-label` on node cards — "Alpha Workspace workspace — online"

@ -72,10 +72,12 @@ canvas/src/
 - **Minimap:** Not present (MiniMap mocked as null in tests)
 - **Status:** Basic keyboard support via viewport shortcuts

-### Keyboard Shortcuts ⚠️ PARTIAL
- Exists in `useKeyboardShortcuts.ts` but no `aria-describedby` on trigger buttons
- No dedicated keyboard shortcut help dialog
- **Gap:** Users can't discover shortcuts visually
+### Keyboard Shortcuts ✅ (strong)
+- All shortcuts in `useKeyboardShortcuts.ts` with `inInput` guard ✅
+- Global `?` shortcut opens `KeyboardShortcutsDialog` (PR #175) ✅
+- Dialog: portal-based, aria-modal, focus trap, Escape close ✅
+- Arrow keys move selected node 10px (50px with Shift) — keyboard node drag (this PR) ✅
+- Hierarchy navigation (Enter/Shift+Enter), z-order (Cmd+]/[), zoom-to-team (Z) ✅

 ### Focus Management ✅ (strong)
 - Skip link → `#canvas-main` ✅
@ -83,9 +85,9 @@ canvas/src/
 - Focus trap in modals via Radix ✅
 - Focus ring: `focus-visible:ring-2 focus-visible:ring-blue-500 focus-visible:ring-offset-2 focus-visible:ring-offset-zinc-950`

-### Accessibility Tree ⚠️ PARTIAL
+### Accessibility Tree ✅
 - Canvas is in accessibility tree (React Flow DOM nodes)
- Node state changes not announced to screen readers (no `aria-live` region)
+- Node state changes announced via `aria-live="polite"` region (PR #172) ✅
 - Context menus announced via `role="menu"` ✅

 ### Context Menus ✅ (strong)
@ -95,11 +97,10 @@ canvas/src/
 - Escape + Tab close menu ✅
 - Auto-focus first item on open ✅

-### Drag and Drop ⚠️ PARTIAL
+### Drag and Drop ✅
 - **Mouse drag:** React Flow native
 - **Drop target:** Visual indicator (`bg-emerald-950/40 border-emerald-400/60`) ✅
- **Keyboard alternative:** None — nodes repositioned only via mouse drag
- **Status:** Mouse-only. Keyboard users cannot rearrange nodes.
+- **Keyboard alternative:** Arrow keys move selected node 10px per press (50px with Shift) (PR #182) ✅

 ---

@ -109,7 +110,7 @@ canvas/src/
 |----------|------|-------|--------|
 | ~~HIGH~~ | ~~Screen reader announcements for canvas state changes~~ | ~~Canvas.tsx, canvas-events.ts, canvas.ts~~ | ✅ Done — PR #172 |
 | MEDIUM | Keyboard shortcut help dialog | useKeyboardShortcuts.ts | ✅ Done (PR #175) |
-| MEDIUM | Keyboard-accessible node drag | WorkspaceNode.tsx, useDragHandlers.ts | Not started |
+| MEDIUM | Keyboard-accessible node drag | WorkspaceNode.tsx, useDragHandlers.ts | ✅ Done (this PR) |
 | LOW | Edge anchor keyboard accessibility | A2AEdge.tsx | Not started |
 | LOW | Node resize keyboard accessibility | WorkspaceNode.tsx (NodeResizer) | Not started |

--- a/workspace-server/internal/handlers/a2a_proxy.go
+++ b/workspace-server/internal/handlers/a2a_proxy.go
@ -490,7 +490,14 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri
 		if logActivity && deliveryConfirmed {
 			h.logA2ASuccess(ctx, workspaceID, callerID, body, respBody, a2aMethod, resp.StatusCode, durationMs)
 		}
-		return 0, nil, &proxyA2AError{
+		// Preserve the actual HTTP status code and any body bytes already read.
+		// Previously this returned (0, nil, error) which discarded both.
+		// Preserving them allows executeDelegation's new condition
+		//   proxyErr != nil && len(respBody) > 0 && status >= 200 && status < 300
+		// to correctly route delivery-confirmed responses (where the agent completed
+		// the work but the TCP connection dropped before the full body was received)
+		// to success instead of failure (#159).
+		return resp.StatusCode, respBody, &proxyA2AError{
 			Status: http.StatusBadGateway,
 			Response: gin.H{
 				"error":              "failed to read agent response",
--- a/workspace-server/internal/handlers/delegation.go
+++ b/workspace-server/internal/handlers/delegation.go
@ -348,7 +348,7 @@ func (h *DelegationHandler) executeDelegation(sourceID, targetID, delegationID s
 	// received). Treat as success: the response body is valid and the work is done.
 	// This prevents "retry storms" where the canvas sees error + Restart-workspace
 	// suggestion even though the delegation actually completed.
-	if proxyErr != nil && len(respBody) > 0 && status >= 200 && status < 300 {
+	if isDeliveryConfirmedSuccess(proxyErr, status, respBody) {
 		log.Printf("Delegation %s: completed with delivery error (status=%d, respBody=%d bytes, proxyErr=%v) — treating as success",
 			delegationID, status, len(respBody), proxyErr.Error())
 		goto handleSuccess
@ -685,6 +685,34 @@ func isTransientProxyError(err *proxyA2AError) bool {
 	return false
 }

+// isDeliveryConfirmedSuccess reports whether the proxy's `(status, body, err)`
+// triple represents a delivery-confirmed success: the proxy hit a transport-
+// layer error AFTER receiving a complete 2xx response with a non-empty body.
+// In that case the agent did the work — the error is on the wire, not in the
+// agent — so the delegation should be marked succeeded rather than failed
+// (preventing the retry-storm + restart-suggest cascade described in #159).
+//
+// Caller invariants:
+//   - proxyErr != nil: a delivery error fired (e.g. connection reset).
+//   - len(respBody) > 0: a response body was received before the error.
+//   - 200 <= status < 300: the partial response carried a 2xx code.
+//
+// All three must hold. nil proxyErr → no decision to make (success path
+// already chosen upstream). Empty body → no work-result to recover. Non-2xx →
+// the agent itself signalled failure or transient state; don't promote it.
+func isDeliveryConfirmedSuccess(proxyErr *proxyA2AError, status int, respBody []byte) bool {
+	if proxyErr == nil {
+		return false
+	}
+	if len(respBody) == 0 {
+		return false
+	}
+	if status < 200 || status >= 300 {
+		return false
+	}
+	return true
+}
+
 // isQueuedProxyResponse reports whether the proxy returned a body shaped like
 // `{"queued": true, "queue_id": ..., "queue_depth": ..., "message": ...}` —
 // the busy-target enqueue path in a2a_proxy_helpers.go. Caller checks this
--- a/workspace-server/internal/handlers/delegation_test.go
+++ b/workspace-server/internal/handlers/delegation_test.go
@ -5,8 +5,10 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"net"
 	"net/http"
 	"net/http/httptest"
+	"sync"
 	"testing"
 	"time"

@ -376,6 +378,44 @@ func TestIsTransientProxyError_RetriesOnRestartRaceStatuses(t *testing.T) {
 	}
 }

+// TestIsDeliveryConfirmedSuccess — regression guard for #159: the proxy can
+// return a complete 2xx body and THEN raise a transport error (e.g. the TCP
+// connection drops after the response is received but before close). In that
+// case the agent did the work; marking the delegation "failed" causes the
+// retry-storm + Restart-workspace cascade described in #159. The new helper
+// distinguishes this from genuine failures.
+func TestIsDeliveryConfirmedSuccess(t *testing.T) {
+	connErr := &proxyA2AError{Status: http.StatusOK, Response: gin.H{}}
+	cases := []struct {
+		name     string
+		proxyErr *proxyA2AError
+		status   int
+		body     []byte
+		expect   bool
+	}{
+		// The new branch: 2xx + body + transport error → recover as success.
+		{"200 + body + connreset (THE bug fix path)", connErr, http.StatusOK, []byte(`{"text":"ok"}`), true},
+		{"299 + body + connreset (boundary high)", connErr, 299, []byte(`{"text":"ok"}`), true},
+		{"200 + body + connreset (boundary low)", connErr, 200, []byte(`{"x":1}`), true},
+		// Negative cases: any one of the three preconditions failing → false.
+		{"nil proxyErr (no decision to make)", nil, http.StatusOK, []byte(`{"text":"ok"}`), false},
+		{"empty body (no work-result to recover)", connErr, http.StatusOK, []byte{}, false},
+		{"nil body (no work-result to recover)", connErr, http.StatusOK, nil, false},
+		{"4xx with body — agent signalled failure, do not promote", connErr, http.StatusBadRequest, []byte(`{"err":"bad"}`), false},
+		{"5xx with body — agent signalled failure, do not promote", connErr, http.StatusInternalServerError, []byte(`{"err":"crash"}`), false},
+		{"3xx with body — redirect, not a result", connErr, 301, []byte(`{"loc":"/x"}`), false},
+		{"199 status (under 200) — not a 2xx", connErr, 199, []byte(`{"x":1}`), false},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := isDeliveryConfirmedSuccess(tc.proxyErr, tc.status, tc.body); got != tc.expect {
+				t.Errorf("isDeliveryConfirmedSuccess(%v, %d, %q) = %v, want %v",
+					tc.proxyErr, tc.status, string(tc.body), got, tc.expect)
+			}
+		})
+	}
+}
+
 func TestIsQueuedProxyResponse(t *testing.T) {
 	// Regression guard for the chat-leak bug: when the proxy returns
 	// 202 with a queued-shape body, executeDelegation must classify it
@ -918,3 +958,308 @@ func TestInsertDelegationOutcome_ZeroValueIsUnknown(t *testing.T) {
 		t.Errorf("insertOutcomeUnknown must not collide with insertOK")
 	}
 }
+
+// ==================== executeDelegation — delivery-confirmed proxy error regression tests ====================
+//
+// These test the fix for issue #159: when proxyA2ARequest returns an error but we have a
+// non-empty response body with a 2xx status code, executeDelegation must treat it as success.
+// The error is a delivery/transport error (e.g., connection reset after response was received).
+// Previously, executeDelegation marked these as "failed" even though the work was done,
+// causing retry storms and "error" rendering in canvas despite the response being available.
+//
+// Test strategy: spin up a mock A2A agent server, set up the source/target DB rows, call
+// executeDelegation directly, and verify the activity_logs status and delegation status.
+
+const testDelegationID = "del-159-test"
+const testSourceID = "ws-source-159"
+const testTargetID = "ws-target-159"
+
+// expectExecuteDelegationBase sets up sqlmock expectations for the DB queries that
+// executeDelegation always makes, regardless of outcome.
+func expectExecuteDelegationBase(mock sqlmock.Sqlmock) {
+	// updateDelegationStatus: dispatched
+	// Uses prefix match — sqlmock regexes match the full query string.
+	mock.ExpectExec("UPDATE activity_logs SET status").
+		WithArgs("dispatched", "", testSourceID, testDelegationID).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// CanCommunicate (source=target self-call is always allowed — no DB lookup needed)
+	// resolveAgentURL: reads ws:{id}:url from Redis, falls back to DB for target
+	mock.ExpectQuery("SELECT url, status FROM workspaces WHERE id = ").
+		WithArgs(testTargetID).
+		WillReturnRows(sqlmock.NewRows([]string{"url", "status"}).AddRow("", "online"))
+}
+
+// expectExecuteDelegationSuccess sets up expectations for a completed delegation.
+func expectExecuteDelegationSuccess(mock sqlmock.Sqlmock, respBody string) {
+	// INSERT activity_logs for delegation completion (response_body status = 'completed')
+	mock.ExpectExec("INSERT INTO activity_logs").
+		WithArgs(sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg(), "completed").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// updateDelegationStatus: completed
+	mock.ExpectExec("UPDATE activity_logs SET status").
+		WithArgs("completed", "", testSourceID, testDelegationID).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+}
+
+// expectExecuteDelegationFailed sets up expectations for a failed delegation.
+func expectExecuteDelegationFailed(mock sqlmock.Sqlmock) {
+	// INSERT activity_logs for delegation failure (response_body status = 'failed')
+	mock.ExpectExec("INSERT INTO activity_logs").
+		WithArgs(sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg(), "failed").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// updateDelegationStatus: failed
+	mock.ExpectExec("UPDATE activity_logs SET status").
+		WithArgs("failed", sqlmock.AnyArg(), testSourceID, testDelegationID).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+}
+
+// TestExecuteDelegation_DeliveryConfirmedProxyError_TreatsAsSuccess is the primary regression
+// test for issue #159. The scenario:
+//   - Attempt 1: server sends 200 OK headers + partial body, then closes connection.
+//     proxyA2ARequest: body read gets io.EOF (partial body read), returns (200, <partial>, BadGateway).
+//     isTransientProxyError(BadGateway) = TRUE → retry.
+//   - Attempt 2: server does the same thing (closes after partial body).
+//     proxyA2ARequest: same (200, <partial>, BadGateway).
+//     isTransientProxyError(BadGateway) = TRUE → retry AGAIN (but outer context will fire soon,
+//     or we get one more attempt). For the test we let it run.
+//     POST-FIX: the executeDelegation new condition sees status=200, body=<partial>, err!=nil
+//     and routes to handleSuccess immediately.
+//
+// The key pre/post-fix difference: pre-fix, executeDelegation received status=0 (hardcoded)
+// even when the server sent 200, so the condition always failed. Post-fix, status=200 is
+// preserved through the error return path (proxyA2ARequest now returns resp.StatusCode, respBody).
+// In this test the retry ultimately succeeds (server eventually sends full body), but
+// the critical assertion is that a 2xx partial-body delivery-confirmed response is never
+// classified as "failed" — it always routes to success.
+func TestExecuteDelegation_DeliveryConfirmedProxyError_TreatsAsSuccess(t *testing.T) {
+	mock := setupTestDB(t)
+	mr := setupTestRedis(t)
+	allowLoopbackForTest(t)
+
+	broadcaster := newTestBroadcaster()
+	wh := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+	dh := NewDelegationHandler(wh, broadcaster)
+
+	// Server that sends a 200 response with declared Content-Length but closes
+	// the connection before sending all bytes. Go's http.Client sees io.EOF on
+	// the body read. proxyA2ARequest captures the partial body + status=200 and
+	// returns (200, <partial>, error). executeDelegation's new condition sees
+	// status=200 + body > 0 + error != nil → routes to handleSuccess.
+	var wg sync.WaitGroup
+	wg.Add(1)
+	ln, err := net.Listen("tcp", "127.0.0.1:0")
+	if err != nil {
+		t.Fatalf("failed to listen: %v", err)
+	}
+	defer ln.Close()
+	go func() {
+		defer wg.Done()
+		conn, err := ln.Accept()
+		if err != nil {
+			return
+		}
+		defer conn.Close()
+		// Consume the HTTP request
+		buf := make([]byte, 2048)
+		conn.Read(buf)
+		// Send 200 OK with Content-Length: 100 but only 74 bytes of body
+		// (less than declared length → io.LimitReader returns io.EOF after reading all 74)
+		resp := "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: 100\r\n\r\n"
+		resp += `{"result":{"parts":[{"text":"work completed successfully"}]}}` // 74 bytes
+		conn.Write([]byte(resp))
+		// Close immediately — client gets io.EOF on body read
+	}()
+
+	agentURL := "http://" + ln.Addr().String()
+	mr.Set(fmt.Sprintf("ws:%s:url", testTargetID), agentURL)
+	allowLoopbackForTest(t)
+
+	expectExecuteDelegationBase(mock)
+	expectExecuteDelegationSuccess(mock, `{"result":{"parts":[{"text":"work completed successfully"}]}}`)
+
+	// Execute synchronously (not as a goroutine) so we can check DB state immediately.
+	// The handler fires it as goroutine; we call it directly for deterministic testing.
+	a2aBody, _ := json.Marshal(map[string]interface{}{
+		"jsonrpc": "2.0",
+		"id":      "1",
+		"method":  "message/send",
+		"params": map[string]interface{}{
+			"message": map[string]interface{}{
+				"role":  "user",
+				"parts": []map[string]string{{"type": "text", "text": "do work"}},
+			},
+		},
+	})
+	dh.executeDelegation(testSourceID, testTargetID, testDelegationID, a2aBody)
+
+	time.Sleep(100 * time.Millisecond) // let DB writes settle
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestExecuteDelegation_ProxyErrorNon2xx_RemainsFailed verifies that the pre-fix failure
+// path is unchanged when proxyA2ARequest returns a delivery-confirmed error with a non-2xx
+// status code (e.g., 500 Internal Server Error with partial body read before connection drop).
+// The new condition requires status >= 200 && status < 300, so non-2xx always routes to failure.
+func TestExecuteDelegation_ProxyErrorNon2xx_RemainsFailed(t *testing.T) {
+	mock := setupTestDB(t)
+	mr := setupTestRedis(t)
+	allowLoopbackForTest(t)
+
+	broadcaster := newTestBroadcaster()
+	wh := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+	dh := NewDelegationHandler(wh, broadcaster)
+
+	// Server returns 500 with declared Content-Length but closes connection early.
+	// proxyA2ARequest: reads 500 headers, partial body, then connection drop → body read error.
+	// Returns (500, <partial_body>, BadGateway).
+	// New condition: status=500 is NOT >= 200 && < 300 → routes to failure.
+	// isTransientProxyError(500) = false → no retry.
+	var wg sync.WaitGroup
+	wg.Add(1)
+	ln, err := net.Listen("tcp", "127.0.0.1:0")
+	if err != nil {
+		t.Fatalf("failed to listen: %v", err)
+	}
+	defer ln.Close()
+	go func() {
+		defer wg.Done()
+		conn, err := ln.Accept()
+		if err != nil {
+			return
+		}
+		defer conn.Close()
+		buf := make([]byte, 2048)
+		conn.Read(buf)
+		// 500 with Content-Length: 100 but only ~60 bytes of body
+		resp := "HTTP/1.1 500 Internal Server Error\r\nContent-Type: application/json\r\nContent-Length: 100\r\n\r\n"
+		resp += `{"error":"agent crashed"}` // ~24 bytes, less than declared
+		conn.Write([]byte(resp))
+		// Close immediately — client gets io.EOF on body read
+	}()
+
+	agentURL := "http://" + ln.Addr().String()
+	mr.Set(fmt.Sprintf("ws:%s:url", testTargetID), agentURL)
+	allowLoopbackForTest(t)
+
+	expectExecuteDelegationBase(mock)
+	expectExecuteDelegationFailed(mock)
+
+	a2aBody, _ := json.Marshal(map[string]interface{}{
+		"jsonrpc": "2.0", "id": "1", "method": "message/send",
+		"params": map[string]interface{}{
+			"message": map[string]interface{}{
+				"role":  "user",
+				"parts": []map[string]string{{"type": "text", "text": "do work"}},
+			},
+		},
+	})
+	dh.executeDelegation(testSourceID, testTargetID, testDelegationID, a2aBody)
+
+	time.Sleep(100 * time.Millisecond)
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestExecuteDelegation_ProxyErrorEmptyBody_RemainsFailed verifies that the pre-fix failure
+// path is unchanged when proxyA2ARequest returns an error with a 2xx status but empty body.
+// The new condition requires len(respBody) > 0, so empty body routes to failure.
+func TestExecuteDelegation_ProxyErrorEmptyBody_RemainsFailed(t *testing.T) {
+	mock := setupTestDB(t)
+	mr := setupTestRedis(t)
+	allowLoopbackForTest(t)
+
+	broadcaster := newTestBroadcaster()
+	wh := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+	dh := NewDelegationHandler(wh, broadcaster)
+
+	// Server returns 502 Bad Gateway — proxyA2ARequest returns 502, body="" (empty), error != nil.
+	// New condition: proxyErr != nil && len(respBody) > 0 && status >= 200 && status < 300
+	// → len(respBody) == 0 → condition FALSE → falls through to failure.
+	// isTransientProxyError(502) is TRUE → retry → same result → failure.
+	agentServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusBadGateway)
+		// No body — connection closes normally
+	}))
+	defer agentServer.Close()
+
+	mr.Set(fmt.Sprintf("ws:%s:url", testTargetID), agentServer.URL)
+	allowLoopbackForTest(t)
+
+	// First attempt: updateDelegationStatus(dispatched) — from expectExecuteDelegationBase
+	expectExecuteDelegationBase(mock)
+	// Second attempt (retry): updateDelegationStatus(dispatched) again
+	mock.ExpectExec("UPDATE activity_logs SET status").
+		WithArgs("dispatched", "", testSourceID, testDelegationID).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+	// Failure: INSERT + UPDATE (failed)
+	expectExecuteDelegationFailed(mock)
+
+	a2aBody, _ := json.Marshal(map[string]interface{}{
+		"jsonrpc": "2.0", "id": "1", "method": "message/send",
+		"params": map[string]interface{}{
+			"message": map[string]interface{}{
+				"role":  "user",
+				"parts": []map[string]string{{"type": "text", "text": "do work"}},
+			},
+		},
+	})
+	dh.executeDelegation(testSourceID, testTargetID, testDelegationID, a2aBody)
+
+	time.Sleep(100 * time.Millisecond)
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestExecuteDelegation_CleanProxyResponse_Unchanged verifies that a clean proxy response
+// (no error, 200 with body) is unaffected by the new condition. This is the baseline:
+// proxyErr == nil so the new condition never fires.
+func TestExecuteDelegation_CleanProxyResponse_Unchanged(t *testing.T) {
+	mock := setupTestDB(t)
+	mr := setupTestRedis(t)
+	allowLoopbackForTest(t)
+
+	broadcaster := newTestBroadcaster()
+	wh := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+	dh := NewDelegationHandler(wh, broadcaster)
+
+	agentServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+		w.Header().Set("Content-Type", "application/json")
+		w.Write([]byte(`{"result":{"parts":[{"text":"all good"}]}}`))
+	}))
+	defer agentServer.Close()
+
+	mr.Set(fmt.Sprintf("ws:%s:url", testTargetID), agentServer.URL)
+	allowLoopbackForTest(t)
+
+	expectExecuteDelegationBase(mock)
+	expectExecuteDelegationSuccess(mock, `{"result":{"parts":[{"text":"all good"}]}}`)
+
+	a2aBody, _ := json.Marshal(map[string]interface{}{
+		"jsonrpc": "2.0", "id": "1", "method": "message/send",
+		"params": map[string]interface{}{
+			"message": map[string]interface{}{
+				"role":  "user",
+				"parts": []map[string]string{{"type": "text", "text": "do work"}},
+			},
+		},
+	})
+	dh.executeDelegation(testSourceID, testTargetID, testDelegationID, a2aBody)
+
+	time.Sleep(100 * time.Millisecond)
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+}
--- a/workspace-server/internal/pendinguploads/export_test.go
+++ b/workspace-server/internal/pendinguploads/export_test.go
@ -6,12 +6,23 @@ import (
 )

 // StartSweeperWithIntervalForTest exposes startSweeperWithInterval to
-// the external test package. The production code uses StartSweeper
+// the external test package. The production code uses StartSeper
 // (which pins the canonical SweepInterval); tests pin a short interval
 // to exercise the ticker-driven cycle without burning real wall-clock
 // time. The Go convention `export_test.go` keeps this seam OUT of the
 // production binary — files ending in _test.go are stripped at build
 // time, so this re-export only exists during `go test`.
 func StartSweeperWithIntervalForTest(ctx context.Context, storage Storage, ackRetention, interval time.Duration) {
-	startSweeperWithInterval(ctx, storage, ackRetention, interval)
+	startSweeperWithInterval(ctx, storage, ackRetention, interval, nil)
+}
+
+// StartSweeperForTest starts the sweeper and returns a done channel
+// that is closed exactly once when the loop exits. Tests MUST receive
+// from done before returning so the goroutine has fully terminated and
+// the shared metric counters are stable for the next test's baseline
+// capture (issue #86).
+func StartSweeperForTest(ctx context.Context, storage Storage, ackRetention time.Duration) chan struct{} {
+	done := make(chan struct{})
+	go startSweeperWithInterval(ctx, storage, ackRetention, SweepInterval, done)
+	return done
 }
--- a/workspace-server/internal/pendinguploads/sweeper.go
+++ b/workspace-server/internal/pendinguploads/sweeper.go
@ -66,15 +66,21 @@ const sweepDeadline = 30 * time.Second
 // to exercise the ticker-driven sweep path without burning real wall-
 // clock time.
 func StartSweeper(ctx context.Context, storage Storage, ackRetention time.Duration) {
-	startSweeperWithInterval(ctx, storage, ackRetention, SweepInterval)
+	startSweeperWithInterval(ctx, storage, ackRetention, SweepInterval, nil)
 }

 // startSweeperWithInterval is the test-friendly variant of StartSweeper
 // — same loop, but the cadence is caller-specified. Production code
 // should use StartSweeper to keep the SweepInterval constant pinned.
-func startSweeperWithInterval(ctx context.Context, storage Storage, ackRetention, interval time.Duration) {
+// If done is non-nil it is closed exactly once when the loop exits,
+// allowing tests to wait for the goroutine to fully terminate before
+// asserting on shared metric counters (issue #86).
+func startSweeperWithInterval(ctx context.Context, storage Storage, ackRetention, interval time.Duration, done chan struct{}) {
 	if storage == nil {
 		log.Println("pendinguploads sweeper: storage is nil — sweeper disabled")
+		if done != nil {
+			close(done)
+		}
 		return
 	}
 	if ackRetention == 0 {
@ -86,6 +92,12 @@ func startSweeperWithInterval(ctx context.Context, storage Storage, ackRetention
 	)
 	ticker := time.NewTicker(interval)
 	defer ticker.Stop()
+	defer func() {
+		log.Println("pendinguploads sweeper: shutdown")
+		if done != nil {
+			close(done)
+		}
+	}()
 	// Run once immediately so a platform restart cleans up any rows
 	// that became eligible while we were down — don't make the
 	// operator wait 5 minutes for the first sweep.
@ -93,9 +105,16 @@ func startSweeperWithInterval(ctx context.Context, storage Storage, ackRetention
 	for {
 		select {
 		case <-ctx.Done():
-			log.Println("pendinguploads sweeper: shutdown")
 			return
 		case <-ticker.C:
+			// Guard: ctx may have been cancelled between the ticker firing
+			// and this case being selected (MPMC channel; close-to-ready race).
+			// Calling sweepOnce with an already-cancelled ctx would increment
+			// the error counter on shutdown — polluting the next test's
+			// baseline (issue #86 full-suite failure).
+			if ctx.Err() != nil {
+				continue
+			}
 			sweepOnce(ctx, storage, ackRetention)
 		}
 	}
--- a/workspace-server/internal/pendinguploads/sweeper_test.go
+++ b/workspace-server/internal/pendinguploads/sweeper_test.go
@ -136,7 +136,7 @@ func TestStartSweeper_RunsImmediatelyAndOnTick(t *testing.T) {
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()

-	go pendinguploads.StartSweeper(ctx, store, time.Hour)
+	done := pendinguploads.StartSweeperForTest(ctx, store, time.Hour)
 	store.waitForCycle(t, 1, 2*time.Second)
 	if got := store.calls.Load(); got < 1 {
 		t.Errorf("expected at least one immediate sweep, got %d", got)
@ -145,6 +145,10 @@ func TestStartSweeper_RunsImmediatelyAndOnTick(t *testing.T) {
 	if store.gotRetention.Load() != 3600 {
 		t.Errorf("retention seconds = %d, want 3600", store.gotRetention.Load())
 	}
+	// #86 fix: ensure goroutine has exited before the next test's
+	// metricDelta() baseline capture.
+	cancel()
+	<-done
 }

 func TestStartSweeper_ZeroAckRetentionUsesDefault(t *testing.T) {
@ -152,23 +156,22 @@ func TestStartSweeper_ZeroAckRetentionUsesDefault(t *testing.T) {
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()

-	go pendinguploads.StartSweeper(ctx, store, 0)
+	done := pendinguploads.StartSweeperForTest(ctx, store, 0)
 	store.waitForCycle(t, 1, 2*time.Second)
 	want := int64(pendinguploads.DefaultAckRetention.Seconds())
 	if store.gotRetention.Load() != want {
 		t.Errorf("retention = %d, want default %d", store.gotRetention.Load(), want)
 	}
+	// #86 fix.
+	cancel()
+	<-done
 }

 func TestStartSweeper_ContextCancelStopsLoop(t *testing.T) {
 	store := newFakeSweepStorage([]pendinguploads.SweepResult{{}}, nil)
 	ctx, cancel := context.WithCancel(context.Background())

-	done := make(chan struct{})
-	go func() {
-		pendinguploads.StartSweeper(ctx, store, time.Second)
-		close(done)
-	}()
+	done := pendinguploads.StartSweeperForTest(ctx, store, time.Second)
 	store.waitForCycle(t, 1, 2*time.Second)
 	cancel()

@ -187,14 +190,17 @@ func TestStartSweeperWithInterval_TickerFiresAdditionalCycles(t *testing.T) {
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()

-	go pendinguploads.StartSweeperWithIntervalForTest(ctx, store, time.Hour, 30*time.Millisecond)
-
+	done := pendinguploads.StartSweeperForTest(ctx, store, time.Hour)
 	// Immediate cycle + at least one tick-driven cycle.
 	store.waitForCycle(t, 2, 2*time.Second)

 	if got := store.calls.Load(); got < 2 {
 		t.Errorf("expected ≥2 cycles (immediate + 1 tick), got %d", got)
 	}
+	// #86 fix: drain the done channel so the goroutine is fully gone
+	// before the next test's metricDelta() baseline capture.
+	cancel()
+	<-done
 }

 func TestStartSweeper_TransientErrorDoesNotCrashLoop(t *testing.T) {
@ -217,7 +223,7 @@ func TestStartSweeper_TransientErrorDoesNotCrashLoop(t *testing.T) {
 	// waitForCycle is too early.
 	_, _, deltaError := metricDelta(t)

-	go pendinguploads.StartSweeper(ctx, store, time.Hour)
+	done := pendinguploads.StartSweeperForTest(ctx, store, time.Hour)

 	// Wait for the first (errored) cycle.
 	store.waitForCycle(t, 1, 2*time.Second)
@ -226,11 +232,13 @@ func TestStartSweeper_TransientErrorDoesNotCrashLoop(t *testing.T) {
 	// stops the loop on the next select pass with no in-flight metric
 	// writes outstanding.
 	waitForMetricDelta(t, deltaError, 1, 2*time.Second)
-	// Cancel — the goroutine returns cleanly, proving the error path
-	// didn't crash the loop. Without this fix the goroutine would have
-	// either panicked (process abort visible at exit) or stuck (this
-	// cancel + done-channel pattern would deadlock instead).
+	// Cancel and wait for the goroutine to fully exit (#86 fix).
+	// Without the done-channel wait, the goroutine races with the next
+	// test's metricDelta() baseline capture — the next test may see
+	// error=1 from this test still "in flight", throwing off its
+	// deltaError assertion.
 	cancel()
+	<-done
 }

 // metricDelta returns a function that, when called, returns how much
@ -265,7 +273,7 @@ func TestStartSweeper_RecordsMetricsOnSuccess(t *testing.T) {
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()

-	go pendinguploads.StartSweeper(ctx, store, time.Hour)
+	done := pendinguploads.StartSweeperForTest(ctx, store, time.Hour)
 	store.waitForCycle(t, 1, 2*time.Second)

 	// Poll for the success counters to settle — closes the cycleDone-
@ -278,6 +286,15 @@ func TestStartSweeper_RecordsMetricsOnSuccess(t *testing.T) {
 	if got := deltaError(); got != 0 {
 		t.Errorf("error counter delta = %d, want 0", got)
 	}
+
+	// #86 fix: drain the done channel so the goroutine is fully gone
+	// before the next test's metricDelta() baseline capture. Without this
+	// the previous test's goroutine could still be mid-Sweep (blocked on
+	// the fake's results channel) and its eventual return would mutate
+	// the shared error/acked counters after the next test has already
+	// snapshot its baseline.
+	cancel()
+	<-done
 }

 func TestStartSweeper_RecordsMetricsOnError(t *testing.T) {
@ -290,7 +307,7 @@ func TestStartSweeper_RecordsMetricsOnError(t *testing.T) {
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()

-	go pendinguploads.StartSweeper(ctx, store, time.Hour)
+	done := pendinguploads.StartSweeperForTest(ctx, store, time.Hour)
 	store.waitForCycle(t, 1, 2*time.Second)

 	// Poll for the error counter to settle — cycleDone fires inside
@ -300,4 +317,9 @@ func TestStartSweeper_RecordsMetricsOnError(t *testing.T) {
 	// though the metric WILL be 1 a few ms later. See
 	// waitForMetricDelta comment.
 	waitForMetricDelta(t, deltaError, 1, 2*time.Second)
+
+	// #86 fix: ensure the goroutine has fully exited before the next
+	// test's metricDelta() baseline capture.
+	cancel()
+	<-done
 }
--- a/workspace-server/internal/plugins/local_test.go
+++ b/workspace-server/internal/plugins/local_test.go
@ -132,19 +132,20 @@ func TestLocalResolver_HonoursContextCancellation(t *testing.T) {
 }

 func TestLocalResolver_BubblesUpCopyFailure(t *testing.T) {
-	// os.Chmod(dst, 0o555) silently passes when os.Geteuid() == 0
-	// (root bypasses POSIX permission checks).  We cannot reliably
-	// exercise the write-failure branch in a root environment without
-	// patching the syscalls, so skip it honestly.
+	// Source file the copyTree walk would read; make dst unwritable so
+	// the copyFile step fails. Skip when running as root — Linux
+	// filesystem permissions are advisory-only for uid 0, so chmod 0o555
+	// does not prevent writes and the test passes vacuously instead of
+	// exercising the error path (issue #87).
 	if os.Getuid() == 0 {
-		t.Skip("running as root — cannot exercise write-failure branch")
+		t.Skip("skipping: chmod 0o555 is ineffective when running as root")
 	}
-
 	base := t.TempDir()
 	writePlugin(t, base, "demo", map[string]string{
 		"plugin.yaml": "name: demo\n",
 	})
 	dst := t.TempDir()
+	// Make dst read-only so creating files inside it fails.
 	if err := os.Chmod(dst, 0o555); err != nil {
 		t.Fatal(err)
 	}
--- a/workspace/tests/conftest.py
+++ b/workspace/tests/conftest.py
@ -401,6 +401,35 @@ if "a2a" not in sys.modules:
 # tests now live in the claude-code template repo, where the real SDK
 # IS installed via Dockerfile, so no stub is needed.

+
+# ==================== Test isolation fixtures ====================
+
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def _clear_platform_auth_cache():
+    """Reset platform_auth._cached_token before each test.
+
+    Fixes issue #160: tests that use monkeypatch.delenv("MOLECULE_WORKSPACE_TOKEN")
+    to simulate "no token in env" fail when platform_auth._cached_token was already
+    set from a prior test's MOLECULE_WORKSPACE_TOKEN value. The cache is populated
+    at module import or first get_token() call and persists for the process lifetime
+    — monkeypatch.delenv removes the env var but not the module-level cache.
+
+    Run at function scope so each test starts with a clean slate regardless of
+    what the previous test set. The import is inside the fixture (not at file
+    top-level) because conftest.py runs during test collection before
+    platform_auth might be available in all test environments. If the module is
+    absent (import error), the fixture is a no-op.
+    """
+    try:
+        import platform_auth as _pa
+        _pa.clear_cache()
+    except ImportError:
+        pass
+    yield  # run the test, then fixture teardown has nothing to do
+
 if "langchain_core" not in sys.modules:
    _make_langchain_mocks()