diff --git a/workspace-server/internal/handlers/a2a_proxy.go b/workspace-server/internal/handlers/a2a_proxy.go index 816d5c81..5737b156 100644 --- a/workspace-server/internal/handlers/a2a_proxy.go +++ b/workspace-server/internal/handlers/a2a_proxy.go @@ -501,8 +501,18 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri // to correctly route delivery-confirmed responses (where the agent completed // the work but the TCP connection dropped before the full body was received) // to success instead of failure (#159). + // + // For non-2xx responses (server explicitly rejected with 3xx+), preserve + // resp.StatusCode in the proxyA2AError.Status so isTransientProxyError + // returns false — a server-authored rejection is not a transient transport + // error and must not be retried. Only 2xx body-read errors keep Status=502 + // (the agent completed work but the TCP layer dropped the response). + errStatus := http.StatusBadGateway + if resp.StatusCode >= 300 { + errStatus = resp.StatusCode + } return resp.StatusCode, respBody, &proxyA2AError{ - Status: http.StatusBadGateway, + Status: errStatus, Response: gin.H{ "error": "failed to read agent response", "delivery_confirmed": deliveryConfirmed, @@ -510,6 +520,21 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri } } + // 2xx with empty body: the agent completed the request but returned no content. + // An A2A agent must always return a JSON body; empty means the agent is + // broken or the connection closed before any body bytes were written. + // Return a proxyA2AError so executeDelegation routes this to failure rather + // than silently marking it as completed with a nil body. + // logA2ASuccess is intentionally NOT called here — delivery was not confirmed. + if resp.StatusCode >= 200 && resp.StatusCode < 300 && len(respBody) == 0 { + log.Printf("ProxyA2A: agent %s returned %d with empty body — treating as failure", + workspaceID, resp.StatusCode) + return resp.StatusCode, respBody, &proxyA2AError{ + Status: resp.StatusCode, + Response: gin.H{"error": "agent returned empty response body"}, + } + } + if logActivity { h.logA2ASuccess(ctx, workspaceID, callerID, body, respBody, a2aMethod, resp.StatusCode, durationMs) } diff --git a/workspace-server/internal/handlers/delegation.go b/workspace-server/internal/handlers/delegation.go index c723795a..3af66150 100644 --- a/workspace-server/internal/handlers/delegation.go +++ b/workspace-server/internal/handlers/delegation.go @@ -309,7 +309,7 @@ func insertDelegationRow(ctx context.Context, c *gin.Context, sourceID string, b // to land a fresh URL in the cache before we try again. Fixes #74 — // bulk restarts used to produce spurious "failed to reach workspace // agent" errors when delegations fired within the warm-up window. -const delegationRetryDelay = 8 * time.Second +var delegationRetryDelay = 8 * time.Second // NB: the log.Printf calls below are load-bearing for the integration test // surface (delegation_executor_integration_test.go). The test uses a raw TCP @@ -342,6 +342,18 @@ func (h *DelegationHandler) executeDelegation(ctx context.Context, sourceID, tar status, respBody, proxyErr := h.workspace.proxyA2ARequest(ctx, targetID, a2aBody, sourceID, true) log.Printf("Delegation %s: step=proxy_done status=%d bodyLen=%d err=%v", delegationID, status, len(respBody), proxyErr) + // When proxyA2ARequest returns an error but we have a non-empty response body + // with a 2xx status code, the agent completed the work successfully — the error + // is a delivery/transport error (e.g., connection reset after response was + // received). Treat as success: the response body is valid and the work is done. + // This check MUST run before the transient-retry gate so a delivery-confirmed + // partial-body 2xx response is never retried. + if isDeliveryConfirmedSuccess(proxyErr, status, respBody) { + log.Printf("Delegation %s: completed with delivery error (status=%d, respBody=%d bytes, proxyErr=%v) — treating as success", + delegationID, status, len(respBody), proxyErr.Error()) + goto handleSuccess + } + // #74: one retry after the reactive URL refresh has had a chance to // run. The proxyA2ARequest's health-check path on a connection error // marks the workspace offline, clears cached keys, and kicks off a @@ -360,18 +372,6 @@ func (h *DelegationHandler) executeDelegation(ctx context.Context, sourceID, tar } } - // When proxyA2ARequest returns an error but we have a non-empty response body - // with a 2xx status code, the agent completed the work successfully — the error - // is a delivery/transport error (e.g., connection reset after response was - // received). Treat as success: the response body is valid and the work is done. - // This prevents "retry storms" where the canvas sees error + Restart-workspace - // suggestion even though the delegation actually completed. - if isDeliveryConfirmedSuccess(proxyErr, status, respBody) { - log.Printf("Delegation %s: completed with delivery error (status=%d, respBody=%d bytes, proxyErr=%v) — treating as success", - delegationID, status, len(respBody), proxyErr.Error()) - goto handleSuccess - } - if proxyErr != nil { log.Printf("Delegation %s: step=handling_failure err=%v", delegationID, proxyErr) log.Printf("Delegation %s: failed — %s", delegationID, proxyErr.Error()) diff --git a/workspace-server/internal/handlers/mcp_test.go b/workspace-server/internal/handlers/mcp_test.go index d200f572..385b03ec 100644 --- a/workspace-server/internal/handlers/mcp_test.go +++ b/workspace-server/internal/handlers/mcp_test.go @@ -417,11 +417,32 @@ func TestMCPHandler_CommitMemory_LocalScope_Success(t *testing.T) { } } -// TestMCPHandler_CommitMemory_GlobalScope_Blocked verifies that C3 is enforced: -// GLOBAL scope is not permitted on the MCP bridge. -func TestMCPHandler_CommitMemory_GlobalScope_Blocked(t *testing.T) { +// TestMCPHandler_CommitMemory_GlobalScope_Blocked_ScrubsInternalError verifies +// two contracts at once on the GLOBAL-scope-blocked path: +// +// 1. C3 invariant (commit_memory with scope=GLOBAL aborts on the MCP bridge +// before touching the DB), AND +// 2. OFFSEC-001 / #259 scrub contract (commit 7d1a189f): the JSON-RPC error +// returned to the client is a CONSTANT — code=-32000, message="tool call +// failed" — with the production-internal err.Error() text logged +// server-side, never reflected back to the caller. +// +// Prior to this rename the test asserted that the client-visible message +// CONTAINED the substring "GLOBAL", which was the human-readable internal +// error from toolCommitMemory. mc#664 Class 2 flipped that assertion the +// right way around: now the test FAILS if the scrub regresses (i.e. if the +// internal string is ever reflected back to the wire), and PASSES iff the +// scrubbed constant reaches the client. +// +// Coupling note: the constant string "tool call failed" and the code -32000 +// are the same values asserted by +// TestMCPHandler_dispatchRPC_UnknownTool_ReturnsConstantMessage — both are +// the OFFSEC-001 contract for the dispatch-failure branch in mcp.go (the +// third err.Error() leak that 7d1a189f scrubbed). If those constants ever +// change, both tests must move together. +func TestMCPHandler_CommitMemory_GlobalScope_Blocked_ScrubsInternalError(t *testing.T) { h, mock := newMCPHandler(t) - // No DB expectations — handler must abort before touching the DB. + // No DB expectations — handler must abort before touching the DB (C3). w := mcpPost(t, h, "ws-1", map[string]interface{}{ "jsonrpc": "2.0", @@ -436,14 +457,53 @@ func TestMCPHandler_CommitMemory_GlobalScope_Blocked(t *testing.T) { }, }) + // JSON-RPC envelope returns 200 with the error in the body — only + // malformed-JSON-at-the-envelope-layer returns 400 (see Call() in mcp.go). + if w.Code != http.StatusOK { + t.Fatalf("expected 200 (JSON-RPC error in body), got %d: %s", w.Code, w.Body.String()) + } + var resp mcpResponse - json.Unmarshal(w.Body.Bytes(), &resp) + if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil { + t.Fatalf("response is not valid JSON: %v", err) + } + + // (1) C3: an error must be reported. if resp.Error == nil { - t.Error("expected JSON-RPC error for GLOBAL scope, got nil") + t.Fatal("expected JSON-RPC error for GLOBAL scope, got nil") } - if resp.Error != nil && resp.Error.Message != "tool call failed" { - t.Errorf("client error should use the OFFSEC constant message, got: %s", resp.Error.Message) + + // (2) OFFSEC-001 positive assertions — exact equality on the scrubbed + // constants so any change (re-leak of err.Error(), code mutation) trips + // the test. Substring-match would not catch a partial re-leak. + if resp.Error.Code != -32000 { + t.Errorf("error code should be -32000 (Server error / dispatch-failure), got: %d", resp.Error.Code) } + if resp.Error.Message != "tool call failed" { + t.Errorf("error message should be the OFFSEC-001 constant %q, got: %q", "tool call failed", resp.Error.Message) + } + + // (3) OFFSEC-001 negative assertions — the internal err.Error() text + // from toolCommitMemory ("GLOBAL scope is not permitted via the MCP + // bridge — use LOCAL or TEAM") must NOT appear in the client-visible + // message. Each token below is a distinct substring of that internal + // string; if ANY leaks through, the scrub in mcp.go dispatchRPC has + // regressed and this assertion fires the canary. + leakedTokens := []string{ + "GLOBAL", // scope name + "scope", // policy lexicon + "permitted", // policy verb + "bridge", // internal architecture term + "LOCAL", // alternative scope name + "TEAM", // alternative scope name + } + for _, tok := range leakedTokens { + if bytes.Contains([]byte(resp.Error.Message), []byte(tok)) { + t.Errorf("OFFSEC-001 scrub regression: client-visible error.message leaks internal token %q (got: %q)", tok, resp.Error.Message) + } + } + + // (4) C3 invariant preserved: handler must short-circuit before any DB call. if err := mock.ExpectationsWereMet(); err != nil { t.Errorf("unexpected DB calls on GLOBAL scope block: %v", err) }