diff --git a/workspace-server/internal/handlers/delegation.go b/workspace-server/internal/handlers/delegation.go index 0156f864..6761ec7e 100644 --- a/workspace-server/internal/handlers/delegation.go +++ b/workspace-server/internal/handlers/delegation.go @@ -348,7 +348,7 @@ func (h *DelegationHandler) executeDelegation(sourceID, targetID, delegationID s // received). Treat as success: the response body is valid and the work is done. // This prevents "retry storms" where the canvas sees error + Restart-workspace // suggestion even though the delegation actually completed. - if proxyErr != nil && len(respBody) > 0 && status >= 200 && status < 300 { + if isDeliveryConfirmedSuccess(proxyErr, status, respBody) { log.Printf("Delegation %s: completed with delivery error (status=%d, respBody=%d bytes, proxyErr=%v) — treating as success", delegationID, status, len(respBody), proxyErr.Error()) goto handleSuccess @@ -685,6 +685,34 @@ func isTransientProxyError(err *proxyA2AError) bool { return false } +// isDeliveryConfirmedSuccess reports whether the proxy's `(status, body, err)` +// triple represents a delivery-confirmed success: the proxy hit a transport- +// layer error AFTER receiving a complete 2xx response with a non-empty body. +// In that case the agent did the work — the error is on the wire, not in the +// agent — so the delegation should be marked succeeded rather than failed +// (preventing the retry-storm + restart-suggest cascade described in #159). +// +// Caller invariants: +// - proxyErr != nil: a delivery error fired (e.g. connection reset). +// - len(respBody) > 0: a response body was received before the error. +// - 200 <= status < 300: the partial response carried a 2xx code. +// +// All three must hold. nil proxyErr → no decision to make (success path +// already chosen upstream). Empty body → no work-result to recover. Non-2xx → +// the agent itself signalled failure or transient state; don't promote it. +func isDeliveryConfirmedSuccess(proxyErr *proxyA2AError, status int, respBody []byte) bool { + if proxyErr == nil { + return false + } + if len(respBody) == 0 { + return false + } + if status < 200 || status >= 300 { + return false + } + return true +} + // isQueuedProxyResponse reports whether the proxy returned a body shaped like // `{"queued": true, "queue_id": ..., "queue_depth": ..., "message": ...}` — // the busy-target enqueue path in a2a_proxy_helpers.go. Caller checks this diff --git a/workspace-server/internal/handlers/delegation_test.go b/workspace-server/internal/handlers/delegation_test.go index 797686bf..427e71b2 100644 --- a/workspace-server/internal/handlers/delegation_test.go +++ b/workspace-server/internal/handlers/delegation_test.go @@ -378,6 +378,44 @@ func TestIsTransientProxyError_RetriesOnRestartRaceStatuses(t *testing.T) { } } +// TestIsDeliveryConfirmedSuccess — regression guard for #159: the proxy can +// return a complete 2xx body and THEN raise a transport error (e.g. the TCP +// connection drops after the response is received but before close). In that +// case the agent did the work; marking the delegation "failed" causes the +// retry-storm + Restart-workspace cascade described in #159. The new helper +// distinguishes this from genuine failures. +func TestIsDeliveryConfirmedSuccess(t *testing.T) { + connErr := &proxyA2AError{Status: http.StatusOK, Response: gin.H{}} + cases := []struct { + name string + proxyErr *proxyA2AError + status int + body []byte + expect bool + }{ + // The new branch: 2xx + body + transport error → recover as success. + {"200 + body + connreset (THE bug fix path)", connErr, http.StatusOK, []byte(`{"text":"ok"}`), true}, + {"299 + body + connreset (boundary high)", connErr, 299, []byte(`{"text":"ok"}`), true}, + {"200 + body + connreset (boundary low)", connErr, 200, []byte(`{"x":1}`), true}, + // Negative cases: any one of the three preconditions failing → false. + {"nil proxyErr (no decision to make)", nil, http.StatusOK, []byte(`{"text":"ok"}`), false}, + {"empty body (no work-result to recover)", connErr, http.StatusOK, []byte{}, false}, + {"nil body (no work-result to recover)", connErr, http.StatusOK, nil, false}, + {"4xx with body — agent signalled failure, do not promote", connErr, http.StatusBadRequest, []byte(`{"err":"bad"}`), false}, + {"5xx with body — agent signalled failure, do not promote", connErr, http.StatusInternalServerError, []byte(`{"err":"crash"}`), false}, + {"3xx with body — redirect, not a result", connErr, 301, []byte(`{"loc":"/x"}`), false}, + {"199 status (under 200) — not a 2xx", connErr, 199, []byte(`{"x":1}`), false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := isDeliveryConfirmedSuccess(tc.proxyErr, tc.status, tc.body); got != tc.expect { + t.Errorf("isDeliveryConfirmedSuccess(%v, %d, %q) = %v, want %v", + tc.proxyErr, tc.status, string(tc.body), got, tc.expect) + } + }) + } +} + func TestIsQueuedProxyResponse(t *testing.T) { // Regression guard for the chat-leak bug: when the proxy returns // 202 with a queued-shape body, executeDelegation must classify it