fix(a2a): avoid false failure on busy queue fallback
ci-arm64-advisory / fast-checks (pull_request) Waiting to run
Lint shellcheck (arm64 pilot) / shellcheck-arm64 (pilot) (pull_request) Successful in 9s
Block internal-flavored paths / Block forbidden paths (pull_request) Successful in 8s
CI / Python Lint & Test (pull_request) Successful in 5s
CI / Detect changes (pull_request) Successful in 8s
E2E Staging Canvas (Playwright) / detect-changes (pull_request) Successful in 16s
E2E Staging SaaS (full lifecycle) / E2E Staging SaaS (pull_request) Has been skipped
E2E Chat / detect-changes (pull_request) Successful in 19s
E2E API Smoke Test / detect-changes (pull_request) Successful in 20s
Handlers Postgres Integration / detect-changes (pull_request) Successful in 6s
Lint forbidden tenant-env keys / Scan workspace_secrets writers for forbidden env keys (pull_request) Successful in 8s
Harness Replays / detect-changes (pull_request) Successful in 8s
Lint no tenant GITEA or GITHUB token write / Scan for repo-host token write into tenant workspace surface (pull_request) Successful in 7s
Secret scan / Scan diff for credential-shaped strings (pull_request) Successful in 7s
gate-check-v3 / gate-check (pull_request) Successful in 8s
E2E Staging SaaS (full lifecycle) / pr-validate (pull_request) Successful in 35s
sop-checklist / review-refire (pull_request) Has been skipped
qa-review / approved (pull_request) Failing after 9s
security-review / approved (pull_request) Failing after 6s
sop-checklist / na-declarations (pull_request) N/A: (none)
sop-checklist / all-items-acked (pull_request) Successful in 6s
sop-tier-check / tier-check (pull_request) Successful in 6s
CI / Shellcheck (E2E scripts) (pull_request) Successful in 3s
CI / Canvas (Next.js) (pull_request) Successful in 4s
E2E Staging Canvas (Playwright) / Canvas tabs E2E (pull_request) Successful in 4s
E2E Chat / E2E Chat (pull_request) Successful in 23s
Harness Replays / Harness Replays (pull_request) Successful in 3s
lint-required-no-paths / lint-required-no-paths (pull_request) Successful in 1m8s
E2E API Smoke Test / E2E API Smoke Test (pull_request) Successful in 1m51s
CI / Canvas Deploy Reminder (pull_request) Has been skipped
Handlers Postgres Integration / Handlers Postgres Integration (pull_request) Successful in 2m26s
CI / Platform (Go) (pull_request) Successful in 5m48s
CI / all-required (pull_request) Successful in 6m45s
audit-force-merge / audit (pull_request) Successful in 8s
ci-arm64-advisory / fast-checks (pull_request) Waiting to run
Lint shellcheck (arm64 pilot) / shellcheck-arm64 (pilot) (pull_request) Successful in 9s
Block internal-flavored paths / Block forbidden paths (pull_request) Successful in 8s
CI / Python Lint & Test (pull_request) Successful in 5s
CI / Detect changes (pull_request) Successful in 8s
E2E Staging Canvas (Playwright) / detect-changes (pull_request) Successful in 16s
E2E Staging SaaS (full lifecycle) / E2E Staging SaaS (pull_request) Has been skipped
E2E Chat / detect-changes (pull_request) Successful in 19s
E2E API Smoke Test / detect-changes (pull_request) Successful in 20s
Handlers Postgres Integration / detect-changes (pull_request) Successful in 6s
Lint forbidden tenant-env keys / Scan workspace_secrets writers for forbidden env keys (pull_request) Successful in 8s
Harness Replays / detect-changes (pull_request) Successful in 8s
Lint no tenant GITEA or GITHUB token write / Scan for repo-host token write into tenant workspace surface (pull_request) Successful in 7s
Secret scan / Scan diff for credential-shaped strings (pull_request) Successful in 7s
gate-check-v3 / gate-check (pull_request) Successful in 8s
E2E Staging SaaS (full lifecycle) / pr-validate (pull_request) Successful in 35s
sop-checklist / review-refire (pull_request) Has been skipped
qa-review / approved (pull_request) Failing after 9s
security-review / approved (pull_request) Failing after 6s
sop-checklist / na-declarations (pull_request) N/A: (none)
sop-checklist / all-items-acked (pull_request) Successful in 6s
sop-tier-check / tier-check (pull_request) Successful in 6s
CI / Shellcheck (E2E scripts) (pull_request) Successful in 3s
CI / Canvas (Next.js) (pull_request) Successful in 4s
E2E Staging Canvas (Playwright) / Canvas tabs E2E (pull_request) Successful in 4s
E2E Chat / E2E Chat (pull_request) Successful in 23s
Harness Replays / Harness Replays (pull_request) Successful in 3s
lint-required-no-paths / lint-required-no-paths (pull_request) Successful in 1m8s
E2E API Smoke Test / E2E API Smoke Test (pull_request) Successful in 1m51s
CI / Canvas Deploy Reminder (pull_request) Has been skipped
Handlers Postgres Integration / Handlers Postgres Integration (pull_request) Successful in 2m26s
CI / Platform (Go) (pull_request) Successful in 5m48s
CI / all-required (pull_request) Successful in 6m45s
audit-force-merge / audit (pull_request) Successful in 8s
This commit is contained in:
@@ -111,12 +111,13 @@ const maxProxyResponseBody = 10 << 20
|
||||
// a generic 502 page to canvas. 10s is well above realistic intra-region
|
||||
// latencies and well below CF's edge timeout.
|
||||
//
|
||||
// 3. Transport.ResponseHeaderTimeout — 180s default. From request-body-end
|
||||
// 3. Transport.ResponseHeaderTimeout — 5min default. From request-body-end
|
||||
// to response-headers-start. Configurable via
|
||||
// A2A_PROXY_RESPONSE_HEADER_TIMEOUT (envx.Duration). Covers cold-start
|
||||
// first-byte (30-60s OAuth flow above) with enough room for Opus agent
|
||||
// turns (big context + internal delegate_task round-trips routinely exceed
|
||||
// the old 60s ceiling). Body streaming after headers is governed by the
|
||||
// turns and Codex scheduled tasks (big context + internal delegate_task
|
||||
// round-trips routinely exceed the old 60s/180s ceilings). Body streaming
|
||||
// after headers is governed by the
|
||||
// per-request context deadline, NOT this timeout — so multi-minute agent
|
||||
// responses still work fine.
|
||||
//
|
||||
@@ -131,7 +132,7 @@ var a2aClient = &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
KeepAlive: 30 * time.Second,
|
||||
}).DialContext,
|
||||
ResponseHeaderTimeout: envx.Duration("A2A_PROXY_RESPONSE_HEADER_TIMEOUT", 180*time.Second),
|
||||
ResponseHeaderTimeout: envx.Duration("A2A_PROXY_RESPONSE_HEADER_TIMEOUT", 5*time.Minute),
|
||||
TLSHandshakeTimeout: 10 * time.Second,
|
||||
// MaxIdleConns / IdleConnTimeout: stdlib defaults are fine; agent
|
||||
// fan-in is bounded by the platform's broadcaster fan-out, not by
|
||||
|
||||
@@ -28,8 +28,8 @@ type proxyDispatchBuildError struct{ err error }
|
||||
func (e *proxyDispatchBuildError) Error() string { return e.err.Error() }
|
||||
|
||||
// handleA2ADispatchError translates a forward-call failure into a proxyA2AError,
|
||||
// runs the reactive container-health check, and (when `logActivity` is true)
|
||||
// schedules a detached LogActivity goroutine for the failed attempt.
|
||||
// runs the reactive container-health check, and records the outcome. Busy
|
||||
// targets that are successfully queued are logged as queued, not failed.
|
||||
func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspaceID, callerID string, body []byte, a2aMethod string, err error, durationMs int, logActivity bool) (int, []byte, *proxyA2AError) {
|
||||
// Build-time failure (couldn't even create the http.Request) — return
|
||||
// a 500 without the reactive-health / busy-retry paths.
|
||||
@@ -45,10 +45,10 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace
|
||||
|
||||
containerDead := h.maybeMarkContainerDead(ctx, workspaceID)
|
||||
|
||||
if logActivity {
|
||||
h.logA2AFailure(ctx, workspaceID, callerID, body, a2aMethod, err, durationMs)
|
||||
}
|
||||
if containerDead {
|
||||
if logActivity {
|
||||
h.logA2AFailure(ctx, workspaceID, callerID, body, a2aMethod, err, durationMs)
|
||||
}
|
||||
return 0, nil, &proxyA2AError{
|
||||
Status: http.StatusServiceUnavailable,
|
||||
Response: gin.H{"error": "workspace agent unreachable — container restart triggered", "restarting": true},
|
||||
@@ -108,6 +108,9 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace
|
||||
ctx, workspaceID, callerID, PriorityTask, body, a2aMethod, idempotencyKey, expiresAt,
|
||||
); qerr == nil {
|
||||
log.Printf("ProxyA2A: target %s busy — enqueued as %s (depth=%d)", workspaceID, qid, depth)
|
||||
if logActivity {
|
||||
h.logA2ABusyQueued(ctx, workspaceID, callerID, body, a2aMethod, durationMs)
|
||||
}
|
||||
respBody, _ := json.Marshal(gin.H{
|
||||
"queued": true,
|
||||
"queue_id": qid,
|
||||
@@ -121,6 +124,9 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace
|
||||
// make delegation silently disappear.
|
||||
log.Printf("ProxyA2A: enqueue for %s failed (%v) — falling back to 503", workspaceID, qerr)
|
||||
}
|
||||
if logActivity {
|
||||
h.logA2AFailure(ctx, workspaceID, callerID, body, a2aMethod, err, durationMs)
|
||||
}
|
||||
return 0, nil, &proxyA2AError{
|
||||
Status: http.StatusServiceUnavailable,
|
||||
Headers: map[string]string{"Retry-After": strconv.Itoa(busyRetryAfterSeconds)},
|
||||
@@ -131,6 +137,9 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace
|
||||
},
|
||||
}
|
||||
}
|
||||
if logActivity {
|
||||
h.logA2AFailure(ctx, workspaceID, callerID, body, a2aMethod, err, durationMs)
|
||||
}
|
||||
return 0, nil, &proxyA2AError{
|
||||
Status: http.StatusBadGateway,
|
||||
Response: gin.H{"error": "failed to reach workspace agent"},
|
||||
@@ -311,6 +320,33 @@ func (h *WorkspaceHandler) logA2AFailure(ctx context.Context, workspaceID, calle
|
||||
})
|
||||
}
|
||||
|
||||
// logA2ABusyQueued records that a push attempt reached a live but busy
|
||||
// workspace and was durably queued for heartbeat drain.
|
||||
func (h *WorkspaceHandler) logA2ABusyQueued(ctx context.Context, workspaceID, callerID string, body []byte, a2aMethod string, durationMs int) {
|
||||
var wsName string
|
||||
db.DB.QueryRowContext(ctx, `SELECT name FROM workspaces WHERE id = $1`, workspaceID).Scan(&wsName)
|
||||
if wsName == "" {
|
||||
wsName = workspaceID
|
||||
}
|
||||
summary := a2aMethod + " → " + wsName + " (queued: target busy)"
|
||||
parent := ctx
|
||||
h.goAsync(func() {
|
||||
logCtx, cancel := context.WithTimeout(context.WithoutCancel(parent), 30*time.Second)
|
||||
defer cancel()
|
||||
LogActivity(logCtx, h.broadcaster, ActivityParams{
|
||||
WorkspaceID: workspaceID,
|
||||
ActivityType: "a2a_receive",
|
||||
SourceID: nilIfEmpty(callerID),
|
||||
TargetID: &workspaceID,
|
||||
Method: &a2aMethod,
|
||||
Summary: &summary,
|
||||
RequestBody: json.RawMessage(body),
|
||||
DurationMs: &durationMs,
|
||||
Status: "ok",
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
// logA2ASuccess records a successful A2A round-trip and (for canvas-initiated
|
||||
// 2xx/3xx responses) broadcasts an A2A_RESPONSE event so the frontend can
|
||||
// receive the reply without polling.
|
||||
|
||||
@@ -1779,6 +1779,58 @@ func TestHandleA2ADispatchError_ContextDeadline(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleA2ADispatchError_BusyEnqueueLogsQueuedNotFailure(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
|
||||
waitForHandlerAsyncBeforeDBCleanup(t, handler)
|
||||
|
||||
mock.ExpectQuery(`INSERT INTO a2a_queue`).
|
||||
WithArgs("ws-busy", nil, PriorityTask, "{}", "message/send", nil, nil).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("11111111-1111-1111-1111-111111111111"))
|
||||
mock.ExpectQuery(`SELECT COUNT\(\*\) FROM a2a_queue`).
|
||||
WithArgs("ws-busy").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1))
|
||||
mock.ExpectQuery(`SELECT name FROM workspaces WHERE id =`).
|
||||
WithArgs("ws-busy").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"name"}).AddRow("Busy Target"))
|
||||
mock.ExpectExec("INSERT INTO activity_logs").
|
||||
WithArgs(
|
||||
"ws-busy",
|
||||
"a2a_receive",
|
||||
nil,
|
||||
sqlmock.AnyArg(),
|
||||
sqlmock.AnyArg(),
|
||||
sqlmock.AnyArg(),
|
||||
sqlmock.AnyArg(),
|
||||
nil,
|
||||
nil,
|
||||
sqlmock.AnyArg(),
|
||||
"ok",
|
||||
nil,
|
||||
).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
status, body, perr := handler.handleA2ADispatchError(
|
||||
context.Background(), "ws-busy", "", []byte("{}"), "message/send",
|
||||
context.DeadlineExceeded, 180002, true,
|
||||
)
|
||||
if perr != nil {
|
||||
t.Fatalf("expected busy enqueue success, got proxy error: %+v", perr)
|
||||
}
|
||||
if status != http.StatusAccepted {
|
||||
t.Fatalf("got status %d, want 202", status)
|
||||
}
|
||||
if !bytes.Contains(body, []byte(`"queued":true`)) {
|
||||
t.Fatalf("expected queued response body, got %s", string(body))
|
||||
}
|
||||
|
||||
time.Sleep(80 * time.Millisecond)
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("unmet expectations; busy enqueue must log status=ok, not error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleA2ADispatchError_BuildError(t *testing.T) {
|
||||
setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
@@ -2354,7 +2406,7 @@ func TestLookupDeliveryMode_ContextCanceled_FailsClosed(t *testing.T) {
|
||||
// ==================== a2aClient ResponseHeaderTimeout config ====================
|
||||
|
||||
func TestA2AClientResponseHeaderTimeout(t *testing.T) {
|
||||
const defaultTimeout = 180 * time.Second
|
||||
const defaultTimeout = 5 * time.Minute
|
||||
|
||||
// Default (unset env) — a2aClient was initialised at package load time.
|
||||
if a2aClient.Transport.(*http.Transport).ResponseHeaderTimeout != defaultTimeout {
|
||||
@@ -2378,7 +2430,7 @@ func TestA2AClientResponseHeaderTimeout(t *testing.T) {
|
||||
t.Run("invalid A2A_PROXY_RESPONSE_HEADER_TIMEOUT falls back to default", func(t *testing.T) {
|
||||
t.Setenv("A2A_PROXY_RESPONSE_HEADER_TIMEOUT", "not-a-duration")
|
||||
// Simulate what envx.Duration does with an invalid value.
|
||||
var fallback = 180 * time.Second
|
||||
var fallback = 5 * time.Minute
|
||||
override := fallback
|
||||
if v := os.Getenv("A2A_PROXY_RESPONSE_HEADER_TIMEOUT"); v != "" {
|
||||
if d, err := time.ParseDuration(v); err == nil && d > 0 {
|
||||
|
||||
Reference in New Issue
Block a user