diff --git a/canvas/e2e/staging-setup.ts b/canvas/e2e/staging-setup.ts index 873ac07bd..b94d6793f 100644 --- a/canvas/e2e/staging-setup.ts +++ b/canvas/e2e/staging-setup.ts @@ -234,23 +234,48 @@ export default async function globalSetup(_config: FullConfig): Promise { "Authorization": `Bearer ${tenantToken}`, "X-Molecule-Org-Id": orgID, }; - const ws = await jsonFetch(`${tenantURL}/workspaces`, { - method: "POST", - headers: tenantAuth, - body: JSON.stringify({ - name: "E2E Canvas Test", - runtime: "hermes", - tier: 2, - model: "gpt-4o", - }), - }); - if (ws.status >= 400 || !ws.body?.id) { - throw new Error(`Workspace create ${ws.status}: ${JSON.stringify(ws.body)}`); + // Retry workspace creation on transient 5xx / timeout — staging CP can + // return 502/503/504 under load and a single-shot failure kills the + // entire E2E run. 3 attempts with 3s exponential backoff (3s, 6s, 12s) + // gives ~21s total budget, well inside the 20-min provision envelope. + let workspaceId = ""; + for (let attempt = 1; attempt <= 3; attempt++) { + const ws = await jsonFetch(`${tenantURL}/workspaces`, { + method: "POST", + headers: tenantAuth, + body: JSON.stringify({ + name: "E2E Canvas Test", + runtime: "hermes", + tier: 2, + model: "gpt-4o", + }), + }); + if (ws.status >= 200 && ws.status < 300 && ws.body?.id) { + workspaceId = ws.body.id as string; + break; + } + const isTransient = ws.status >= 500 || ws.status === 0; + if (!isTransient || attempt === 3) { + throw new Error(`Workspace create ${ws.status} (attempt ${attempt}): ${JSON.stringify(ws.body)}`); + } + const backoff = 3000 * Math.pow(2, attempt - 1); + console.log(`[staging-setup] Workspace create transient ${ws.status}, retrying in ${backoff}ms...`); + await new Promise((r) => setTimeout(r, backoff)); } - const workspaceId = ws.body.id as string; console.log(`[staging-setup] Workspace created: ${workspaceId}`); // 6. Wait for workspace online + // + // Hermes cold-boot takes 10-13 min on slow apt days (apt + uv + hermes + // install + npm browser-tools). The controlplane bootstrap-watcher + // deadline fires at 5 min and sets status=failed prematurely; heartbeat + // then transitions failed → online after install.sh finishes. So + // 'failed' is a TRANSIENT state we must tolerate — log once and keep + // polling, only hard-fail at the deadline. Pre-fix this was a flake + // generator: workspace went failed→online inside our window but we + // bailed at the failed read. See test_staging_full_saas.sh step 7/11 + // and issue #2632. + let wsFailedLogged = false; await waitFor( async () => { const r = await jsonFetch(`${tenantURL}/workspaces/${workspaceId}`, { @@ -259,17 +284,21 @@ export default async function globalSetup(_config: FullConfig): Promise { if (r.status !== 200) return null; if (r.body?.status === "online") return true; if (r.body?.status === "failed") { - // last_sample_error is often empty when the failure happens before - // the agent emits a sample (e.g. boot crash, image pull error, - // missing PYTHONPATH, OpenAI quota at startup). Dumping the full - // body gives triage the boot_stage / last_error / image fields it - // needs without a second probe. Otherwise this propagates as a - // bare "Workspace failed: " — the exact useless message that - // sent #2632 to the issue tracker. - const detail = r.body.last_sample_error - ? r.body.last_sample_error - : `(no last_sample_error) full body: ${JSON.stringify(r.body)}`; - throw new Error(`Workspace failed: ${detail}`); + if (!wsFailedLogged) { + // last_sample_error is often empty when the failure happens before + // the agent emits a sample (e.g. boot crash, image pull error, + // missing PYTHONPATH, OpenAI quota at startup). Dumping the full + // body gives triage the boot_stage / last_error / image fields it + // needs without a second probe. Otherwise this propagates as a + // bare "Workspace failed: " — the exact useless message that + // sent #2632 to the issue tracker. + const detail = r.body.last_sample_error + ? r.body.last_sample_error + : `(no last_sample_error) full body: ${JSON.stringify(r.body)}`; + console.log(`[staging-setup] workspace ${workspaceId} transiently failed — waiting for heartbeat recovery (bootstrap-watcher deadline, see cp#245). detail: ${detail}`); + wsFailedLogged = true; + } + return null; } return null; }, diff --git a/workspace-server/internal/channels/telegram.go b/workspace-server/internal/channels/telegram.go index 3d323057c..11e59ee57 100644 --- a/workspace-server/internal/channels/telegram.go +++ b/workspace-server/internal/channels/telegram.go @@ -517,7 +517,9 @@ func (t *TelegramAdapter) StartPolling(ctx context.Context, config map[string]in // Acknowledge the button press (removes loading spinner) ackCfg := tgbotapi.NewCallback(cb.ID, "Received") - bot.Send(ackCfg) + if _, err := bot.Send(ackCfg); err != nil { + log.Printf("telegram: failed to send callback ack: %v", err) + } // Update the message to show what was clicked decision := "approved" @@ -529,7 +531,9 @@ func (t *TelegramAdapter) StartPolling(ctx context.Context, config map[string]in cb.Message.MessageID, cb.Message.Text+"\n\n✅ CEO "+decision, ) - bot.Send(editMsg) + if _, err := bot.Send(editMsg); err != nil { + log.Printf("telegram: failed to send edit message: %v", err) + } // Route the decision as an inbound message to the agent inbound := &InboundMessage{ diff --git a/workspace-server/internal/handlers/approvals.go b/workspace-server/internal/handlers/approvals.go index 985010cf8..40393c9f6 100644 --- a/workspace-server/internal/handlers/approvals.go +++ b/workspace-server/internal/handlers/approvals.go @@ -54,23 +54,29 @@ func (h *ApprovalsHandler) Create(c *gin.Context) { return } - h.broadcaster.RecordAndBroadcast(ctx, string(events.EventApprovalRequested), workspaceID, map[string]interface{}{ + if err := h.broadcaster.RecordAndBroadcast(ctx, string(events.EventApprovalRequested), workspaceID, map[string]interface{}{ "approval_id": approvalID, "action": body.Action, "reason": body.Reason, "task_id": body.TaskID, - }) + }); err != nil { + log.Printf("approvals: failed to broadcast approval requested: %v", err) + } // Auto-escalate to parent var parentID *string - db.DB.QueryRowContext(ctx, `SELECT parent_id FROM workspaces WHERE id = $1`, workspaceID).Scan(&parentID) + if err := db.DB.QueryRowContext(ctx, `SELECT parent_id FROM workspaces WHERE id = $1`, workspaceID).Scan(&parentID); err != nil { + log.Printf("approvals: failed to lookup parent for escalation: %v", err) + } if parentID != nil { - h.broadcaster.RecordAndBroadcast(ctx, string(events.EventApprovalEscalated), *parentID, map[string]interface{}{ + if err := h.broadcaster.RecordAndBroadcast(ctx, string(events.EventApprovalEscalated), *parentID, map[string]interface{}{ "approval_id": approvalID, "from_workspace_id": workspaceID, "action": body.Action, "reason": body.Reason, - }) + }); err != nil { + log.Printf("approvals: failed to broadcast approval escalated: %v", err) + } } c.JSON(http.StatusCreated, gin.H{"approval_id": approvalID, "status": "pending"}) @@ -221,11 +227,13 @@ func (h *ApprovalsHandler) Decide(c *gin.Context) { eventType = "APPROVAL_DENIED" } - h.broadcaster.RecordAndBroadcast(ctx, eventType, workspaceID, map[string]interface{}{ + if err := h.broadcaster.RecordAndBroadcast(ctx, eventType, workspaceID, map[string]interface{}{ "approval_id": approvalID, "decision": body.Decision, "decided_by": decidedBy, - }) + }); err != nil { + log.Printf("approvals: failed to broadcast approval decision: %v", err) + } c.JSON(http.StatusOK, gin.H{"status": body.Decision, "approval_id": approvalID}) }