diff --git a/canvas/e2e/staging-setup.ts b/canvas/e2e/staging-setup.ts
index 873ac07bd..b94d6793f 100644
--- a/canvas/e2e/staging-setup.ts
+++ b/canvas/e2e/staging-setup.ts
@@ -234,23 +234,48 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
     "Authorization": `Bearer ${tenantToken}`,
     "X-Molecule-Org-Id": orgID,
   };
-  const ws = await jsonFetch(`${tenantURL}/workspaces`, {
-    method: "POST",
-    headers: tenantAuth,
-    body: JSON.stringify({
-      name: "E2E Canvas Test",
-      runtime: "hermes",
-      tier: 2,
-      model: "gpt-4o",
-    }),
-  });
-  if (ws.status >= 400 || !ws.body?.id) {
-    throw new Error(`Workspace create ${ws.status}: ${JSON.stringify(ws.body)}`);
+  // Retry workspace creation on transient 5xx / timeout — staging CP can
+  // return 502/503/504 under load and a single-shot failure kills the
+  // entire E2E run. 3 attempts with 3s exponential backoff (3s, 6s, 12s)
+  // gives ~21s total budget, well inside the 20-min provision envelope.
+  let workspaceId = "";
+  for (let attempt = 1; attempt <= 3; attempt++) {
+    const ws = await jsonFetch(`${tenantURL}/workspaces`, {
+      method: "POST",
+      headers: tenantAuth,
+      body: JSON.stringify({
+        name: "E2E Canvas Test",
+        runtime: "hermes",
+        tier: 2,
+        model: "gpt-4o",
+      }),
+    });
+    if (ws.status >= 200 && ws.status < 300 && ws.body?.id) {
+      workspaceId = ws.body.id as string;
+      break;
+    }
+    const isTransient = ws.status >= 500 || ws.status === 0;
+    if (!isTransient || attempt === 3) {
+      throw new Error(`Workspace create ${ws.status} (attempt ${attempt}): ${JSON.stringify(ws.body)}`);
+    }
+    const backoff = 3000 * Math.pow(2, attempt - 1);
+    console.log(`[staging-setup] Workspace create transient ${ws.status}, retrying in ${backoff}ms...`);
+    await new Promise((r) => setTimeout(r, backoff));
   }
-  const workspaceId = ws.body.id as string;
   console.log(`[staging-setup] Workspace created: ${workspaceId}`);
 
   // 6. Wait for workspace online
+  //
+  // Hermes cold-boot takes 10-13 min on slow apt days (apt + uv + hermes
+  // install + npm browser-tools). The controlplane bootstrap-watcher
+  // deadline fires at 5 min and sets status=failed prematurely; heartbeat
+  // then transitions failed → online after install.sh finishes. So
+  // 'failed' is a TRANSIENT state we must tolerate — log once and keep
+  // polling, only hard-fail at the deadline. Pre-fix this was a flake
+  // generator: workspace went failed→online inside our window but we
+  // bailed at the failed read. See test_staging_full_saas.sh step 7/11
+  // and issue #2632.
+  let wsFailedLogged = false;
   await waitFor<boolean>(
     async () => {
       const r = await jsonFetch(`${tenantURL}/workspaces/${workspaceId}`, {
@@ -259,17 +284,21 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
       if (r.status !== 200) return null;
       if (r.body?.status === "online") return true;
       if (r.body?.status === "failed") {
-        // last_sample_error is often empty when the failure happens before
-        // the agent emits a sample (e.g. boot crash, image pull error,
-        // missing PYTHONPATH, OpenAI quota at startup). Dumping the full
-        // body gives triage the boot_stage / last_error / image fields it
-        // needs without a second probe. Otherwise this propagates as a
-        // bare "Workspace failed: " — the exact useless message that
-        // sent #2632 to the issue tracker.
-        const detail = r.body.last_sample_error
-          ? r.body.last_sample_error
-          : `(no last_sample_error) full body: ${JSON.stringify(r.body)}`;
-        throw new Error(`Workspace failed: ${detail}`);
+        if (!wsFailedLogged) {
+          // last_sample_error is often empty when the failure happens before
+          // the agent emits a sample (e.g. boot crash, image pull error,
+          // missing PYTHONPATH, OpenAI quota at startup). Dumping the full
+          // body gives triage the boot_stage / last_error / image fields it
+          // needs without a second probe. Otherwise this propagates as a
+          // bare "Workspace failed: " — the exact useless message that
+          // sent #2632 to the issue tracker.
+          const detail = r.body.last_sample_error
+            ? r.body.last_sample_error
+            : `(no last_sample_error) full body: ${JSON.stringify(r.body)}`;
+          console.log(`[staging-setup] workspace ${workspaceId} transiently failed — waiting for heartbeat recovery (bootstrap-watcher deadline, see cp#245). detail: ${detail}`);
+          wsFailedLogged = true;
+        }
+        return null;
       }
       return null;
     },
diff --git a/workspace-server/internal/channels/telegram.go b/workspace-server/internal/channels/telegram.go
index 3d323057c..11e59ee57 100644
--- a/workspace-server/internal/channels/telegram.go
+++ b/workspace-server/internal/channels/telegram.go
@@ -517,7 +517,9 @@ func (t *TelegramAdapter) StartPolling(ctx context.Context, config map[string]in
 
 				// Acknowledge the button press (removes loading spinner)
 				ackCfg := tgbotapi.NewCallback(cb.ID, "Received")
-				bot.Send(ackCfg)
+				if _, err := bot.Send(ackCfg); err != nil {
+					log.Printf("telegram: failed to send callback ack: %v", err)
+				}
 
 				// Update the message to show what was clicked
 				decision := "approved"
@@ -529,7 +531,9 @@ func (t *TelegramAdapter) StartPolling(ctx context.Context, config map[string]in
 					cb.Message.MessageID,
 					cb.Message.Text+"\n\n✅ CEO "+decision,
 				)
-				bot.Send(editMsg)
+				if _, err := bot.Send(editMsg); err != nil {
+					log.Printf("telegram: failed to send edit message: %v", err)
+				}
 
 				// Route the decision as an inbound message to the agent
 				inbound := &InboundMessage{
diff --git a/workspace-server/internal/handlers/approvals.go b/workspace-server/internal/handlers/approvals.go
index 985010cf8..40393c9f6 100644
--- a/workspace-server/internal/handlers/approvals.go
+++ b/workspace-server/internal/handlers/approvals.go
@@ -54,23 +54,29 @@ func (h *ApprovalsHandler) Create(c *gin.Context) {
 		return
 	}
 
-	h.broadcaster.RecordAndBroadcast(ctx, string(events.EventApprovalRequested), workspaceID, map[string]interface{}{
+	if err := h.broadcaster.RecordAndBroadcast(ctx, string(events.EventApprovalRequested), workspaceID, map[string]interface{}{
 		"approval_id": approvalID,
 		"action":      body.Action,
 		"reason":      body.Reason,
 		"task_id":     body.TaskID,
-	})
+	}); err != nil {
+		log.Printf("approvals: failed to broadcast approval requested: %v", err)
+	}
 
 	// Auto-escalate to parent
 	var parentID *string
-	db.DB.QueryRowContext(ctx, `SELECT parent_id FROM workspaces WHERE id = $1`, workspaceID).Scan(&parentID)
+	if err := db.DB.QueryRowContext(ctx, `SELECT parent_id FROM workspaces WHERE id = $1`, workspaceID).Scan(&parentID); err != nil {
+		log.Printf("approvals: failed to lookup parent for escalation: %v", err)
+	}
 	if parentID != nil {
-		h.broadcaster.RecordAndBroadcast(ctx, string(events.EventApprovalEscalated), *parentID, map[string]interface{}{
+		if err := h.broadcaster.RecordAndBroadcast(ctx, string(events.EventApprovalEscalated), *parentID, map[string]interface{}{
 			"approval_id":       approvalID,
 			"from_workspace_id": workspaceID,
 			"action":            body.Action,
 			"reason":            body.Reason,
-		})
+		}); err != nil {
+			log.Printf("approvals: failed to broadcast approval escalated: %v", err)
+		}
 	}
 
 	c.JSON(http.StatusCreated, gin.H{"approval_id": approvalID, "status": "pending"})
@@ -221,11 +227,13 @@ func (h *ApprovalsHandler) Decide(c *gin.Context) {
 		eventType = "APPROVAL_DENIED"
 	}
 
-	h.broadcaster.RecordAndBroadcast(ctx, eventType, workspaceID, map[string]interface{}{
+	if err := h.broadcaster.RecordAndBroadcast(ctx, eventType, workspaceID, map[string]interface{}{
 		"approval_id": approvalID,
 		"decision":    body.Decision,
 		"decided_by":  decidedBy,
-	})
+	}); err != nil {
+		log.Printf("approvals: failed to broadcast approval decision: %v", err)
+	}
 
 	c.JSON(http.StatusOK, gin.H{"status": body.Decision, "approval_id": approvalID})
 }