From 5c0c15eb4f2fabf63b5fa6129de4f3d22cfcc647 Mon Sep 17 00:00:00 2001
From: dev-lead <dev-lead@moleculesai.app>
Date: Fri, 8 May 2026 10:58:19 -0700
Subject: [PATCH] chore(canary): workflow_dispatch input keep_on_failure for
 log capture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Investigating molecule-core#129 failure mode #1 (claude-code "Agent
error (Exception)") needs the workspace's docker logs to find the
actual exception. The canary tears down the tenant on every failure,
so the workspace container is destroyed before anyone can SSM in.

Add a workflow_dispatch input `keep_on_failure: bool` (default false).
When true, sets `E2E_KEEP_ORG=1` for the canary script — its existing
debug path skips teardown, leaving the tenant + EC2 + CF tunnel + DNS
alive. Operator can then SSM into the workspace EC2 (via the same
flow as recover-tunnels.py) and capture `docker logs` from the
claude-code container.

Cron-triggered runs never set the input (it only exists on dispatch),
so unattended scheduled canaries always tear down — no risk of
unattended cost leak.

Operator workflow:
  1. Dispatch canary-staging.yml with keep_on_failure=true
  2. Watch CI; on failure (likely, given the 38h chronic red),
     note the SLUG / TENANT_URL printed at step 1/11
  3. SSM exec into the workspace EC2 (us-east-2) and run
     `docker logs <claude-code-container>` to find the actual
     exception traceback
  4. Manually delete via DELETE /cp/admin/tenants/<slug> when done
     (the script logs this reminder on E2E_KEEP_ORG=1 path)

Refs: molecule-core#129 (canary investigation)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/canary-staging.yml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
diff --git a/.github/workflows/canary-staging.yml b/.github/workflows/canary-staging.yml
index 8f0d74ac..bf75c57f 100644
--- a/.github/workflows/canary-staging.yml
+++ b/.github/workflows/canary-staging.yml
@@ -20,6 +20,19 @@ on:
     # a few minutes under load — that's fine for a canary.
     - cron: '*/30 * * * *'
   workflow_dispatch:
+    inputs:
+      keep_on_failure:
+        description: >-
+          Skip teardown when the canary fails (debugging only). The
+          tenant org + EC2 + CF tunnel + DNS stay alive so an operator
+          can SSM into the workspace EC2 and capture docker logs of the
+          failing claude-code container. REMEMBER to manually delete
+          via DELETE /cp/admin/tenants/<slug> when done so the org
+          doesn't accumulate cost. Only honored on workflow_dispatch;
+          cron runs always tear down (we don't want unattended cron
+          to leak resources).
+        type: boolean
+        default: false
 
 # Serialise with the full-SaaS workflow so they don't contend for the
 # same org-create quota on staging. Different group key from
@@ -80,6 +93,14 @@ jobs:
       # is "Token Plan only" but cheap-per-token and fast.
       E2E_MODEL_SLUG: MiniMax-M2.7-highspeed
       E2E_RUN_ID: "canary-${{ github.run_id }}"
+      # Debug-only: when an operator dispatches with keep_on_failure=true,
+      # the canary script's E2E_KEEP_ORG=1 path skips teardown so the
+      # tenant org + EC2 stay alive for SSM-based log capture. Cron runs
+      # never set this (the input only exists on workflow_dispatch) so
+      # unattended cron always tears down. See molecule-core#129
+      # failure mode #1 — capturing the actual exception requires
+      # docker logs from the live container.
+      E2E_KEEP_ORG: ${{ github.event.inputs.keep_on_failure == 'true' && '1' || '0' }}
 
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
-- 
2.45.2