From 24d64677abe0d48144bb7676188ca3e3a078849d Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 3 May 2026 16:04:04 -0700 Subject: [PATCH 1/2] canvas/Legend: focus rings + 24x24 close-button touch target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two small a11y fixes for the floating legend. 1. Both buttons (open pill + close ×) had no focus-visible ring — keyboard users couldn't tell where focus landed. Added the accent-ring pattern used across the rest of the canvas. 2. Close button was a ~10x16px hit area — well below WCAG 2.5.5's 24x24 minimum. Bumped to w-6 h-6 with negative margin so the visible × stays in the same spot but the hit area + focus ring are larger. Hover bg added to make the hit area visible on hover. Co-Authored-By: Claude Opus 4.7 (1M context) --- canvas/src/components/Legend.tsx | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/canvas/src/components/Legend.tsx b/canvas/src/components/Legend.tsx index c6d67365..f4137ff7 100644 --- a/canvas/src/components/Legend.tsx +++ b/canvas/src/components/Legend.tsx @@ -77,7 +77,7 @@ export function Legend() { onClick={openLegend} aria-label="Show legend" title="Show legend" - className={`fixed bottom-6 ${leftClass} z-30 flex items-center gap-1.5 rounded-full bg-surface-sunken/95 border border-line/50 px-3 py-1.5 text-[11px] font-semibold text-ink-mid uppercase tracking-wider shadow-xl shadow-black/30 backdrop-blur-sm hover:text-ink hover:border-line transition-[left,colors] duration-200`} + className={`fixed bottom-6 ${leftClass} z-30 flex items-center gap-1.5 rounded-full bg-surface-sunken/95 border border-line/50 px-3 py-1.5 text-[11px] font-semibold text-ink-mid uppercase tracking-wider shadow-xl shadow-black/30 backdrop-blur-sm hover:text-ink hover:border-line focus:outline-none focus-visible:ring-2 focus-visible:ring-accent/60 focus-visible:ring-offset-2 focus-visible:ring-offset-surface transition-[left,colors] duration-200`} > Legend @@ -94,7 +94,10 @@ export function Legend() { onClick={closeLegend} aria-label="Hide legend" title="Hide legend" - className="-mt-0.5 -mr-1 px-1.5 text-[14px] leading-none text-ink-soft hover:text-ink transition-colors" + // 24×24 touch target (was ~10×16, well under WCAG 2.5.5 min). + // Negative margin keeps the visual position the same as before + // — only the hit area + focus ring are larger. + className="-mt-1.5 -mr-1.5 w-6 h-6 inline-flex items-center justify-center rounded text-[14px] leading-none text-ink-soft hover:text-ink hover:bg-surface-card/40 focus:outline-none focus-visible:ring-2 focus-visible:ring-accent/60 transition-colors" > × From 3cd8c53de032f56a52ae4f6c526ef6654029447f Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 3 May 2026 16:08:40 -0700 Subject: [PATCH 2/2] ci: tighten e2e cleanup race window 120m -> ~45m worst case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes that close one of the leak classes from the molecule-controlplane#420 vCPU audit: 1. sweep-stale-e2e-orgs.yml: cron */15 (was hourly), MAX_AGE_MINUTES 30 (was 120). E2E runs are 8-25 min wall clock; 30 min is safely above the longest run while shrinking the worst-case leak window from ~2h to ~45 min (15-min sweep cadence + 30-min threshold). 2. canary-staging.yml teardown: the per-slug DELETE used `>/dev/null || true`, which swallowed every failure. A 5xx or timeout from CP looked identical to "successfully deleted" and the canary tenant kept eating ~2 vCPU until the sweeper caught it. Now we capture the response code and surface non-2xx as a workflow warning that names the leaked slug. The exit semantics stay unchanged — a single-canary cleanup miss shouldn't fail-flag the canary itself when the actual smoke check passed. The sweeper is the safety net for whatever slips past. Caught during the molecule-controlplane#420 audit on 2026-05-03 — 3 e2e canary tenant orphans were running for 24-95 min, all under the previous 120-min sweep threshold so they went unnoticed until manual cleanup. Same `|| true` pattern exists in e2e-staging-{canvas,external,saas,sanity}.yml; out of scope for this PR (mechanical port; tracking separately) but the sweeper tightening covers all of them by reducing the safety-net latency. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/canary-staging.yml | 28 ++++++++++++++++++++-- .github/workflows/sweep-stale-e2e-orgs.yml | 21 ++++++++++------ 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/.github/workflows/canary-staging.yml b/.github/workflows/canary-staging.yml index 396d53ca..93f53ca7 100644 --- a/.github/workflows/canary-staging.yml +++ b/.github/workflows/canary-staging.yml @@ -231,10 +231,34 @@ jobs: and o.get('status') not in ('purged',)] print('\n'.join(candidates)) " 2>/dev/null) + # Per-slug DELETE with HTTP-code verification. The previous + # `... >/dev/null || true` swallowed every failure, so a 5xx + # or timeout from CP looked identical to "successfully cleaned + # up" and the tenant kept eating ~2 vCPU until the hourly + # stale sweep caught it (up to 2h later). Now we capture the + # response code and surface non-2xx as a workflow warning, so + # the run page shows which slug leaked. We still don't `exit 1` + # on cleanup failure — a single-canary cleanup miss shouldn't + # fail-flag the canary itself when the actual smoke check + # passed. The sweep-stale-e2e-orgs cron (now every 15 min, + # 30-min threshold) is the safety net for whatever slips past. + # See molecule-controlplane#420. + leaks=() for slug in $orgs; do - curl -sS -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ + code=$(curl -sS -o /tmp/canary-cleanup.out -w "%{http_code}" \ + -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \ -H "Authorization: Bearer $ADMIN_TOKEN" \ -H "Content-Type: application/json" \ - -d "{\"confirm\":\"$slug\"}" >/dev/null || true + -d "{\"confirm\":\"$slug\"}" \ + || echo "000") + if [ "$code" = "200" ] || [ "$code" = "204" ]; then + echo "[teardown] deleted $slug (HTTP $code)" + else + echo "::warning::canary teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/canary-cleanup.out 2>/dev/null)" + leaks+=("$slug") + fi done + if [ ${#leaks[@]} -gt 0 ]; then + echo "::warning::canary teardown left ${#leaks[@]} leak(s): ${leaks[*]}" + fi exit 0 diff --git a/.github/workflows/sweep-stale-e2e-orgs.yml b/.github/workflows/sweep-stale-e2e-orgs.yml index 5a0dce30..d2fcb8be 100644 --- a/.github/workflows/sweep-stale-e2e-orgs.yml +++ b/.github/workflows/sweep-stale-e2e-orgs.yml @@ -25,16 +25,23 @@ name: Sweep stale e2e-* orgs (staging) on: schedule: - # Every hour on the hour. E2E orgs are short-lived (~10-25 min wall - # clock from create to teardown). Anything older than the - # MAX_AGE_MINUTES threshold below is presumed dead. - - cron: '0 * * * *' + # Every 15 min. E2E orgs are short-lived (~8-25 min wall clock from + # create to teardown — canary is ~8 min, full SaaS ~25 min). The + # previous hourly + 120-min stale threshold meant a leaked tenant + # could keep an EC2 alive for up to 2 hours, eating ~2 vCPU per + # leak. Tightening the cadence + threshold reduces the worst-case + # leak window from 120 min to ~45 min (15-min sweep cadence + 30-min + # threshold) without risk of catching in-progress runs (the longest + # e2e run is the 25-min canary, well under the 30-min threshold). + # See molecule-controlplane#420 for the leak-class accounting that + # motivated this tightening. + - cron: '*/15 * * * *' workflow_dispatch: inputs: max_age_minutes: - description: "Delete e2e-* orgs older than N minutes (default 120)" + description: "Delete e2e-* orgs older than N minutes (default 30)" required: false - default: "120" + default: "30" dry_run: description: "Dry run only — list what would be deleted" required: false @@ -58,7 +65,7 @@ jobs: env: MOLECULE_CP_URL: https://staging-api.moleculesai.app ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }} - MAX_AGE_MINUTES: ${{ github.event.inputs.max_age_minutes || '120' }} + MAX_AGE_MINUTES: ${{ github.event.inputs.max_age_minutes || '30' }} DRY_RUN: ${{ github.event.inputs.dry_run || 'false' }} # Refuse to delete more than this many orgs in one tick. If the # CP DB is briefly empty (or the admin endpoint goes weird and