molecule-core/.github/workflows/continuous-synth-e2e.yml

name: Continuous synthetic E2E (staging)

# Hard gate (#2342): cron-driven full-lifecycle E2E that catches
# regressions visible only at runtime — schema drift, deployment-pipeline
# gaps, vendor outages, env-var rotations, DNS / CF / Railway side-effects.
#
# Why this gate exists:
#   PR-time CI catches code-level regressions but not deployment-time or
#   integration-time ones. Today's empirical data:
#     • #2345 (A2A v0.2 silent drop) — passed all unit tests, broke at
#       JSON-RPC parse layer between sender and receiver. Visible only
#       to a sender exercising the full path.
#     • RFC #2312 chat upload — landed on staging-branch but never
#       reached staging tenants because publish-workspace-server-image
#       was main-only. Caught by manual dogfooding hours after deploy.
#   Both would have surfaced within 15-20 min of regression if a
#   continuous synth-E2E was running.
#
# Cadence: every 20 min (3x/hour). The script is conservatively
# bounded at 10 min wall-clock; even on degraded staging it should
# finish before the next firing. cron-overlap is guarded by the
# concurrency group below.
#
# Cost: ~3 runs/hour × 5-10 min × $0.008/min GHA = ~$0.50-$1/day.
# Plus a fresh tenant provisioned + torn down each run (Railway +
# AWS pennies). Negligible.
#
# Failure handling: when the run fails, the workflow exits non-zero
# and GitHub's standard email/notification path fires. Operators
# can subscribe to this workflow's failure channel for paging-grade
# alerting.

on:
  schedule:
    # Every 20 minutes, on :10 :30 :50. Two constraints:
    #   1. Stay off the top-of-hour. GitHub Actions scheduler drops
    #      :00 firings under high load (own docs:
    #      https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#schedule).
    #      Empirical 2026-05-03: cron was '0,20,40 * * * *' but actual
    #      firings landed at :08, :03, :01, :03 with :20 + :40 silently
    #      dropped — only the :00-region run survived. Detection
    #      latency degraded from claimed 20 min to actual ~60 min.
    #      :10/:30/:50 sit far enough from :00 that GH-load skips
    #      stop dropping us.
    #   2. Avoid colliding with the existing :15 sweep-cf-orphans
    #      and :45 sweep-cf-tunnels — both hit the CF API and we
    #      don't want to fight for rate-limit tokens.
    - cron: '10,30,50 * * * *'
  workflow_dispatch:
    inputs:
      runtime:
        description: "Runtime to provision (claude-code = default + cheapest via MiniMax; langgraph = OpenAI-only; hermes = SDK-native path, slower)"
        required: false
        default: "claude-code"
        type: string
      model_slug:
        description: "Model id to provision the workspace with (default MiniMax-M2.7-highspeed; e.g. 'sonnet' to test direct Anthropic, 'openai/gpt-4o' for hermes)"
        required: false
        default: "MiniMax-M2.7-highspeed"
        type: string
      keep_org:
        description: "Skip teardown for post-mortem debugging (only manual dispatch — never set this for cron runs)"
        required: false
        default: false
        type: boolean

permissions:
  contents: read
  # No issue-write here — failures surface as red runs in the workflow
  # history. If you want auto-issue-on-fail, add a follow-up step that
  # uses gh issue create gated on `if: failure()`. Keeping the surface
  # minimal until that's actually wanted.

# Serialize so two firings can never overlap. Cron firing every 20 min
# but scripts conservatively bounded at 10 min — overlap shouldn't
# happen in steady state, but if a run hangs we don't want N more
# stacking up.
concurrency:
  group: continuous-synth-e2e
  cancel-in-progress: false

jobs:
  synth:
    name: Synthetic E2E against staging
    runs-on: ubuntu-latest
    timeout-minutes: 12
    env:
      # claude-code default: cold-start ~5 min (comparable to langgraph),
      # but uses MiniMax-M2.7-highspeed via the template's third-party-
      # Anthropic-compat path (workspace-configs-templates/claude-code-
      # default/config.yaml:64-69). MiniMax is ~5-10x cheaper than
      # gpt-4.1-mini per token AND avoids the recurring OpenAI quota-
      # exhaustion class that took the canary down 2026-05-03 (#265).
      # Operators can pick langgraph / hermes via workflow_dispatch
      # when they specifically need to exercise the OpenAI or SDK-
      # native paths.
      E2E_RUNTIME: ${{ github.event.inputs.runtime || 'claude-code' }}
      # Pin the canary to a specific MiniMax model rather than relying
      # on the per-runtime default ("sonnet" → routes to direct
      # Anthropic, defeats the cost saving). Operators can override
      # via workflow_dispatch by setting a different E2E_MODEL_SLUG
      # input if they need to exercise a specific model. M2.7-highspeed
      # is "Token Plan only" but cheap-per-token and fast.
      E2E_MODEL_SLUG: ${{ github.event.inputs.model_slug || 'MiniMax-M2.7-highspeed' }}
      # Bound to 10 min so a stuck provision fails the run instead of
      # holding up the next cron firing. 15-min default in the script
      # is for the on-PR full lifecycle where we have more headroom.
      E2E_PROVISION_TIMEOUT_SECS: '600'
      # Slug suffix — namespaced "synth-" so these runs are
      # distinguishable from PR-driven runs in CP admin.
      E2E_RUN_ID: synth-${{ github.run_id }}
      # Forced false for cron; respected for manual dispatch
      E2E_KEEP_ORG: ${{ github.event.inputs.keep_org == 'true' && '1' || '' }}
      MOLECULE_CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }}
      MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      # MiniMax key is the canary's PRIMARY auth path. claude-code
      # template's `minimax` provider routes ANTHROPIC_BASE_URL to
      # api.minimax.io/anthropic and reads MINIMAX_API_KEY at boot.
      # tests/e2e/test_staging_full_saas.sh branches SECRETS_JSON on
      # which key is present — MiniMax wins when set.
      E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
      # OpenAI fallback — kept wired so operators can dispatch with
      # E2E_RUNTIME=langgraph or =hermes and still have a working
      # canary path. The script picks the right blob shape based on
      # which key is non-empty.
      E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_KEY }}
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Verify required secrets present
        env:
          # Re-bind so the per-runtime LLM key check below sees the right
          # secret. The job-level env block already reads both; this just
          # makes them visible inside the conditional shell.
          IS_DISPATCH: ${{ github.event_name == 'workflow_dispatch' }}
        run: |
          # Schedule-vs-dispatch hardening (mirrors the sweep-cf-* and
          # redeploy-tenants-on-* workflows): hard-fail on missing secret
          # for cron firing so a misconfigured-repo doesn't silently
          # report green while doing nothing. Soft-skip on operator
          # dispatch — operators can dispatch ad-hoc to verify a fix
          # without setting up the secret first.
          if [ -z "${MOLECULE_ADMIN_TOKEN:-}" ]; then
            if [ "$IS_DISPATCH" = "true" ]; then
              echo "::warning::CP_STAGING_ADMIN_API_TOKEN not set — synth E2E cannot run"
              echo "::warning::Set it at Settings → Secrets and Variables → Actions"
              exit 0
            fi
            echo "::error::CP_STAGING_ADMIN_API_TOKEN secret missing — synth E2E cannot run"
            echo "::error::Set it at Settings → Secrets and Variables → Actions; pull from staging-CP's CP_ADMIN_API_TOKEN env in Railway."
            exit 1
          fi

          # LLM-key requirement is per-runtime: claude-code uses MiniMax
          # (MOLECULE_STAGING_MINIMAX_API_KEY), langgraph + hermes use
          # OpenAI (MOLECULE_STAGING_OPENAI_KEY). Cron firing must have
          # the right key for the active runtime; dispatch can soft-skip.
          case "${E2E_RUNTIME}" in
            claude-code)
              required_secret_name="MOLECULE_STAGING_MINIMAX_API_KEY"
              required_secret_value="${E2E_MINIMAX_API_KEY:-}"
              ;;
            langgraph|hermes)
              required_secret_name="MOLECULE_STAGING_OPENAI_KEY"
              required_secret_value="${E2E_OPENAI_API_KEY:-}"
              ;;
            *)
              echo "::warning::Unknown E2E_RUNTIME='${E2E_RUNTIME}' — skipping LLM-key check"
              required_secret_name=""
              required_secret_value="present"
              ;;
          esac
          if [ -n "$required_secret_name" ] && [ -z "$required_secret_value" ]; then
            if [ "$IS_DISPATCH" = "true" ]; then
              echo "::warning::${required_secret_name} not set — synth E2E with runtime=${E2E_RUNTIME} cannot reach an LLM"
              echo "::warning::Set it at Settings → Secrets and Variables → Actions, OR dispatch with a different runtime"
              exit 0
            fi
            echo "::error::${required_secret_name} secret missing — runtime=${E2E_RUNTIME} cannot authenticate against its LLM provider"
            echo "::error::Set it at Settings → Secrets and Variables → Actions"
            exit 1
          fi

      - name: Install required tools
        run: |
          # The script depends on jq + curl (already on ubuntu-latest)
          # and python3 (likewise). Verify they're all present so we
          # fail fast on a runner image regression rather than mid-script.
          for cmd in jq curl python3; do
            command -v "$cmd" >/dev/null 2>&1 || {
              echo "::error::required tool '$cmd' not on PATH — runner image regression?"
              exit 1
            }
          done

      - name: Run synthetic E2E
        # The script handles its own teardown via EXIT trap; even on
        # failure (timeout, assertion), the org is deprovisioned and
        # leaks are reported. Exit code propagates from the script.
        run: |
          bash tests/e2e/test_staging_full_saas.sh

      - name: Failure summary
        # Runs only on failure. Adds a job summary so the workflow run
        # page shows a quick "what happened" instead of forcing readers
        # to scroll through script output.
        if: failure()
        run: |
          {
            echo "## Continuous synth E2E failed"
            echo ""
            echo "**Run ID:** ${{ github.run_id }}"
            echo "**Trigger:** ${{ github.event_name }}"
            echo "**Runtime:** ${E2E_RUNTIME}"
            echo "**Slug:** synth-${{ github.run_id }}"
            echo ""
            echo "### What this means"
            echo ""
            echo "Staging just regressed on a path that previously worked. Likely classes:"
            echo "- Schema mismatch between sender and receiver (#2345 class)"
            echo "- Deployment-pipeline gap (RFC #2312 / staging-tenant-image-stale class)"
            echo "- Vendor outage (Cloudflare, Railway, AWS, GHCR)"
            echo "- Staging-CP env var rotation"
            echo ""
            echo "### Next steps"
            echo ""
            echo "1. Check the script output above for the assertion that failed"
            echo "2. If it's a vendor outage, no action needed — next firing in ~20 min"
            echo "3. If it's a code regression, find the causing PR via \`git log\` against last green run and revert/fix"
            echo "4. Keep an eye on the next 1-2 firings — flake vs persistent fail differs in priority"
          } >> "$GITHUB_STEP_SUMMARY"