From 85261b1af9c532fb3154e1769a5a67904132a6be Mon Sep 17 00:00:00 2001 From: Molecule AI Infra-SRE Date: Mon, 11 May 2026 06:07:08 +0000 Subject: [PATCH 1/9] fix(docker): resolve duplicate services conflict (PR #385) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - docker-compose.yml: remove duplicate postgres/redis/langfuse-db-init/ langfuse-clickhouse definitions; import all infra services via include: docker-compose.infra.yml (Docker Compose v2 require directive) - docker-compose.infra.yml: add networks + restart policies to infra services; rename clickhouse → langfuse-clickhouse to match the name docker-compose.yml was importing; update langfuse-web depends_on and CLICKHOUSE_URL accordingly Co-Authored-By: Claude Opus 4.7 --- docker-compose.infra.yml | 18 +++++++--- docker-compose.yml | 78 ---------------------------------------- 2 files changed, 14 insertions(+), 82 deletions(-) diff --git a/docker-compose.infra.yml b/docker-compose.infra.yml index 0b7dbced..e25834b6 100644 --- a/docker-compose.infra.yml +++ b/docker-compose.infra.yml @@ -11,6 +11,9 @@ services: - "5432:5432" volumes: - pgdata:/var/lib/postgresql/data + networks: + - molecule-core-net + restart: unless-stopped healthcheck: test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-dev}"] interval: 2s @@ -25,6 +28,8 @@ services: environment: POSTGRES_USER: ${POSTGRES_USER:-dev} POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-dev} + networks: + - molecule-core-net command: - /bin/sh - -c @@ -45,6 +50,9 @@ services: - "6379:6379" volumes: - redisdata:/data + networks: + - molecule-core-net + restart: unless-stopped healthcheck: test: ["CMD", "redis-cli", "ping"] interval: 2s @@ -52,7 +60,7 @@ services: retries: 10 # digest-pinned 2026-05-10 (sha256:5b296e0ba1da74efea3143c773ddd60245f249fb7c72eb1d866c2d6ebc759fbe, linux/amd64) - clickhouse: + langfuse-clickhouse: image: clickhouse/clickhouse-server@sha256:5b296e0ba1da74efea3143c773ddd60245f249fb7c72eb1d866c2d6ebc759fbe environment: CLICKHOUSE_DB: langfuse @@ -60,6 +68,8 @@ services: CLICKHOUSE_PASSWORD: ${CLICKHOUSE_PASSWORD:-langfuse-dev} volumes: - clickhousedata:/var/lib/clickhouse + networks: + - molecule-core-net healthcheck: test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://127.0.0.1:8123/ping || exit 1"] interval: 5s @@ -104,7 +114,7 @@ services: langfuse-web: image: langfuse/langfuse@sha256:e7aafd3ccf721821b40f8b2251220b4bb8af5e4877b5c5a8846af5b3318aaf1d depends_on: - clickhouse: + langfuse-clickhouse: condition: service_healthy langfuse-db-init: condition: service_completed_successfully @@ -113,8 +123,8 @@ services: # Langfuse v2 expects the HTTP interface (port 8123). The previous # clickhouse://...:9000 native-protocol URL is rejected with # "ClickHouse URL protocol must be either http or https". - CLICKHOUSE_URL: http://clickhouse:8123 - CLICKHOUSE_MIGRATION_URL: clickhouse://clickhouse:9000 + CLICKHOUSE_URL: http://langfuse-clickhouse:8123 + CLICKHOUSE_MIGRATION_URL: clickhouse://langfuse-clickhouse:9000 CLICKHOUSE_USER: langfuse CLICKHOUSE_PASSWORD: ${CLICKHOUSE_PASSWORD:-langfuse-dev} NEXTAUTH_SECRET: ${LANGFUSE_SECRET:-changeme-langfuse-secret} diff --git a/docker-compose.yml b/docker-compose.yml index 782a314c..eb80449e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,85 +3,7 @@ include: - docker-compose.infra.yml services: - # --- Infrastructure --- - # digest-pinned 2026-05-10 (sha256:4941ef97aaa2633ce9808f7766f8b8d746dd039ce8c51ca6da185c3dc63ab579, linux/amd64) - postgres: - image: postgres@sha256:4941ef97aaa2633ce9808f7766f8b8d746dd039ce8c51ca6da185c3dc63ab579 - environment: - POSTGRES_USER: ${POSTGRES_USER:-dev} - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-dev} - POSTGRES_DB: ${POSTGRES_DB:-molecule} - command: ["postgres", "-c", "wal_level=logical"] - ports: - - "5432:5432" - volumes: - - pgdata:/var/lib/postgresql/data - networks: - - molecule-core-net - restart: unless-stopped - healthcheck: - test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-dev}"] - interval: 2s - timeout: 5s - retries: 10 - - langfuse-db-init: - image: postgres@sha256:4941ef97aaa2633ce9808f7766f8b8d746dd039ce8c51ca6da185c3dc63ab579 - depends_on: - postgres: - condition: service_healthy - environment: - POSTGRES_USER: ${POSTGRES_USER:-dev} - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-dev} - command: - - /bin/sh - - -c - - | - export PGPASSWORD="$${POSTGRES_PASSWORD}" - until pg_isready -h postgres -U "$${POSTGRES_USER}" -d postgres >/dev/null 2>&1; do - sleep 1 - done - if ! psql -h postgres -U "$${POSTGRES_USER}" -d postgres -tAc "SELECT 1 FROM pg_database WHERE datname = 'langfuse'" | grep -q 1; then - psql -h postgres -U "$${POSTGRES_USER}" -d postgres -c "CREATE DATABASE langfuse" - fi - networks: - - molecule-core-net - - # digest-pinned 2026-05-10 (sha256:b1addbe72465a718643cff9e60a58e6df1841e29d6d7d60c9a85d8d72f08d1a7, linux/amd64) - redis: - image: redis@sha256:b1addbe72465a718643cff9e60a58e6df1841e29d6d7d60c9a85d8d72f08d1a7 - command: ["redis-server", "--notify-keyspace-events", "KEA"] - ports: - - "6379:6379" - volumes: - - redisdata:/data - networks: - - molecule-core-net - restart: unless-stopped - healthcheck: - test: ["CMD", "redis-cli", "ping"] - interval: 2s - timeout: 5s - retries: 10 - # --- Observability --- - # digest-pinned 2026-05-10 (sha256:5b296e0ba1da74efea3143c773ddd60245f249fb7c72eb1d866c2d6ebc759fbe, linux/amd64) - langfuse-clickhouse: - image: clickhouse/clickhouse-server@sha256:5b296e0ba1da74efea3143c773ddd60245f249fb7c72eb1d866c2d6ebc759fbe - environment: - CLICKHOUSE_DB: langfuse - CLICKHOUSE_USER: langfuse - CLICKHOUSE_PASSWORD: langfuse - volumes: - - clickhousedata:/var/lib/clickhouse - networks: - - molecule-core-net - healthcheck: - test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://127.0.0.1:8123/ping || exit 1"] - interval: 5s - timeout: 5s - retries: 10 - # digest-pinned 2026-05-10 (sha256:e7aafd3ccf721821b40f8b2251220b4bb8af5e4877b5c5a8846af5b3318aaf1d, linux/amd64) langfuse: image: langfuse/langfuse@sha256:e7aafd3ccf721821b40f8b2251220b4bb8af5e4877b5c5a8846af5b3318aaf1d From 7770af32bee8a850cb654875ff33c772c44cbe46 Mon Sep 17 00:00:00 2001 From: Molecule AI Infra-SRE Date: Mon, 11 May 2026 08:12:06 +0000 Subject: [PATCH 2/9] fix(docker-compose): remove redundant langfuse-web from infra MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit langfuse-web in docker-compose.infra.yml is a dead duplicate of langfuse in docker-compose.yml (same image, same port 3001:3000). Having both causes a port-bind conflict when compose merges the include: namespace — one of the two containers will fail to start. Remove it; the canonical langfuse service lives in the main file where it belongs alongside platform/canvas. Co-Authored-By: Claude Opus 4.7 --- docker-compose.infra.yml | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/docker-compose.infra.yml b/docker-compose.infra.yml index e25834b6..beabe71f 100644 --- a/docker-compose.infra.yml +++ b/docker-compose.infra.yml @@ -110,29 +110,6 @@ services: ports: - "8233:8080" - # digest-pinned 2026-05-10 (sha256:e7aafd3ccf721821b40f8b2251220b4bb8af5e4877b5c5a8846af5b3318aaf1d, linux/amd64) - langfuse-web: - image: langfuse/langfuse@sha256:e7aafd3ccf721821b40f8b2251220b4bb8af5e4877b5c5a8846af5b3318aaf1d - depends_on: - langfuse-clickhouse: - condition: service_healthy - langfuse-db-init: - condition: service_completed_successfully - environment: - DATABASE_URL: postgres://${POSTGRES_USER:-dev}:${POSTGRES_PASSWORD:-dev}@postgres:5432/langfuse - # Langfuse v2 expects the HTTP interface (port 8123). The previous - # clickhouse://...:9000 native-protocol URL is rejected with - # "ClickHouse URL protocol must be either http or https". - CLICKHOUSE_URL: http://langfuse-clickhouse:8123 - CLICKHOUSE_MIGRATION_URL: clickhouse://langfuse-clickhouse:9000 - CLICKHOUSE_USER: langfuse - CLICKHOUSE_PASSWORD: ${CLICKHOUSE_PASSWORD:-langfuse-dev} - NEXTAUTH_SECRET: ${LANGFUSE_SECRET:-changeme-langfuse-secret} - NEXTAUTH_URL: http://localhost:3001 - SALT: ${LANGFUSE_SALT:-changeme-langfuse-salt} - ports: - - "3001:3000" - networks: default: name: molecule-core-net From 2afcf5ab9947907a5381860f519f96c4198224df Mon Sep 17 00:00:00 2001 From: hongming-pc2 Date: Mon, 11 May 2026 01:20:36 -0700 Subject: [PATCH 3/9] fix(ci): reconcile drifted secret names per #425 audit (Section D / class-E) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .github→.gitea migration left 3 secret-name drifts that mean the ported workflows reference secret-store names that don't match the canonical names. Renaming the workflow refs so the upcoming secret-store PUT (#425 class-A) lands under the names the workflows actually look up: - CP_STAGING_ADMIN_TOKEN -> CP_STAGING_ADMIN_API_TOKEN (sweep-aws-secrets, sweep-cf-orphans, sweep-cf-tunnels — peers in redeploy-tenants-on-staging + continuous-synth-e2e already use the _API_TOKEN form; semantic precision wins, 3v2 caller split) - CP_PROD_ADMIN_TOKEN -> CP_ADMIN_API_TOKEN (same 3 sweep workflows — CP_ADMIN_API_TOKEN is already the canonical name for the prod variant on molecule-controlplane, and matches ops.sh's `mol_tenants` reading `CP_ADMIN_API_TOKEN` from Railway) - MOLECULE_STAGING_OPENAI_KEY -> MOLECULE_STAGING_OPENAI_API_KEY (canary-staging, continuous-synth-e2e, e2e-staging-saas — the `_KEY` vs `_API_KEY` drift; peers are MOLECULE_STAGING_ANTHROPIC_API_KEY / MOLECULE_STAGING_MINIMAX_API_KEY. Confirmed CONSUMED — langgraph + hermes runtime tests use openai/gpt-4o and check the env presence — so renamed, not deleted.) KEPT as-is (no rename): CF_ACCOUNT_ID / CF_API_TOKEN / CF_ZONE_ID — these are the documented CI-scoped duplicates of the operator-host CLOUDFLARE_* admin names; renaming would touch 3 sweep workflows for zero functional gain. Documented as CI-scoped-dup in the secrets-map follow-up. Also updated the inline `for var in ...` presence-check loops + the `required_secret_name="..."` error strings so the workflows' diagnostics match the renamed names. Sequence: this PR merges → #425 class-A PUT populates the secret store under the canonical names → the 3 schedule-only reds (canary-staging, sweep-aws-secrets, continuous-synth-e2e) go green within ~30 min → watchdog #423 auto-closes their [main-red] issues. Refs: molecule-core#425 (secret-store audit, Section D), internal#297. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitea/workflows/canary-staging.yml | 4 ++-- .gitea/workflows/continuous-synth-e2e.yml | 6 +++--- .gitea/workflows/e2e-staging-saas.yml | 4 ++-- .gitea/workflows/sweep-aws-secrets.yml | 6 +++--- .gitea/workflows/sweep-cf-orphans.yml | 6 +++--- .gitea/workflows/sweep-cf-tunnels.yml | 6 +++--- tests/e2e/test_staging_full_saas.sh | 8 ++++---- 7 files changed, 20 insertions(+), 20 deletions(-) diff --git a/.gitea/workflows/canary-staging.yml b/.gitea/workflows/canary-staging.yml index ff40d4db..d3d6b68e 100644 --- a/.gitea/workflows/canary-staging.yml +++ b/.gitea/workflows/canary-staging.yml @@ -85,7 +85,7 @@ jobs: # OpenAI fallback — kept wired so an operator-dispatched run with # E2E_RUNTIME=hermes overridden via workflow_dispatch can still # exercise the OpenAI path without re-editing the workflow. - E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_KEY }} + E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }} E2E_MODE: canary E2E_RUNTIME: claude-code # Pin the canary to a specific MiniMax model rather than relying @@ -140,7 +140,7 @@ jobs: fi ;; langgraph|hermes) - required_secret_name="MOLECULE_STAGING_OPENAI_KEY" + required_secret_name="MOLECULE_STAGING_OPENAI_API_KEY" required_secret_value="${E2E_OPENAI_API_KEY:-}" ;; *) diff --git a/.gitea/workflows/continuous-synth-e2e.yml b/.gitea/workflows/continuous-synth-e2e.yml index f0ed9e8f..299d42e0 100644 --- a/.gitea/workflows/continuous-synth-e2e.yml +++ b/.gitea/workflows/continuous-synth-e2e.yml @@ -147,7 +147,7 @@ jobs: # E2E_RUNTIME=langgraph or =hermes and still have a working # canary path. The script picks the right blob shape based on # which key is non-empty. - E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_KEY }} + E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -175,7 +175,7 @@ jobs: # LLM-key requirement is per-runtime: claude-code accepts # EITHER MiniMax OR direct-Anthropic (whichever is set first), - # langgraph + hermes use OpenAI (MOLECULE_STAGING_OPENAI_KEY). + # langgraph + hermes use OpenAI (MOLECULE_STAGING_OPENAI_API_KEY). case "${E2E_RUNTIME}" in claude-code) if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then @@ -190,7 +190,7 @@ jobs: fi ;; langgraph|hermes) - required_secret_name="MOLECULE_STAGING_OPENAI_KEY" + required_secret_name="MOLECULE_STAGING_OPENAI_API_KEY" required_secret_value="${E2E_OPENAI_API_KEY:-}" ;; *) diff --git a/.gitea/workflows/e2e-staging-saas.yml b/.gitea/workflows/e2e-staging-saas.yml index f0e501f6..7b6c093b 100644 --- a/.gitea/workflows/e2e-staging-saas.yml +++ b/.gitea/workflows/e2e-staging-saas.yml @@ -105,7 +105,7 @@ jobs: # OpenAI fallback — kept wired so an operator-dispatched run with # E2E_RUNTIME=hermes or =langgraph via workflow_dispatch can still # exercise the OpenAI path. - E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_KEY }} + E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }} E2E_RUNTIME: ${{ github.event.inputs.runtime || 'claude-code' }} # Pin the model when running on the default claude-code path — # the per-runtime default ("sonnet") routes to direct Anthropic @@ -152,7 +152,7 @@ jobs: fi ;; langgraph|hermes) - required_secret_name="MOLECULE_STAGING_OPENAI_KEY" + required_secret_name="MOLECULE_STAGING_OPENAI_API_KEY" required_secret_value="${E2E_OPENAI_API_KEY:-}" ;; *) diff --git a/.gitea/workflows/sweep-aws-secrets.yml b/.gitea/workflows/sweep-aws-secrets.yml index afa8f6fa..a6572e8e 100644 --- a/.gitea/workflows/sweep-aws-secrets.yml +++ b/.gitea/workflows/sweep-aws-secrets.yml @@ -73,8 +73,8 @@ jobs: AWS_REGION: ${{ secrets.AWS_REGION || 'us-east-1' }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_JANITOR_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_JANITOR_SECRET_ACCESS_KEY }} - CP_PROD_ADMIN_TOKEN: ${{ secrets.CP_PROD_ADMIN_TOKEN }} - CP_STAGING_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_TOKEN }} + CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }} + CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }} GRACE_HOURS: ${{ github.event.inputs.grace_hours || '24' }} @@ -90,7 +90,7 @@ jobs: # they already accepted the repo state) run: | missing=() - for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY CP_PROD_ADMIN_TOKEN CP_STAGING_ADMIN_TOKEN; do + for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY CP_ADMIN_API_TOKEN CP_STAGING_ADMIN_API_TOKEN; do if [ -z "${!var:-}" ]; then missing+=("$var") fi diff --git a/.gitea/workflows/sweep-cf-orphans.yml b/.gitea/workflows/sweep-cf-orphans.yml index 18dc41cb..b18630b7 100644 --- a/.gitea/workflows/sweep-cf-orphans.yml +++ b/.gitea/workflows/sweep-cf-orphans.yml @@ -75,8 +75,8 @@ jobs: env: CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }} CF_ZONE_ID: ${{ secrets.CF_ZONE_ID }} - CP_PROD_ADMIN_TOKEN: ${{ secrets.CP_PROD_ADMIN_TOKEN }} - CP_STAGING_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_TOKEN }} + CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }} + CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_DEFAULT_REGION: us-east-2 @@ -109,7 +109,7 @@ jobs: # so they can rerun after fixing the secret) run: | missing=() - for var in CF_API_TOKEN CF_ZONE_ID CP_PROD_ADMIN_TOKEN CP_STAGING_ADMIN_TOKEN AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do + for var in CF_API_TOKEN CF_ZONE_ID CP_ADMIN_API_TOKEN CP_STAGING_ADMIN_API_TOKEN AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do if [ -z "${!var:-}" ]; then missing+=("$var") fi diff --git a/.gitea/workflows/sweep-cf-tunnels.yml b/.gitea/workflows/sweep-cf-tunnels.yml index 3fdc06c1..1fa12cfd 100644 --- a/.gitea/workflows/sweep-cf-tunnels.yml +++ b/.gitea/workflows/sweep-cf-tunnels.yml @@ -70,8 +70,8 @@ jobs: env: CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }} CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }} - CP_PROD_ADMIN_TOKEN: ${{ secrets.CP_PROD_ADMIN_TOKEN }} - CP_STAGING_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_TOKEN }} + CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }} + CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '90' }} steps: @@ -89,7 +89,7 @@ jobs: # they already accepted the repo state) run: | missing=() - for var in CF_API_TOKEN CF_ACCOUNT_ID CP_PROD_ADMIN_TOKEN CP_STAGING_ADMIN_TOKEN; do + for var in CF_API_TOKEN CF_ACCOUNT_ID CP_ADMIN_API_TOKEN CP_STAGING_ADMIN_API_TOKEN; do if [ -z "${!var:-}" ]; then missing+=("$var") fi diff --git a/tests/e2e/test_staging_full_saas.sh b/tests/e2e/test_staging_full_saas.sh index 2caece5c..b494f8f3 100755 --- a/tests/e2e/test_staging_full_saas.sh +++ b/tests/e2e/test_staging_full_saas.sh @@ -341,7 +341,7 @@ tenant_call() { # MiniMax account). Lower friction than MiniMax for operators # who already have an Anthropic API key for their own Claude # Code session. Pricier per-token than MiniMax but billing is -# still independent of MOLECULE_STAGING_OPENAI_KEY. Pinned to the +# still independent of MOLECULE_STAGING_OPENAI_API_KEY. Pinned to the # claude-code runtime — hermes/langgraph use OpenAI-shaped envs. # # E2E_OPENAI_API_KEY → langgraph + hermes paths. Kept as fallback @@ -368,7 +368,7 @@ elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then # who already have an Anthropic API key (e.g. for their own Claude # Code session) and want to avoid setting up a separate MiniMax # account just for E2E. Pricier per-token than MiniMax but billing - # is still independent of MOLECULE_STAGING_OPENAI_KEY, so an OpenAI + # is still independent of MOLECULE_STAGING_OPENAI_API_KEY, so an OpenAI # quota collapse doesn't wedge this path. Pinned to the claude-code # runtime: hermes/langgraph use OpenAI-shaped envs and won't honour # ANTHROPIC_API_KEY without further wiring (out of scope for this @@ -623,7 +623,7 @@ fi # "Encrypted content is not supported" → hermes codex_responses API misroute (#14) # "Unknown provider" → bridge misconfigured PROVIDER= (regression of #13 fix) # "hermes-agent unreachable" → gateway process died -# "exceeded your current quota" → MOLECULE_STAGING_OPENAI_KEY billing (NOT a platform regression — #2578) +# "exceeded your current quota" → MOLECULE_STAGING_OPENAI_API_KEY billing (NOT a platform regression — #2578) # # Fail LOUD with the specific pattern so CI log + alert channel makes the # regression unambiguous. @@ -657,7 +657,7 @@ fi # with a provider-side 429, that is a billing event on the configured # OpenAI key, not a platform regression. Tracked in #2578. if echo "$AGENT_TEXT" | grep -qiE "exceeded your current quota|insufficient_quota"; then - fail "A2A — PROVIDER QUOTA EXHAUSTED (NOT a platform regression). Operator action: top up MOLECULE_STAGING_OPENAI_KEY billing or rotate to a higher-quota org at Settings → Secrets and Variables → Actions. Tracked in #2578. Raw: $AGENT_TEXT" + fail "A2A — PROVIDER QUOTA EXHAUSTED (NOT a platform regression). Operator action: top up MOLECULE_STAGING_OPENAI_API_KEY billing or rotate to a higher-quota org at Settings → Secrets and Variables → Actions. Tracked in #2578. Raw: $AGENT_TEXT" fi # Generic catch-all — falls through if none of the known regressions hit. if echo "$AGENT_TEXT" | grep -qiE "error|exception"; then From 5373b5e7f697a36209dae645071bc605027a5a8c Mon Sep 17 00:00:00 2001 From: hongming-pc2 Date: Mon, 11 May 2026 01:32:26 -0700 Subject: [PATCH 4/9] fix(ci): extend class-E rename to scripts/ops/sweep-*.sh (chained-defect from #430 review) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit core-devops lens review (review 1075) caught the chained defect: the 3 sweep workflows shell out to `bash scripts/ops/sweep-{aws-secrets,cf-orphans,cf-tunnels}.sh`, and those scripts still consume the OLD env-var names — `need CP_PROD_ADMIN_TOKEN`, `need CP_STAGING_ADMIN_TOKEN`, and `Bearer $CP_PROD_ADMIN_TOKEN` / `Bearer $CP_STAGING_ADMIN_TOKEN` in the CP-admin curl calls. The workflow- level presence-check loop (renamed in the first commit) would pass, then the shell script would `exit 1` at the `need CP_PROD_ADMIN_TOKEN` line. Classic `feedback_chained_defects_in_never_tested_workflows` — the YAML- surface rename looked complete; the actual consumer is one layer deeper. This commit completes the rename in the scripts: - `CP_PROD_ADMIN_TOKEN` -> `CP_ADMIN_API_TOKEN` - `CP_STAGING_ADMIN_TOKEN` -> `CP_STAGING_ADMIN_API_TOKEN` (6 occurrences total per script — comments, `need` checks, `Bearer $...` curl headers — across all 3). The .gitea/workflows/sweep-*.yml files (first commit) export `CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}` etc., so the scripts now read `$CP_ADMIN_API_TOKEN` — consistent end-to-end. Per core-devops's other (non-blocking) note: `workflow_dispatch` each sweep in dry-run after this lands + after the #425 class-A PUT, to confirm the path beyond the presence-check actually works (the `MINIMAX_TOKEN`-grade shape-match isn't enough — exercise the real CP-admin call). Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/ops/sweep-aws-secrets.sh | 12 ++++++------ scripts/ops/sweep-cf-orphans.sh | 12 ++++++------ scripts/ops/sweep-cf-tunnels.sh | 12 ++++++------ 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/scripts/ops/sweep-aws-secrets.sh b/scripts/ops/sweep-aws-secrets.sh index 55db0a11..20450026 100755 --- a/scripts/ops/sweep-aws-secrets.sh +++ b/scripts/ops/sweep-aws-secrets.sh @@ -40,8 +40,8 @@ # # Env vars required: # AWS_REGION — region the secrets live in (default: us-east-1) -# CP_PROD_ADMIN_TOKEN — CP admin bearer for api.moleculesai.app -# CP_STAGING_ADMIN_TOKEN — CP admin bearer for staging-api.moleculesai.app +# CP_ADMIN_API_TOKEN — CP admin bearer for api.moleculesai.app +# CP_STAGING_ADMIN_API_TOKEN — CP admin bearer for staging-api.moleculesai.app # AWS_ACCESS_KEY_ID, — IAM principal with secretsmanager:ListSecrets # AWS_SECRET_ACCESS_KEY and secretsmanager:DeleteSecret. Note: the # prod molecule-cp principal does NOT have @@ -88,8 +88,8 @@ need() { exit 1 fi } -need CP_PROD_ADMIN_TOKEN -need CP_STAGING_ADMIN_TOKEN +need CP_ADMIN_API_TOKEN +need CP_STAGING_ADMIN_API_TOKEN need AWS_ACCESS_KEY_ID need AWS_SECRET_ACCESS_KEY @@ -107,13 +107,13 @@ log() { echo "[$(date -u +%H:%M:%S)] $*"; } # response includes both `id` and `slug`; we extract `id` here. log "Fetching CP prod org ids..." -PROD_IDS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_PROD_ADMIN_TOKEN" \ +PROD_IDS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \ "https://api.moleculesai.app/cp/admin/orgs?limit=500" \ | python3 -c "import json,sys; print(' '.join(o['id'] for o in json.load(sys.stdin).get('orgs',[])))") log " prod orgs: $(echo "$PROD_IDS" | wc -w | tr -d ' ')" log "Fetching CP staging org ids..." -STAGING_IDS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_STAGING_ADMIN_TOKEN" \ +STAGING_IDS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_STAGING_ADMIN_API_TOKEN" \ "https://staging-api.moleculesai.app/cp/admin/orgs?limit=500" \ | python3 -c "import json,sys; print(' '.join(o['id'] for o in json.load(sys.stdin).get('orgs',[])))") log " staging orgs: $(echo "$STAGING_IDS" | wc -w | tr -d ' ')" diff --git a/scripts/ops/sweep-cf-orphans.sh b/scripts/ops/sweep-cf-orphans.sh index 569bcbcf..8a4da90c 100755 --- a/scripts/ops/sweep-cf-orphans.sh +++ b/scripts/ops/sweep-cf-orphans.sh @@ -20,8 +20,8 @@ # Env vars required: # CF_API_TOKEN — Cloudflare token with zone:dns:edit # CF_ZONE_ID — the zone (moleculesai.app) -# CP_PROD_ADMIN_TOKEN — CP admin bearer for api.moleculesai.app -# CP_STAGING_ADMIN_TOKEN — CP admin bearer for staging-api.moleculesai.app +# CP_ADMIN_API_TOKEN — CP admin bearer for api.moleculesai.app +# CP_STAGING_ADMIN_API_TOKEN — CP admin bearer for staging-api.moleculesai.app # AWS_* — standard AWS creds (default region us-east-2) # # Exit codes: @@ -58,21 +58,21 @@ need() { } need CF_API_TOKEN need CF_ZONE_ID -need CP_PROD_ADMIN_TOKEN -need CP_STAGING_ADMIN_TOKEN +need CP_ADMIN_API_TOKEN +need CP_STAGING_ADMIN_API_TOKEN log() { echo "[$(date -u +%H:%M:%S)] $*"; } # --- Gather live sets ------------------------------------------------------ log "Fetching CP prod org slugs..." -PROD_SLUGS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_PROD_ADMIN_TOKEN" \ +PROD_SLUGS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \ "https://api.moleculesai.app/cp/admin/orgs?limit=500" \ | python3 -c "import json,sys; print(' '.join(o['slug'] for o in json.load(sys.stdin).get('orgs',[])))") log " prod orgs: $(echo "$PROD_SLUGS" | wc -w | tr -d ' ')" log "Fetching CP staging org slugs..." -STAGING_SLUGS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_STAGING_ADMIN_TOKEN" \ +STAGING_SLUGS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_STAGING_ADMIN_API_TOKEN" \ "https://staging-api.moleculesai.app/cp/admin/orgs?limit=500" \ | python3 -c "import json,sys; print(' '.join(o['slug'] for o in json.load(sys.stdin).get('orgs',[])))") log " staging orgs: $(echo "$STAGING_SLUGS" | wc -w | tr -d ' ')" diff --git a/scripts/ops/sweep-cf-tunnels.sh b/scripts/ops/sweep-cf-tunnels.sh index bf948940..13734db3 100755 --- a/scripts/ops/sweep-cf-tunnels.sh +++ b/scripts/ops/sweep-cf-tunnels.sh @@ -31,8 +31,8 @@ # token must include the tunnel scope.) # CF_ACCOUNT_ID — the account that owns the tunnels (visible # in dash.cloudflare.com URL path) -# CP_PROD_ADMIN_TOKEN — CP admin bearer for api.moleculesai.app -# CP_STAGING_ADMIN_TOKEN — CP admin bearer for staging-api.moleculesai.app +# CP_ADMIN_API_TOKEN — CP admin bearer for api.moleculesai.app +# CP_STAGING_ADMIN_API_TOKEN — CP admin bearer for staging-api.moleculesai.app # # Exit codes: # 0 — dry-run completed or sweep executed successfully @@ -72,21 +72,21 @@ need() { } need CF_API_TOKEN need CF_ACCOUNT_ID -need CP_PROD_ADMIN_TOKEN -need CP_STAGING_ADMIN_TOKEN +need CP_ADMIN_API_TOKEN +need CP_STAGING_ADMIN_API_TOKEN log() { echo "[$(date -u +%H:%M:%S)] $*"; } # --- Gather live sets ------------------------------------------------------ log "Fetching CP prod org slugs..." -PROD_SLUGS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_PROD_ADMIN_TOKEN" \ +PROD_SLUGS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \ "https://api.moleculesai.app/cp/admin/orgs?limit=500" \ | python3 -c "import json,sys; print(' '.join(o['slug'] for o in json.load(sys.stdin).get('orgs',[])))") log " prod orgs: $(echo "$PROD_SLUGS" | wc -w | tr -d ' ')" log "Fetching CP staging org slugs..." -STAGING_SLUGS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_STAGING_ADMIN_TOKEN" \ +STAGING_SLUGS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_STAGING_ADMIN_API_TOKEN" \ "https://staging-api.moleculesai.app/cp/admin/orgs?limit=500" \ | python3 -c "import json,sys; print(' '.join(o['slug'] for o in json.load(sys.stdin).get('orgs',[])))") log " staging orgs: $(echo "$STAGING_SLUGS" | wc -w | tr -d ' ')" From 39db2e6d7390b98732b668a79523b26a401f7e01 Mon Sep 17 00:00:00 2001 From: Molecule AI Core-BE Date: Mon, 11 May 2026 08:25:27 +0000 Subject: [PATCH 5/9] =?UTF-8?q?fix(workspace):=20complete=20OFFSEC-003=20f?= =?UTF-8?q?ix=20=E2=80=94=20promote=20full=20sanitization=20to=20main?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Promotes the complete OFFSEC-003 boundary-marker sanitization from staging to main, including: - _delegate_sync_via_polling: sanitize response_preview and error strings before returning (OFFSEC-003 polling-path fix from PR #417). - tool_check_task_status JSON endpoint: sanitize summary + response_preview in both the task_id filter path and the list path. - tool_delegate_task non-polling path: preserve main's existing sanitize_a2a_result(result) wrapper (staging accidentally removed it). Closes #418. Co-Authored-By: Molecule AI · core-be --- workspace/a2a_tools_delegation.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/workspace/a2a_tools_delegation.py b/workspace/a2a_tools_delegation.py index 5a40891b..c6416122 100644 --- a/workspace/a2a_tools_delegation.py +++ b/workspace/a2a_tools_delegation.py @@ -167,12 +167,19 @@ async def _delegate_sync_via_polling( break if terminal: if (terminal.get("status") or "").lower() == "completed": - return terminal.get("response_preview") or "" - err = ( + # OFFSEC-003: sanitize response_preview before returning so + # boundary markers injected by a malicious peer cannot escape + # the trust boundary. + return sanitize_a2a_result(terminal.get("response_preview") or "") + # OFFSEC-003: sanitize error_detail / summary before wrapping with + # the _A2A_ERROR_PREFIX sentinel so injected markers cannot appear + # inside the trusted error block returned to the agent. + err_raw = ( terminal.get("error_detail") or terminal.get("summary") or "delegation failed" ) + err = sanitize_a2a_result(err_raw) return f"{_A2A_ERROR_PREFIX}{err}" await asyncio.sleep(_SYNC_POLL_INTERVAL_S) @@ -408,12 +415,11 @@ async def tool_check_task_status( # Filter by delegation_id matching = [d for d in delegations if d.get("delegation_id") == task_id] if matching: - entry = dict(matching[0]) - # OFFSEC-003: sanitize peer-generated text fields - for field in ("result", "response_preview"): - if field in entry and entry[field]: - entry[field] = sanitize_a2a_result(str(entry[field])) - return json.dumps(entry) + # OFFSEC-003: sanitize peer-supplied fields + d = matching[0] + d["summary"] = sanitize_a2a_result(d.get("summary", "")) + d["response_preview"] = sanitize_a2a_result(d.get("response_preview", "")) + return json.dumps(d) return json.dumps({"status": "not_found", "delegation_id": task_id}) # Return all recent delegations summary = [] @@ -425,7 +431,7 @@ async def tool_check_task_status( "delegation_id": d.get("delegation_id", ""), "target_id": d.get("target_id", ""), "status": d.get("status", ""), - "summary": d.get("summary", ""), + "summary": sanitize_a2a_result(d.get("summary", "")), "response_preview": preview, }) return json.dumps({"delegations": summary, "count": len(delegations)}) From 318e0ad742380b96ebe389111649d91db5d74b7e Mon Sep 17 00:00:00 2001 From: Molecule AI Infra-Runtime-BE Date: Mon, 11 May 2026 09:30:32 +0000 Subject: [PATCH 6/9] fix(workspace): skip idle prompt when delegation results are pending (#381) (#432) Co-authored-by: Molecule AI Infra-Runtime-BE Co-committed-by: Molecule AI Infra-Runtime-BE --- workspace/main.py | 25 ++++++ .../tests/test_idle_loop_pending_check.py | 80 +++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 workspace/tests/test_idle_loop_pending_check.py diff --git a/workspace/main.py b/workspace/main.py index 77c2d2d6..8c569309 100644 --- a/workspace/main.py +++ b/workspace/main.py @@ -668,6 +668,31 @@ async def main(): # pragma: no cover if heartbeat.active_tasks > 0: continue + # Issue #381 fix: skip the idle prompt if there are unconsumed + # delegation results waiting. The heartbeat sends a self-message + # for every new result batch, so sending the idle prompt here would + # race: the agent would compose a stale tick BEFORE processing the + # results notification, producing repeated identical asks (peer sends + # correction, we respond with stale state, peer asks again). + # By skipping the idle prompt when results are pending, we let the + # heartbeat's own self-message wake the agent after results are + # written. The agent then sees the results in _prepare_prompt() + # and processes them before composing. + from heartbeat import DELEGATION_RESULTS_FILE as _DRF + try: + with open(_DRF) as _rf: + _rf.seek(0) + _content = _rf.read().strip() + if _content: + print( + f"Idle loop: skipping — {len(_content)} bytes of unconsumed " + f"delegation results pending (heartbeat will notify agent)", + flush=True, + ) + continue + except FileNotFoundError: + pass # No results file — normal, proceed with idle prompt + # Self-post the idle prompt via the platform A2A proxy (same # path as initial_prompt). The agent's own concurrency control # rejects if the workspace becomes busy between this check and diff --git a/workspace/tests/test_idle_loop_pending_check.py b/workspace/tests/test_idle_loop_pending_check.py new file mode 100644 index 00000000..6699bf8f --- /dev/null +++ b/workspace/tests/test_idle_loop_pending_check.py @@ -0,0 +1,80 @@ +"""Tests for issue #381: idle loop must not fire when delegation results are pending. + +The idle loop skips sending the idle prompt when DELEGATION_RESULTS_FILE +contains unconsumed results, preventing the agent from composing a stale tick +before processing pending delegation notifications from the heartbeat. + +Source: workspace/main.py:_run_idle_loop() pending-results guard. +""" +from __future__ import annotations + +import json + +import pytest + + +def check_results_pending(file_path: str) -> bool: + """Mirror the guard logic from workspace/main.py:_run_idle_loop(). + + Returns True if the results file exists and is non-empty, + meaning the idle loop should skip this tick. + """ + try: + with open(file_path) as rf: + rf.seek(0) + content = rf.read().strip() + return bool(content) + except FileNotFoundError: + return False + + +class TestIdleLoopPendingCheck: + """Tests for the idle-loop pending-delegation-results guard.""" + + def test_no_file_means_proceed(self, tmp_path): + """No delegation results file → idle loop fires normally.""" + results_file = tmp_path / "delegation_results.jsonl" + assert not check_results_pending(str(results_file)) + + def test_empty_file_means_proceed(self, tmp_path): + """Empty file → no pending results → idle loop fires.""" + results_file = tmp_path / "delegation_results.jsonl" + results_file.write_text("", encoding="utf-8") + assert not check_results_pending(str(results_file)) + + def test_whitespace_only_file_means_proceed(self, tmp_path): + """File with only whitespace → treated as empty → idle loop fires.""" + results_file = tmp_path / "delegation_results.jsonl" + results_file.write_text(" \n ", encoding="utf-8") + assert not check_results_pending(str(results_file)) + + def test_single_result_means_skip(self, tmp_path): + """File with one delegation result → skip idle tick.""" + results_file = tmp_path / "delegation_results.jsonl" + results_file.write_text( + json.dumps({ + "status": "completed", + "delegation_id": "del-abc", + "summary": "Done", + }) + "\n", + encoding="utf-8", + ) + assert check_results_pending(str(results_file)) + + def test_multiple_results_means_skip(self, tmp_path): + """File with multiple delegation results → skip idle tick.""" + results_file = tmp_path / "delegation_results.jsonl" + results_file.write_text( + json.dumps({"status": "completed", "delegation_id": "del-1", "summary": "A"}) + + "\n" + + json.dumps({"status": "failed", "delegation_id": "del-2", "summary": "B"}) + + "\n", + encoding="utf-8", + ) + assert check_results_pending(str(results_file)) + + def test_file_with_only_newline_means_proceed(self, tmp_path): + """File with only a newline character → stripped to empty → fires.""" + results_file = tmp_path / "delegation_results.jsonl" + results_file.write_text("\n", encoding="utf-8") + assert not check_results_pending(str(results_file)) From 651f44790bb5d1dfb3686d52fc09a55c9bcb51d0 Mon Sep 17 00:00:00 2001 From: Molecule AI Core-FE Date: Mon, 11 May 2026 09:41:16 +0000 Subject: [PATCH 7/9] fix(canvas/a11y): add accessible name to ConsoleModal + DeleteCascadeConfirmDialog backdrops (#410) Co-authored-by: Molecule AI Core-FE Co-committed-by: Molecule AI Core-FE --- canvas/src/components/ConsoleModal.tsx | 6 +++++- canvas/src/components/DeleteCascadeConfirmDialog.tsx | 6 +++++- canvas/src/components/__tests__/ConsoleModal.test.tsx | 4 ++-- .../__tests__/DeleteCascadeConfirmDialog.test.tsx | 4 ++-- 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/canvas/src/components/ConsoleModal.tsx b/canvas/src/components/ConsoleModal.tsx index f20faa8a..907dc37f 100644 --- a/canvas/src/components/ConsoleModal.tsx +++ b/canvas/src/components/ConsoleModal.tsx @@ -90,7 +90,11 @@ export function ConsoleModal({ workspaceId, workspaceName, open, onClose }: Prop return createPortal(
-