forked from molecule-ai/molecule-core
Compare commits
7 Commits
main
...
migrate/is
| Author | SHA1 | Date | |
|---|---|---|---|
| 79c6f7f04c | |||
| b44d389b50 | |||
|
|
17f7bd0b31 | ||
| e353b54a85 | |||
|
|
bb124da6c5 | ||
|
|
30b8046235 | ||
|
|
3501e6bfd7 |
2
.github/scripts/lint_secret_pattern_drift.py
vendored
2
.github/scripts/lint_secret_pattern_drift.py
vendored
@ -37,7 +37,7 @@ CANONICAL_FILE = Path(".github/workflows/secret-scan.yml")
|
||||
CONSUMERS: list[tuple[str, str]] = [
|
||||
(
|
||||
"molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh",
|
||||
"https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-runtime/raw/branch/main/molecule_runtime/scripts/pre-commit-checks.sh",
|
||||
"https://raw.githubusercontent.com/Molecule-AI/molecule-ai-workspace-runtime/main/molecule_runtime/scripts/pre-commit-checks.sh",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
467
.github/workflows/auto-promote-on-e2e.yml
vendored
Normal file
467
.github/workflows/auto-promote-on-e2e.yml
vendored
Normal file
@ -0,0 +1,467 @@
|
||||
name: Auto-promote :latest after main image build
|
||||
|
||||
# Retags `ghcr.io/molecule-ai/{platform,platform-tenant}:staging-<sha>`
|
||||
# → `:latest` after either the image build or E2E completes on a `main`
|
||||
# push, gated on E2E Staging SaaS not being red for that SHA.
|
||||
#
|
||||
# Why two triggers:
|
||||
#
|
||||
# `publish-workspace-server-image` and `e2e-staging-saas` are both
|
||||
# paths-filtered, but with DIFFERENT path sets:
|
||||
#
|
||||
# publish-workspace-server-image:
|
||||
# workspace-server/**, canvas/**, manifest.json
|
||||
#
|
||||
# e2e-staging-saas (full lifecycle):
|
||||
# workspace-server/internal/handlers/{registry,workspace_provision,
|
||||
# a2a_proxy}.go, workspace-server/internal/middleware/**,
|
||||
# workspace-server/internal/provisioner/**, tests/e2e/test_staging_full_saas.sh
|
||||
#
|
||||
# The E2E set is a strict SUBSET of the publish set. So:
|
||||
# - canvas/** changes → publish fires, E2E does not
|
||||
# - workspace-server/cmd/** changes → publish fires, E2E does not
|
||||
# - workspace-server/internal/sweep/** → publish fires, E2E does not
|
||||
#
|
||||
# The previous version triggered ONLY on E2E completion, which meant
|
||||
# non-E2E-path changes (canvas, cmd, sweep, etc.) rebuilt the image
|
||||
# but never advanced `:latest`. Result: as of 2026-04-28 this workflow
|
||||
# had run zero times since merge despite eight main pushes — `:latest`
|
||||
# was ~7 hours / 9 PRs behind main with no human realising. See
|
||||
# `molecule-core` Slack discussion 2026-04-28.
|
||||
#
|
||||
# Adding `publish-workspace-server-image` as a second trigger closes
|
||||
# the gap: any image rebuild on main eligibly advances `:latest`.
|
||||
#
|
||||
# Why E2E remains a kill-switch (not the trigger):
|
||||
#
|
||||
# When E2E DID run for this SHA and ended red, we abort — `:latest`
|
||||
# stays on the prior known-good digest. When E2E didn't run (paths
|
||||
# filtered out), we proceed: pre-merge gates already validated this
|
||||
# SHA on staging via auto-promote-staging requiring CI + E2E Canvas +
|
||||
# E2E API + CodeQL all green. Image content for non-E2E-paths
|
||||
# (canvas, cmd, sweep) is exercised by those staging gates.
|
||||
#
|
||||
# Why `main` only:
|
||||
#
|
||||
# `:latest` is what prod tenants pull. We only want SHAs that have
|
||||
# reached main (via auto-promote-staging) to advance `:latest`.
|
||||
# Triggering on staging would let a staging-only revert advance
|
||||
# `:latest` to a SHA that never reaches main, breaking the "production
|
||||
# runs what's on main" invariant.
|
||||
#
|
||||
# Idempotency:
|
||||
#
|
||||
# When a SHA touches paths that match BOTH publish and E2E, both
|
||||
# workflows fire and complete. Both trigger this workflow on
|
||||
# completion → two runs race. Both retag `:staging-<sha>` →
|
||||
# `:latest`. crane tag is idempotent (re-tagging the same digest is a
|
||||
# no-op), so the second run is harmless. concurrency group serializes
|
||||
# them anyway.
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows:
|
||||
- 'E2E Staging SaaS (full lifecycle)'
|
||||
- 'publish-workspace-server-image'
|
||||
types: [completed]
|
||||
branches: [main]
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
sha:
|
||||
description: 'Short sha to promote (override; defaults to upstream workflow_run head_sha)'
|
||||
required: false
|
||||
type: string
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
concurrency:
|
||||
# Serialize promotes per-SHA so the publish+E2E both-fired race lands
|
||||
# cleanly. Different SHAs can promote in parallel.
|
||||
group: auto-promote-latest-${{ github.event.workflow_run.head_sha || github.event.inputs.sha || github.sha }}
|
||||
cancel-in-progress: false
|
||||
|
||||
env:
|
||||
IMAGE_NAME: ghcr.io/molecule-ai/platform
|
||||
TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant
|
||||
|
||||
jobs:
|
||||
promote:
|
||||
# Proceed if upstream succeeded OR manual dispatch. Upstream-failure
|
||||
# paths are filtered here; the E2E-was-red kill-switch lives in the
|
||||
# gate-check step below (covers the case where upstream is publish
|
||||
# success but E2E for the same SHA failed).
|
||||
if: |
|
||||
github.event_name == 'workflow_dispatch' ||
|
||||
(github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Compute short sha
|
||||
id: sha
|
||||
run: |
|
||||
set -euo pipefail
|
||||
if [ -n "${{ github.event.inputs.sha }}" ]; then
|
||||
FULL="${{ github.event.inputs.sha }}"
|
||||
else
|
||||
FULL="${{ github.event.workflow_run.head_sha }}"
|
||||
fi
|
||||
echo "short=${FULL:0:7}" >> "$GITHUB_OUTPUT"
|
||||
echo "full=${FULL}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Gate — E2E Staging SaaS state for this SHA
|
||||
# When upstream IS E2E success, we know it's green (filtered by
|
||||
# the job-level `if` already). When upstream is publish, look up
|
||||
# E2E state for the same SHA. Four buckets:
|
||||
#
|
||||
# - completed/success: E2E confirmed safe → proceed
|
||||
# - completed/failure|cancelled|timed_out: E2E found a
|
||||
# regression → ABORT (exit 1), `:latest` stays put
|
||||
# - in_progress|queued|requested: E2E is RACING with publish
|
||||
# for a runtime-touching SHA. publish typically completes
|
||||
# ~5-10min before E2E (~10-15min). If we promote on the
|
||||
# publish signal here, a later E2E failure can't roll back
|
||||
# `:latest` — it'd already be wrongly advanced. So we DEFER:
|
||||
# skip subsequent steps (proceed=false) and let E2E's own
|
||||
# completion event re-fire this workflow, which then takes
|
||||
# the upstream-is-E2E path. exit 0 so the run shows as
|
||||
# success rather than a noisy fake-failure.
|
||||
# - none/none: E2E was paths-filtered out for this SHA (the
|
||||
# change touched canvas/cmd/sweep/etc. — paths covered by
|
||||
# publish but not by E2E). pre-merge gates on staging
|
||||
# already validated this SHA → proceed.
|
||||
#
|
||||
# Manual dispatch skips this check — operator override.
|
||||
id: gate
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
REPO: ${{ github.repository }}
|
||||
SHA: ${{ steps.sha.outputs.full }}
|
||||
UPSTREAM_NAME: ${{ github.event.workflow_run.name }}
|
||||
EVENT_NAME: ${{ github.event_name }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
if [ "$EVENT_NAME" = "workflow_dispatch" ]; then
|
||||
echo "proceed=true" >> "$GITHUB_OUTPUT"
|
||||
echo "::notice::Manual dispatch — skipping E2E gate (operator override)"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ "$UPSTREAM_NAME" = "E2E Staging SaaS (full lifecycle)" ]; then
|
||||
echo "proceed=true" >> "$GITHUB_OUTPUT"
|
||||
echo "::notice::Upstream is E2E itself (success per job-level if) — gate trivially satisfied"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Upstream is publish-workspace-server-image. Check E2E state
|
||||
# for the same SHA via Gitea's commit-status API.
|
||||
#
|
||||
# GitHub-era this was `gh run list --workflow=X --commit=SHA
|
||||
# --json status,conclusion` returning either `[]` (no run on
|
||||
# this SHA) or `[{status, conclusion}]` (the run's state).
|
||||
# Gitea has NO workflow-runs API at all — `/api/v1/repos/.../
|
||||
# actions/runs` returns 404 (verified 2026-05-07, issue #75).
|
||||
# However Gitea Actions DOES emit a commit status per workflow
|
||||
# job, with `context = "<Workflow Name> / <Job Name> (<event>)"`,
|
||||
# which is exactly what we need: each E2E run leg becomes one
|
||||
# status row on the SHA, and the aggregate state encodes the
|
||||
# run's outcome.
|
||||
#
|
||||
# Mapping:
|
||||
# 0 matched contexts → "none/none" (E2E paths-
|
||||
# filtered
|
||||
# out — same
|
||||
# semantic
|
||||
# as before)
|
||||
# any context = pending → "in_progress/none" (defer)
|
||||
# any context = error|failure → "completed/failure" (abort)
|
||||
# all contexts = success → "completed/success" (proceed)
|
||||
#
|
||||
# The "completed/cancelled" and "completed/timed_out" buckets
|
||||
# don't have direct Gitea analogs (Gitea statuses are
|
||||
# success / failure / error / pending / warning). Per-SHA
|
||||
# concurrency cancellation surfaces as `error` on Gitea, which
|
||||
# we map to "completed/failure" rather than "completed/cancelled"
|
||||
# — losing the soft-defer semantic of the cancelled bucket on
|
||||
# this fleet. Tradeoff: the staleness alarm (auto-promote-stale-
|
||||
# alarm.yml) still catches a stuck :latest within 4h, and a
|
||||
# legitimate cancel is rare enough that aborting + manual
|
||||
# re-dispatch is acceptable. If we measure cancel frequency
|
||||
# > 1/week, revisit by reading the run-step-summary text via
|
||||
# a follow-up script.
|
||||
#
|
||||
# Network or auth blips collapse to "none/none" via the curl
|
||||
# `|| true` fallback, matching the pre-Gitea behaviour where
|
||||
# an empty list also degenerated to none/none.
|
||||
GITEA_API_URL="${GITHUB_SERVER_URL:-https://git.moleculesai.app}/api/v1"
|
||||
STATUSES_JSON=$(curl --fail-with-body -sS \
|
||||
-H "Authorization: token ${GH_TOKEN}" \
|
||||
-H "Accept: application/json" \
|
||||
"${GITEA_API_URL}/repos/${REPO}/commits/${SHA}/statuses?limit=100" \
|
||||
2>/dev/null || echo "[]")
|
||||
RESULT=$(printf '%s' "$STATUSES_JSON" | jq -r '
|
||||
# Filter to E2E Staging SaaS (full lifecycle) statuses.
|
||||
# Match by leading workflow-name prefix so the "<job>
|
||||
# (<event>)" tail is irrelevant. Gitea emits the workflow
|
||||
# name verbatim from the YAML `name:` field.
|
||||
[.[] | select(.context | startswith("E2E Staging SaaS (full lifecycle) /"))] as $rows
|
||||
| if ($rows | length) == 0 then
|
||||
"none/none"
|
||||
elif any($rows[]; .status == "pending") then
|
||||
"in_progress/none"
|
||||
elif any($rows[]; .status == "failure" or .status == "error") then
|
||||
"completed/failure"
|
||||
elif all($rows[]; .status == "success") then
|
||||
"completed/success"
|
||||
else
|
||||
# Mixed / unknown — fall through to *) bucket below.
|
||||
"completed/" + ($rows[0].status // "unknown")
|
||||
end
|
||||
' 2>/dev/null || echo "none/none")
|
||||
|
||||
echo "E2E Staging SaaS for ${SHA:0:7}: $RESULT"
|
||||
|
||||
case "$RESULT" in
|
||||
completed/success)
|
||||
echo "proceed=true" >> "$GITHUB_OUTPUT"
|
||||
echo "::notice::E2E green for this SHA — proceeding with promote"
|
||||
;;
|
||||
completed/failure|completed/timed_out)
|
||||
echo "proceed=false" >> "$GITHUB_OUTPUT"
|
||||
{
|
||||
echo "## ❌ Auto-promote aborted — E2E Staging SaaS failed"
|
||||
echo
|
||||
echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\`"
|
||||
echo "\`:latest\` stays on the prior known-good digest."
|
||||
echo
|
||||
echo "If the failure was a flake, manually dispatch this workflow with the same sha to override."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
exit 1
|
||||
;;
|
||||
completed/cancelled)
|
||||
# GitHub-era only: cancelled ≠ failure. Gitea statuses
|
||||
# don't expose a "cancelled" state — a per-SHA concurrency
|
||||
# cancellation surfaces as `failure` or `error` on Gitea
|
||||
# and is now handled by the failure branch above. This
|
||||
# arm is kept for backwards compatibility / dual-host
|
||||
# operation (if we ever add a non-Gitea fallback) but
|
||||
# under the post-#75 flow it's unreachable.
|
||||
echo "proceed=false" >> "$GITHUB_OUTPUT"
|
||||
{
|
||||
echo "## ⏭ Auto-promote deferred — E2E Staging SaaS was cancelled"
|
||||
echo
|
||||
echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\`"
|
||||
echo "Likely per-SHA concurrency (newer push superseded this E2E run)."
|
||||
echo "The newer SHA's E2E will fire its own promote when it lands."
|
||||
echo "If you need this specific SHA promoted, manually dispatch."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
;;
|
||||
in_progress/*|queued/*|requested/*|waiting/*|pending/*)
|
||||
echo "proceed=false" >> "$GITHUB_OUTPUT"
|
||||
{
|
||||
echo "## ⏳ Auto-promote deferred — E2E Staging SaaS still running"
|
||||
echo
|
||||
echo "Publish completed before E2E for \`${SHA:0:7}\` (state: \`$RESULT\`)."
|
||||
echo "Skipping retag here — E2E's own completion event will re-fire this workflow."
|
||||
echo "If E2E ends green, that run promotes \`:latest\`. If red, it aborts."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
;;
|
||||
none/none)
|
||||
echo "proceed=true" >> "$GITHUB_OUTPUT"
|
||||
echo "::notice::E2E paths-filtered out for this SHA — pre-merge staging gates carry"
|
||||
;;
|
||||
*)
|
||||
echo "proceed=false" >> "$GITHUB_OUTPUT"
|
||||
{
|
||||
echo "## ❓ Auto-promote aborted — unexpected E2E state"
|
||||
echo
|
||||
echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\` (unhandled)"
|
||||
echo "Manual investigation needed; re-dispatch with the same sha once resolved."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
- if: steps.gate.outputs.proceed == 'true'
|
||||
uses: imjasonh/setup-crane@6da1ae018866400525525ce74ff892880c099987 # v0.5
|
||||
|
||||
- name: GHCR login
|
||||
if: steps.gate.outputs.proceed == 'true'
|
||||
run: |
|
||||
echo "${{ secrets.GITHUB_TOKEN }}" | \
|
||||
crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin
|
||||
|
||||
- name: Verify :staging-<sha> exists for both images
|
||||
# Better to fail fast with a clear message than to half-tag
|
||||
# (platform retagged but platform-tenant missing → tenants pull
|
||||
# a stale image).
|
||||
if: steps.gate.outputs.proceed == 'true'
|
||||
run: |
|
||||
set -euo pipefail
|
||||
for img in "${IMAGE_NAME}" "${TENANT_IMAGE_NAME}"; do
|
||||
tag="${img}:staging-${{ steps.sha.outputs.short }}"
|
||||
if ! crane manifest "$tag" >/dev/null 2>&1; then
|
||||
echo "::error::Missing tag: $tag"
|
||||
echo "::error::publish-workspace-server-image must complete on this SHA before auto-promote can retag :latest."
|
||||
exit 1
|
||||
fi
|
||||
echo " ok: $tag exists"
|
||||
done
|
||||
|
||||
- name: Ancestry check — refuse to promote :latest backwards
|
||||
# #2244: workflow_run completions arrive in arbitrary order. If
|
||||
# SHA-A and SHA-B both reach main within ~10 min and SHA-B's E2E
|
||||
# completes before SHA-A's, this workflow can fire for SHA-A
|
||||
# AFTER it already promoted SHA-B → :latest goes backwards. The
|
||||
# orphan-reconciler "next run corrects it" doesn't apply: there's
|
||||
# no auto-corrective re-promote, :latest stays wrong until the
|
||||
# next main push lands.
|
||||
#
|
||||
# Detection: read current :latest's `org.opencontainers.image.revision`
|
||||
# label (set by publish-workspace-server-image.yml at build time)
|
||||
# and ask the GitHub compare API whether the candidate SHA is
|
||||
# ahead-of / identical-to / behind / diverged-from current.
|
||||
# Hard-fail on `behind` and `diverged` per the approved design —
|
||||
# silent-bypass is the class we're moving away from. Workflow
|
||||
# goes red, oncall sees it, operator decides how to recover
|
||||
# (manual dispatch with the right SHA, force-promote, etc.).
|
||||
#
|
||||
# Manual dispatch skips this check — operator override semantics
|
||||
# match the gate-check step above.
|
||||
#
|
||||
# Backward-compat: when current :latest carries no revision
|
||||
# label (legacy image pre-publish-with-label), skip-with-warning.
|
||||
# All :latest images on main are post-label as of 2026-04-29, so
|
||||
# this branch will be dead within 90 days; remove then.
|
||||
if: steps.gate.outputs.proceed == 'true' && github.event_name != 'workflow_dispatch'
|
||||
id: ancestry
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
REPO: ${{ github.repository }}
|
||||
TARGET_SHA: ${{ steps.sha.outputs.full }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
# Read the current :latest config and pull the revision label.
|
||||
# `crane config` returns the OCI image config blob (not the manifest);
|
||||
# labels live under `.config.Labels`. `// empty` makes jq return ""
|
||||
# rather than the literal "null" so the test below works.
|
||||
CURRENT_REVISION=$(crane config "${IMAGE_NAME}:latest" 2>/dev/null \
|
||||
| jq -r '.config.Labels["org.opencontainers.image.revision"] // empty' \
|
||||
|| true)
|
||||
|
||||
if [ -z "$CURRENT_REVISION" ]; then
|
||||
echo "decision=skip-no-label" >> "$GITHUB_OUTPUT"
|
||||
{
|
||||
echo "## ⚠ Ancestry check skipped — current :latest has no revision label"
|
||||
echo
|
||||
echo "Likely a legacy image built before \`org.opencontainers.image.revision\` was set."
|
||||
echo "Falling through to retag. After all \`:latest\` images are post-label (TODO 90 days), this branch is dead and should be removed."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "::warning::Current :latest carries no revision label — skipping ancestry check (legacy image)"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ "$CURRENT_REVISION" = "$TARGET_SHA" ]; then
|
||||
echo "decision=identical" >> "$GITHUB_OUTPUT"
|
||||
echo "::notice:::latest already at ${TARGET_SHA:0:7} — retag will be a no-op"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Ask GitHub which side of the merge graph TARGET_SHA sits on
|
||||
# relative to CURRENT_REVISION. Returns one of: ahead | identical
|
||||
# | behind | diverged. Network or auth errors collapse to "error"
|
||||
# via the explicit fallback so the case below always matches.
|
||||
STATUS=$(gh api \
|
||||
"repos/${REPO}/compare/${CURRENT_REVISION}...${TARGET_SHA}" \
|
||||
--jq '.status' 2>/dev/null || echo "error")
|
||||
|
||||
echo "ancestry compare ${CURRENT_REVISION:0:7} → ${TARGET_SHA:0:7}: $STATUS"
|
||||
|
||||
case "$STATUS" in
|
||||
ahead)
|
||||
echo "decision=ahead" >> "$GITHUB_OUTPUT"
|
||||
echo "::notice::Target ${TARGET_SHA:0:7} is ahead of current :latest (${CURRENT_REVISION:0:7}) — proceeding with retag"
|
||||
;;
|
||||
identical)
|
||||
echo "decision=identical" >> "$GITHUB_OUTPUT"
|
||||
echo "::notice::Target identical to :latest — retag will be a no-op"
|
||||
;;
|
||||
behind)
|
||||
echo "decision=behind" >> "$GITHUB_OUTPUT"
|
||||
{
|
||||
echo "## ❌ Auto-promote refused — target is BEHIND current :latest"
|
||||
echo
|
||||
echo "| Field | Value |"
|
||||
echo "|---|---|"
|
||||
echo "| Target SHA | \`$TARGET_SHA\` |"
|
||||
echo "| Current :latest revision | \`$CURRENT_REVISION\` |"
|
||||
echo "| GitHub compare status | \`behind\` |"
|
||||
echo
|
||||
echo "This guard catches the workflow_run-completion-order race (#2244):"
|
||||
echo "two rapid main pushes whose E2Es complete out-of-order can otherwise"
|
||||
echo "promote \`:latest\` backwards. \`:latest\` stays on \`${CURRENT_REVISION:0:7}\`."
|
||||
echo
|
||||
echo "**Recovery:** if this is a legitimate revert that should land on \`:latest\`,"
|
||||
echo "manually dispatch this workflow with the target sha as input — the manual-dispatch"
|
||||
echo "path skips the ancestry check (operator override)."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
exit 1
|
||||
;;
|
||||
diverged)
|
||||
echo "decision=diverged" >> "$GITHUB_OUTPUT"
|
||||
{
|
||||
echo "## ❓ Auto-promote refused — history diverged"
|
||||
echo
|
||||
echo "| Field | Value |"
|
||||
echo "|---|---|"
|
||||
echo "| Target SHA | \`$TARGET_SHA\` |"
|
||||
echo "| Current :latest revision | \`$CURRENT_REVISION\` |"
|
||||
echo "| GitHub compare status | \`diverged\` |"
|
||||
echo
|
||||
echo "Likely cause: force-push rewrote main's history, leaving the previous"
|
||||
echo "\`:latest\` revision orphaned. Needs human review before \`:latest\` advances."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
exit 1
|
||||
;;
|
||||
error|*)
|
||||
echo "decision=error" >> "$GITHUB_OUTPUT"
|
||||
{
|
||||
echo "## ❌ Auto-promote aborted — ancestry-check API error"
|
||||
echo
|
||||
echo "\`gh api repos/${REPO}/compare/${CURRENT_REVISION}...${TARGET_SHA}\` returned unexpected status: \`$STATUS\`"
|
||||
echo
|
||||
echo "Manual dispatch with the target sha bypasses this check."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
- name: Retag platform :staging-<sha> → :latest
|
||||
if: steps.gate.outputs.proceed == 'true'
|
||||
run: |
|
||||
crane tag "${IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest
|
||||
|
||||
- name: Retag tenant :staging-<sha> → :latest
|
||||
if: steps.gate.outputs.proceed == 'true'
|
||||
run: |
|
||||
crane tag "${TENANT_IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest
|
||||
|
||||
- name: Summary
|
||||
if: steps.gate.outputs.proceed == 'true'
|
||||
run: |
|
||||
{
|
||||
echo "## :latest promoted to ${{ steps.sha.outputs.short }}"
|
||||
echo
|
||||
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
|
||||
echo "- Trigger: manual dispatch"
|
||||
else
|
||||
echo "- Upstream: \`${{ github.event.workflow_run.name }}\` ([run](${{ github.event.workflow_run.html_url }}))"
|
||||
fi
|
||||
echo "- platform:staging-${{ steps.sha.outputs.short }} → :latest"
|
||||
echo "- platform-tenant:staging-${{ steps.sha.outputs.short }} → :latest"
|
||||
echo
|
||||
echo "Tenant fleet auto-pulls within 5 min via IMAGE_AUTO_REFRESH=true."
|
||||
echo "Force immediate fanout: dispatch redeploy-tenants-on-main.yml."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
492
.github/workflows/auto-promote-staging.yml
vendored
Normal file
492
.github/workflows/auto-promote-staging.yml
vendored
Normal file
@ -0,0 +1,492 @@
|
||||
name: Auto-promote staging → main
|
||||
|
||||
# Fires after any of the staging-branch quality gates complete. When ALL
|
||||
# required gates are green on the same staging SHA, opens (or re-uses)
|
||||
# a PR `staging → main` and schedules Gitea auto-merge so the PR lands
|
||||
# automatically once approval + status checks are satisfied.
|
||||
#
|
||||
# ============================================================
|
||||
# What this workflow does
|
||||
# ============================================================
|
||||
#
|
||||
# 1. On a workflow_run completion event for one of the staging gate
|
||||
# workflows (CI, E2E Staging Canvas, E2E API Smoke, CodeQL),
|
||||
# checks if the combined status on the staging head SHA is green.
|
||||
# 2. If green, opens (or re-uses) a PR `head: staging → base: main`
|
||||
# via Gitea REST `POST /api/v1/repos/.../pulls`.
|
||||
# 3. Schedules auto-merge via `POST /api/v1/repos/.../pulls/{index}/merge`
|
||||
# with `merge_when_checks_succeed: true`. Gitea waits for the
|
||||
# approval requirement on `main` (`required_approvals: 1`) and
|
||||
# the status-check gates, then merges.
|
||||
# 4. The merge commit lands on `main` and fires
|
||||
# `publish-workspace-server-image.yml` naturally via its
|
||||
# `on: push: branches: [main]` trigger — no explicit dispatch
|
||||
# needed (see "Why no workflow_dispatch tail" below).
|
||||
#
|
||||
# `auto-sync-main-to-staging.yml` is the reverse-direction
|
||||
# counterpart (main → staging, fast-forward push). Together they
|
||||
# keep the staging-superset-of-main invariant tight.
|
||||
#
|
||||
# ============================================================
|
||||
# Why Gitea REST (and not `gh pr create`)
|
||||
# ============================================================
|
||||
#
|
||||
# Pre-2026-05-06 this workflow used `gh pr create`, `gh pr merge --auto`,
|
||||
# `gh run list`, and `gh workflow run` against GitHub. After the
|
||||
# GitHub→Gitea cutover those calls fail because:
|
||||
#
|
||||
# - `gh pr create / merge / view / list` route to GitHub GraphQL
|
||||
# (`/api/graphql`). Gitea does not expose a GraphQL endpoint;
|
||||
# every call returns `HTTP 405 Method Not Allowed` — same root
|
||||
# cause as #65 (auto-sync) which PR #66 fixed by dropping `gh`
|
||||
# entirely.
|
||||
# - `gh run list --workflow=...` GitHub-shape; Gitea has the
|
||||
# simpler `GET /repos/.../commits/{ref}/status` combined-status
|
||||
# endpoint instead.
|
||||
# - `gh workflow run X.yml` calls `POST /repos/.../actions/workflows/{id}/dispatches`,
|
||||
# which does NOT exist on Gitea 1.22.6 (verified via swagger.v1.json).
|
||||
#
|
||||
# So this workflow uses direct `curl` calls to Gitea REST. No `gh`
|
||||
# CLI dependency, no GraphQL, no missing-endpoint footgun.
|
||||
#
|
||||
# ============================================================
|
||||
# Why no workflow_dispatch tail (was load-bearing on GitHub, dead on Gitea)
|
||||
# ============================================================
|
||||
#
|
||||
# The GitHub-era version had a 60-line polling step that waited for
|
||||
# the promote PR to merge, then explicitly dispatched
|
||||
# `publish-workspace-server-image.yml` on `--ref main`. That step
|
||||
# existed because GitHub's GITHUB_TOKEN-initiated merges suppress
|
||||
# downstream `on: push` workflows (the documented "no recursion" rule
|
||||
# — https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow).
|
||||
# The explicit dispatch was the workaround.
|
||||
#
|
||||
# Gitea Actions does NOT have this no-recursion rule. PR #66's auto-
|
||||
# sync merge to main fired `auto-promote-staging` on the next push
|
||||
# trigger naturally. So the cascade fires on the natural push event;
|
||||
# the explicit dispatch is dead code. (And even if we wanted to
|
||||
# preserve it, Gitea has no `workflow_dispatch` REST endpoint.)
|
||||
#
|
||||
# Removed in this rewrite. If we ever observe the cascade misfire,
|
||||
# operator can push an empty commit to `main` to wake it.
|
||||
#
|
||||
# ============================================================
|
||||
# Why open a PR (and not direct push)
|
||||
# ============================================================
|
||||
#
|
||||
# `main` branch protection has `enable_push: false` with NO
|
||||
# `push_whitelist_usernames`. Direct push is impossible for any
|
||||
# persona, including admins. PR-mediated merge is the only path,
|
||||
# which is intentional: prod state mutations (and staging→main IS a
|
||||
# prod mutation, since the next deploy fans out to tenants) require
|
||||
# Hongming's approval per `feedback_prod_apply_needs_hongming_chat_go`.
|
||||
#
|
||||
# The auto-merge schedule preserves this gate: `merge_when_checks_succeed`
|
||||
# does NOT bypass `required_approvals: 1`. Gitea waits for BOTH
|
||||
# approval AND green checks before merging. Hongming reviews via the
|
||||
# canvas/chat-handle of the PR notification, approves, and Gitea
|
||||
# auto-merges within seconds.
|
||||
#
|
||||
# ============================================================
|
||||
# Identity + token (anti-bot-ring per saved-memory
|
||||
# `feedback_per_agent_gitea_identity_default`)
|
||||
# ============================================================
|
||||
#
|
||||
# This workflow uses `secrets.AUTO_SYNC_TOKEN` — a personal access
|
||||
# token issued to the `devops-engineer` Gitea persona. NOT the
|
||||
# founder PAT. The bot-ring fingerprint that triggered the GitHub
|
||||
# org suspension on 2026-05-06 was characterised by founder PAT
|
||||
# acting as CI at machine speed.
|
||||
#
|
||||
# Token scope: `push: true` (read+write) on this repo. The persona
|
||||
# can: open PRs, comment on PRs, schedule auto-merge. The persona
|
||||
# CANNOT bypass main's branch protection (`required_approvals: 1`
|
||||
# still applies — only Hongming's review unblocks merge).
|
||||
#
|
||||
# Authorship: the PR is opened by `devops-engineer`; the merge
|
||||
# commit credits Hongming-as-approver and `devops-engineer` as
|
||||
# the merger.
|
||||
#
|
||||
# ============================================================
|
||||
# Failure modes & operational notes
|
||||
# ============================================================
|
||||
#
|
||||
# A — staging gates not all green at trigger time:
|
||||
# - The combined-status check returns `state: pending|failure`.
|
||||
# Workflow exits 0 with a step-summary "not all green; staying
|
||||
# on current main". Re-fires on the next gate completion.
|
||||
#
|
||||
# B — Gitea PR-create returns non-201 (e.g. 422 already-exists):
|
||||
# - Idempotent: the workflow first GETs the existing open
|
||||
# staging→main PR. If found, reuse it; if not, POST a new one.
|
||||
# 422 should never surface; if it does (race), step summary
|
||||
# captures the body and the next workflow_run picks up.
|
||||
#
|
||||
# C — `merge_when_checks_succeed` schedule fails:
|
||||
# - 422 with "Pull request is not mergeable" if there are
|
||||
# conflicts or stale base. Step summary surfaces it; operator
|
||||
# (or `auto-sync-main-to-staging`) needs to bring staging up
|
||||
# to date with main first. Workflow exits 1 to surface red.
|
||||
#
|
||||
# D — `AUTO_SYNC_TOKEN` rotated / wrong scope:
|
||||
# - 401/403 on first REST call. Step summary surfaces it.
|
||||
# Re-issue the token from `~/.molecule-ai/personas/` on the
|
||||
# operator host and update the repo Actions secret.
|
||||
#
|
||||
# ============================================================
|
||||
# Loop safety
|
||||
# ============================================================
|
||||
#
|
||||
# When the promote PR merges to main, `auto-sync-main-to-staging.yml`
|
||||
# fires (on:push:main) and pushes the merge commit back to staging.
|
||||
# That push to staging is by `devops-engineer`, NOT this workflow's
|
||||
# token, and triggers the staging gate workflows. When they all
|
||||
# complete, we end up back here — but the tree-diff guard catches
|
||||
# it: staging tree == main tree (the merge commit changes nothing),
|
||||
# so we skip and the cycle terminates.
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows:
|
||||
- CI
|
||||
- E2E Staging Canvas (Playwright)
|
||||
- E2E API Smoke Test
|
||||
- CodeQL
|
||||
types: [completed]
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
force:
|
||||
description: "Force promote even when AUTO_PROMOTE_ENABLED is unset (manual override)"
|
||||
required: false
|
||||
default: "false"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
|
||||
# Serialize auto-promote runs. Multiple staging gate completions can land
|
||||
# in quick succession (CI + E2E + CodeQL all finish within seconds of
|
||||
# each other on a green PR) — without this, two parallel runs both:
|
||||
# 1. Would race the GET-or-POST PR step.
|
||||
# 2. Would both call merge-schedule (idempotent — fine on Gitea).
|
||||
# cancel-in-progress: false because the second run on a fresh staging
|
||||
# tip should NOT kill the first which has already opened the PR.
|
||||
concurrency:
|
||||
group: auto-promote-staging
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
check-all-gates-green:
|
||||
# Only consider staging pushes. PRs into staging don't promote.
|
||||
if: >
|
||||
(github.event_name == 'workflow_run' &&
|
||||
github.event.workflow_run.head_branch == 'staging' &&
|
||||
github.event.workflow_run.event == 'push')
|
||||
|| github.event_name == 'workflow_dispatch'
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
all_green: ${{ steps.gates.outputs.all_green }}
|
||||
head_sha: ${{ steps.gates.outputs.head_sha }}
|
||||
steps:
|
||||
# Skip empty-tree promotes (the perpetual auto-promote↔auto-sync
|
||||
# cycle observed pre-cutover on GitHub). On Gitea the cycle shape
|
||||
# is different (auto-sync uses fast-forward, no merge commit),
|
||||
# but the tree-diff guard is cheap insurance and protects against
|
||||
# any future merge-style regression.
|
||||
- name: Checkout for tree-diff check
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: staging
|
||||
|
||||
- name: Skip if staging tree == main tree (cycle-break safety)
|
||||
id: tree-diff
|
||||
env:
|
||||
HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
|
||||
run: |
|
||||
set -eu
|
||||
git fetch origin main --depth=50 || { echo "::warning::git fetch main failed — proceeding (fail-open)"; exit 0; }
|
||||
if git diff --quiet origin/main "$HEAD_SHA" -- 2>/dev/null; then
|
||||
{
|
||||
echo "## Skipped — no code to promote"
|
||||
echo
|
||||
echo "staging tip (\`${HEAD_SHA:0:8}\`) and \`main\` have identical trees."
|
||||
echo "Skipping to avoid opening an empty promote PR."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "::notice::auto-promote: staging tree == main tree — no code to promote, skipping"
|
||||
echo "skip=true" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "skip=false" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
- name: Check combined status on staging head
|
||||
if: steps.tree-diff.outputs.skip != 'true'
|
||||
id: gates
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
|
||||
HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
|
||||
REPO: ${{ github.repository }}
|
||||
GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
# Gitea-native combined-status endpoint aggregates every
|
||||
# check context attached to a SHA. This is structurally
|
||||
# cleaner than the GitHub-era per-workflow `gh run list`
|
||||
# loop because:
|
||||
#
|
||||
# 1. There's no risk of "workflow name collision" (the
|
||||
# GitHub-era code had to switch from `--workflow=NAME`
|
||||
# to `--workflow=FILE.YML` to disambiguate "CodeQL"
|
||||
# between the explicit workflow and GitHub's UI-
|
||||
# configured default setup; Gitea has no such
|
||||
# duplicate-name surface).
|
||||
# 2. Gitea's combined state already encodes the AND
|
||||
# across all contexts: success only if EVERY context
|
||||
# is success. Pending or failure on any context
|
||||
# produces non-success state.
|
||||
#
|
||||
# See https://docs.gitea.com/api/1.22 for the schema —
|
||||
# `state` is one of: success, pending, failure, error.
|
||||
|
||||
echo "head_sha=${HEAD_SHA}" >> "$GITHUB_OUTPUT"
|
||||
echo "Checking combined status on SHA ${HEAD_SHA}"
|
||||
|
||||
# `set +o pipefail` for the http-code capture pattern; restore
|
||||
# immediately. Pattern hardened per `feedback_curl_status_capture_pollution`.
|
||||
BODY_FILE=$(mktemp)
|
||||
set +e
|
||||
STATUS=$(curl -sS \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" \
|
||||
-H "Accept: application/json" \
|
||||
-o "${BODY_FILE}" \
|
||||
-w "%{http_code}" \
|
||||
"${GITEA_HOST}/api/v1/repos/${REPO}/commits/${HEAD_SHA}/status")
|
||||
CURL_RC=$?
|
||||
set -e
|
||||
|
||||
if [ "${CURL_RC}" -ne 0 ] || [ "${STATUS}" != "200" ]; then
|
||||
echo "::error::combined-status fetch failed: curl=${CURL_RC} http=${STATUS}"
|
||||
cat "${BODY_FILE}" | head -c 500 || true
|
||||
rm -f "${BODY_FILE}"
|
||||
echo "all_green=false" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
STATE=$(jq -r '.state // "missing"' < "${BODY_FILE}")
|
||||
TOTAL=$(jq -r '.total_count // 0' < "${BODY_FILE}")
|
||||
rm -f "${BODY_FILE}"
|
||||
|
||||
echo "Combined status: state=${STATE} total_count=${TOTAL}"
|
||||
|
||||
if [ "${STATE}" = "success" ] && [ "${TOTAL}" -gt 0 ]; then
|
||||
echo "all_green=true" >> "$GITHUB_OUTPUT"
|
||||
echo "::notice::All gates green on ${HEAD_SHA} (${TOTAL} contexts)"
|
||||
else
|
||||
echo "all_green=false" >> "$GITHUB_OUTPUT"
|
||||
{
|
||||
echo "## Not promoting — combined status not green"
|
||||
echo
|
||||
echo "- SHA: \`${HEAD_SHA:0:8}\`"
|
||||
echo "- Combined state: \`${STATE}\`"
|
||||
echo "- Context count: ${TOTAL}"
|
||||
echo
|
||||
echo "Will re-fire on the next gate completion. Investigate any red gate via the Actions UI."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "::notice::auto-promote: combined status is ${STATE} on ${HEAD_SHA} — staying on current main"
|
||||
fi
|
||||
|
||||
promote:
|
||||
needs: check-all-gates-green
|
||||
if: needs.check-all-gates-green.outputs.all_green == 'true'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check rollout gate
|
||||
env:
|
||||
AUTO_PROMOTE_ENABLED: ${{ vars.AUTO_PROMOTE_ENABLED }}
|
||||
FORCE_INPUT: ${{ github.event.inputs.force }}
|
||||
run: |
|
||||
set -eu
|
||||
# Repo variable AUTO_PROMOTE_ENABLED=true flips this on. While
|
||||
# it's unset, the workflow dry-runs (logs what it would have
|
||||
# done) but doesn't open the promote PR. Set the variable in
|
||||
# Settings → Actions → Variables.
|
||||
if [ "${AUTO_PROMOTE_ENABLED:-}" != "true" ] && [ "${FORCE_INPUT:-false}" != "true" ]; then
|
||||
{
|
||||
echo "## Auto-promote disabled"
|
||||
echo
|
||||
echo "Repo variable \`AUTO_PROMOTE_ENABLED\` is not set to \`true\`."
|
||||
echo "All gates are green on staging; would have opened a promote PR to \`main\`."
|
||||
echo
|
||||
echo "To enable: Settings → Actions → Variables → \`AUTO_PROMOTE_ENABLED=true\`."
|
||||
echo "To test once manually: workflow_dispatch with \`force=true\`."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "::notice::auto-promote disabled — dry run only"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
- name: Open or reuse promote PR + schedule auto-merge
|
||||
if: ${{ vars.AUTO_PROMOTE_ENABLED == 'true' || github.event.inputs.force == 'true' }}
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
|
||||
REPO: ${{ github.repository }}
|
||||
TARGET_SHA: ${{ needs.check-all-gates-green.outputs.head_sha }}
|
||||
GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
API="${GITEA_HOST}/api/v1/repos/${REPO}"
|
||||
AUTH=(-H "Authorization: token ${GITEA_TOKEN}" -H "Accept: application/json")
|
||||
|
||||
# http_status_get RESULT_VAR URL
|
||||
# Sets RESULT_VAR to "<http_code>:<body_file>". Curl status
|
||||
# capture pattern per `feedback_curl_status_capture_pollution`:
|
||||
# http_code goes to its own tempfile-equivalent (-w), body to
|
||||
# another tempfile, set +e/-e bracket protects pipeline state.
|
||||
http_get() {
|
||||
local body_file="$1"; shift
|
||||
local url="$1"; shift
|
||||
set +e
|
||||
local code
|
||||
code=$(curl -sS "${AUTH[@]}" -o "${body_file}" -w "%{http_code}" "${url}")
|
||||
local rc=$?
|
||||
set -e
|
||||
if [ "${rc}" -ne 0 ]; then
|
||||
echo "::error::curl GET failed (rc=${rc}) on ${url}"
|
||||
return 99
|
||||
fi
|
||||
echo "${code}"
|
||||
}
|
||||
http_post_json() {
|
||||
local body_file="$1"; shift
|
||||
local data="$1"; shift
|
||||
local url="$1"; shift
|
||||
set +e
|
||||
local code
|
||||
code=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \
|
||||
-X POST -d "${data}" -o "${body_file}" -w "%{http_code}" "${url}")
|
||||
local rc=$?
|
||||
set -e
|
||||
if [ "${rc}" -ne 0 ]; then
|
||||
echo "::error::curl POST failed (rc=${rc}) on ${url}"
|
||||
return 99
|
||||
fi
|
||||
echo "${code}"
|
||||
}
|
||||
|
||||
# Step 1: look for an existing open staging→main promote PR
|
||||
# (idempotent on workflow re-run). Gitea doesn't have a
|
||||
# head/base filter on the list endpoint that's as ergonomic
|
||||
# as gh's, but the dedicated `/pulls/{base}/{head}` lookup
|
||||
# works.
|
||||
BODY=$(mktemp)
|
||||
STATUS=$(http_get "${BODY}" "${API}/pulls/main/staging") || true
|
||||
|
||||
PR_NUM=""
|
||||
if [ "${STATUS}" = "200" ]; then
|
||||
STATE=$(jq -r '.state // "missing"' < "${BODY}")
|
||||
if [ "${STATE}" = "open" ]; then
|
||||
PR_NUM=$(jq -r '.number // ""' < "${BODY}")
|
||||
echo "::notice::Re-using existing open promote PR #${PR_NUM}"
|
||||
fi
|
||||
fi
|
||||
rm -f "${BODY}"
|
||||
|
||||
# Step 2: if no open PR, create one.
|
||||
if [ -z "${PR_NUM}" ]; then
|
||||
TITLE="staging → main: auto-promote ${TARGET_SHA:0:7}"
|
||||
BODY_TEXT=$(cat <<EOFBODY
|
||||
Automated promotion of \`staging\` (\`${TARGET_SHA:0:8}\`) to \`main\`. All required staging gates are green at this SHA (combined status reported success).
|
||||
|
||||
This PR is auto-generated by \`.github/workflows/auto-promote-staging.yml\` whenever every required gate completes green on the same staging SHA.
|
||||
|
||||
**Approval gate:** \`main\` branch protection requires 1 approval before this can land. Once approved, Gitea will auto-merge (the workflow scheduled \`merge_when_checks_succeed: true\` immediately after open).
|
||||
|
||||
The reverse-direction sync (the merge commit on \`main\` → \`staging\`) is handled automatically by \`auto-sync-main-to-staging.yml\` after this PR lands.
|
||||
|
||||
---
|
||||
- Source: staging at \`${TARGET_SHA}\`
|
||||
- Opened by: \`devops-engineer\` persona (anti-bot-ring; never founder PAT)
|
||||
- Refs: #65, #73, #195
|
||||
EOFBODY
|
||||
)
|
||||
REQ=$(jq -n \
|
||||
--arg title "${TITLE}" \
|
||||
--arg body "${BODY_TEXT}" \
|
||||
--arg base "main" \
|
||||
--arg head "staging" \
|
||||
'{title:$title, body:$body, base:$base, head:$head}')
|
||||
|
||||
BODY=$(mktemp)
|
||||
STATUS=$(http_post_json "${BODY}" "${REQ}" "${API}/pulls")
|
||||
|
||||
if [ "${STATUS}" = "201" ]; then
|
||||
PR_NUM=$(jq -r '.number // ""' < "${BODY}")
|
||||
echo "::notice::Opened promote PR #${PR_NUM}"
|
||||
else
|
||||
echo "::error::Failed to create promote PR: HTTP ${STATUS}"
|
||||
jq -r '.message // .' < "${BODY}" | head -c 500
|
||||
rm -f "${BODY}"
|
||||
exit 1
|
||||
fi
|
||||
rm -f "${BODY}"
|
||||
fi
|
||||
|
||||
# Step 3: schedule auto-merge. merge_when_checks_succeed
|
||||
# tells Gitea to wait for both:
|
||||
# - all required status checks to pass
|
||||
# - the required-approvals gate (1 approval on main)
|
||||
# before merging. On approval+green, Gitea merges within
|
||||
# seconds. On any check failing or approval being denied,
|
||||
# the schedule stays armed but doesn't fire.
|
||||
#
|
||||
# Idempotent: re-arming on an already-armed PR is a no-op.
|
||||
REQ=$(jq -n '{Do:"merge", merge_when_checks_succeed:true}')
|
||||
BODY=$(mktemp)
|
||||
STATUS=$(http_post_json "${BODY}" "${REQ}" "${API}/pulls/${PR_NUM}/merge")
|
||||
|
||||
# Gitea returns:
|
||||
# - 200/204 on successful immediate merge (gates already green AND approved)
|
||||
# - 405 "Please try again later" when scheduled successfully but waiting
|
||||
# - 422 on "Pull request is not mergeable" (conflict, stale base, etc.)
|
||||
#
|
||||
# 405 here is benign — Gitea's way of saying "scheduled, not merging now".
|
||||
# We treat 200/204/405 as success, anything else as failure.
|
||||
case "${STATUS}" in
|
||||
200|204)
|
||||
MERGE_OUTCOME="merged-immediately"
|
||||
echo "::notice::Promote PR #${PR_NUM} merged immediately (gates+approval already green)"
|
||||
;;
|
||||
405)
|
||||
MERGE_OUTCOME="auto-merge-scheduled"
|
||||
echo "::notice::Promote PR #${PR_NUM}: auto-merge scheduled (Gitea will land on approval+green)"
|
||||
;;
|
||||
422)
|
||||
MERGE_OUTCOME="not-mergeable"
|
||||
echo "::warning::Promote PR #${PR_NUM}: not mergeable (conflict, stale base, or already merging)."
|
||||
jq -r '.message // .' < "${BODY}" | head -c 500
|
||||
;;
|
||||
*)
|
||||
echo "::error::Unexpected status ${STATUS} on merge schedule"
|
||||
jq -r '.message // .' < "${BODY}" | head -c 500
|
||||
rm -f "${BODY}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
rm -f "${BODY}"
|
||||
|
||||
{
|
||||
echo "## Auto-promote PR opened"
|
||||
echo
|
||||
echo "- Source: staging at \`${TARGET_SHA:0:8}\`"
|
||||
echo "- PR: #${PR_NUM}"
|
||||
echo "- Outcome: \`${MERGE_OUTCOME}\`"
|
||||
echo
|
||||
if [ "${MERGE_OUTCOME}" = "auto-merge-scheduled" ]; then
|
||||
echo "Gitea will auto-merge once Hongming approves and all checks are green. No human action needed beyond approval."
|
||||
elif [ "${MERGE_OUTCOME}" = "merged-immediately" ]; then
|
||||
echo "Merged immediately. \`publish-workspace-server-image.yml\` will fire naturally on the resulting \`main\` push."
|
||||
else
|
||||
echo "PR is not auto-merging. Operator may need to bring staging up to date with main, then re-trigger this workflow via workflow_dispatch."
|
||||
fi
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
83
.github/workflows/auto-promote-stale-alarm.yml
vendored
Normal file
83
.github/workflows/auto-promote-stale-alarm.yml
vendored
Normal file
@ -0,0 +1,83 @@
|
||||
name: auto-promote-stale-alarm
|
||||
|
||||
# Hourly cron + on-demand alarm for the silent-block failure mode that
|
||||
# motivated issue #2975:
|
||||
# - The auto-promote-staging.yml workflow opened a PR + armed
|
||||
# auto-merge, but main's branch protection requires a human review
|
||||
# (reviewDecision=REVIEW_REQUIRED). The PR sat BLOCKED with no
|
||||
# surface-up-the-stack for 12+ hours, holding 25 commits hostage
|
||||
# including the Memory v2 redesign and a reno-stars data-loss fix.
|
||||
#
|
||||
# This workflow runs `scripts/check-stale-promote-pr.sh` against the
|
||||
# repo's open auto-promote PRs (base=main head=staging). When a PR has
|
||||
# been BLOCKED on REVIEW_REQUIRED for >4h, it:
|
||||
# 1. Emits a workflow-level warning (visible in run summary + the
|
||||
# Actions UI feed).
|
||||
# 2. Posts a comment on the PR (idempotent — one alarm per PR).
|
||||
#
|
||||
# The detection logic lives in scripts/check-stale-promote-pr.sh so
|
||||
# it's unit-testable with stubbed `gh` (see test-check-stale-promote-pr.sh).
|
||||
# This file is the schedule + invocation surface only — SSOT for the
|
||||
# detector itself.
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Hourly. Cheap (one `gh pr list` + jq), and 1h granularity is
|
||||
# plenty for a 4h staleness threshold — operators see the alarm
|
||||
# within at most 1h of crossing the threshold.
|
||||
- cron: "27 * * * *" # at :27 to dodge the cron herd at :00
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
stale_hours:
|
||||
description: "Hours after which a BLOCKED+REVIEW_REQUIRED PR is stale (default 4)"
|
||||
required: false
|
||||
default: "4"
|
||||
post_comment:
|
||||
description: "Post a comment on stale PRs (default true)"
|
||||
required: false
|
||||
default: "true"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write # post comments on stale PRs
|
||||
|
||||
# Serialize so the on-demand and scheduled runs don't double-comment
|
||||
# the same PR. cancel-in-progress=false because the script is idempotent
|
||||
# (existing comment marker prevents dupes), but a scheduled run firing
|
||||
# while a manual one runs would just re-list the same PR set.
|
||||
concurrency:
|
||||
group: auto-promote-stale-alarm
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
scan:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout (need scripts/ only)
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
sparse-checkout: |
|
||||
scripts/check-stale-promote-pr.sh
|
||||
sparse-checkout-cone-mode: false
|
||||
- name: Run stale-PR detector
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
GITHUB_REPOSITORY: ${{ github.repository }}
|
||||
STALE_HOURS: ${{ inputs.stale_hours || '4' }}
|
||||
POST_COMMENT: ${{ inputs.post_comment || 'true' }}
|
||||
run: |
|
||||
# The script's exit code reflects the count of stale PRs.
|
||||
# We don't want a stale finding to fail the workflow run —
|
||||
# the warning + comment are the signal, the green/red is
|
||||
# noise. So convert any non-zero exit to a workflow notice
|
||||
# and exit 0.
|
||||
set +e
|
||||
bash scripts/check-stale-promote-pr.sh
|
||||
rc=$?
|
||||
set -e
|
||||
if [ "$rc" -ne 0 ]; then
|
||||
echo "::notice::Stale PR detector found $rc PR(s) needing attention. See warnings above + comments on the PRs."
|
||||
fi
|
||||
# Always succeed — operator-facing surface is the warning,
|
||||
# not the workflow status.
|
||||
exit 0
|
||||
404
.github/workflows/auto-sync-canary.yml
vendored
Normal file
404
.github/workflows/auto-sync-canary.yml
vendored
Normal file
@ -0,0 +1,404 @@
|
||||
name: Auto-sync canary — AUTO_SYNC_TOKEN rotation drift
|
||||
|
||||
# Synthetic health check for the AUTO_SYNC_TOKEN secret consumed by
|
||||
# auto-sync-main-to-staging.yml (PR #66) and publish-workspace-server-image.yml.
|
||||
#
|
||||
# ============================================================
|
||||
# Why this workflow exists
|
||||
# ============================================================
|
||||
#
|
||||
# PR #66 fixed auto-sync (replaced GitHub-era `gh pr create` — which
|
||||
# 405s on Gitea's GraphQL endpoint — with a direct git push from the
|
||||
# `devops-engineer` persona's `AUTO_SYNC_TOKEN`). Hostile self-review
|
||||
# weakest spot #3 of that PR:
|
||||
#
|
||||
# "Token rotation silently breaks auto-sync. If AUTO_SYNC_TOKEN is
|
||||
# rotated without updating the repo secret, every push to main
|
||||
# fails red on the auto-sync push step. The workflow surfaces the
|
||||
# failure mode in the step summary (failure mode B in the header),
|
||||
# but there's no proactive monitoring."
|
||||
#
|
||||
# Detection latency under the status quo: rotation is only caught on
|
||||
# the next push to `main`. During quiet periods (no main push for
|
||||
# many hours) the staging-superset-of-main invariant silently breaks.
|
||||
#
|
||||
# This workflow closes the gap: every 6 hours, it fires the auth
|
||||
# surface that auto-sync depends on and emits a red workflow status
|
||||
# if AUTO_SYNC_TOKEN has drifted out of validity.
|
||||
#
|
||||
# ============================================================
|
||||
# What this checks (Option B — read-only verify)
|
||||
# ============================================================
|
||||
#
|
||||
# 1. `GET /api/v1/user` against Gitea with the token → validates the
|
||||
# token authenticates AND resolves to `devops-engineer` (catches
|
||||
# the case where the token was regenerated under a different
|
||||
# persona by mistake).
|
||||
# 2. `GET /api/v1/repos/molecule-ai/molecule-core` with the token →
|
||||
# validates the token has `read:repository` scope on this repo
|
||||
# (the v2 scope contract — see saved memory
|
||||
# `reference_persona_token_v2_scope`).
|
||||
# 3. `git push --dry-run` of the current staging SHA back to
|
||||
# `refs/heads/staging` via `https://oauth2:<token>@<gitea>/...`
|
||||
# → validates the EXACT HTTPS basic-auth path that
|
||||
# `actions/checkout` + `git push origin staging` use inside
|
||||
# auto-sync-main-to-staging.yml. NOP by construction (push the
|
||||
# current tip to itself = "Everything up-to-date"); auth is
|
||||
# checked at the smart-protocol handshake BEFORE the empty-diff
|
||||
# computation, so bad token → exit 128 with "Authentication
|
||||
# failed". `git ls-remote` is NOT used here because Gitea
|
||||
# falls back to anonymous read on public repos and would
|
||||
# silently green-light a rotated token.
|
||||
#
|
||||
# Each step exits non-zero with an actionable error message if it
|
||||
# fails. The workflow status itself is the operator-facing surface.
|
||||
#
|
||||
# ============================================================
|
||||
# What this does NOT check (intentional)
|
||||
# ============================================================
|
||||
#
|
||||
# - **Branch-protection authz** (failure mode C in auto-sync header):
|
||||
# would require an actual write to staging. Already monitored by
|
||||
# `branch-protection-drift.yml` daily. Don't duplicate.
|
||||
# - **Conflict resolution** (failure mode A): a real conflict is data-
|
||||
# driven, not auth-driven; can't synthesise it without polluting
|
||||
# staging. Already surfaces immediately on the next main push.
|
||||
# - **Concurrency** (failure mode D): handled by workflow concurrency
|
||||
# group on auto-sync, not a credential issue.
|
||||
#
|
||||
# ============================================================
|
||||
# Why Option B (read-only) and not the alternatives
|
||||
# ============================================================
|
||||
#
|
||||
# Considered + rejected (see issue #72 for full write-up):
|
||||
#
|
||||
# - **Option A — full auto-sync on schedule**: every run creates a
|
||||
# no-op merge commit on staging when main hasn't advanced. 4 noise
|
||||
# commits/day. And races the real `push:` trigger when main has
|
||||
# advanced. Rejected.
|
||||
#
|
||||
# - **Option C — push to dedicated `auto-sync-canary` branch**: would
|
||||
# exercise authz too, but adds branch noise on Gitea AND requires
|
||||
# maintaining a second branch protection (or expanding staging's
|
||||
# whitelist to a junk branch). Authz already covered by
|
||||
# `branch-protection-drift.yml`. Rejected.
|
||||
#
|
||||
# Prior art for the chosen Option B shape:
|
||||
# - Cloudflare's `/user/tokens/verify` endpoint (read-only auth
|
||||
# probe explicitly designed for credential canaries).
|
||||
# - AWS Secrets Manager rotation Lambda's `testSecret` step (auth
|
||||
# probe before promoting AWSPENDING → AWSCURRENT).
|
||||
# - HashiCorp Vault's `vault token lookup` for renewal canaries.
|
||||
#
|
||||
# ============================================================
|
||||
# Operator runbook — what to do when this workflow goes RED
|
||||
# ============================================================
|
||||
#
|
||||
# 1. **Identify which step failed**:
|
||||
# - Step "Verify token authenticates as devops-engineer" red →
|
||||
# token is invalid OR resolves to wrong persona.
|
||||
# - Step "Verify token has repo read scope" red → token valid but
|
||||
# stripped of `read:repository` scope (or repo perms changed).
|
||||
# - Step "Verify git HTTPS auth path via no-op dry-run push to
|
||||
# staging" red → token rotated/revoked OR Gitea git-HTTPS
|
||||
# surface is broken (rare). Auth check happens on the
|
||||
# smart-protocol handshake, separate from the API path.
|
||||
#
|
||||
# 2. **Re-issue the token** on the operator host:
|
||||
# ```
|
||||
# ssh root@5.78.80.188 'docker exec --user git molecule-gitea-1 \
|
||||
# gitea admin user generate-access-token \
|
||||
# --username devops-engineer \
|
||||
# --token-name persona-devops-engineer-vN \
|
||||
# --scopes "read:repository,write:repository,read:user,read:organization,read:issue,write:issue,read:notification,read:misc"'
|
||||
# ```
|
||||
# Update `/etc/molecule-bootstrap/agent-secrets.env` in place
|
||||
# (per `feedback_unified_credentials_file`). The previous token
|
||||
# file lands at `.bak.<date>`.
|
||||
#
|
||||
# 3. **Update the repo Actions secret** at:
|
||||
# Settings → Secrets and variables → Actions → AUTO_SYNC_TOKEN
|
||||
# Paste the new token. (Don't echo it in chat — but per
|
||||
# `feedback_passwords_in_chat_are_burned`, a paste in a 1:1
|
||||
# Claude session is within trust boundary.)
|
||||
#
|
||||
# 4. **Re-run this canary** via workflow_dispatch. Confirm GREEN.
|
||||
#
|
||||
# 5. **Backfill any missed main → staging syncs** by re-running
|
||||
# `auto-sync-main-to-staging.yml` from its workflow_dispatch
|
||||
# surface, OR by pushing an empty commit to main (if you'd
|
||||
# rather force a real trigger).
|
||||
#
|
||||
# ============================================================
|
||||
# Security notes
|
||||
# ============================================================
|
||||
#
|
||||
# - Token usage: read-only (`GET /api/v1/user`, `GET /api/v1/repos/...`,
|
||||
# `git ls-remote`). No write paths. Same blast-radius profile as
|
||||
# `actions/checkout` on a public repo.
|
||||
# - The token NEVER appears in logs: every `curl` uses a header
|
||||
# variable, never inline; the `git ls-remote` URL builds the
|
||||
# `oauth2:$TOKEN@host` form into a single env var that's not
|
||||
# echoed. GitHub Actions secret-masking covers anything that does
|
||||
# slip through.
|
||||
# - No new token introduced — same `AUTO_SYNC_TOKEN` the workflow
|
||||
# under monitor uses. Per least-privilege we deliberately do NOT
|
||||
# broaden scope for the canary.
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Every 6 hours at :17 (offsets the cron herd at :00). Justification
|
||||
# from issue #72: cheap to run (~5s wall-clock, no quota), 3h average
|
||||
# detection latency, 6h max. 1h would be 24× the runs for marginal
|
||||
# benefit; daily would be 6× longer latency and worse than status
|
||||
# quo on a quiet-main day.
|
||||
- cron: '17 */6 * * *'
|
||||
workflow_dispatch:
|
||||
|
||||
# No concurrency group needed — the canary is read-only and idempotent.
|
||||
# Two parallel runs (e.g. operator dispatch during a scheduled tick) are
|
||||
# harmless: same result, doubled HTTPS calls, no shared state.
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
verify-token:
|
||||
name: Verify AUTO_SYNC_TOKEN validity
|
||||
runs-on: ubuntu-latest
|
||||
# 2 min surfaces hangs (Gitea API stall, DNS issue) within one
|
||||
# cron interval. Realistic worst case is ~10s: 2 curls + 1 git
|
||||
# ls-remote, each capped by the explicit timeouts below.
|
||||
timeout-minutes: 2
|
||||
|
||||
env:
|
||||
# Pinned in env so individual steps can read it without
|
||||
# repeating the secret reference. GitHub masks the value in
|
||||
# logs automatically.
|
||||
AUTO_SYNC_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
|
||||
# MUST stay in sync with auto-sync-main-to-staging.yml's
|
||||
# `git config user.name "devops-engineer"` line. Renaming the
|
||||
# devops-engineer persona requires updating both files (and
|
||||
# the staging branch protection's `push_whitelist_usernames`).
|
||||
EXPECTED_PERSONA: devops-engineer
|
||||
GITEA_HOST: git.moleculesai.app
|
||||
REPO_PATH: molecule-ai/molecule-core
|
||||
|
||||
steps:
|
||||
- name: Verify AUTO_SYNC_TOKEN secret is configured
|
||||
# Schedule-vs-dispatch behaviour split, per
|
||||
# `feedback_schedule_vs_dispatch_secrets_hardening`:
|
||||
#
|
||||
# - schedule: hard-fail when the secret is missing. The
|
||||
# whole point of the canary is to surface drift; soft-
|
||||
# skipping on missing-secret would make the canary
|
||||
# itself drift-invisible (sweep-cf-orphans #2088 lesson).
|
||||
# - workflow_dispatch: hard-fail too — there's no scenario
|
||||
# where an operator wants this canary to silently no-op.
|
||||
# The workflow has no other ad-hoc utility; if you ran
|
||||
# it, you wanted the answer.
|
||||
run: |
|
||||
if [ -z "${AUTO_SYNC_TOKEN}" ]; then
|
||||
echo "::error::AUTO_SYNC_TOKEN secret is not set on this repo." >&2
|
||||
echo "::error::Set it at Settings → Secrets and variables → Actions." >&2
|
||||
echo "::error::Without it, auto-sync-main-to-staging.yml will fail every push to main." >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "AUTO_SYNC_TOKEN is configured (value masked)."
|
||||
|
||||
- name: Verify token authenticates as ${{ env.EXPECTED_PERSONA }}
|
||||
# Calls Gitea's `/api/v1/user` — the canonical
|
||||
# auth-probe-with-no-side-effects endpoint (mirrors
|
||||
# Cloudflare's /user/tokens/verify).
|
||||
#
|
||||
# Failure surfaces:
|
||||
# - HTTP 401: token invalid (rotated, revoked, or never
|
||||
# correctly registered).
|
||||
# - HTTP 200 but username != devops-engineer: token was
|
||||
# regenerated under the wrong persona — this would let
|
||||
# auth pass but commit attribution would be wrong, and
|
||||
# branch-protection authz would fail because only
|
||||
# `devops-engineer` is whitelisted.
|
||||
run: |
|
||||
set -euo pipefail
|
||||
response_file="$(mktemp)"
|
||||
code_file="$(mktemp)"
|
||||
# `--max-time 30`: full call ceiling. `--connect-timeout 10`:
|
||||
# DNS + TCP. `-w "%{http_code}"` routed to a tempfile so curl's
|
||||
# exit code can't pollute the captured status — see
|
||||
# feedback_curl_status_capture_pollution + the
|
||||
# `lint-curl-status-capture.yml` gate that rejects the unsafe
|
||||
# `$(curl ... || echo "000")` shape.
|
||||
set +e
|
||||
curl -sS -o "$response_file" \
|
||||
--max-time 30 --connect-timeout 10 \
|
||||
-w "%{http_code}" \
|
||||
-H "Authorization: token ${AUTO_SYNC_TOKEN}" \
|
||||
-H "Accept: application/json" \
|
||||
"https://${GITEA_HOST}/api/v1/user" >"$code_file" 2>/dev/null
|
||||
set -e
|
||||
status=$(cat "$code_file" 2>/dev/null || true)
|
||||
[ -z "$status" ] && status="000"
|
||||
|
||||
if [ "$status" != "200" ]; then
|
||||
echo "::error::Token rotation suspected: GET /api/v1/user returned HTTP $status (expected 200)." >&2
|
||||
echo "::error::Likely cause: AUTO_SYNC_TOKEN has been rotated/revoked on Gitea but the repo Actions secret was not updated." >&2
|
||||
echo "::error::Runbook: see header comment of this workflow file." >&2
|
||||
# Print response body but redact anything that looks like a token.
|
||||
sed -E 's/[A-Fa-f0-9]{32,}/<redacted>/g' "$response_file" >&2 || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
username=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('login',''))" "$response_file")
|
||||
if [ "$username" != "${EXPECTED_PERSONA}" ]; then
|
||||
echo "::error::Token resolves to user '$username', expected '${EXPECTED_PERSONA}'." >&2
|
||||
echo "::error::AUTO_SYNC_TOKEN must be the devops-engineer persona PAT (not founder PAT, not another persona)." >&2
|
||||
echo "::error::Auto-sync push will fail because only 'devops-engineer' is whitelisted on staging branch protection." >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "Token authenticates as: $username ✓"
|
||||
|
||||
- name: Verify token has repo read scope
|
||||
# `GET /api/v1/repos/<owner>/<repo>` requires `read:repository`
|
||||
# on the persona's v2 scope contract. If the scope was
|
||||
# narrowed/dropped on rotation we catch it here, before the
|
||||
# next main push reveals it via a checkout failure.
|
||||
run: |
|
||||
set -euo pipefail
|
||||
response_file="$(mktemp)"
|
||||
code_file="$(mktemp)"
|
||||
# See first probe step for the rationale on the tempfile-routed
|
||||
# `-w "%{http_code}"` pattern — the unsafe `|| echo "000"` shape
|
||||
# is rejected by lint-curl-status-capture.yml.
|
||||
set +e
|
||||
curl -sS -o "$response_file" \
|
||||
--max-time 30 --connect-timeout 10 \
|
||||
-w "%{http_code}" \
|
||||
-H "Authorization: token ${AUTO_SYNC_TOKEN}" \
|
||||
-H "Accept: application/json" \
|
||||
"https://${GITEA_HOST}/api/v1/repos/${REPO_PATH}" >"$code_file" 2>/dev/null
|
||||
set -e
|
||||
status=$(cat "$code_file" 2>/dev/null || true)
|
||||
[ -z "$status" ] && status="000"
|
||||
|
||||
if [ "$status" != "200" ]; then
|
||||
echo "::error::Token lacks read:repository scope on ${REPO_PATH}: HTTP $status." >&2
|
||||
echo "::error::Auto-sync's actions/checkout step will fail with this token." >&2
|
||||
echo "::error::Re-issue with v2 scope contract: read:repository,write:repository,read:user,read:organization,read:issue,write:issue,read:notification,read:misc" >&2
|
||||
sed -E 's/[A-Fa-f0-9]{32,}/<redacted>/g' "$response_file" >&2 || true
|
||||
exit 1
|
||||
fi
|
||||
echo "Token has read:repository on ${REPO_PATH} ✓"
|
||||
|
||||
- name: Verify git HTTPS auth path via no-op dry-run push to staging
|
||||
# Final probe: exercise the EXACT auth path that
|
||||
# `actions/checkout` + `git push origin staging` use in
|
||||
# auto-sync-main-to-staging.yml. Gitea's API and git-HTTPS
|
||||
# surfaces share the token-lookup code path internally but
|
||||
# the wire-level error shapes differ — historically (#173)
|
||||
# the API path was healthy while git-HTTPS rejected, so
|
||||
# checking only the API would have given false-green.
|
||||
#
|
||||
# IMPORTANT: `git ls-remote` on a public repo (which
|
||||
# molecule-core is) succeeds even with a junk token because
|
||||
# Gitea falls back to anonymous-read. `ls-remote` therefore
|
||||
# CANNOT validate auth on this surface. We use
|
||||
# `git push --dry-run` instead — push is auth-gated even on
|
||||
# public repos.
|
||||
#
|
||||
# NOP shape: read the current staging SHA via authenticated
|
||||
# ls-remote (the SHA itself is public; auth is incidental
|
||||
# here, used only to colocate the discovery in one step), then
|
||||
# `git push --dry-run <SHA>:refs/heads/staging`. Pushing the
|
||||
# current tip back to itself is "Everything up-to-date" with
|
||||
# exit 0 when auth succeeds. With a bad token Gitea returns
|
||||
# HTTP 401 in the smart-protocol handshake and git exits 128
|
||||
# with "Authentication failed".
|
||||
#
|
||||
# The dry-run never reaches Gitea's pre-receive hook (which
|
||||
# is where branch-protection authz runs), so this probe does
|
||||
# not validate failure mode C. That's intentional —
|
||||
# branch-protection-drift.yml owns authz monitoring; this
|
||||
# canary owns auth.
|
||||
env:
|
||||
# Don't hang waiting for password prompt if auth fails on a
|
||||
# terminal-attached run. (In Actions there's no terminal,
|
||||
# but the env-var hardens against an interactive runner
|
||||
# config.)
|
||||
GIT_TERMINAL_PROMPT: "0"
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# Token is in $AUTO_SYNC_TOKEN (job-level env). Compose the
|
||||
# URL as a local var that's never echoed.
|
||||
url="https://oauth2:${AUTO_SYNC_TOKEN}@${GITEA_HOST}/${REPO_PATH}"
|
||||
|
||||
# Step a: read current staging SHA. ~1KB; auth-gated only
|
||||
# on private repos but always works on public — used here
|
||||
# only to discover the SHA, not to validate auth.
|
||||
staging_ref=$(timeout 30s git ls-remote --refs "$url" refs/heads/staging 2>&1) || {
|
||||
redacted=$(echo "$staging_ref" | sed -E "s|oauth2:[^@]+@|oauth2:<redacted>@|g")
|
||||
echo "::error::ls-remote against staging failed (network/DNS issue):" >&2
|
||||
echo "$redacted" >&2
|
||||
exit 1
|
||||
}
|
||||
if ! echo "$staging_ref" | grep -qE '^[0-9a-f]{40}[[:space:]]+refs/heads/staging$'; then
|
||||
echo "::error::ls-remote returned unexpected shape:" >&2
|
||||
echo "$staging_ref" | sed -E "s|oauth2:[^@]+@|oauth2:<redacted>@|g" >&2
|
||||
exit 1
|
||||
fi
|
||||
staging_sha=$(echo "$staging_ref" | awk '{print $1}')
|
||||
|
||||
# Step b: spin up an ephemeral local repo. `git push` always
|
||||
# requires a local repo even when pushing a remote SHA that
|
||||
# isn't in the local object DB (the protocol negotiates and
|
||||
# discovers we don't need to send any objects). We don't use
|
||||
# `actions/checkout` for this — it would clone the whole
|
||||
# repo (~hundreds of MB) for what's essentially `git init`.
|
||||
tmp_repo="$(mktemp -d)"
|
||||
trap 'rm -rf "$tmp_repo"' EXIT
|
||||
git -C "$tmp_repo" init -q
|
||||
# Author config required for any git operation; values are
|
||||
# arbitrary because nothing gets committed here.
|
||||
git -C "$tmp_repo" config user.email canary@auto-sync.local
|
||||
git -C "$tmp_repo" config user.name auto-sync-canary
|
||||
|
||||
# Step c: dry-run push the current staging SHA back to
|
||||
# staging. NOP by construction — the remote tip equals the
|
||||
# SHA we're pushing, so "Everything up-to-date" is the
|
||||
# success path.
|
||||
#
|
||||
# Authentication is checked at the smart-protocol handshake,
|
||||
# BEFORE the dry-run can compute an empty diff. Bad token
|
||||
# → "Authentication failed", exit 128. Good token → exit 0.
|
||||
set +e
|
||||
push_out=$(timeout 30s git -C "$tmp_repo" push --dry-run "$url" "${staging_sha}:refs/heads/staging" 2>&1)
|
||||
push_rc=$?
|
||||
set -e
|
||||
|
||||
if [ "$push_rc" -ne 0 ]; then
|
||||
redacted=$(echo "$push_out" | sed -E "s|oauth2:[^@]+@|oauth2:<redacted>@|g")
|
||||
echo "::error::Token rotation suspected: git push --dry-run against staging failed via the AUTO_SYNC_TOKEN HTTPS auth path (exit $push_rc)." >&2
|
||||
echo "::error::This is the EXACT auth path that actions/checkout + git push use in auto-sync-main-to-staging.yml." >&2
|
||||
echo "::error::Likely cause: AUTO_SYNC_TOKEN was rotated/revoked on Gitea but the repo Actions secret was not updated. Runbook: see header." >&2
|
||||
echo "$redacted" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "git HTTPS auth path: NOP push --dry-run to staging → ${staging_sha:0:8} ✓"
|
||||
|
||||
- name: Summarise canary result
|
||||
# Everything passed — surface a green summary. (Failures
|
||||
# already wrote ::error:: lines and exited above; if we got
|
||||
# here, all three probes passed.)
|
||||
run: |
|
||||
{
|
||||
echo "## Auto-sync canary: GREEN"
|
||||
echo ""
|
||||
echo "AUTO_SYNC_TOKEN is healthy:"
|
||||
echo "- Authenticates as \`${EXPECTED_PERSONA}\` ✓"
|
||||
echo "- Has \`read:repository\` scope on \`${REPO_PATH}\` ✓"
|
||||
echo "- Git HTTPS auth path: no-op dry-run push to \`refs/heads/staging\` succeeds ✓"
|
||||
echo ""
|
||||
echo "Auto-sync main → staging will succeed on the next push to main."
|
||||
echo "If this canary ever goes RED, see the runbook in this workflow's header."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
255
.github/workflows/auto-sync-main-to-staging.yml
vendored
Normal file
255
.github/workflows/auto-sync-main-to-staging.yml
vendored
Normal file
@ -0,0 +1,255 @@
|
||||
name: Auto-sync main → staging
|
||||
|
||||
# Reflects every push to `main` back onto `staging` so the
|
||||
# staging-as-superset-of-main invariant holds.
|
||||
#
|
||||
# ============================================================
|
||||
# What this workflow does
|
||||
# ============================================================
|
||||
#
|
||||
# On every push to `main`:
|
||||
# 1. Checks if staging already contains main → no-op.
|
||||
# 2. Fetches both branches, merges main into staging in the
|
||||
# runner workspace (fast-forward if possible, else
|
||||
# `--no-ff` merge commit).
|
||||
# 3. Pushes staging directly to origin via the
|
||||
# `devops-engineer` persona's `AUTO_SYNC_TOKEN`.
|
||||
#
|
||||
# Authoritative path: a single `git push origin staging` from
|
||||
# inside this workflow is the SSOT for advancing staging after
|
||||
# a main push. No PR, no merge queue, no human approval —
|
||||
# staging is mechanically maintained as a superset of main.
|
||||
#
|
||||
# `auto-promote-staging.yml` is the reverse-direction
|
||||
# counterpart (staging → main, gated on green CI). Together
|
||||
# they keep the staging-superset-of-main invariant tight.
|
||||
#
|
||||
# ============================================================
|
||||
# Why direct push (and not "open a PR")
|
||||
# ============================================================
|
||||
#
|
||||
# Pre-2026-05-06 the canonical SCM was GitHub.com, where:
|
||||
# - The `staging` branch had a `merge_queue` ruleset that
|
||||
# blocked ALL direct pushes (no bypass even for org
|
||||
# admins or the GitHub Actions integration).
|
||||
# - Therefore this workflow opened a PR via `gh pr create`
|
||||
# and let auto-merge land it through the queue.
|
||||
#
|
||||
# Post-2026-05-06 the canonical SCM is Gitea
|
||||
# (`git.moleculesai.app/molecule-ai/molecule-core`). Gitea:
|
||||
# - Has no `merge_queue` concept.
|
||||
# - Allows direct push to protected branches via per-user
|
||||
# `push_whitelist_usernames` on the branch protection.
|
||||
# - Does not expose a GraphQL endpoint, so `gh pr create`
|
||||
# returns `HTTP 405 Method Not Allowed
|
||||
# (https://git.moleculesai.app/api/graphql)` — the
|
||||
# pre-suspension architecture cannot work on Gitea.
|
||||
#
|
||||
# The molecule-ai/molecule-core staging branch protection
|
||||
# (verified via `GET /api/v1/repos/.../branch_protections`)
|
||||
# whitelists `devops-engineer` for direct push. So the
|
||||
# correct Gitea-shape architecture is: authenticate as
|
||||
# `devops-engineer`, merge locally, push staging directly.
|
||||
#
|
||||
# This is structurally simpler than the GitHub-era PR dance
|
||||
# and removes the dependence on `gh` CLI / GraphQL entirely.
|
||||
#
|
||||
# ============================================================
|
||||
# Identity + token (anti-bot-ring per saved-memory
|
||||
# `feedback_per_agent_gitea_identity_default`)
|
||||
# ============================================================
|
||||
#
|
||||
# This workflow uses `secrets.AUTO_SYNC_TOKEN`, which is a
|
||||
# personal access token issued to the `devops-engineer`
|
||||
# persona on Gitea — NOT the founder PAT. The bot-ring
|
||||
# fingerprint that triggered the GitHub org suspension on
|
||||
# 2026-05-06 was characterised by founder PAT acting as CI
|
||||
# at machine speed; per-persona identities split the
|
||||
# attribution honestly.
|
||||
#
|
||||
# Token scope on Gitea: repo write. Push target restricted
|
||||
# to `staging` (this workflow is the only writer; main is
|
||||
# untouched). Compromise blast radius: bounded to staging
|
||||
# branch + this repo's read surface.
|
||||
#
|
||||
# Commits are authored by the persona email
|
||||
# `devops-engineer@agents.moleculesai.app` so commit history
|
||||
# reflects which automation produced the merge.
|
||||
#
|
||||
# ============================================================
|
||||
# Failure modes & operational notes
|
||||
# ============================================================
|
||||
#
|
||||
# A — staging has commits main doesn't, and the merge
|
||||
# conflicts:
|
||||
# - The `--no-ff` merge step exits non-zero. Workflow
|
||||
# fails red. Operator (devops-engineer or human)
|
||||
# resolves manually:
|
||||
# git fetch origin
|
||||
# git checkout staging
|
||||
# git merge --no-ff origin/main
|
||||
# # resolve conflicts
|
||||
# git push origin staging
|
||||
# - Step summary surfaces the conflict so the failed run
|
||||
# is self-explanatory.
|
||||
#
|
||||
# B — `AUTO_SYNC_TOKEN` rotated / wrong scope:
|
||||
# - `git push` step exits non-zero with `HTTP 401` /
|
||||
# `403`. Step summary surfaces the failed push.
|
||||
# - Re-issue the token from `~/.molecule-ai/personas/`
|
||||
# on the operator host and update the repo Actions
|
||||
# secret. Re-run the workflow.
|
||||
#
|
||||
# C — staging branch protection no longer whitelists
|
||||
# `devops-engineer`:
|
||||
# - `git push` exits non-zero with a Gitea protected-
|
||||
# branch rejection. Step summary surfaces it.
|
||||
# - Re-add `devops-engineer` to
|
||||
# `push_whitelist_usernames` on the staging
|
||||
# protection (Settings → Branches → staging).
|
||||
#
|
||||
# D — concurrent push to main while a sync is in flight:
|
||||
# - The `concurrency` group below serialises runs.
|
||||
# The second waits for the first; if main advances
|
||||
# again while we're syncing, the second run picks
|
||||
# up the new tip on its own fetch.
|
||||
#
|
||||
# ============================================================
|
||||
# Loop safety
|
||||
# ============================================================
|
||||
#
|
||||
# The push to staging from this workflow does NOT itself
|
||||
# fire a `push: branches: [main]` event (different branch),
|
||||
# so there's no risk of self-recursion. `auto-promote-staging.yml`
|
||||
# fires on `workflow_run` of CI etc. — it sees the new
|
||||
# staging tip on its next gate-completion event, NOT on this
|
||||
# push directly. No loop.
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
# workflow_dispatch lets operators manually backfill a
|
||||
# missed sync (e.g. if AUTO_SYNC_TOKEN was rotated and a
|
||||
# main push slipped through while the secret was stale).
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
concurrency:
|
||||
group: auto-sync-main-to-staging
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
sync-staging:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout staging (with devops-engineer push token)
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: staging
|
||||
# AUTO_SYNC_TOKEN authenticates as the
|
||||
# `devops-engineer` Gitea persona — the only
|
||||
# identity whitelisted for direct push to
|
||||
# staging. See header comment for context.
|
||||
token: ${{ secrets.AUTO_SYNC_TOKEN }}
|
||||
|
||||
- name: Configure git author
|
||||
run: |
|
||||
# Per-persona identity, NOT founder PAT.
|
||||
# `feedback_per_agent_gitea_identity_default`.
|
||||
git config user.name "devops-engineer"
|
||||
git config user.email "devops-engineer@agents.moleculesai.app"
|
||||
|
||||
- name: Check if staging already contains main
|
||||
id: check
|
||||
run: |
|
||||
set -euo pipefail
|
||||
git fetch origin main
|
||||
if git merge-base --is-ancestor origin/main HEAD; then
|
||||
echo "needs_sync=false" >> "$GITHUB_OUTPUT"
|
||||
{
|
||||
echo "## No-op"
|
||||
echo
|
||||
echo "staging already contains \`origin/main\` ($(git rev-parse --short=8 origin/main))."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
else
|
||||
echo "needs_sync=true" >> "$GITHUB_OUTPUT"
|
||||
MAIN_SHORT=$(git rev-parse --short=8 origin/main)
|
||||
echo "main_short=${MAIN_SHORT}" >> "$GITHUB_OUTPUT"
|
||||
echo "::notice::staging is missing main's tip (${MAIN_SHORT}) — merging in-runner and pushing"
|
||||
fi
|
||||
|
||||
- name: Merge main into staging (in-runner)
|
||||
if: steps.check.outputs.needs_sync == 'true'
|
||||
id: merge
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# Already on staging from checkout. Try fast-forward
|
||||
# first (cleanest history); fall back to merge commit
|
||||
# if staging has commits main doesn't.
|
||||
if git merge --ff-only origin/main; then
|
||||
echo "did_ff=true" >> "$GITHUB_OUTPUT"
|
||||
echo "::notice::Fast-forwarded staging to origin/main"
|
||||
else
|
||||
echo "did_ff=false" >> "$GITHUB_OUTPUT"
|
||||
if ! git merge --no-ff origin/main \
|
||||
-m "chore: sync main → staging (auto, ${{ steps.check.outputs.main_short }})"; then
|
||||
# Hygiene: leave the work tree clean before failing.
|
||||
git merge --abort || true
|
||||
{
|
||||
echo "## Conflict"
|
||||
echo
|
||||
echo "Auto-merge \`main → staging\` failed with conflicts."
|
||||
echo "A human (or devops-engineer persona) needs to resolve manually:"
|
||||
echo
|
||||
echo '```'
|
||||
echo "git fetch origin"
|
||||
echo "git checkout staging"
|
||||
echo "git merge --no-ff origin/main"
|
||||
echo "# resolve conflicts"
|
||||
echo "git push origin staging"
|
||||
echo '```'
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
- name: Push staging to origin
|
||||
if: steps.check.outputs.needs_sync == 'true'
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# Direct push to staging. devops-engineer persona is
|
||||
# whitelisted for direct push on the staging branch
|
||||
# protection (Settings → Branches → staging).
|
||||
#
|
||||
# No --force / --force-with-lease: a fast-forward or
|
||||
# legitimate merge commit on top of current staging
|
||||
# is the only thing we'd ever push. If origin/staging
|
||||
# advanced under us (concurrent merge), the push
|
||||
# legitimately rejects and the next run picks up the
|
||||
# new state.
|
||||
if ! git push origin staging; then
|
||||
{
|
||||
echo "## Push rejected"
|
||||
echo
|
||||
echo "Direct push to \`staging\` failed. Likely causes:"
|
||||
echo "- \`AUTO_SYNC_TOKEN\` rotated / wrong scope (HTTP 401/403)"
|
||||
echo "- \`devops-engineer\` no longer in"
|
||||
echo " \`push_whitelist_usernames\` on the staging"
|
||||
echo " branch protection (HTTP 422)"
|
||||
echo "- staging advanced concurrently — re-running this"
|
||||
echo " workflow on the new main tip will pick it up"
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
{
|
||||
echo "## Auto-sync succeeded"
|
||||
echo
|
||||
echo "- staging advanced to: \`$(git rev-parse --short=8 HEAD)\`"
|
||||
echo "- main tip: \`${{ steps.check.outputs.main_short }}\`"
|
||||
echo "- Strategy: $([ "${{ steps.merge.outputs.did_ff }}" = "true" ] && echo "fast-forward" || echo "merge commit")"
|
||||
echo "- Pushed by: \`devops-engineer\` (per-agent persona, anti-bot-ring)"
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
82
.github/workflows/canary-staging.yml
vendored
82
.github/workflows/canary-staging.yml
vendored
@ -20,19 +20,6 @@ on:
|
||||
# a few minutes under load — that's fine for a canary.
|
||||
- cron: '*/30 * * * *'
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
keep_on_failure:
|
||||
description: >-
|
||||
Skip teardown when the canary fails (debugging only). The
|
||||
tenant org + EC2 + CF tunnel + DNS stay alive so an operator
|
||||
can SSM into the workspace EC2 and capture docker logs of the
|
||||
failing claude-code container. REMEMBER to manually delete
|
||||
via DELETE /cp/admin/tenants/<slug> when done so the org
|
||||
doesn't accumulate cost. Only honored on workflow_dispatch;
|
||||
cron runs always tear down (we don't want unattended cron
|
||||
to leak resources).
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
# Serialise with the full-SaaS workflow so they don't contend for the
|
||||
# same org-create quota on staging. Different group key from
|
||||
@ -93,14 +80,6 @@ jobs:
|
||||
# is "Token Plan only" but cheap-per-token and fast.
|
||||
E2E_MODEL_SLUG: MiniMax-M2.7-highspeed
|
||||
E2E_RUN_ID: "canary-${{ github.run_id }}"
|
||||
# Debug-only: when an operator dispatches with keep_on_failure=true,
|
||||
# the canary script's E2E_KEEP_ORG=1 path skips teardown so the
|
||||
# tenant org + EC2 stay alive for SSM-based log capture. Cron runs
|
||||
# never set this (the input only exists on workflow_dispatch) so
|
||||
# unattended cron always tears down. See molecule-core#129
|
||||
# failure mode #1 — capturing the actual exception requires
|
||||
# docker logs from the live container.
|
||||
E2E_KEEP_ORG: ${{ github.event.inputs.keep_on_failure == 'true' && '1' || '0' }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
@ -158,28 +137,27 @@ jobs:
|
||||
id: canary
|
||||
run: bash tests/e2e/test_staging_full_saas.sh
|
||||
|
||||
# Alerting: open a sticky issue on the FIRST failure; comment on
|
||||
# subsequent failures; auto-close on next green. Comment-on-existing
|
||||
# de-duplicates so a single open issue accumulates the streak —
|
||||
# ops sees one issue with N comments rather than N issues.
|
||||
# Alerting: open an issue only after THREE consecutive failures so
|
||||
# transient flakes (Cloudflare DNS hiccup, AWS API blip) don't spam
|
||||
# the issue list. If an issue is already open, we still comment on
|
||||
# every failure so ops sees the streak. Auto-close on next green.
|
||||
#
|
||||
# Why no consecutive-failures threshold (e.g., wait 3 runs before
|
||||
# filing): the prior threshold check used
|
||||
# `github.rest.actions.listWorkflowRuns()` which Gitea 1.22.6 does
|
||||
# not expose (returns 404). On Gitea Actions the threshold call
|
||||
# ALWAYS failed, breaking the entire alerting step and going days
|
||||
# silent on real regressions (38h+ chronic red on 2026-05-07/08
|
||||
# before this fix; tracked in molecule-core#129). Filing on first
|
||||
# failure is also better UX — we want to know about the first red,
|
||||
# not wait 90 min for it to "count." Real flakes get one issue +
|
||||
# a quick close-on-green; persistent reds accumulate comments.
|
||||
# Threshold rationale: canary fires every 30 min, so 3 failures =
|
||||
# ~90 min of consecutive red — well past any single-run flake but
|
||||
# still tight enough that a real outage gets surfaced before the
|
||||
# next deploy window.
|
||||
- name: Open issue on failure
|
||||
if: failure()
|
||||
uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
|
||||
env:
|
||||
# Inject the workflow path explicitly — context.workflow is
|
||||
# the *name*, not the file path the actions API needs.
|
||||
WORKFLOW_PATH: '.github/workflows/canary-staging.yml'
|
||||
CONSECUTIVE_THRESHOLD: '3'
|
||||
with:
|
||||
script: |
|
||||
const title = '🔴 Canary failing: staging SaaS smoke';
|
||||
const runURL = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
|
||||
const runURL = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
|
||||
|
||||
// Find an existing open canary issue (stable title match).
|
||||
// If one exists, this isn't a "first failure" — comment and exit.
|
||||
@ -199,12 +177,32 @@ jobs:
|
||||
return;
|
||||
}
|
||||
|
||||
// No open issue yet — file one on this first failure. The
|
||||
// comment-on-existing branch above means subsequent failures
|
||||
// accumulate as comments on this same issue, so we don't
|
||||
// spam new issues per run.
|
||||
// No open issue yet — check the last N-1 runs' conclusions.
|
||||
// We open the issue only if the last (THRESHOLD-1) runs ALSO
|
||||
// failed (so this is the 3rd consecutive red).
|
||||
const threshold = parseInt(process.env.CONSECUTIVE_THRESHOLD, 10);
|
||||
const { data: runs } = await github.rest.actions.listWorkflowRuns({
|
||||
owner: context.repo.owner, repo: context.repo.repo,
|
||||
workflow_id: process.env.WORKFLOW_PATH,
|
||||
status: 'completed',
|
||||
per_page: threshold,
|
||||
// Skip the current in-progress run; it isn't 'completed' yet.
|
||||
});
|
||||
// listWorkflowRuns returns recent first. We need (threshold-1)
|
||||
// prior failures (current run is the threshold-th).
|
||||
const priorFailures = (runs.workflow_runs || [])
|
||||
.slice(0, threshold - 1)
|
||||
.filter(r => r.id !== context.runId)
|
||||
.filter(r => r.conclusion === 'failure')
|
||||
.length;
|
||||
if (priorFailures < threshold - 1) {
|
||||
core.info(`Below threshold: ${priorFailures + 1}/${threshold} consecutive failures — not filing yet`);
|
||||
return;
|
||||
}
|
||||
|
||||
const body =
|
||||
`Canary run failed at ${new Date().toISOString()}.\n\n` +
|
||||
`Canary run failed at ${new Date().toISOString()}, ` +
|
||||
`${threshold} consecutive runs red.\n\n` +
|
||||
`Run: ${runURL}\n\n` +
|
||||
`This issue auto-closes on the next green canary run. ` +
|
||||
`Consecutive failures add a comment here rather than a new issue.`;
|
||||
@ -213,7 +211,7 @@ jobs:
|
||||
title, body,
|
||||
labels: ['canary-staging', 'bug'],
|
||||
});
|
||||
core.info('Opened canary failure issue (first red)');
|
||||
core.info(`Opened canary failure issue (${threshold} consecutive reds)`);
|
||||
|
||||
- name: Auto-close canary issue on success
|
||||
if: success()
|
||||
|
||||
2
.github/workflows/canary-verify.yml
vendored
2
.github/workflows/canary-verify.yml
vendored
@ -108,7 +108,7 @@ jobs:
|
||||
echo
|
||||
echo "One or more canary secrets are unset (\`CANARY_TENANT_URLS\`, \`CANARY_ADMIN_TOKENS\`, \`CANARY_CP_SHARED_SECRET\`)."
|
||||
echo "Phase 2 canary fleet has not been stood up yet —"
|
||||
echo "see [canary-tenants.md](https://git.moleculesai.app/molecule-ai/molecule-controlplane/blob/main/docs/canary-tenants.md)."
|
||||
echo "see [canary-tenants.md](https://github.com/molecule-ai/molecule-controlplane/blob/main/docs/canary-tenants.md)."
|
||||
echo
|
||||
echo "**Skipped — promote-to-latest will NOT auto-fire.** Dispatch \`promote-latest.yml\` manually when ready."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
1
.github/workflows/ci.yml
vendored
1
.github/workflows/ci.yml
vendored
@ -218,6 +218,7 @@ jobs:
|
||||
uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
|
||||
with:
|
||||
node-version: '22'
|
||||
token: ${{ secrets.GITEA_TOKEN }}
|
||||
- if: needs.changes.outputs.canvas == 'true'
|
||||
run: rm -f package-lock.json && npm install
|
||||
- if: needs.changes.outputs.canvas == 'true'
|
||||
|
||||
130
.github/workflows/e2e-api.yml
vendored
130
.github/workflows/e2e-api.yml
vendored
@ -12,59 +12,6 @@ name: E2E API Smoke Test
|
||||
# spending CI cycles. See the in-job comment on the `e2e-api` job for
|
||||
# why this is one job (not two-jobs-sharing-name) and the 2026-04-29
|
||||
# PR #2264 incident that drove the consolidation.
|
||||
#
|
||||
# Parallel-safety (Class B Hongming-owned CICD red sweep, 2026-05-08)
|
||||
# -------------------------------------------------------------------
|
||||
# Same substrate hazard as PR #98 (handlers-postgres-integration). Our
|
||||
# Gitea act_runner runs with `container.network: host` (operator host
|
||||
# `/opt/molecule/runners/config.yaml`), which means:
|
||||
#
|
||||
# * Two concurrent runs both try to bind their `-p 15432:5432` /
|
||||
# `-p 16379:6379` host ports — the second postgres/redis FATALs
|
||||
# with `Address in use` and `docker run` returns exit 125 with
|
||||
# `Conflict. The container name "/molecule-ci-postgres" is already
|
||||
# in use by container ...`. Verified in run a7/2727 on 2026-05-07.
|
||||
# * The fixed container names `molecule-ci-postgres` / `-redis` (the
|
||||
# pre-fix shape) collide on name AS WELL AS port. The cleanup-with-
|
||||
# `docker rm -f` at the start of the second job KILLS the first
|
||||
# job's still-running postgres/redis.
|
||||
#
|
||||
# Fix shape (mirrors PR #98's bridge-net pattern, adapted because
|
||||
# platform-server is a Go binary on the host, not a containerised
|
||||
# step):
|
||||
#
|
||||
# 1. Unique container names per run:
|
||||
# pg-e2e-api-${RUN_ID}-${RUN_ATTEMPT}
|
||||
# redis-e2e-api-${RUN_ID}-${RUN_ATTEMPT}
|
||||
# `${RUN_ID}-${RUN_ATTEMPT}` is unique even across reruns of the
|
||||
# same run_id.
|
||||
# 2. Ephemeral host port per run (`-p 0:5432`), then read the actual
|
||||
# bound port via `docker port` and export DATABASE_URL/REDIS_URL
|
||||
# pointing at it. No fixed host-port → no port collision.
|
||||
# 3. `127.0.0.1` (NOT `localhost`) in URLs — IPv6 first-resolve was
|
||||
# the original flake fixed in #92 and the script's still IPv6-
|
||||
# enabled.
|
||||
# 4. `if: always()` cleanup so containers don't leak when test steps
|
||||
# fail.
|
||||
#
|
||||
# Issue #94 items #2 + #3 (also fixed here):
|
||||
# * Pre-pull `alpine:latest` so the platform-server's provisioner
|
||||
# (`internal/handlers/container_files.go`) can stand up its
|
||||
# ephemeral token-write helper without a daemon.io round-trip.
|
||||
# * Create `molecule-monorepo-net` bridge network if missing so the
|
||||
# provisioner's container.HostConfig {NetworkMode: ...} attach
|
||||
# succeeds.
|
||||
# Item #1 (timeouts) — evidence on recent runs (77/3191, ae/4270, 0e/
|
||||
# 2318) shows Postgres ready in 3s, Redis in 1s, Platform in 1s when
|
||||
# they DO come up. Timeouts are not the bottleneck; not bumped.
|
||||
#
|
||||
# Item explicitly NOT fixed here: failing test `Status back online`
|
||||
# fails because the platform's langgraph workspace template image
|
||||
# (ghcr.io/molecule-ai/workspace-template-langgraph:latest) returns
|
||||
# 403 Forbidden post-2026-05-06 GitHub org suspension. That is a
|
||||
# template-registry resolution issue (ADR-002 / local-build mode) and
|
||||
# belongs in a separate change that touches workspace-server, not
|
||||
# this workflow file.
|
||||
|
||||
on:
|
||||
push:
|
||||
@ -131,14 +78,11 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 15
|
||||
env:
|
||||
# Unique per-run container names so concurrent runs on the host-
|
||||
# network act_runner don't collide on name OR port.
|
||||
# `${RUN_ID}-${RUN_ATTEMPT}` stays unique across reruns of the
|
||||
# same run_id. PORT is set later (after docker port lookup) since
|
||||
# we let Docker assign an ephemeral host port.
|
||||
PG_CONTAINER: pg-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
|
||||
REDIS_CONTAINER: redis-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
|
||||
DATABASE_URL: postgres://dev:dev@localhost:15432/molecule?sslmode=disable
|
||||
REDIS_URL: redis://localhost:16379
|
||||
PORT: "8080"
|
||||
PG_CONTAINER: molecule-ci-postgres
|
||||
REDIS_CONTAINER: molecule-ci-redis
|
||||
steps:
|
||||
- name: No-op pass (paths filter excluded this commit)
|
||||
if: needs.detect-changes.outputs.api != 'true'
|
||||
@ -153,53 +97,11 @@ jobs:
|
||||
go-version: 'stable'
|
||||
cache: true
|
||||
cache-dependency-path: workspace-server/go.sum
|
||||
- name: Pre-pull alpine + ensure provisioner network (Issue #94 items #2 + #3)
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: |
|
||||
# Provisioner uses alpine:latest for ephemeral token-write
|
||||
# containers (workspace-server/internal/handlers/container_files.go).
|
||||
# Pre-pull so the first provision in test_api.sh doesn't race
|
||||
# the daemon's pull cache. Idempotent — `docker pull` is a no-op
|
||||
# when the image is already present.
|
||||
docker pull alpine:latest >/dev/null
|
||||
# Provisioner attaches workspace containers to
|
||||
# molecule-monorepo-net (workspace-server/internal/provisioner/
|
||||
# provisioner.go::DefaultNetwork). The bridge already exists on
|
||||
# the operator host's docker daemon — `network create` is
|
||||
# idempotent via `|| true`.
|
||||
docker network create molecule-monorepo-net >/dev/null 2>&1 || true
|
||||
echo "alpine:latest pre-pulled; molecule-monorepo-net ensured."
|
||||
- name: Start Postgres (docker)
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: |
|
||||
# Defensive cleanup — only matches THIS run's container name,
|
||||
# so it cannot kill a sibling run's postgres. (Pre-fix the
|
||||
# name was static and this rm hit other runs' containers.)
|
||||
docker rm -f "$PG_CONTAINER" 2>/dev/null || true
|
||||
# `-p 0:5432` requests an ephemeral host port; we read it back
|
||||
# below and export DATABASE_URL.
|
||||
docker run -d --name "$PG_CONTAINER" \
|
||||
-e POSTGRES_USER=dev -e POSTGRES_PASSWORD=dev -e POSTGRES_DB=molecule \
|
||||
-p 0:5432 postgres:16 >/dev/null
|
||||
# Resolve the host-side port assignment. `docker port` prints
|
||||
# `0.0.0.0:NNNN` (and on host-net runners may also print an
|
||||
# IPv6 line — take the first IPv4 line).
|
||||
PG_PORT=$(docker port "$PG_CONTAINER" 5432/tcp | awk -F: '/^0\.0\.0\.0:/ {print $2; exit}')
|
||||
if [ -z "$PG_PORT" ]; then
|
||||
# Fallback: any first line. Some Docker versions print only
|
||||
# one line.
|
||||
PG_PORT=$(docker port "$PG_CONTAINER" 5432/tcp | head -1 | awk -F: '{print $NF}')
|
||||
fi
|
||||
if [ -z "$PG_PORT" ]; then
|
||||
echo "::error::Could not resolve host port for $PG_CONTAINER"
|
||||
docker port "$PG_CONTAINER" 5432/tcp || true
|
||||
docker logs "$PG_CONTAINER" || true
|
||||
exit 1
|
||||
fi
|
||||
# 127.0.0.1 (NOT localhost) — IPv6 first-resolve flake (#92).
|
||||
echo "PG_PORT=${PG_PORT}" >> "$GITHUB_ENV"
|
||||
echo "DATABASE_URL=postgres://dev:dev@127.0.0.1:${PG_PORT}/molecule?sslmode=disable" >> "$GITHUB_ENV"
|
||||
echo "Postgres host port: ${PG_PORT}"
|
||||
docker run -d --name "$PG_CONTAINER" -e POSTGRES_USER=dev -e POSTGRES_PASSWORD=dev -e POSTGRES_DB=molecule -p 15432:5432 postgres:16
|
||||
for i in $(seq 1 30); do
|
||||
if docker exec "$PG_CONTAINER" pg_isready -U dev >/dev/null 2>&1; then
|
||||
echo "Postgres ready after ${i}s"
|
||||
@ -214,20 +116,7 @@ jobs:
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: |
|
||||
docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
|
||||
docker run -d --name "$REDIS_CONTAINER" -p 0:6379 redis:7 >/dev/null
|
||||
REDIS_PORT=$(docker port "$REDIS_CONTAINER" 6379/tcp | awk -F: '/^0\.0\.0\.0:/ {print $2; exit}')
|
||||
if [ -z "$REDIS_PORT" ]; then
|
||||
REDIS_PORT=$(docker port "$REDIS_CONTAINER" 6379/tcp | head -1 | awk -F: '{print $NF}')
|
||||
fi
|
||||
if [ -z "$REDIS_PORT" ]; then
|
||||
echo "::error::Could not resolve host port for $REDIS_CONTAINER"
|
||||
docker port "$REDIS_CONTAINER" 6379/tcp || true
|
||||
docker logs "$REDIS_CONTAINER" || true
|
||||
exit 1
|
||||
fi
|
||||
echo "REDIS_PORT=${REDIS_PORT}" >> "$GITHUB_ENV"
|
||||
echo "REDIS_URL=redis://127.0.0.1:${REDIS_PORT}" >> "$GITHUB_ENV"
|
||||
echo "Redis host port: ${REDIS_PORT}"
|
||||
docker run -d --name "$REDIS_CONTAINER" -p 16379:6379 redis:7
|
||||
for i in $(seq 1 15); do
|
||||
if docker exec "$REDIS_CONTAINER" redis-cli ping 2>/dev/null | grep -q PONG; then
|
||||
echo "Redis ready after ${i}s"
|
||||
@ -246,15 +135,13 @@ jobs:
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
working-directory: workspace-server
|
||||
run: |
|
||||
# DATABASE_URL + REDIS_URL exported by the start-postgres /
|
||||
# start-redis steps point at this run's per-run host ports.
|
||||
./platform-server > platform.log 2>&1 &
|
||||
echo $! > platform.pid
|
||||
- name: Wait for /health
|
||||
if: needs.detect-changes.outputs.api == 'true'
|
||||
run: |
|
||||
for i in $(seq 1 30); do
|
||||
if curl -sf http://127.0.0.1:8080/health > /dev/null; then
|
||||
if curl -sf http://localhost:8080/health > /dev/null; then
|
||||
echo "Platform up after ${i}s"
|
||||
exit 0
|
||||
fi
|
||||
@ -298,9 +185,6 @@ jobs:
|
||||
kill "$(cat workspace-server/platform.pid)" 2>/dev/null || true
|
||||
fi
|
||||
- name: Stop service containers
|
||||
# always() so containers don't leak when test steps fail. The
|
||||
# cleanup is best-effort: if the container is already gone
|
||||
# (e.g. concurrent rerun race), don't fail the job.
|
||||
if: always() && needs.detect-changes.outputs.api == 'true'
|
||||
run: |
|
||||
docker rm -f "$PG_CONTAINER" 2>/dev/null || true
|
||||
|
||||
5
.github/workflows/e2e-staging-canvas.yml
vendored
5
.github/workflows/e2e-staging-canvas.yml
vendored
@ -22,9 +22,9 @@ on:
|
||||
# spending CI cycles. See e2e-api.yml for the rationale on why this
|
||||
# is a single job rather than two-jobs-sharing-name.
|
||||
push:
|
||||
branches: [main]
|
||||
branches: [main, staging]
|
||||
pull_request:
|
||||
branches: [main]
|
||||
branches: [main, staging]
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# Weekly on Sunday 08:00 UTC — catches Chrome / Playwright / Next.js
|
||||
@ -124,6 +124,7 @@ jobs:
|
||||
node-version: '20'
|
||||
cache: 'npm'
|
||||
cache-dependency-path: canvas/package-lock.json
|
||||
token: ${{ secrets.GITEA_TOKEN }}
|
||||
|
||||
- name: Install canvas deps
|
||||
if: needs.detect-changes.outputs.canvas == 'true'
|
||||
|
||||
4
.github/workflows/e2e-staging-external.yml
vendored
4
.github/workflows/e2e-staging-external.yml
vendored
@ -32,7 +32,7 @@ name: E2E Staging External Runtime
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
branches: [staging, main]
|
||||
paths:
|
||||
- 'workspace-server/internal/handlers/workspace.go'
|
||||
- 'workspace-server/internal/handlers/registry.go'
|
||||
@ -44,7 +44,7 @@ on:
|
||||
- 'tests/e2e/test_staging_external_runtime.sh'
|
||||
- '.github/workflows/e2e-staging-external.yml'
|
||||
pull_request:
|
||||
branches: [main]
|
||||
branches: [staging, main]
|
||||
paths:
|
||||
- 'workspace-server/internal/handlers/workspace.go'
|
||||
- 'workspace-server/internal/handlers/registry.go'
|
||||
|
||||
13
.github/workflows/e2e-staging-saas.yml
vendored
13
.github/workflows/e2e-staging-saas.yml
vendored
@ -20,12 +20,13 @@ name: E2E Staging SaaS (full lifecycle)
|
||||
# via the same paths watcher that e2e-api.yml uses)
|
||||
|
||||
on:
|
||||
# Trunk-based (Phase 3 of internal#81): main is the only branch.
|
||||
# Previously this fired on staging push too because staging was a
|
||||
# superset of main and ran the gate ahead of auto-promote; with no
|
||||
# staging branch, main is where E2E gates the deploy.
|
||||
# Fire on staging push too — previously this only ran on main, which
|
||||
# meant the most thorough end-to-end test caught regressions AFTER
|
||||
# they shipped to staging (and then to the auto-promote PR). Running
|
||||
# on staging push catches them BEFORE the staging→main promotion
|
||||
# opens, so a green canary into auto-promote is more meaningful.
|
||||
push:
|
||||
branches: [main]
|
||||
branches: [staging, main]
|
||||
paths:
|
||||
- 'workspace-server/internal/handlers/registry.go'
|
||||
- 'workspace-server/internal/handlers/workspace_provision.go'
|
||||
@ -35,7 +36,7 @@ on:
|
||||
- 'tests/e2e/test_staging_full_saas.sh'
|
||||
- '.github/workflows/e2e-staging-saas.yml'
|
||||
pull_request:
|
||||
branches: [main]
|
||||
branches: [staging, main]
|
||||
paths:
|
||||
- 'workspace-server/internal/handlers/registry.go'
|
||||
- 'workspace-server/internal/handlers/workspace_provision.go'
|
||||
|
||||
139
.github/workflows/handlers-postgres-integration.yml
vendored
139
.github/workflows/handlers-postgres-integration.yml
vendored
@ -14,42 +14,12 @@ name: Handlers Postgres Integration
|
||||
# self-review caught it took 2 minutes to set up and would have caught
|
||||
# the bug at PR-time.
|
||||
#
|
||||
# Why this workflow does NOT use `services: postgres:` (Class B fix)
|
||||
# ------------------------------------------------------------------
|
||||
# Our act_runner config has `container.network: host` (operator host
|
||||
# /opt/molecule/runners/config.yaml), which act_runner applies to BOTH
|
||||
# the job container AND every service container. With host-net, two
|
||||
# concurrent runs of this workflow both try to bind 0.0.0.0:5432 — the
|
||||
# second postgres FATALs with `could not create any TCP/IP sockets:
|
||||
# Address in use`, and Docker auto-removes it (act_runner sets
|
||||
# AutoRemove:true on service containers). By the time the migrations
|
||||
# step runs `psql`, the postgres container is gone, hence
|
||||
# `Connection refused` then `failed to remove container: No such
|
||||
# container` at cleanup time.
|
||||
# This job spins a Postgres service container, applies the migration,
|
||||
# and runs `go test -tags=integration` against a live DB. Required
|
||||
# check on staging branch protection — backend handler PRs cannot
|
||||
# merge without a real-DB regression gate.
|
||||
#
|
||||
# Per-job `container.network` override is silently ignored by
|
||||
# act_runner — `--network and --net in the options will be ignored.`
|
||||
# appears in the runner log. Documented constraint.
|
||||
#
|
||||
# So we sidestep `services:` entirely. The job container still uses
|
||||
# host-net (inherited from runner config; required for cache server
|
||||
# discovery on the bridge IP 172.18.0.17:42631). We launch a sibling
|
||||
# postgres on the existing `molecule-monorepo-net` bridge with a
|
||||
# UNIQUE name per run — `pg-handlers-${RUN_ID}-${RUN_ATTEMPT}` — and
|
||||
# read its bridge IP via `docker inspect`. A host-net job container
|
||||
# can reach a bridge-net container directly via the bridge IP (verified
|
||||
# manually on operator host 2026-05-08).
|
||||
#
|
||||
# Trade-offs vs. the original `services:` shape:
|
||||
# + No host-port collision; N parallel runs share the bridge cleanly
|
||||
# + `if: always()` cleanup runs even on test-step failure
|
||||
# - One more step in the workflow (+~3 lines)
|
||||
# - Requires `molecule-monorepo-net` to exist on the operator host
|
||||
# (it does; declared in docker-compose.yml + docker-compose.infra.yml)
|
||||
#
|
||||
# Class B Hongming-owned CICD red sweep, 2026-05-08.
|
||||
#
|
||||
# Cost: ~30s job (postgres pull from cache + go build + 4 tests).
|
||||
# Cost: ~30s job (postgres pull from GH cache + go build + 4 tests).
|
||||
|
||||
on:
|
||||
push:
|
||||
@ -89,14 +59,20 @@ jobs:
|
||||
name: Handlers Postgres Integration
|
||||
needs: detect-changes
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
# Unique name per run so concurrent jobs don't collide on the
|
||||
# bridge network. ${RUN_ID}-${RUN_ATTEMPT} is unique even across
|
||||
# workflow_dispatch reruns of the same run_id.
|
||||
PG_NAME: pg-handlers-${{ github.run_id }}-${{ github.run_attempt }}
|
||||
# Bridge network already exists on the operator host (declared
|
||||
# in docker-compose.yml + docker-compose.infra.yml).
|
||||
PG_NETWORK: molecule-monorepo-net
|
||||
services:
|
||||
postgres:
|
||||
image: postgres:15-alpine
|
||||
env:
|
||||
POSTGRES_PASSWORD: test
|
||||
POSTGRES_DB: molecule
|
||||
ports:
|
||||
- 5432:5432
|
||||
# GHA spins this with --health-cmd built in for postgres images.
|
||||
options: >-
|
||||
--health-cmd pg_isready
|
||||
--health-interval 5s
|
||||
--health-timeout 5s
|
||||
--health-retries 10
|
||||
defaults:
|
||||
run:
|
||||
working-directory: workspace-server
|
||||
@ -113,57 +89,16 @@ jobs:
|
||||
with:
|
||||
go-version: 'stable'
|
||||
|
||||
- if: needs.detect-changes.outputs.handlers == 'true'
|
||||
name: Start sibling Postgres on bridge network
|
||||
working-directory: .
|
||||
run: |
|
||||
# Sanity: the bridge network must exist on the operator host.
|
||||
# Hard-fail loud if it doesn't — easier to spot than a silent
|
||||
# auto-create that diverges from the rest of the stack.
|
||||
if ! docker network inspect "${PG_NETWORK}" >/dev/null 2>&1; then
|
||||
echo "::error::Bridge network '${PG_NETWORK}' missing on operator host. Re-run docker-compose.infra.yml or check ops handbook."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# If a stale container with the same name exists (rerun on
|
||||
# the same run_id), wipe it first.
|
||||
docker rm -f "${PG_NAME}" >/dev/null 2>&1 || true
|
||||
|
||||
docker run -d \
|
||||
--name "${PG_NAME}" \
|
||||
--network "${PG_NETWORK}" \
|
||||
--health-cmd "pg_isready -U postgres" \
|
||||
--health-interval 5s \
|
||||
--health-timeout 5s \
|
||||
--health-retries 10 \
|
||||
-e POSTGRES_PASSWORD=test \
|
||||
-e POSTGRES_DB=molecule \
|
||||
postgres:15-alpine >/dev/null
|
||||
|
||||
# Read back the bridge IP. Always present immediately after
|
||||
# `docker run -d` for bridge networks.
|
||||
PG_HOST=$(docker inspect "${PG_NAME}" \
|
||||
--format "{{(index .NetworkSettings.Networks \"${PG_NETWORK}\").IPAddress}}")
|
||||
if [ -z "${PG_HOST}" ]; then
|
||||
echo "::error::Could not resolve PG_HOST for ${PG_NAME} on ${PG_NETWORK}"
|
||||
docker logs "${PG_NAME}" || true
|
||||
exit 1
|
||||
fi
|
||||
echo "PG_HOST=${PG_HOST}" >> "$GITHUB_ENV"
|
||||
echo "INTEGRATION_DB_URL=postgres://postgres:test@${PG_HOST}:5432/molecule?sslmode=disable" >> "$GITHUB_ENV"
|
||||
echo "Started ${PG_NAME} at ${PG_HOST}:5432"
|
||||
|
||||
- if: needs.detect-changes.outputs.handlers == 'true'
|
||||
name: Apply migrations to Postgres service
|
||||
env:
|
||||
PGPASSWORD: test
|
||||
run: |
|
||||
# Wait for postgres to actually accept connections. Docker's
|
||||
# health-cmd handles container-side readiness, but the wire
|
||||
# to the bridge IP is best-tested with pg_isready directly.
|
||||
# Wait for postgres to actually accept connections (the
|
||||
# GHA --health-cmd is best-effort but psql can still race).
|
||||
for i in {1..15}; do
|
||||
if pg_isready -h "${PG_HOST}" -p 5432 -U postgres -q; then break; fi
|
||||
echo "waiting for postgres at ${PG_HOST}:5432..."; sleep 2
|
||||
if pg_isready -h localhost -p 5432 -U postgres -q; then break; fi
|
||||
echo "waiting for postgres..."; sleep 2
|
||||
done
|
||||
|
||||
# Apply every .up.sql in lexicographic order with
|
||||
@ -196,7 +131,7 @@ jobs:
|
||||
# not fine once a cross-table atomicity test came in.
|
||||
set +e
|
||||
for migration in $(ls migrations/*.sql 2>/dev/null | grep -v '\.down\.sql$' | sort); do
|
||||
if psql -h "${PG_HOST}" -U postgres -d molecule -v ON_ERROR_STOP=1 \
|
||||
if psql -h localhost -U postgres -d molecule -v ON_ERROR_STOP=1 \
|
||||
-f "$migration" >/dev/null 2>&1; then
|
||||
echo "✓ $(basename "$migration")"
|
||||
else
|
||||
@ -210,7 +145,7 @@ jobs:
|
||||
# fail if any didn't land — that would be a real regression we
|
||||
# want loud.
|
||||
for tbl in delegations workspaces activity_logs pending_uploads; do
|
||||
if ! psql -h "${PG_HOST}" -U postgres -d molecule -tA \
|
||||
if ! psql -h localhost -U postgres -d molecule -tA \
|
||||
-c "SELECT 1 FROM information_schema.tables WHERE table_name = '$tbl'" \
|
||||
| grep -q 1; then
|
||||
echo "::error::$tbl table missing after migration replay — handler integration tests would be meaningless"
|
||||
@ -221,32 +156,16 @@ jobs:
|
||||
|
||||
- if: needs.detect-changes.outputs.handlers == 'true'
|
||||
name: Run integration tests
|
||||
env:
|
||||
INTEGRATION_DB_URL: postgres://postgres:test@localhost:5432/molecule?sslmode=disable
|
||||
run: |
|
||||
# INTEGRATION_DB_URL is exported by the start-postgres step;
|
||||
# points at the per-run bridge IP, not 127.0.0.1, so concurrent
|
||||
# workflow runs don't fight over a host-net 5432 port.
|
||||
go test -tags=integration -timeout 5m -v ./internal/handlers/ -run "^TestIntegration_"
|
||||
|
||||
- if: failure() && needs.detect-changes.outputs.handlers == 'true'
|
||||
- if: needs.detect-changes.outputs.handlers == 'true' && failure()
|
||||
name: Diagnostic dump on failure
|
||||
env:
|
||||
PGPASSWORD: test
|
||||
run: |
|
||||
echo "::group::postgres container status"
|
||||
docker ps -a --filter "name=${PG_NAME}" --format '{{.Status}} {{.Names}}' || true
|
||||
docker logs "${PG_NAME}" 2>&1 | tail -50 || true
|
||||
echo "::endgroup::"
|
||||
echo "::group::delegations table state"
|
||||
psql -h "${PG_HOST}" -U postgres -d molecule -c "SELECT * FROM delegations LIMIT 50;" || true
|
||||
psql -h localhost -U postgres -d molecule -c "SELECT * FROM delegations LIMIT 50;" || true
|
||||
echo "::endgroup::"
|
||||
|
||||
- if: always() && needs.detect-changes.outputs.handlers == 'true'
|
||||
name: Stop sibling Postgres
|
||||
working-directory: .
|
||||
run: |
|
||||
# always() so containers don't leak when migrations or tests
|
||||
# fail. The cleanup is best-effort: if the container is
|
||||
# already gone (e.g. concurrent rerun race), don't fail the job.
|
||||
docker rm -f "${PG_NAME}" >/dev/null 2>&1 || true
|
||||
echo "Cleaned up ${PG_NAME}"
|
||||
|
||||
|
||||
11
.github/workflows/harness-replays.yml
vendored
11
.github/workflows/harness-replays.yml
vendored
@ -119,17 +119,6 @@ jobs:
|
||||
# symptom, different root cause: staging still has the in-image
|
||||
# clone path, hits the auth error directly).
|
||||
#
|
||||
# 2026-05-08 sub-finding (#192): the clone step ALSO fails when
|
||||
# any referenced workspace-template repo is private and the
|
||||
# AUTO_SYNC_TOKEN bearer (devops-engineer persona) lacks read
|
||||
# access. Root cause: 5 of 9 workspace-template repos
|
||||
# (openclaw, codex, crewai, deepagents, gemini-cli) had been
|
||||
# marked private with no team grant. Resolution: flipped them
|
||||
# to public per `feedback_oss_first_repo_visibility_default`
|
||||
# (the OSS surface should be public). Layer-3 (customer-private +
|
||||
# marketplace third-party repos) tracked separately in
|
||||
# internal#102.
|
||||
#
|
||||
# Token shape matches publish-workspace-server-image.yml: AUTO_SYNC_TOKEN
|
||||
# is the devops-engineer persona PAT, NOT the founder PAT (per
|
||||
# `feedback_per_agent_gitea_identity_default`). clone-manifest.sh
|
||||
|
||||
59
.github/workflows/pr-guards.yml
vendored
59
.github/workflows/pr-guards.yml
vendored
@ -1,25 +1,14 @@
|
||||
name: pr-guards
|
||||
|
||||
# PR-time guards. Today the only guard is "disable auto-merge when a
|
||||
# new commit is pushed after auto-merge was enabled" — added 2026-04-27
|
||||
# after PR #2174 auto-merged with only its first commit because the
|
||||
# second commit was pushed after the merge queue had locked the PR's
|
||||
# SHA.
|
||||
# Thin caller that delegates to the molecule-ci reusable guard. Today
|
||||
# the guard is just "disable auto-merge when a new commit is pushed
|
||||
# after auto-merge was enabled" — added 2026-04-27 after PR #2174
|
||||
# auto-merged with only its first commit because the second commit
|
||||
# was pushed after the merge queue had locked the PR's SHA.
|
||||
#
|
||||
# Why this is inlined (not delegated to molecule-ci's reusable
|
||||
# workflow): the reusable workflow uses `gh pr merge --disable-auto`,
|
||||
# which calls GitHub's GraphQL API. Gitea has no GraphQL endpoint and
|
||||
# returns HTTP 405 on /api/graphql, so the job failed on every Gitea
|
||||
# PR push since the 2026-05-06 migration. Gitea also has no `--auto`
|
||||
# merge primitive that this job could be acting on, so the right
|
||||
# behaviour on Gitea is "no-op + green status" — not a 405.
|
||||
#
|
||||
# Inlining (vs. an `if:` on the `uses:` line) keeps the job ALWAYS
|
||||
# running, which matters for branch protection: required-check names
|
||||
# need a job that emits SUCCESS terminal state, not SKIPPED. See
|
||||
# `feedback_branch_protection_check_name_parity` and `feedback_pr_merge_safety_guards`.
|
||||
#
|
||||
# Issue #88 item 1.
|
||||
# When more PR-time guards land in molecule-ci, add them here as
|
||||
# additional jobs that share the same pull_request:synchronize
|
||||
# trigger.
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
@ -30,34 +19,4 @@ permissions:
|
||||
|
||||
jobs:
|
||||
disable-auto-merge-on-push:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
# Detect Gitea Actions. act_runner sets GITEA_ACTIONS=true in the
|
||||
# step env on every job. Belt-and-suspenders: also check the repo
|
||||
# url's host, which is independent of any runner-side env config
|
||||
# (covers a future Gitea host where the env var is forgotten).
|
||||
- name: Detect runner host
|
||||
id: host
|
||||
run: |
|
||||
if [[ "${GITEA_ACTIONS:-}" == "true" ]] || [[ "${{ github.server_url }}" == *moleculesai.app* ]] || [[ "${{ github.event.repository.html_url }}" == *moleculesai.app* ]]; then
|
||||
echo "is_gitea=true" >> "$GITHUB_OUTPUT"
|
||||
echo "::notice::Gitea Actions detected — auto-merge gating is not applicable here (Gitea has no --auto merge primitive). Job will no-op."
|
||||
else
|
||||
echo "is_gitea=false" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
- name: Disable auto-merge (GitHub only)
|
||||
if: steps.host.outputs.is_gitea != 'true'
|
||||
env:
|
||||
GH_TOKEN: ${{ github.token }}
|
||||
PR: ${{ github.event.pull_request.number }}
|
||||
REPO: ${{ github.repository }}
|
||||
NEW_SHA: ${{ github.sha }}
|
||||
run: |
|
||||
set -eu
|
||||
gh pr merge "$PR" --disable-auto -R "$REPO" || true
|
||||
gh pr comment "$PR" -R "$REPO" --body "🔒 Auto-merge disabled — new commit (\`${NEW_SHA:0:7}\`) pushed after auto-merge was enabled. The merge queue locks SHAs at entry, so subsequent pushes can race. Verify the new commit and re-enable with \`gh pr merge --auto\`."
|
||||
|
||||
- name: Gitea no-op
|
||||
if: steps.host.outputs.is_gitea == 'true'
|
||||
run: echo "Gitea Actions — auto-merge gating not applicable; no-op (job intentionally green so branch protection's required-check name lands SUCCESS)."
|
||||
uses: molecule-ai/molecule-ci/.github/workflows/disable-auto-merge-on-push.yml@main
|
||||
|
||||
177
.github/workflows/publish-runtime.yml
vendored
177
.github/workflows/publish-runtime.yml
vendored
@ -282,33 +282,42 @@ jobs:
|
||||
echo "::error::Refusing to fan out cascade against stale or corrupt PyPI surfaces."
|
||||
exit 1
|
||||
|
||||
- name: Fan out via push to .runtime-version
|
||||
- name: Fan out repository_dispatch
|
||||
env:
|
||||
# Gitea PAT with write:repository scope on the 8 cascade-active
|
||||
# template repos. Used here for `git push` (NOT for an API
|
||||
# dispatch — Gitea 1.22.6 has no repository_dispatch endpoint;
|
||||
# empirically verified across 6 candidate paths in molecule-
|
||||
# core#20 issuecomment-913). The push trips each template's
|
||||
# existing `on: push: branches: [main]` trigger on
|
||||
# publish-image.yml, which then reads the updated
|
||||
# .runtime-version via its resolve-version job.
|
||||
DISPATCH_TOKEN: ${{ secrets.DISPATCH_TOKEN }}
|
||||
# Fine-grained PAT with `actions:write` on the 8 template repos.
|
||||
# GITHUB_TOKEN can't fire dispatches across repos — needs an explicit
|
||||
# token. Stored as a repo secret; rotate per the standard schedule.
|
||||
DISPATCH_TOKEN: ${{ secrets.TEMPLATE_DISPATCH_TOKEN }}
|
||||
# Single source of truth: the publish job's output, which handles
|
||||
# tag/manual-input/auto-bump uniformly. The previous fallback
|
||||
# (`steps.version.outputs.version` from inside the cascade job)
|
||||
# was a dead reference — different job, no shared step scope.
|
||||
RUNTIME_VERSION: ${{ needs.publish.outputs.version }}
|
||||
run: |
|
||||
set +e # don't abort on a single repo failure — collect them all
|
||||
|
||||
# Soft-skip on workflow_dispatch when the token is missing
|
||||
# (operator ad-hoc test); hard-fail on push so unattended
|
||||
# publishes can't silently skip the cascade. Same shape as
|
||||
# the original v1, intentional split per the schedule-vs-
|
||||
# dispatch hardening 2026-04-28.
|
||||
# Schedule-vs-dispatch behaviour split (hardened 2026-04-28
|
||||
# after the sweep-cf-orphans soft-skip incident — same class
|
||||
# of bug):
|
||||
#
|
||||
# The earlier "skipping cascade. templates will pick up the
|
||||
# new version on their own next rebuild" message was wrong —
|
||||
# templates only build on this dispatch trigger; without it
|
||||
# they stay pinned to whatever runtime version they last saw.
|
||||
# A silent skip here means "PyPI is current, templates are
|
||||
# not" and the gap is invisible until someone notices a
|
||||
# template still on the old version weeks later.
|
||||
#
|
||||
# - push → exit 1 (red CI surfaces the gap)
|
||||
# - workflow_dispatch → exit 0 with a warning (operator
|
||||
# ran this ad-hoc; let them rerun
|
||||
# after fixing the secret)
|
||||
if [ -z "$DISPATCH_TOKEN" ]; then
|
||||
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
|
||||
echo "::warning::DISPATCH_TOKEN secret not set — skipping cascade."
|
||||
echo "::warning::TEMPLATE_DISPATCH_TOKEN secret not set — skipping cascade."
|
||||
echo "::warning::set it at Settings → Secrets and Variables → Actions, then rerun. Templates will stay on the prior runtime version until either this token is set or each template is rebuilt manually."
|
||||
exit 0
|
||||
fi
|
||||
echo "::error::DISPATCH_TOKEN secret missing — cascade cannot fan out."
|
||||
echo "::error::TEMPLATE_DISPATCH_TOKEN secret missing — cascade cannot fan out."
|
||||
echo "::error::PyPI was published, but the 8 template repos will NOT pick up the new version until this token is restored and a republish dispatches the cascade."
|
||||
echo "::error::set it at Settings → Secrets and Variables → Actions; then re-trigger publish-runtime via workflow_dispatch."
|
||||
exit 1
|
||||
@ -318,119 +327,37 @@ jobs:
|
||||
echo "::error::publish job did not expose a version output — cascade cannot fan out"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# All 9 workspace templates declared in manifest.json. The list
|
||||
# MUST stay aligned with manifest.json's workspace_templates —
|
||||
# cascade-list-drift-gate.yml enforces this in CI per the
|
||||
# codex-stuck-on-stale-runtime invariant from PR #2556.
|
||||
# Long-term goal: derive this list from manifest.json so it
|
||||
# can't drift even on a manifest edit (RFC #388 Phase-1).
|
||||
#
|
||||
# Per-template publish-image.yml presence is checked at
|
||||
# cascade-time below: codex doesn't ship one today, so the
|
||||
# cascade soft-skips it with an informational message rather
|
||||
# than dropping it from this list (which would re-introduce
|
||||
# the drift the gate exists to catch).
|
||||
GITEA_URL="${GITEA_URL:-https://git.moleculesai.app}"
|
||||
# All 9 active workspace template repos. The PR #2536 pruning
|
||||
# ("deprecated, no shipping images") was empirically wrong:
|
||||
# continuous-synth-e2e.yml defaults to langgraph as its primary
|
||||
# canary (line 44), and every excluded template had successful
|
||||
# publish-image runs as of 2026-05-03 — none were dormant.
|
||||
# Symptom of the prune: today's a2a-sdk strict-mode fix
|
||||
# (#2566 / commit e1628c4) cascaded to 4 templates but never
|
||||
# reached langgraph, so the synth-E2E correctly canary'd a fix
|
||||
# that had landed but not deployed. Re-added the 5 templates.
|
||||
# Long-term: derive this list from manifest.json so cascade
|
||||
# scope can't drift from E2E scope — tracked in RFC #388 as a
|
||||
# Phase-1 invariant.
|
||||
TEMPLATES="claude-code hermes openclaw codex langgraph crewai autogen deepagents gemini-cli"
|
||||
FAILED=""
|
||||
SKIPPED=""
|
||||
|
||||
# Configure git identity once. The persona owning DISPATCH_TOKEN
|
||||
# is the same identity that authored this commit on each
|
||||
# template; using a generic "publish-runtime cascade" co-author
|
||||
# trailer in the message keeps the audit trail honest about the
|
||||
# workflow-driven origin.
|
||||
git config --global user.name "publish-runtime cascade"
|
||||
git config --global user.email "publish-runtime@moleculesai.app"
|
||||
|
||||
WORKDIR="$(mktemp -d)"
|
||||
for tpl in $TEMPLATES; do
|
||||
REPO="molecule-ai/molecule-ai-workspace-template-$tpl"
|
||||
CLONE="$WORKDIR/$tpl"
|
||||
|
||||
# Pre-check: skip templates without a publish-image.yml.
|
||||
# The cascade's job is to trip the template's on-push
|
||||
# rebuild — if there's no rebuild workflow, pushing a
|
||||
# .runtime-version commit is just noise on the target
|
||||
# repo. Use the Gitea contents API (no clone required for
|
||||
# the probe). 200 = present; 404 = absent.
|
||||
HTTP=$(curl -sS -o /dev/null -w "%{http_code}" \
|
||||
-H "Authorization: token $DISPATCH_TOKEN" \
|
||||
"$GITEA_URL/api/v1/repos/$REPO/contents/.github/workflows/publish-image.yml")
|
||||
if [ "$HTTP" = "404" ]; then
|
||||
echo "↷ $tpl has no publish-image.yml — soft-skip (informational; manifest still tracks it)"
|
||||
SKIPPED="$SKIPPED $tpl"
|
||||
continue
|
||||
fi
|
||||
if [ "$HTTP" != "200" ]; then
|
||||
echo "::warning::$tpl publish-image.yml probe returned HTTP $HTTP — proceeding anyway, push will surface the real failure if any"
|
||||
fi
|
||||
|
||||
# Use a per-template attempt loop so a transient race (e.g.
|
||||
# human pushing to the same template at the same instant)
|
||||
# doesn't lose the cascade. Bounded retries (3) — beyond
|
||||
# that we surface the failure and let the operator retry.
|
||||
attempt=0
|
||||
success=false
|
||||
while [ $attempt -lt 3 ]; do
|
||||
attempt=$((attempt + 1))
|
||||
rm -rf "$CLONE"
|
||||
if ! git clone --depth=1 \
|
||||
"https://x-access-token:${DISPATCH_TOKEN}@${GITEA_URL#https://}/$REPO.git" \
|
||||
"$CLONE" >/tmp/clone.log 2>&1; then
|
||||
echo "::warning::clone $tpl attempt $attempt failed: $(tail -n3 /tmp/clone.log)"
|
||||
sleep 2
|
||||
continue
|
||||
fi
|
||||
|
||||
cd "$CLONE"
|
||||
echo "$VERSION" > .runtime-version
|
||||
|
||||
# Idempotency guard: if the file already matches, this
|
||||
# publish is a re-run for a version already cascaded.
|
||||
# Don't push a no-op commit (would spuriously re-trip the
|
||||
# template's on-push and rebuild for nothing).
|
||||
if git diff --quiet -- .runtime-version; then
|
||||
echo "✓ $tpl already at $VERSION — no commit needed (idempotent)"
|
||||
success=true
|
||||
cd - >/dev/null
|
||||
break
|
||||
fi
|
||||
|
||||
git add .runtime-version
|
||||
git commit -m "chore: pin runtime to $VERSION (publish-runtime cascade)" \
|
||||
-m "Co-Authored-By: publish-runtime cascade <publish-runtime@moleculesai.app>" \
|
||||
>/dev/null
|
||||
|
||||
if git push origin HEAD:main >/tmp/push.log 2>&1; then
|
||||
echo "✓ $tpl pushed $VERSION on attempt $attempt"
|
||||
success=true
|
||||
cd - >/dev/null
|
||||
break
|
||||
fi
|
||||
|
||||
# Likely a non-fast-forward — pull-rebase and retry.
|
||||
# Don't force-push: that would silently overwrite a racing
|
||||
# human/cascade commit.
|
||||
echo "::warning::push $tpl attempt $attempt failed, pull-rebasing: $(tail -n3 /tmp/push.log)"
|
||||
git pull --rebase origin main >/tmp/rebase.log 2>&1 || true
|
||||
cd - >/dev/null
|
||||
done
|
||||
|
||||
if [ "$success" != "true" ]; then
|
||||
STATUS=$(curl -sS -o /tmp/dispatch.out -w "%{http_code}" \
|
||||
-X POST "https://api.github.com/repos/$REPO/dispatches" \
|
||||
-H "Authorization: Bearer $DISPATCH_TOKEN" \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
-d "{\"event_type\":\"runtime-published\",\"client_payload\":{\"runtime_version\":\"$VERSION\"}}")
|
||||
if [ "$STATUS" = "204" ]; then
|
||||
echo "✓ dispatched $tpl ($VERSION)"
|
||||
else
|
||||
echo "::warning::✗ failed to dispatch $tpl: HTTP $STATUS — $(cat /tmp/dispatch.out)"
|
||||
FAILED="$FAILED $tpl"
|
||||
fi
|
||||
done
|
||||
rm -rf "$WORKDIR"
|
||||
|
||||
if [ -n "$FAILED" ]; then
|
||||
echo "::error::Cascade incomplete after 3 retries each. Failed templates:$FAILED"
|
||||
echo "::error::PyPI publish succeeded; failed templates lag the new version. Re-run this workflow_dispatch with the same version to retry only the laggers (idempotent — already-cascaded templates skip)."
|
||||
exit 1
|
||||
fi
|
||||
if [ -n "$SKIPPED" ]; then
|
||||
echo "Cascade complete: pinned $VERSION on cascade-active templates. Soft-skipped (no publish-image.yml):$SKIPPED"
|
||||
else
|
||||
echo "Cascade complete: $VERSION pinned across all manifest workspace_templates."
|
||||
echo "::warning::Cascade incomplete. Failed templates:$FAILED"
|
||||
# Don't fail the whole job — PyPI publish already succeeded;
|
||||
# operators can retry the failed templates manually.
|
||||
fi
|
||||
|
||||
@ -36,7 +36,7 @@ on:
|
||||
workflow_run:
|
||||
workflows: ['publish-workspace-server-image']
|
||||
types: [completed]
|
||||
branches: [main]
|
||||
branches: [staging]
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
target_tag:
|
||||
|
||||
276
.github/workflows/retarget-main-to-staging.yml
vendored
Normal file
276
.github/workflows/retarget-main-to-staging.yml
vendored
Normal file
@ -0,0 +1,276 @@
|
||||
name: Retarget main PRs to staging
|
||||
|
||||
# Mechanical enforcement of SHARED_RULES rule 8 ("Staging-first
|
||||
# workflow, no exceptions"). When a bot opens a PR against `main`,
|
||||
# retarget it to `staging` automatically and leave an explanatory
|
||||
# comment. Human / CEO-authored PRs (the staging→main promotion
|
||||
# PRs, etc.) are left alone — they're the authorised exception
|
||||
# to the rule.
|
||||
#
|
||||
# ============================================================
|
||||
# What this workflow does
|
||||
# ============================================================
|
||||
#
|
||||
# On `pull_request_target` opened/reopened against `main`:
|
||||
# 1. If the PR head is `staging`, skip (the auto-promote PRs
|
||||
# MUST stay base=main).
|
||||
# 2. If the PR author is a bot, retarget the PR base to
|
||||
# `staging` via Gitea REST `PATCH /pulls/{N}` body
|
||||
# `{"base":"staging"}`.
|
||||
# 3. If the retarget returns 422 "pull request already exists
|
||||
# for base branch 'staging'" (issue #1884 case: another PR
|
||||
# on the same head already targets staging), close the
|
||||
# now-redundant main-PR via Gitea REST instead of failing
|
||||
# red.
|
||||
# 4. Post an explainer comment on the retargeted PR via
|
||||
# Gitea REST `POST /issues/{N}/comments`.
|
||||
#
|
||||
# ============================================================
|
||||
# Why Gitea REST (and not `gh api / gh pr close / gh pr comment`)
|
||||
# ============================================================
|
||||
#
|
||||
# Pre-2026-05-06 this workflow used `gh api -X PATCH "repos/{owner}/{repo}/pulls/{N}" -f base=staging`
|
||||
# plus `gh pr close` and `gh pr comment`. After the GitHub→Gitea
|
||||
# cutover those calls fail because:
|
||||
#
|
||||
# - `gh` CLI defaults to `api.github.com`. Even with `GH_HOST`
|
||||
# pointing at Gitea, `gh pr close / comment` route through
|
||||
# GraphQL (`/api/graphql`) which Gitea does not expose.
|
||||
# Empirical: every `gh pr *` call returns
|
||||
# `HTTP 405 Method Not Allowed (https://git.moleculesai.app/api/graphql)`
|
||||
# — same root cause as #65 (auto-sync, fixed in PR #66) and
|
||||
# #73/#195 (auto-promote, fixed in PR #78).
|
||||
# - `gh api -X PATCH /pulls/{N}` happens to use a REST path
|
||||
# that Gitea also has, but the `gh` host-resolution layer
|
||||
# and pagination/retry logic don't always hit Gitea cleanly,
|
||||
# and the cost of switching to direct `curl` is one extra
|
||||
# line of code.
|
||||
#
|
||||
# So this workflow uses direct `curl` calls to Gitea REST. No
|
||||
# `gh` CLI dependency, no GraphQL, no flaky host-resolution.
|
||||
#
|
||||
# ============================================================
|
||||
# Identity + token (anti-bot-ring per saved-memory
|
||||
# `feedback_per_agent_gitea_identity_default`)
|
||||
# ============================================================
|
||||
#
|
||||
# Pre-fix this workflow used the per-job ephemeral
|
||||
# `secrets.GITHUB_TOKEN`. On Gitea Actions that token has
|
||||
# narrow scope and unpredictable cross-PR write capability.
|
||||
#
|
||||
# Post-fix: `secrets.AUTO_SYNC_TOKEN` (the `devops-engineer`
|
||||
# Gitea persona). Same persona used by `auto-sync-main-to-staging.yml`
|
||||
# (PR #66) and `auto-promote-staging.yml` (PR #78). Token scope:
|
||||
# `push: true` repo write, sufficient for PR-edit + close + comment.
|
||||
#
|
||||
# Why this token does NOT need branch-protection bypass:
|
||||
# patching a PR's base ref is a PR-level operation that does not
|
||||
# require push perms on either branch (the PR's own commits stay
|
||||
# put; only the metadata changes).
|
||||
#
|
||||
# ============================================================
|
||||
# Failure modes & operational notes
|
||||
# ============================================================
|
||||
#
|
||||
# A — PATCH base→staging returns 422 "pull request already exists"
|
||||
# (issue #1884 case):
|
||||
# - Detected by string-match on response body. Workflow
|
||||
# falls through to closing the now-redundant main-PR
|
||||
# (Gitea REST `PATCH /pulls/{N}` with `state: closed`)
|
||||
# and posts an explanation comment. Step summary surfaces.
|
||||
#
|
||||
# B — `AUTO_SYNC_TOKEN` rotated / wrong scope:
|
||||
# - First REST call returns 401/403. Step summary surfaces.
|
||||
# Re-issue token from `~/.molecule-ai/personas/` on the
|
||||
# operator host and update repo Actions secret.
|
||||
#
|
||||
# C — PR was deleted between trigger and run:
|
||||
# - REST call returns 404. Workflow exits 0 with a notice
|
||||
# (the rule was already enforced or the PR is gone).
|
||||
#
|
||||
# D — author is not actually a bot but the filter mis-fires:
|
||||
# - Filter is conservative: only triggers on
|
||||
# `user.type == 'Bot'`, `login` ends with `[bot]`, or
|
||||
# known bot logins (`molecule-ai[bot]`, `app/molecule-ai`).
|
||||
# Human PRs slip through unaffected. If a NEW bot login
|
||||
# starts shipping main-PRs, add it to the filter.
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened, reopened]
|
||||
branches: [main]
|
||||
|
||||
permissions:
|
||||
pull-requests: write
|
||||
|
||||
jobs:
|
||||
retarget:
|
||||
name: Retarget to staging
|
||||
runs-on: ubuntu-latest
|
||||
# Only fire for bot-authored PRs. Human CEO PRs (staging→main
|
||||
# promotion) are intentional and pass through.
|
||||
#
|
||||
# Head-ref guard: never retarget a PR whose head IS `staging`
|
||||
# — those are the auto-promote staging→main PRs (opened by
|
||||
# `devops-engineer` since PR #78 / #195 fix). Retargeting
|
||||
# head=staging onto base=staging fails with HTTP 422 "no new
|
||||
# commits between base 'staging' and head 'staging'", which
|
||||
# would surface as a noisy red workflow run on every
|
||||
# auto-promote (caught 2026-05-03 on the GitHub-era PR #2588).
|
||||
if: >-
|
||||
github.event.pull_request.head.ref != 'staging'
|
||||
&& (
|
||||
github.event.pull_request.user.type == 'Bot'
|
||||
|| endsWith(github.event.pull_request.user.login, '[bot]')
|
||||
|| github.event.pull_request.user.login == 'app/molecule-ai'
|
||||
|| github.event.pull_request.user.login == 'molecule-ai[bot]'
|
||||
|| github.event.pull_request.user.login == 'devops-engineer'
|
||||
)
|
||||
steps:
|
||||
- name: Retarget PR base to staging via Gitea REST
|
||||
id: retarget
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
|
||||
GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }}
|
||||
REPO: ${{ github.repository }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
PR_AUTHOR: ${{ github.event.pull_request.user.login }}
|
||||
# Issue #1884 case: when the bot opens a PR against main
|
||||
# and there's already another PR on the same head branch
|
||||
# targeting staging, Gitea's PATCH returns 422 with a
|
||||
# body mentioning "pull request already exists for base
|
||||
# branch 'staging'" (the Gitea message wording is
|
||||
# slightly different from GitHub's; the substring match
|
||||
# below covers both for forward/back compat).
|
||||
# The retarget can't proceed — but the right response is
|
||||
# to close the now-redundant main-PR, not to fail the
|
||||
# workflow noisily. Detect that specific 422 and close
|
||||
# instead.
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
API="${GITEA_HOST}/api/v1/repos/${REPO}"
|
||||
AUTH=(-H "Authorization: token ${GITEA_TOKEN}" -H "Accept: application/json")
|
||||
|
||||
echo "Retargeting PR #${PR_NUMBER} (author: ${PR_AUTHOR}) from main → staging"
|
||||
|
||||
# Curl-status-capture pattern per `feedback_curl_status_capture_pollution`:
|
||||
# http_code via -w to its own scalar, body to a tempfile, set +e/-e
|
||||
# bracket so curl's non-zero-on-4xx doesn't pollute the script's exit chain.
|
||||
BODY_FILE=$(mktemp)
|
||||
REQ='{"base":"staging"}'
|
||||
|
||||
set +e
|
||||
STATUS=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \
|
||||
-X PATCH -d "${REQ}" \
|
||||
-o "${BODY_FILE}" -w "%{http_code}" \
|
||||
"${API}/pulls/${PR_NUMBER}")
|
||||
CURL_RC=$?
|
||||
set -e
|
||||
|
||||
if [ "${CURL_RC}" -ne 0 ]; then
|
||||
echo "::error::curl PATCH failed (rc=${CURL_RC})"
|
||||
rm -f "${BODY_FILE}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ "${STATUS}" = "201" ] || [ "${STATUS}" = "200" ]; then
|
||||
NEW_BASE=$(jq -r '.base.ref // "?"' < "${BODY_FILE}")
|
||||
rm -f "${BODY_FILE}"
|
||||
if [ "${NEW_BASE}" = "staging" ]; then
|
||||
echo "::notice::Retargeted PR #${PR_NUMBER} → staging"
|
||||
echo "outcome=retargeted" >> "$GITHUB_OUTPUT"
|
||||
exit 0
|
||||
fi
|
||||
echo "::error::PATCH returned ${STATUS} but base.ref is '${NEW_BASE}', not 'staging'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Specifically match the 422 duplicate-base/head error so
|
||||
# any OTHER PATCH failure (auth, deleted PR, etc.) still
|
||||
# surfaces as a real workflow failure.
|
||||
BODY=$(cat "${BODY_FILE}" || true)
|
||||
rm -f "${BODY_FILE}"
|
||||
|
||||
if [ "${STATUS}" = "422" ] && echo "${BODY}" | grep -qE "(pull request already exists for base branch 'staging'|already exists.*base.*staging)"; then
|
||||
echo "::notice::PR #${PR_NUMBER}: duplicate target-staging PR exists on same head — closing this main-PR as redundant."
|
||||
|
||||
# Close the now-redundant main-PR via Gitea REST
|
||||
# (PATCH state=closed). Post comment explaining
|
||||
# rationale BEFORE close so the comment lands on the
|
||||
# PR (commenting on a closed PR works on Gitea, but
|
||||
# historically caused notification ordering surprises).
|
||||
|
||||
CLOSE_BODY_FILE=$(mktemp)
|
||||
CMT_REQ=$(jq -n '{body:"[retarget-bot] Closing — another PR on the same head branch already targets `staging`. This PR is redundant. See issue #1884 for the rationale."}')
|
||||
set +e
|
||||
CMT_STATUS=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \
|
||||
-X POST -d "${CMT_REQ}" \
|
||||
-o "${CLOSE_BODY_FILE}" -w "%{http_code}" \
|
||||
"${API}/issues/${PR_NUMBER}/comments")
|
||||
set -e
|
||||
if [ "${CMT_STATUS}" != "201" ]; then
|
||||
echo "::warning::dup-close comment POST returned ${CMT_STATUS}; continuing to close anyway"
|
||||
cat "${CLOSE_BODY_FILE}" | head -c 300 || true
|
||||
fi
|
||||
rm -f "${CLOSE_BODY_FILE}"
|
||||
|
||||
CLOSE_REQ='{"state":"closed"}'
|
||||
CLOSE_RESP=$(mktemp)
|
||||
set +e
|
||||
CL_STATUS=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \
|
||||
-X PATCH -d "${CLOSE_REQ}" \
|
||||
-o "${CLOSE_RESP}" -w "%{http_code}" \
|
||||
"${API}/pulls/${PR_NUMBER}")
|
||||
set -e
|
||||
if [ "${CL_STATUS}" = "201" ] || [ "${CL_STATUS}" = "200" ]; then
|
||||
echo "::notice::Closed PR #${PR_NUMBER} as redundant"
|
||||
echo "outcome=closed-as-duplicate" >> "$GITHUB_OUTPUT"
|
||||
rm -f "${CLOSE_RESP}"
|
||||
exit 0
|
||||
fi
|
||||
echo "::error::Failed to close redundant PR: HTTP ${CL_STATUS}"
|
||||
cat "${CLOSE_RESP}" | head -c 300 || true
|
||||
rm -f "${CLOSE_RESP}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "::error::Retarget PATCH failed and was NOT a duplicate-base error: HTTP ${STATUS}"
|
||||
echo "${BODY}" | head -c 500 >&2
|
||||
exit 1
|
||||
|
||||
- name: Post explainer comment
|
||||
if: steps.retarget.outputs.outcome == 'retargeted'
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
|
||||
GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }}
|
||||
REPO: ${{ github.repository }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
API="${GITEA_HOST}/api/v1/repos/${REPO}"
|
||||
AUTH=(-H "Authorization: token ${GITEA_TOKEN}" -H "Accept: application/json")
|
||||
|
||||
# PR comments live on the issue endpoint in Gitea
|
||||
# (PRs ARE issues — same endpoint, different sub-resources
|
||||
# for diffs/files/etc.). The body uses jq to safely
|
||||
# encode the multi-line markdown without shell-quote
|
||||
# nightmares.
|
||||
REQ=$(jq -n '{body:"[retarget-bot] This PR was opened against `main` and has been retargeted to `staging` automatically.\n\n**Why:** per [SHARED_RULES rule 8](https://git.moleculesai.app/molecule-ai/molecule-ai-org-template-molecule-dev/src/branch/main/SHARED_RULES.md), all feature work targets `staging` first; the CEO promotes `staging → main` separately.\n\n**What changed:** just the base branch — no code change. CI will re-run against `staging`. If you get merge conflicts, rebase on `staging`.\n\n**If this PR is the CEO`s staging→main promotion:** the Action skipped you (only bot-authored PRs are retargeted, head=staging is also exempted). If you see this comment on your CEO PR, that`s a bug — please tag @hongmingwang."}')
|
||||
|
||||
BODY_FILE=$(mktemp)
|
||||
set +e
|
||||
STATUS=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \
|
||||
-X POST -d "${REQ}" \
|
||||
-o "${BODY_FILE}" -w "%{http_code}" \
|
||||
"${API}/issues/${PR_NUMBER}/comments")
|
||||
set -e
|
||||
|
||||
if [ "${STATUS}" = "201" ]; then
|
||||
echo "::notice::Posted explainer comment on PR #${PR_NUMBER}"
|
||||
else
|
||||
echo "::warning::Failed to post explainer (HTTP ${STATUS}) — retarget itself succeeded"
|
||||
cat "${BODY_FILE}" | head -c 300 || true
|
||||
fi
|
||||
rm -f "${BODY_FILE}"
|
||||
@ -110,7 +110,7 @@ causing a render loop when any node position changed.
|
||||
|
||||
1. **Repo-wide:** "Automatically delete head branches" is on. Once a PR merges, the branch is deleted server-side. Any subsequent `git push` to that branch fails with `remote rejected — no such branch`.
|
||||
|
||||
2. **CI:** the `pr-guards` workflow (calling [molecule-ci `disable-auto-merge-on-push`](https://git.moleculesai.app/molecule-ai/molecule-ci/src/branch/main/.github/workflows/disable-auto-merge-on-push.yml)) fires on every push to an open PR. If auto-merge was already enabled, it's disabled and a comment is posted. You must explicitly re-enable after verifying the new commit.
|
||||
2. **CI:** the `pr-guards` workflow (calling [molecule-ci `disable-auto-merge-on-push`](https://git.moleculesai.app/molecule-ai/molecule-ci/blob/main/.github/workflows/disable-auto-merge-on-push.yml)) fires on every push to an open PR. If auto-merge was already enabled, it's disabled and a comment is posted. You must explicitly re-enable after verifying the new commit.
|
||||
|
||||
**Workflow rules that follow from the guards:**
|
||||
- Push **all** commits before running `gh pr merge --auto`.
|
||||
|
||||
28
Makefile
28
Makefile
@ -1,28 +0,0 @@
|
||||
# Top-level Makefile — convenience wrappers around docker compose.
|
||||
#
|
||||
# Most molecule-core dev work happens via these shortcuts. CI doesn't
|
||||
# use this Makefile; CI calls docker compose / go test directly so the
|
||||
# Makefile can evolve without breaking the build.
|
||||
|
||||
.PHONY: help dev up down logs build test
|
||||
|
||||
help: ## Show this help.
|
||||
@grep -E '^[a-zA-Z_-]+:.*?## ' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-12s\033[0m %s\n", $$1, $$2}'
|
||||
|
||||
dev: ## Start the full stack with air hot-reload for the platform service.
|
||||
docker compose -f docker-compose.yml -f docker-compose.dev.yml up
|
||||
|
||||
up: ## Start the full stack in production-shape mode (no air, normal Dockerfile).
|
||||
docker compose up
|
||||
|
||||
down: ## Stop the stack and remove containers (volumes preserved).
|
||||
docker compose down
|
||||
|
||||
logs: ## Tail logs from all services (Ctrl-C to detach).
|
||||
docker compose logs -f
|
||||
|
||||
build: ## Force a fresh build of the platform image (no cache).
|
||||
docker compose build --no-cache platform
|
||||
|
||||
test: ## Run Go unit tests in workspace-server/.
|
||||
cd workspace-server && go test -race ./...
|
||||
69
README.md
69
README.md
@ -1,7 +1,7 @@
|
||||
<div align="center">
|
||||
|
||||
<p>
|
||||
<img src="./docs/assets/branding/molecule-icon.svg" alt="Molecule AI" width="160" />
|
||||
<img src="./docs/assets/branding/molecule-icon.png" alt="Molecule AI Icon Logo" width="160" />
|
||||
</p>
|
||||
|
||||
<p>
|
||||
@ -53,8 +53,8 @@ Molecule AI is the most powerful way to govern an AI agent organization in produ
|
||||
It combines the parts that are usually scattered across demos, internal glue code, and framework-specific tooling into one product:
|
||||
|
||||
- one org-native control plane for teams, roles, hierarchy, and lifecycle
|
||||
- one runtime layer that lets **eight** agent runtimes — LangGraph, DeepAgents, Claude Code, CrewAI, AutoGen, **Hermes**, **Gemini CLI**, and OpenClaw — run side by side behind one workspace contract
|
||||
- one memory model that keeps recall, sharing, and skill evolution aligned with organizational boundaries (Memory v2 backed by pgvector for semantic recall)
|
||||
- one runtime layer that lets LangGraph, DeepAgents, Claude Code, CrewAI, AutoGen, and OpenClaw run side by side
|
||||
- one memory model that keeps recall, sharing, and skill evolution aligned with organizational boundaries
|
||||
- one operational surface for observing, pausing, restarting, inspecting, and improving live workspaces
|
||||
|
||||
Most teams can build a workflow, a strong single agent, a coding agent, or a custom multi-agent graph.
|
||||
@ -75,7 +75,7 @@ You do not wire collaboration paths by hand. Hierarchy defines the default commu
|
||||
|
||||
### 3. Runtime choice stops being a dead-end decision
|
||||
|
||||
LangGraph, DeepAgents, Claude Code, CrewAI, AutoGen, Hermes, Gemini CLI, and OpenClaw can all plug into the same workspace abstraction. Teams can standardize governance without forcing every group onto one runtime.
|
||||
LangGraph, DeepAgents, Claude Code, CrewAI, AutoGen, and OpenClaw can all plug into the same workspace abstraction. Teams can standardize governance without forcing every group onto one runtime.
|
||||
|
||||
### 4. Memory is treated like infrastructure
|
||||
|
||||
@ -117,8 +117,6 @@ Molecule AI is not trying to replace the frameworks below. It is the system that
|
||||
| **Claude Code** | Shipping on `main` | Real coding workflows, CLI-native continuity | Secure workspace abstraction, A2A delegation, org boundaries, shared control plane |
|
||||
| **CrewAI** | Shipping on `main` | Role-based crews | Persistent workspace identity, policy consistency, shared canvas and registry |
|
||||
| **AutoGen** | Shipping on `main` | Assistant/tool orchestration | Standardized deployment, hierarchy-aware collaboration, shared ops plane |
|
||||
| **Hermes 4** | Shipping on `main` | Hybrid reasoning, native tools, json_schema (NousResearch/hermes-agent) | Option B upstream hook, A2A bridge to OpenAI-compat API, multi-provider provider derivation |
|
||||
| **Gemini CLI** | Shipping on `main` | Google Gemini CLI continuity | Workspace lifecycle, A2A, hierarchy-aware collaboration, shared ops plane |
|
||||
| **OpenClaw** | Shipping on `main` | CLI-native runtime with its own session model | Workspace lifecycle, templates, activity logs, topology-aware collaboration |
|
||||
| **NemoClaw** | WIP on `feat/nemoclaw-t4-docker` | NVIDIA-oriented runtime path | Planned to join the same abstraction once merged; not yet part of `main` |
|
||||
|
||||
@ -184,10 +182,9 @@ The result is not just “an agent that learns.” It is **an organization that
|
||||
|
||||
## What Ships In `main`
|
||||
|
||||
### Canvas (v4)
|
||||
### Canvas
|
||||
|
||||
- Next.js 15 + React Flow + Zustand
|
||||
- **warm-paper theme system** — light / dark / follow-system, SSR cookie + nonce'd boot script + ThemeProvider; terminal + code surfaces stay dark unconditionally
|
||||
- drag-to-nest team building
|
||||
- empty-state deployment + onboarding wizard
|
||||
- template palette
|
||||
@ -196,9 +193,8 @@ The result is not just “an agent that learns.” It is **an organization that
|
||||
|
||||
### Platform
|
||||
|
||||
- Go 1.25 / Gin control plane (80+ HTTP endpoints + Gorilla WebSocket fanout)
|
||||
- workspace CRUD and provisioning (pluggable Provisioner — Docker locally, EC2 + SSM in production)
|
||||
- **A2A response path is a typed discriminated union (RFC #2967)** — frozen dataclasses + total parser; 100% unit + adversarial fuzz coverage
|
||||
- Go/Gin control plane
|
||||
- workspace CRUD and provisioning
|
||||
- registry and heartbeats
|
||||
- browser-safe A2A proxy
|
||||
- team expansion/collapse
|
||||
@ -208,10 +204,10 @@ The result is not just “an agent that learns.” It is **an organization that
|
||||
|
||||
### Runtime
|
||||
|
||||
- unified `workspace/` image; thin AMI in production (us-east-2)
|
||||
- adapter-driven execution across **8 runtimes** (Claude Code, Hermes, Gemini CLI, LangGraph, DeepAgents, CrewAI, AutoGen, OpenClaw)
|
||||
- unified `workspace/` image
|
||||
- adapter-driven execution
|
||||
- Agent Card registration
|
||||
- awareness-backed memory integration; **Memory v2 backed by pgvector** for semantic recall
|
||||
- awareness-backed memory integration
|
||||
- plugin-mounted shared rules/skills
|
||||
- hot-reloadable local skills
|
||||
- coordinator-only delegation path
|
||||
@ -225,21 +221,6 @@ The result is not just “an agent that learns.” It is **an organization that
|
||||
- runtime tiers
|
||||
- direct workspace inspection through terminal and files
|
||||
|
||||
### SaaS (via [`molecule-controlplane`](https://git.moleculesai.app/molecule-ai/molecule-controlplane))
|
||||
|
||||
- multi-tenant on AWS EC2 + Neon (per-tenant Postgres branch) + Cloudflare Tunnels (per-tenant, no public ports)
|
||||
- WorkOS AuthKit + Stripe Checkout + Customer Portal
|
||||
- AWS KMS envelope encryption (DB / Redis connection strings); AWS Secrets Manager for tenant bootstrap
|
||||
- `tenant_resources` audit table + 30-min boot-event-aware reconciler — every CF / AWS lifecycle event recorded, claim vs live state diffed
|
||||
|
||||
### Bring your own Claude Code session (via [`molecule-mcp-claude-channel`](https://git.moleculesai.app/molecule-ai/molecule-mcp-claude-channel))
|
||||
|
||||
- Claude Code plugin that bridges Molecule A2A traffic into a local Claude Code session via MCP
|
||||
- subscribe to one or more workspaces; peer messages surface as conversation turns; replies route back through Molecule's A2A
|
||||
- no tunnel, no public endpoint — the plugin self-registers each watched workspace as `delivery_mode=poll` and long-polls `/activity?since_id=…`
|
||||
- multi-tenant friendly: one plugin install can watch workspaces across multiple Molecule tenants (`MOLECULE_PLATFORM_URLS` per-workspace)
|
||||
- install via the standard marketplace flow: `/plugin marketplace add Molecule-AI/molecule-mcp-claude-channel` → `/plugin install molecule-channel@molecule-mcp-claude-channel`
|
||||
|
||||
## Built For Teams That Need More Than A Demo
|
||||
|
||||
Molecule AI is especially strong when you need to run:
|
||||
@ -252,30 +233,24 @@ Molecule AI is especially strong when you need to run:
|
||||
## Architecture
|
||||
|
||||
```text
|
||||
Canvas (Next.js 15, warm-paper :3000) <--HTTP / WS--> Platform (Go 1.25 :8080) <---> Postgres + Redis
|
||||
| |
|
||||
| +--> Provisioner: Docker (local) / EC2 + SSM (prod)
|
||||
| +--> bundles · templates · secrets · KMS
|
||||
Canvas (Next.js :3000) <--HTTP / WS--> Platform (Go :8080) <---> Postgres + Redis
|
||||
| |
|
||||
| +--> Docker provisioner / bundles / templates / secrets
|
||||
|
|
||||
+------------------------- shows ------------------------> workspaces, teams, tasks, traces, events
|
||||
+-------------------- shows --------------------> workspaces, teams, tasks, traces, events
|
||||
|
||||
Workspace Runtime (Python ≥3.11, image with adapters)
|
||||
- 8 adapters: LangGraph / DeepAgents / Claude Code / CrewAI / AutoGen / Hermes / Gemini CLI / OpenClaw
|
||||
- Agent Card + A2A server (typed-SSOT response path, RFC #2967)
|
||||
- heartbeat + activity + awareness-backed memory (Memory v2 — pgvector semantic recall)
|
||||
Workspace Runtime (Python image with adapters)
|
||||
- LangGraph / DeepAgents / Claude Code / CrewAI / AutoGen / OpenClaw
|
||||
- Agent Card + A2A server
|
||||
- heartbeat + activity + awareness-backed memory
|
||||
- skills + plugins + hot reload
|
||||
|
||||
SaaS Control Plane (molecule-controlplane, private)
|
||||
- per-tenant EC2 + Neon (Postgres branch) + Cloudflare Tunnel
|
||||
- WorkOS · Stripe · KMS · AWS Secrets Manager
|
||||
- tenant_resources audit + 30-min reconciler
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
git clone https://git.moleculesai.app/molecule-ai/molecule-core.git
|
||||
cd molecule-core
|
||||
cd molecule-monorepo
|
||||
|
||||
cp .env.example .env
|
||||
# Defaults boot the stack locally out of the box. See .env.example for
|
||||
@ -328,11 +303,7 @@ Then open `http://localhost:3000`:
|
||||
|
||||
## Current Scope
|
||||
|
||||
The current `main` branch ships the core platform, Canvas v4 (warm-paper themed), Memory v2 (pgvector semantic recall), the typed-SSOT A2A response path (RFC #2967), **eight production adapters** (Claude Code, Hermes, Gemini CLI, LangGraph, DeepAgents, CrewAI, AutoGen, OpenClaw), skill lifecycle, and operational surfaces.
|
||||
|
||||
The companion private repo [`molecule-controlplane`](https://git.moleculesai.app/molecule-ai/molecule-controlplane) provides the SaaS surface — multi-tenant orchestration on EC2 + Neon + Cloudflare Tunnels, KMS envelope encryption, WorkOS auth, Stripe billing, and a `tenant_resources` audit table with a 30-min reconciler.
|
||||
|
||||
Adjacent runtime work such as **NemoClaw** remains branch-level until merged, and this README keeps that distinction explicit on purpose.
|
||||
The current `main` branch already includes the core platform, canvas, memory model, six production adapters, skill lifecycle, and operational surfaces. Adjacent runtime work such as **NemoClaw** remains branch-level until merged, and this README keeps that distinction explicit on purpose.
|
||||
|
||||
## License
|
||||
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
<div align="center">
|
||||
|
||||
<p>
|
||||
<img src="./docs/assets/branding/molecule-icon.svg" alt="Molecule AI" width="160" />
|
||||
<img src="./docs/assets/branding/molecule-icon.png" alt="Molecule AI 图案 Logo" width="160" />
|
||||
</p>
|
||||
|
||||
<p>
|
||||
@ -52,8 +52,8 @@ Molecule AI 是目前最强的 AI Agent 组织治理方案之一,用来把 age
|
||||
它把过去分散在 demo、内部胶水代码和各类 framework 私有工具里的关键能力,收敛成一个产品:
|
||||
|
||||
- 一套组织原生 control plane,管理团队、角色、层级和生命周期
|
||||
- 一套 runtime abstraction,让 **8 个** agent runtime —— LangGraph、DeepAgents、Claude Code、CrewAI、AutoGen、**Hermes**、**Gemini CLI**、OpenClaw —— 共用一套 workspace 契约
|
||||
- 一套与组织边界对齐的 memory 模型,把 recall、sharing 和 skill evolution 放进同一体系(Memory v2 由 pgvector 支撑语义召回)
|
||||
- 一套 runtime abstraction,让 LangGraph、DeepAgents、Claude Code、CrewAI、AutoGen、OpenClaw 并存运行
|
||||
- 一套与组织边界对齐的 memory 模型,把 recall、sharing 和 skill evolution 放进同一体系
|
||||
- 一套面向线上 workspace 的运维面,统一完成观测、暂停、重启、检查和持续改进
|
||||
|
||||
今天很多团队能做好 workflow、单 agent、coding agent,或者自定义 multi-agent graph 中的一种。
|
||||
@ -74,7 +74,7 @@ Molecule AI 填的就是这个空白。
|
||||
|
||||
### 3. Runtime 选择不再是死路
|
||||
|
||||
LangGraph、DeepAgents、Claude Code、CrewAI、AutoGen、Hermes、Gemini CLI、OpenClaw 都可以挂到同一个 workspace abstraction 下。团队可以统一治理方式,而不必统一到底层 runtime。
|
||||
LangGraph、DeepAgents、Claude Code、CrewAI、AutoGen、OpenClaw 都可以挂到同一个 workspace abstraction 下。团队可以统一治理方式,而不必统一到底层 runtime。
|
||||
|
||||
### 4. Memory 被当成基础设施来做
|
||||
|
||||
@ -116,8 +116,6 @@ Molecule AI 并不是要替代下面这些 framework,而是把它们纳入更
|
||||
| **Claude Code** | `main` 已支持 | 真实编码工作流、CLI-native continuity | 安全 workspace 抽象、A2A delegation、组织边界、共享 control plane |
|
||||
| **CrewAI** | `main` 已支持 | 角色型 crew 模式清晰 | 持久 workspace 身份、统一策略、共享 Canvas 和 registry |
|
||||
| **AutoGen** | `main` 已支持 | assistant/tool orchestration | 统一部署、层级协作、共享运维平面 |
|
||||
| **Hermes 4** | `main` 已支持 | 混合推理、原生工具调用、json_schema 输出(NousResearch/hermes-agent) | Option B 上游 hook、A2A 桥接 OpenAI 兼容 API、多 provider 自动派生 |
|
||||
| **Gemini CLI** | `main` 已支持 | Google Gemini CLI 持续会话 | workspace 生命周期、A2A、层级感知协作、共享运维平面 |
|
||||
| **OpenClaw** | `main` 已支持 | CLI-native runtime,自有 session 模型 | workspace 生命周期、templates、activity logs、拓扑感知协作 |
|
||||
| **NemoClaw** | `feat/nemoclaw-t4-docker` 分支 WIP | NVIDIA 方向 runtime 路线 | 计划并入同一抽象层,但当前还不是 `main` 已合并能力 |
|
||||
|
||||
@ -183,10 +181,9 @@ Molecule AI 并不是要替代下面这些 framework,而是把它们纳入更
|
||||
|
||||
## `main` 分支已经具备什么
|
||||
|
||||
### Canvas(v4)
|
||||
### Canvas
|
||||
|
||||
- Next.js 15 + React Flow + Zustand
|
||||
- **warm-paper 主题系统** —— light / dark / 跟随系统;SSR cookie + nonce'd boot 脚本 + ThemeProvider;终端与代码面板始终保持深色
|
||||
- drag-to-nest 团队构建
|
||||
- empty state + onboarding wizard
|
||||
- template palette
|
||||
@ -195,9 +192,8 @@ Molecule AI 并不是要替代下面这些 framework,而是把它们纳入更
|
||||
|
||||
### Platform
|
||||
|
||||
- Go 1.25 / Gin control plane(80+ HTTP 端点 + Gorilla WebSocket fanout)
|
||||
- workspace CRUD 和 provisioning(可插拔 Provisioner —— 本地 Docker、生产 EC2 + SSM)
|
||||
- **A2A 响应路径已收敛为类型化的判别联合(RFC #2967)** —— 冻结 dataclass + 全量 parser;100% 单元测试 + 对抗性 fuzz 覆盖
|
||||
- Go/Gin control plane
|
||||
- workspace CRUD 和 provisioning
|
||||
- registry 与 heartbeat
|
||||
- 浏览器安全的 A2A proxy
|
||||
- team expansion/collapse
|
||||
@ -207,10 +203,10 @@ Molecule AI 并不是要替代下面这些 framework,而是把它们纳入更
|
||||
|
||||
### Runtime
|
||||
|
||||
- 统一 `workspace/` 镜像;生产环境采用 thin AMI(us-east-2)
|
||||
- adapter 驱动执行,覆盖 **8 个 runtime**(Claude Code、Hermes、Gemini CLI、LangGraph、DeepAgents、CrewAI、AutoGen、OpenClaw)
|
||||
- 统一 `workspace/` 镜像
|
||||
- adapter 驱动执行
|
||||
- Agent Card 注册
|
||||
- awareness-backed memory;**Memory v2 由 pgvector 支撑**语义召回
|
||||
- awareness-backed memory
|
||||
- plugin 挂载共享 rules/skills
|
||||
- 本地 skills 热加载
|
||||
- coordinator-only delegation 路径
|
||||
@ -224,21 +220,6 @@ Molecule AI 并不是要替代下面这些 framework,而是把它们纳入更
|
||||
- runtime tiers
|
||||
- 终端与文件层面的 workspace 直接排障
|
||||
|
||||
### SaaS(由 [`molecule-controlplane`](https://git.moleculesai.app/molecule-ai/molecule-controlplane) 提供)
|
||||
|
||||
- 多租户运行在 AWS EC2 + Neon(每租户一个 Postgres branch)+ Cloudflare Tunnels(每租户一条隧道,对外不开任何端口)
|
||||
- WorkOS AuthKit + Stripe Checkout + Customer Portal
|
||||
- AWS KMS 信封加密(DB / Redis 连接串);AWS Secrets Manager 负责租户 bootstrap
|
||||
- `tenant_resources` 审计表 + 30 分钟 boot-event-aware reconciler —— 每个 CF / AWS lifecycle 事件都有记录,每 30 分钟比对 claim 与实际状态
|
||||
|
||||
### 在 Claude Code 里直接接入(由 [`molecule-mcp-claude-channel`](https://git.moleculesai.app/molecule-ai/molecule-mcp-claude-channel) 提供)
|
||||
|
||||
- 把 Molecule A2A 流量桥接到本地 Claude Code 会话的 MCP 插件
|
||||
- 订阅一个或多个 workspace;peer 的消息会以 user-turn 出现,回复会经 Molecule A2A 路由出去
|
||||
- 无需公网隧道、无需公开端点 —— 插件启动时自动把每个 watched workspace 注册成 `delivery_mode=poll`,长轮询 `/activity?since_id=…`
|
||||
- 多租户友好:单次安装即可同时 watch 跨多个 Molecule 租户的 workspace(`MOLECULE_PLATFORM_URLS` 按 workspace 配置)
|
||||
- 通过标准 marketplace 流程安装:`/plugin marketplace add Molecule-AI/molecule-mcp-claude-channel` → `/plugin install molecule-channel@molecule-mcp-claude-channel`
|
||||
|
||||
## 适合什么团队
|
||||
|
||||
Molecule AI 特别适合下面这些场景:
|
||||
@ -251,23 +232,17 @@ Molecule AI 特别适合下面这些场景:
|
||||
## 架构总览
|
||||
|
||||
```text
|
||||
Canvas (Next.js 15, warm-paper :3000) <--HTTP / WS--> Platform (Go 1.25 :8080) <---> Postgres + Redis
|
||||
| |
|
||||
| +--> Provisioner: Docker (本地) / EC2 + SSM (生产)
|
||||
| +--> bundles · templates · secrets · KMS
|
||||
Canvas (Next.js :3000) <--HTTP / WS--> Platform (Go :8080) <---> Postgres + Redis
|
||||
| |
|
||||
| +--> Docker provisioner / bundles / templates / secrets
|
||||
|
|
||||
+------------------------- 展示 ------------------------> workspaces, teams, tasks, traces, events
|
||||
+-------------------- 展示 --------------------> workspaces, teams, tasks, traces, events
|
||||
|
||||
Workspace Runtime (Python ≥3.11,含 adapter 集合的镜像)
|
||||
- 8 个 adapter: LangGraph / DeepAgents / Claude Code / CrewAI / AutoGen / Hermes / Gemini CLI / OpenClaw
|
||||
- Agent Card + A2A server(typed-SSOT 响应路径,RFC #2967)
|
||||
- heartbeat + activity + awareness-backed memory(Memory v2 —— pgvector 语义召回)
|
||||
Workspace Runtime (Python image with adapters)
|
||||
- LangGraph / DeepAgents / Claude Code / CrewAI / AutoGen / OpenClaw
|
||||
- Agent Card + A2A server
|
||||
- heartbeat + activity + awareness-backed memory
|
||||
- skills + plugins + hot reload
|
||||
|
||||
SaaS Control Plane (molecule-controlplane,私有)
|
||||
- 每租户 EC2 + Neon (Postgres branch) + Cloudflare Tunnel
|
||||
- WorkOS · Stripe · KMS · AWS Secrets Manager
|
||||
- tenant_resources 审计 + 30 分钟 reconciler
|
||||
```
|
||||
|
||||
## 快速开始
|
||||
@ -321,11 +296,7 @@ npm run dev
|
||||
|
||||
## 当前范围说明
|
||||
|
||||
当前 `main` 已经包含核心平台、Canvas v4(warm-paper 主题)、Memory v2(pgvector 语义召回)、typed-SSOT A2A 响应路径(RFC #2967)、**8 个正式 adapter**(Claude Code、Hermes、Gemini CLI、LangGraph、DeepAgents、CrewAI、AutoGen、OpenClaw)、skill lifecycle,以及主要运维面。
|
||||
|
||||
配套的私有仓库 [`molecule-controlplane`](https://git.moleculesai.app/molecule-ai/molecule-controlplane) 提供 SaaS 层 —— 多租户编排(EC2 + Neon + Cloudflare Tunnels)、KMS 信封加密、WorkOS 鉴权、Stripe 计费,以及 `tenant_resources` 审计表加 30 分钟 reconciler。
|
||||
|
||||
像 **NemoClaw** 这样的相邻 runtime 路线仍然属于分支级工作,只有合并后才会进入正式支持列表,这里会明确区分。
|
||||
当前 `main` 已经包含核心平台、Canvas、memory model、6 个正式 adapter、skill lifecycle 和主要运维面。像 **NemoClaw** 这样的相邻 runtime 路线仍然属于分支级工作,只有合并后才会进入正式支持列表,这里会明确区分。
|
||||
|
||||
## License
|
||||
|
||||
|
||||
@ -1,10 +0,0 @@
|
||||
# Excluded from `docker build` context. Without this, the COPY . . step in
|
||||
# canvas/Dockerfile clobbers the freshly-installed node_modules with the
|
||||
# host's (potentially broken / wrong-arch) copy — the @tailwindcss/oxide
|
||||
# native binary disagreed and broke `next build`.
|
||||
node_modules
|
||||
.next
|
||||
.git
|
||||
*.log
|
||||
.env*
|
||||
!.env.example
|
||||
@ -1,11 +1,7 @@
|
||||
FROM node:22-alpine AS builder
|
||||
WORKDIR /app
|
||||
COPY package.json package-lock.json* ./
|
||||
# `npm ci` (not `install`) for lockfile-exact reproducibility.
|
||||
# `--include=optional` ensures the platform-specific @tailwindcss/oxide
|
||||
# native binary lands — without it, postcss fails with "Cannot read
|
||||
# properties of undefined (reading 'All')" at build time.
|
||||
RUN npm ci --include=optional
|
||||
RUN npm install
|
||||
COPY . .
|
||||
ARG NEXT_PUBLIC_PLATFORM_URL=http://localhost:8080
|
||||
ARG NEXT_PUBLIC_WS_URL=ws://localhost:8080/ws
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
{
|
||||
"name": "molecule-monorepo-canvas",
|
||||
"name": "molecule-core-canvas",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"scripts": {
|
||||
|
||||
@ -41,7 +41,7 @@ export default function PricingPage() {
|
||||
<p className="mt-2 text-ink-mid">
|
||||
We publish the{" "}
|
||||
<a
|
||||
href="https://git.moleculesai.app/molecule-ai/molecule-monorepo"
|
||||
href="https://git.moleculesai.app/molecule-ai/molecule-core"
|
||||
className="text-accent underline hover:text-accent"
|
||||
>
|
||||
full source on GitHub
|
||||
|
||||
@ -13,6 +13,7 @@ import { AttachmentPreview } from "./chat/AttachmentPreview";
|
||||
import { extractFilesFromTask } from "./chat/message-parser";
|
||||
import { AgentCommsPanel } from "./chat/AgentCommsPanel";
|
||||
import { appendActivityLine } from "./chat/activityLog";
|
||||
import { activityRowToMessages, type ActivityRowForHydration } from "./chat/historyHydration";
|
||||
import { runtimeDisplayName } from "@/lib/runtime-names";
|
||||
import { ConfirmDialog } from "@/components/ConfirmDialog";
|
||||
|
||||
@ -49,12 +50,38 @@ interface A2AResponse {
|
||||
};
|
||||
}
|
||||
|
||||
// Internal-self-message filtering moved server-side in RFC #2945
|
||||
// PR-C/D — the platform's /chat-history endpoint applies the
|
||||
// IsInternalSelfMessage predicate before returning rows, so the
|
||||
// client no longer needs the local backstop on the history path.
|
||||
// The proper fix is still X-Workspace-ID header (source_id=workspace_id);
|
||||
// the platform-side prefix filter handles the residual cases.
|
||||
/** Detect activity-log rows that the workspace's own runtime fired
|
||||
* against itself but were misclassified as canvas-source. The proper
|
||||
* fix is the X-Workspace-ID header from `self_source_headers()` in
|
||||
* workspace/platform_auth.py, which makes the platform record
|
||||
* source_id = workspace_id. But three failure modes still leak a
|
||||
* self-message into "My Chat":
|
||||
*
|
||||
* 1. Historical rows already in the DB with source_id=NULL.
|
||||
* 2. Workspace containers running pre-fix heartbeat.py / main.py
|
||||
* (the fix only takes effect after an image rebuild + redeploy).
|
||||
* 3. Future internal triggers added without the helper.
|
||||
*
|
||||
* This client-side filter recognises the heartbeat trigger by its
|
||||
* exact prefix — the heartbeat assembles
|
||||
*
|
||||
* "Delegation results are ready. Review them and take appropriate
|
||||
* action:\n" + summary_lines + report_instruction
|
||||
*
|
||||
* in workspace/heartbeat.py. The prefix is template-fixed so a
|
||||
* string match is reliable. If the heartbeat copy ever changes,
|
||||
* update this constant in the same commit.
|
||||
*
|
||||
* This is a backstop, not the primary defence — the X-Workspace-ID
|
||||
* header is. Filtering content is fragile to copy edits, so keep
|
||||
* the list narrow. */
|
||||
const INTERNAL_SELF_MESSAGE_PREFIXES = [
|
||||
"Delegation results are ready. Review them and take appropriate action",
|
||||
];
|
||||
|
||||
function isInternalSelfMessage(text: string): boolean {
|
||||
return INTERNAL_SELF_MESSAGE_PREFIXES.some((p) => text.startsWith(p));
|
||||
}
|
||||
|
||||
// extractReplyText pulls the agent's text reply out of an A2A response.
|
||||
// Concatenates ALL text parts (joined with "\n") rather than returning
|
||||
@ -107,19 +134,8 @@ const INITIAL_HISTORY_LIMIT = 10;
|
||||
const OLDER_HISTORY_BATCH = 20;
|
||||
|
||||
/**
|
||||
* Load chat history from the platform's typed /chat-history endpoint.
|
||||
*
|
||||
* Server-side rendering of activity_logs rows into ChatMessage shape
|
||||
* lives in workspace-server/internal/messagestore/postgres_store.go
|
||||
* (RFC #2945 PR-C/D). The server already applies the canvas-source
|
||||
* filter, the internal-self-message predicate, the role decision
|
||||
* (status=error vs agent-error prefix → system), and the v0/v1
|
||||
* file-shape extraction. Canvas just renders what it receives.
|
||||
*
|
||||
* Wire shape (mirrors ChatMessage exactly, no per-row mapping needed):
|
||||
*
|
||||
* GET /workspaces/:id/chat-history?limit=N&before_ts=T
|
||||
* 200 → {"messages": ChatMessage[], "reached_end": boolean}
|
||||
* Load chat history from the activity_logs database via the platform API.
|
||||
* Uses source=canvas to only get user-initiated messages (not agent-to-agent).
|
||||
*
|
||||
* Pagination:
|
||||
* - Pass `limit` to bound the page size (newest-first from server).
|
||||
@ -127,10 +143,10 @@ const OLDER_HISTORY_BATCH = 20;
|
||||
* timestamp. Combined with limit, this yields the next-older page
|
||||
* when scrolling backward through history.
|
||||
*
|
||||
* `reachedEnd` is propagated from the server. The server computes it
|
||||
* by comparing rowCount vs limit so a partial last page is correctly
|
||||
* detected even when the row→bubble fan-out is non-1:1 (each row
|
||||
* produces 1-2 bubbles).
|
||||
* `reachedEnd` is true when the server returned fewer rows than asked
|
||||
* for — caller uses this to disable further older-batch fetches.
|
||||
* (Counts row-level returns, not chat-bubble count: each row may
|
||||
* produce 1-2 bubbles.)
|
||||
*/
|
||||
async function loadMessagesFromDB(
|
||||
workspaceId: string,
|
||||
@ -138,23 +154,25 @@ async function loadMessagesFromDB(
|
||||
beforeTs?: string,
|
||||
): Promise<{ messages: ChatMessage[]; error: string | null; reachedEnd: boolean }> {
|
||||
try {
|
||||
const params = new URLSearchParams({ limit: String(limit) });
|
||||
const params = new URLSearchParams({
|
||||
type: "a2a_receive",
|
||||
source: "canvas",
|
||||
limit: String(limit),
|
||||
});
|
||||
if (beforeTs) params.set("before_ts", beforeTs);
|
||||
const resp = await api.get<{ messages: ChatMessage[]; reached_end: boolean }>(
|
||||
`/workspaces/${workspaceId}/chat-history?${params.toString()}`,
|
||||
const activities = await api.get<ActivityRowForHydration[]>(
|
||||
`/workspaces/${workspaceId}/activity?${params.toString()}`,
|
||||
);
|
||||
|
||||
// Server emits oldest-first within the page (RFC #2945 PR-C-2
|
||||
// post-fix: server reverses row-aware before returning so the
|
||||
// wire is display-ready). Canvas appends/prepends without
|
||||
// reordering — this avoids the pair-flip bug a naive flat
|
||||
// reverse causes when each row produces a (user, agent) pair
|
||||
// with the same timestamp.
|
||||
return {
|
||||
messages: resp.messages ?? [],
|
||||
error: null,
|
||||
reachedEnd: resp.reached_end,
|
||||
};
|
||||
const messages: ChatMessage[] = [];
|
||||
// Activities are newest-first, reverse for chronological order.
|
||||
// Per-row mapping lives in chat/historyHydration.ts so it can be
|
||||
// unit-tested without spinning up the full ChatTab component
|
||||
// (regression cover for the timestamp-collapse bug).
|
||||
for (const a of [...activities].reverse()) {
|
||||
messages.push(...activityRowToMessages(a, isInternalSelfMessage));
|
||||
}
|
||||
return { messages, error: null, reachedEnd: activities.length < limit };
|
||||
} catch (err) {
|
||||
return {
|
||||
messages: [],
|
||||
|
||||
@ -21,39 +21,20 @@ interface Props {
|
||||
// --- Agent Card Section ---
|
||||
|
||||
function AgentCardSection({ workspaceId }: { workspaceId: string }) {
|
||||
// Initial card value comes from the canvas store — node.data.agentCard
|
||||
// is hydrated by the platform stream when the workspace appears in the
|
||||
// graph, so reading it here avoids a duplicate `GET /workspaces/${id}`
|
||||
// (the parent ConfigTab.loadConfig already fetches workspace metadata,
|
||||
// and refetching here adds a serialised RTT to the panel-open path —
|
||||
// contributed to the ~20s detail-panel load reported in core#11).
|
||||
// Local state still tracks the edited/saved value so the editor flow
|
||||
// is unchanged.
|
||||
const storeCard = useCanvasStore((s) => {
|
||||
// Defensive against test mocks that omit `nodes` (some test files
|
||||
// stub the store with a minimal shape). In production `nodes` is
|
||||
// always an array — empty or not — so the optional chaining only
|
||||
// matters for the test path.
|
||||
const node = s.nodes?.find?.((n) => n.id === workspaceId);
|
||||
return (node?.data.agentCard as
|
||||
| Record<string, unknown>
|
||||
| null
|
||||
| undefined) ?? null;
|
||||
});
|
||||
const [card, setCard] = useState<Record<string, unknown> | null>(storeCard);
|
||||
const [card, setCard] = useState<Record<string, unknown> | null>(null);
|
||||
const [loading, setLoading] = useState(true);
|
||||
const [editing, setEditing] = useState(false);
|
||||
const [draft, setDraft] = useState("");
|
||||
const [saving, setSaving] = useState(false);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
const [success, setSuccess] = useState(false);
|
||||
|
||||
// If the store updates while this section is mounted (another tab
|
||||
// pushed an update via the platform event stream), reflect that —
|
||||
// unless the user is mid-edit, in which case we don't clobber their
|
||||
// unsaved draft.
|
||||
useEffect(() => {
|
||||
if (!editing) setCard(storeCard);
|
||||
}, [storeCard, editing]);
|
||||
api.get<Record<string, unknown>>(`/workspaces/${workspaceId}`)
|
||||
.then((ws) => setCard((ws.agent_card as Record<string, unknown>) || null))
|
||||
.catch(() => {})
|
||||
.finally(() => setLoading(false));
|
||||
}, [workspaceId]);
|
||||
|
||||
const handleSave = async () => {
|
||||
setError(null);
|
||||
@ -72,7 +53,9 @@ function AgentCardSection({ workspaceId }: { workspaceId: string }) {
|
||||
|
||||
return (
|
||||
<Section title="Agent Card" defaultOpen={false}>
|
||||
{editing ? (
|
||||
{loading ? (
|
||||
<div className="text-[10px] text-ink-soft">Loading...</div>
|
||||
) : editing ? (
|
||||
<div className="space-y-2">
|
||||
<textarea
|
||||
aria-label="Agent card JSON editor"
|
||||
@ -238,51 +221,47 @@ export function ConfigTab({ workspaceId }: Props) {
|
||||
setLoading(true);
|
||||
setError(null);
|
||||
|
||||
// Load workspace metadata (runtime + model + provider) in parallel.
|
||||
// These are independent GETs against three workspace-server endpoints
|
||||
// and used to be awaited serially — for SaaS workspaces each call
|
||||
// round-trips through an EIC SSH tunnel, so the previous serial
|
||||
// pattern stacked 3-5s of tunnel-setup latency per call (core#11).
|
||||
// Promise.all overlaps them; the per-call cost stays the same but
|
||||
// wall time drops to max() instead of sum().
|
||||
//
|
||||
// Each leg has its own .catch handler that yields a sentinel value,
|
||||
// matching the previous semantics:
|
||||
// - /workspaces/${id}: required source-of-truth for runtime+tier;
|
||||
// fall back to YAML if the GET fails (rare, network-class only).
|
||||
// - /workspaces/${id}/model: non-fatal; empty model lets the form
|
||||
// fall through to YAML runtime_config.model.
|
||||
// - /workspaces/${id}/provider: non-fatal; old workspace-servers
|
||||
// return 404, in which case provider="" and Save skips the PUT.
|
||||
//
|
||||
// See GH #1894 for the workspace-row-as-source-of-truth rationale
|
||||
// that motivated splitting from a single config.yaml read.
|
||||
const [wsRes, modelRes, providerRes] = await Promise.all([
|
||||
api.get<{ runtime?: string; tier?: number }>(`/workspaces/${workspaceId}`)
|
||||
.catch(() => ({} as { runtime?: string; tier?: number })),
|
||||
api.get<{ model?: string }>(`/workspaces/${workspaceId}/model`)
|
||||
.catch(() => ({} as { model?: string })),
|
||||
api.get<{ provider?: string }>(`/workspaces/${workspaceId}/provider`)
|
||||
.catch(() => null),
|
||||
]);
|
||||
const wsMetadataRuntime = (wsRes.runtime || "").trim();
|
||||
const wsMetadataModel = (modelRes.model || "").trim();
|
||||
const wsMetadataTier: number | null =
|
||||
typeof wsRes.tier === "number" ? wsRes.tier : null;
|
||||
if (providerRes !== null) {
|
||||
const loadedProvider = (providerRes.provider || "").trim();
|
||||
setProvider(loadedProvider);
|
||||
setOriginalProvider(loadedProvider);
|
||||
} else {
|
||||
setProvider("");
|
||||
setOriginalProvider("");
|
||||
}
|
||||
// ALWAYS load workspace metadata first (runtime + model). These are the
|
||||
// source of truth regardless of whether the runtime uses our config.yaml
|
||||
// template. Without this the form falls back to empty/default values on
|
||||
// a hermes workspace (which doesn't use our template), creating the
|
||||
// appearance that the saved runtime is unset — and worse, clicking Save
|
||||
// would silently flip `runtime` from `hermes` back to the dropdown
|
||||
// default `LangGraph`. See GH #1894.
|
||||
let wsMetadataRuntime = "";
|
||||
let wsMetadataModel = "";
|
||||
let wsMetadataTier: number | null = null;
|
||||
try {
|
||||
const ws = await api.get<{ runtime?: string; tier?: number }>(`/workspaces/${workspaceId}`);
|
||||
wsMetadataRuntime = (ws.runtime || "").trim();
|
||||
if (typeof ws.tier === "number") wsMetadataTier = ws.tier;
|
||||
} catch { /* fall back to config.yaml */ }
|
||||
try {
|
||||
const m = await api.get<{ model?: string }>(`/workspaces/${workspaceId}/model`);
|
||||
wsMetadataModel = (m.model || "").trim();
|
||||
} catch { /* non-fatal */ }
|
||||
// originalModel is set further down once the YAML has been parsed —
|
||||
// we want it to reflect what the form ACTUALLY rendered, which may
|
||||
// be the YAML's runtime_config.model fallback when MODEL_PROVIDER
|
||||
// is empty. Setting it here from wsMetadataModel alone would be
|
||||
// wrong for hermes/pre-#240 workspaces.
|
||||
|
||||
// Load explicit provider override (Option B PR-5). Endpoint returns
|
||||
// {provider: "", source: "default"} when no override is set, so the
|
||||
// empty string is the legitimate "auto-derive" signal — don't treat
|
||||
// it as a load error. Non-fatal: an older workspace-server that
|
||||
// predates PR-2 returns 404 here; the form falls back to "" and
|
||||
// Save just won't PUT the provider field.
|
||||
try {
|
||||
const p = await api.get<{ provider?: string }>(`/workspaces/${workspaceId}/provider`);
|
||||
const loadedProvider = (p.provider || "").trim();
|
||||
setProvider(loadedProvider);
|
||||
setOriginalProvider(loadedProvider);
|
||||
} catch {
|
||||
setProvider("");
|
||||
setOriginalProvider("");
|
||||
}
|
||||
|
||||
// Skip the config.yaml fetch entirely for runtimes that manage
|
||||
// their own config (external, hermes, etc.) — they don't have a
|
||||
// platform-side template, so the GET would 404. The catch block
|
||||
|
||||
@ -1,11 +1,13 @@
|
||||
// @vitest-environment jsdom
|
||||
//
|
||||
// Pins the lazy-loading chat-history pagination.
|
||||
// Pins the lazy-loading chat-history pagination added 2026-05-05.
|
||||
//
|
||||
// PR-C-2 (RFC #2945): canvas was migrated from /activity?type=a2a_receive
|
||||
// to /chat-history. Server now returns typed ChatMessage[] in
|
||||
// display-ready oldest-first order. These tests guard the canvas-side
|
||||
// pagination invariants against the new endpoint surface.
|
||||
// Pre-fix: ChatTab fetched the newest 50 messages on every mount and
|
||||
// scrolled to bottom, paying full DOM cost up-front even when the user
|
||||
// only wanted to read the last few bubbles. Post-fix: initial load is
|
||||
// bounded to 10 newest, and an IntersectionObserver on a top sentinel
|
||||
// triggers loadOlder() (batch of 20 with `before_ts` cursor) when the
|
||||
// user scrolls up.
|
||||
//
|
||||
// Pinned branches:
|
||||
// 1. Initial fetch carries `limit=10` and NO before_ts (newest-first
|
||||
@ -18,10 +20,11 @@
|
||||
// asserting the rendered bubble count matches the full page).
|
||||
// 4. The retry button after a failed initial load uses the same
|
||||
// INITIAL_HISTORY_LIMIT (10), not the legacy 50.
|
||||
// 5. before_ts cursor is the OLDEST timestamp from the current page,
|
||||
// passed verbatim to walk backward.
|
||||
// 6. Inflight guard rejects duplicate IO triggers while a loadOlder
|
||||
// fetch is in flight.
|
||||
//
|
||||
// IntersectionObserver / scroll-anchor restoration is exercised by the
|
||||
// E2E synth-canary suite — pinning it in jsdom would require mocking
|
||||
// the observer and faking layout, which is brittler than trusting a
|
||||
// live-DOM canary against the staging tenant.
|
||||
|
||||
import { describe, it, expect, vi, afterEach, beforeEach } from "vitest";
|
||||
import { render, screen, cleanup, waitFor, fireEvent } from "@testing-library/react";
|
||||
@ -30,31 +33,24 @@ import React from "react";
|
||||
afterEach(cleanup);
|
||||
|
||||
// Both ChatTab sub-panels (MyChat + AgentComms) mount simultaneously so
|
||||
// keyboard tab order and aria-controls land on a real DOM. MyChat's
|
||||
// loadMessagesFromDB hits /chat-history; AgentComms's polling hits a
|
||||
// different URL. Route the mock by URL so each gets a sensible default
|
||||
// and only MyChat's calls land in the assertion array.
|
||||
const myChatHistoryCalls: string[] = [];
|
||||
let myChatNextResponse:
|
||||
| { ok: true; messages: unknown[]; reachedEnd?: boolean }
|
||||
| { ok: false; err: Error } = { ok: true, messages: [] };
|
||||
|
||||
// keyboard tab order and aria-controls land on a real DOM. Both fire
|
||||
// /activity GETs on mount: MyChat's hits `type=a2a_receive&source=canvas`,
|
||||
// AgentComms's hits a different filter. Route the mock by URL so each
|
||||
// gets a sensible default and only MyChat's call is what the assertions
|
||||
// scrutinise.
|
||||
const myChatActivityCalls: string[] = [];
|
||||
let myChatNextResponse: { ok: true; rows: unknown[] } | { ok: false; err: Error } = {
|
||||
ok: true,
|
||||
rows: [],
|
||||
};
|
||||
const apiGet = vi.fn((path: string): Promise<unknown> => {
|
||||
if (path.includes("/chat-history")) {
|
||||
myChatHistoryCalls.push(path);
|
||||
if (myChatNextResponse.ok) {
|
||||
const reached_end =
|
||||
myChatNextResponse.reachedEnd !== undefined
|
||||
? myChatNextResponse.reachedEnd
|
||||
: myChatNextResponse.messages.length < 10;
|
||||
return Promise.resolve({
|
||||
messages: myChatNextResponse.messages,
|
||||
reached_end,
|
||||
});
|
||||
}
|
||||
if (path.includes("type=a2a_receive") && path.includes("source=canvas")) {
|
||||
myChatActivityCalls.push(path);
|
||||
if (myChatNextResponse.ok) return Promise.resolve(myChatNextResponse.rows);
|
||||
return Promise.reject(myChatNextResponse.err);
|
||||
}
|
||||
// AgentComms / heartbeat / anything else — empty array safe default.
|
||||
// AgentComms / heartbeat / anything else — empty array is a safe
|
||||
// default that won't blow up the corresponding component's .then().
|
||||
return Promise.resolve([]);
|
||||
});
|
||||
const apiPost = vi.fn();
|
||||
@ -88,8 +84,8 @@ const ioInstances: IOInstance[] = [];
|
||||
beforeEach(() => {
|
||||
apiGet.mockClear();
|
||||
apiPost.mockReset();
|
||||
myChatHistoryCalls.length = 0;
|
||||
myChatNextResponse = { ok: true, messages: [] };
|
||||
myChatActivityCalls.length = 0;
|
||||
myChatNextResponse = { ok: true, rows: [] };
|
||||
ioInstances.length = 0;
|
||||
class FakeIO {
|
||||
private inst: IOInstance;
|
||||
@ -105,12 +101,20 @@ beforeEach(() => {
|
||||
this.inst.disconnected = true;
|
||||
}
|
||||
}
|
||||
// Install on every reachable global — different bundlers / module
|
||||
// graphs can resolve `IntersectionObserver` via `window`, `globalThis`,
|
||||
// or the bare global. Without all three, jsdom's own (pre-existing)
|
||||
// stub silently wins and ioInstances stays empty.
|
||||
(window as unknown as { IntersectionObserver: unknown }).IntersectionObserver = FakeIO;
|
||||
(globalThis as unknown as { IntersectionObserver: unknown }).IntersectionObserver = FakeIO;
|
||||
// jsdom doesn't implement scrollIntoView; ChatTab calls it after every
|
||||
// messages update.
|
||||
Element.prototype.scrollIntoView = vi.fn();
|
||||
});
|
||||
|
||||
function triggerIntersection(instanceIdx = -1) {
|
||||
// -1 → the latest observer (the live one). Tests targeting an old
|
||||
// (disconnected) instance pass a positive index.
|
||||
const inst = ioInstances.at(instanceIdx);
|
||||
if (!inst) throw new Error(`no IO instance at ${instanceIdx}`);
|
||||
inst.callback(
|
||||
@ -121,30 +125,25 @@ function triggerIntersection(instanceIdx = -1) {
|
||||
|
||||
import { ChatTab } from "../ChatTab";
|
||||
|
||||
// makeMessagePair returns a (user, agent) pair sharing a timestamp,
|
||||
// matching the wire shape /chat-history emits per activity_logs row.
|
||||
// Server-side reverseRowChunks ensures the wire is oldest-first across
|
||||
// rows but [user, agent] within each row.
|
||||
function makeMessagePair(seq: number): unknown[] {
|
||||
// Zero-pad seq into the minute slot so seq=10 produces a valid
|
||||
// timestamp (00:10:00Z, not 00:010:00Z).
|
||||
function makeActivityRow(seq: number): Record<string, unknown> {
|
||||
// Zero-pad seq into the minute slot so "seq=10" doesn't produce
|
||||
// the invalid timestamp "00:010:00Z" (caught by the loadOlder URL
|
||||
// assertion below — first version of the helper used `0${seq}` and
|
||||
// the test failed on `before_ts` having an extra digit).
|
||||
const mm = String(seq).padStart(2, "0");
|
||||
const ts = `2026-05-05T00:${mm}:00Z`;
|
||||
return [
|
||||
{ id: `u-${seq}`, role: "user", content: `user msg ${seq}`, timestamp: ts },
|
||||
{ id: `a-${seq}`, role: "agent", content: `agent reply ${seq}`, timestamp: ts },
|
||||
];
|
||||
return {
|
||||
activity_type: "a2a_receive",
|
||||
status: "ok",
|
||||
created_at: `2026-05-05T00:${mm}:00Z`,
|
||||
request_body: { params: { message: { parts: [{ kind: "text", text: `user msg ${seq}` }] } } },
|
||||
response_body: { result: `agent reply ${seq}` },
|
||||
};
|
||||
}
|
||||
|
||||
// pageOldestFirst builds a wire-shape page (oldest-first within page)
|
||||
// of `count` row-pairs starting at seq=`start`. Mirrors the server's
|
||||
// post-reverseRowChunks emission order.
|
||||
function pageOldestFirst(start: number, count: number): unknown[] {
|
||||
const out: unknown[] = [];
|
||||
for (let i = 0; i < count; i++) {
|
||||
out.push(...makeMessagePair(start + i));
|
||||
}
|
||||
return out;
|
||||
// Server returns newest-first; the helper builds a server-shape page
|
||||
// so the order in the rendered messages array matches production.
|
||||
function newestFirstPage(start: number, count: number): unknown[] {
|
||||
return Array.from({ length: count }, (_, i) => makeActivityRow(start + count - 1 - i));
|
||||
}
|
||||
|
||||
const minimalData = {
|
||||
@ -154,30 +153,28 @@ const minimalData = {
|
||||
} as unknown as Parameters<typeof ChatTab>[0]["data"];
|
||||
|
||||
describe("ChatTab lazy history pagination", () => {
|
||||
it("initial fetch carries limit=10 (not the legacy 50) and hits /chat-history", async () => {
|
||||
myChatNextResponse = { ok: true, messages: makeMessagePair(1) };
|
||||
it("initial fetch carries limit=10 (not the legacy 50)", async () => {
|
||||
myChatNextResponse = { ok: true, rows: [makeActivityRow(1)] };
|
||||
render(<ChatTab workspaceId="ws-1" data={minimalData} />);
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
|
||||
const url = myChatHistoryCalls[0];
|
||||
expect(url).toContain("/chat-history");
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
|
||||
const url = myChatActivityCalls[0];
|
||||
expect(url).toContain("limit=10");
|
||||
expect(url).not.toContain("limit=50");
|
||||
// before_ts should NOT be set on the initial fetch — that's the
|
||||
// newest-first slice the user lands on.
|
||||
expect(url).not.toContain("before_ts");
|
||||
// /chat-history filters source-canvas server-side; client should
|
||||
// NOT pass type/source params (they belonged to /activity).
|
||||
expect(url).not.toContain("type=a2a_receive");
|
||||
expect(url).not.toContain("source=canvas");
|
||||
});
|
||||
|
||||
it("hides the top sentinel when initial fetch returns fewer than the limit", async () => {
|
||||
// 3 < 10 → server says "no more older history exists"; sentinel
|
||||
// should NOT mount and the "Loading older messages…" line should
|
||||
// never appear.
|
||||
myChatNextResponse = { ok: true, messages: pageOldestFirst(1, 3) };
|
||||
// never appear (it can't, since the sentinel is what triggers it).
|
||||
myChatNextResponse = {
|
||||
ok: true,
|
||||
rows: [makeActivityRow(1), makeActivityRow(2), makeActivityRow(3)],
|
||||
};
|
||||
render(<ChatTab workspaceId="ws-2" data={minimalData} />);
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
|
||||
await waitFor(() => {
|
||||
expect(screen.queryByText(/Loading chat history/i)).toBeNull();
|
||||
});
|
||||
@ -185,15 +182,15 @@ describe("ChatTab lazy history pagination", () => {
|
||||
});
|
||||
|
||||
it("renders all messages when initial fetch returns exactly the limit", async () => {
|
||||
// limit=10 row-pairs → 20 ChatMessages. reachedEnd should be FALSE
|
||||
// so the sentinel mounts. Verified by bubble counts.
|
||||
myChatNextResponse = {
|
||||
ok: true,
|
||||
messages: pageOldestFirst(1, 10),
|
||||
reachedEnd: false,
|
||||
};
|
||||
// 10 == limit → server might have more older rows; sentinel SHOULD
|
||||
// mount so the IO observer can fire loadOlder() on scroll-up. We
|
||||
// verify by checking the rendered bubble count — if hasMore stayed
|
||||
// true the sentinel render path doesn't crash and all 10 rows
|
||||
// produced their pair of bubbles.
|
||||
const fullPage = Array.from({ length: 10 }, (_, i) => makeActivityRow(i + 1));
|
||||
myChatNextResponse = { ok: true, rows: fullPage };
|
||||
render(<ChatTab workspaceId="ws-3" data={minimalData} />);
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
|
||||
await waitFor(() => {
|
||||
expect(screen.queryByText(/Loading chat history/i)).toBeNull();
|
||||
});
|
||||
@ -205,67 +202,54 @@ describe("ChatTab lazy history pagination", () => {
|
||||
myChatNextResponse = { ok: false, err: new Error("network down") };
|
||||
render(<ChatTab workspaceId="ws-4" data={minimalData} />);
|
||||
const retry = await screen.findByText(/Retry/);
|
||||
myChatNextResponse = { ok: true, messages: makeMessagePair(1) };
|
||||
myChatNextResponse = { ok: true, rows: [makeActivityRow(1)] };
|
||||
fireEvent.click(retry);
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(2));
|
||||
const retryUrl = myChatHistoryCalls[1];
|
||||
expect(retryUrl).toContain("/chat-history");
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(2));
|
||||
const retryUrl = myChatActivityCalls[1];
|
||||
expect(retryUrl).toContain("limit=10");
|
||||
expect(retryUrl).not.toContain("limit=50");
|
||||
});
|
||||
|
||||
it("loadOlder fetches limit=20 with before_ts=oldest.timestamp", async () => {
|
||||
// Initial page = 10 row-pairs in oldest-first order (seq 1..10).
|
||||
// The oldest (and so the cursor for loadOlder) is seq=1's
|
||||
// timestamp 2026-05-05T00:01:00Z.
|
||||
myChatNextResponse = {
|
||||
ok: true,
|
||||
messages: pageOldestFirst(1, 10),
|
||||
reachedEnd: false,
|
||||
};
|
||||
// Initial page = 10 rows in newest-first order (seq 10..1). After
|
||||
// the component reverses to oldest-first for display, messages[0]
|
||||
// is built from seq=1 — the oldest — and its timestamp is what
|
||||
// before_ts should carry.
|
||||
myChatNextResponse = { ok: true, rows: newestFirstPage(1, 10) };
|
||||
render(<ChatTab workspaceId="ws-load-older" data={minimalData} />);
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
|
||||
await waitFor(() => expect(ioInstances.length).toBeGreaterThan(0));
|
||||
|
||||
// Stage older-batch response, then fire IO callback.
|
||||
myChatNextResponse = {
|
||||
ok: true,
|
||||
messages: pageOldestFirst(0, 1),
|
||||
reachedEnd: true,
|
||||
};
|
||||
// Stage the older-batch response, then fire the IO callback.
|
||||
myChatNextResponse = { ok: true, rows: newestFirstPage(0, 1) };
|
||||
triggerIntersection();
|
||||
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(2));
|
||||
const olderUrl = myChatHistoryCalls[1];
|
||||
expect(olderUrl).toContain("/chat-history");
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(2));
|
||||
const olderUrl = myChatActivityCalls[1];
|
||||
expect(olderUrl).toContain("limit=20");
|
||||
expect(olderUrl).toContain("before_ts=");
|
||||
expect(decodeURIComponent(olderUrl)).toContain("before_ts=2026-05-05T00:01:00Z");
|
||||
});
|
||||
|
||||
it("inflight guard rejects a second IO trigger while first loadOlder is in flight", async () => {
|
||||
myChatNextResponse = {
|
||||
ok: true,
|
||||
messages: pageOldestFirst(1, 10),
|
||||
reachedEnd: false,
|
||||
};
|
||||
myChatNextResponse = { ok: true, rows: newestFirstPage(1, 10) };
|
||||
render(<ChatTab workspaceId="ws-inflight" data={minimalData} />);
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
|
||||
await waitFor(() => expect(ioInstances.length).toBeGreaterThan(0));
|
||||
|
||||
// Hold the next loadOlder fetch open with a manual deferred so we
|
||||
// can fire the second trigger while the first is in-flight.
|
||||
let release!: (resp: unknown) => void;
|
||||
const deferred = new Promise<unknown>((res) => {
|
||||
let release!: (rows: unknown[]) => void;
|
||||
const deferred = new Promise<unknown[]>((res) => {
|
||||
release = res;
|
||||
});
|
||||
apiGet.mockImplementationOnce((path: string): Promise<unknown> => {
|
||||
myChatHistoryCalls.push(path);
|
||||
myChatActivityCalls.push(path);
|
||||
return deferred;
|
||||
});
|
||||
|
||||
triggerIntersection(); // start loadOlder #1
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(2));
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(2));
|
||||
|
||||
// Second IO trigger lands while #1 is still pending.
|
||||
triggerIntersection();
|
||||
@ -274,62 +258,79 @@ describe("ChatTab lazy history pagination", () => {
|
||||
// Without the inflight guard, each of these would have started a
|
||||
// new fetch. With the guard, none of them do — call count stays 2.
|
||||
await new Promise((r) => setTimeout(r, 10));
|
||||
expect(myChatHistoryCalls.length).toBe(2);
|
||||
expect(myChatActivityCalls.length).toBe(2);
|
||||
|
||||
// Release the first fetch with a valid wire response shape.
|
||||
release({ messages: [], reached_end: true });
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(2));
|
||||
// Release the first fetch. Inflight clears in the finally block;
|
||||
// a subsequent IO trigger is permitted again (verified by checking
|
||||
// we can fire a follow-up after release without hanging the test).
|
||||
release([]);
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(2));
|
||||
});
|
||||
|
||||
it("empty older response clears the scroll anchor and unmounts the sentinel", async () => {
|
||||
myChatNextResponse = {
|
||||
ok: true,
|
||||
messages: pageOldestFirst(1, 10),
|
||||
reachedEnd: false,
|
||||
};
|
||||
// The bug we're pinning: if loadOlder returns 0 rows, the
|
||||
// scrollAnchorRef must be cleared so the next paint doesn't try to
|
||||
// restore against a no-op prepend (which would fight the natural
|
||||
// bottom-pin for any subsequent live message). hasMore flipping to
|
||||
// false is the same flag-flip path; sentinel disappearing is the
|
||||
// observable proxy.
|
||||
myChatNextResponse = { ok: true, rows: newestFirstPage(1, 10) };
|
||||
render(<ChatTab workspaceId="ws-anchor" data={minimalData} />);
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
|
||||
await waitFor(() => expect(ioInstances.length).toBeGreaterThan(0));
|
||||
|
||||
myChatNextResponse = {
|
||||
ok: true,
|
||||
messages: [],
|
||||
reachedEnd: true,
|
||||
};
|
||||
myChatNextResponse = { ok: true, rows: [] }; // empty → reachedEnd
|
||||
triggerIntersection();
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(2));
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(2));
|
||||
|
||||
// After reachedEnd the sentinel unmounts (hasMore=false). We can't
|
||||
// peek scrollAnchorRef directly, but we can assert the consequence:
|
||||
// scrollIntoView (the bottom-pin for live appends) is not blocked
|
||||
// by a stale anchor. Trigger a re-render via an unrelated state
|
||||
// change… in practice the safest assertion here is that the
|
||||
// sentinel disappeared (proving the empty response propagated to
|
||||
// hasMore correctly, which is the same flag-flip path as anchor
|
||||
// clearing).
|
||||
await waitFor(() => {
|
||||
expect(screen.queryByText(/Loading older messages/i)).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
it("IntersectionObserver does not churn when older messages prepend", async () => {
|
||||
myChatNextResponse = {
|
||||
ok: true,
|
||||
messages: pageOldestFirst(1, 10),
|
||||
reachedEnd: false,
|
||||
};
|
||||
// Whole-PR perf invariant: prepending older history (the load-bearing
|
||||
// user gesture) must NOT tear down + re-arm the IO observer.
|
||||
// Triggering loadOlder is the cleanest way to drive a messages
|
||||
// mutation from inside the test, since live agent push goes through
|
||||
// a Zustand store that's harder to drive reliably from jsdom.
|
||||
//
|
||||
// Pre-fix, loadOlder depended on `messages`, so every prepend
|
||||
// recreated loadOlder → re-ran the IO effect → new observer. Each
|
||||
// call to triggerIntersection() produced a fresh disconnected
|
||||
// observer + a new live one. Post-fix, the observer survives.
|
||||
myChatNextResponse = { ok: true, rows: newestFirstPage(1, 10) };
|
||||
render(<ChatTab workspaceId="ws-stable-io" data={minimalData} />);
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(1));
|
||||
await waitFor(() => expect(myChatActivityCalls.length).toBe(1));
|
||||
await waitFor(() => expect(ioInstances.length).toBeGreaterThan(0));
|
||||
|
||||
// Snapshot the observer instance after first paint stabilises.
|
||||
const observerBefore = ioInstances.at(-1);
|
||||
expect(observerBefore).toBeDefined();
|
||||
expect(observerBefore!.disconnected).toBe(false);
|
||||
|
||||
// Trigger three older-batch prepends. Each batch returns the full
|
||||
// OLDER_HISTORY_BATCH (20 row-pairs = 40 messages) so reachedEnd
|
||||
// stays false and the sentinel keeps mounting.
|
||||
// OLDER_HISTORY_BATCH (20 rows) so reachedEnd stays false and the
|
||||
// sentinel keeps mounting. Pre-fix, each prepend mutated `messages`
|
||||
// → recreated loadOlder → re-ran the IO effect → new observer.
|
||||
for (let batch = 0; batch < 3; batch++) {
|
||||
myChatNextResponse = {
|
||||
ok: true,
|
||||
messages: pageOldestFirst(-(batch + 1) * 20, 20),
|
||||
reachedEnd: false,
|
||||
rows: newestFirstPage(-(batch + 1) * 20, 20),
|
||||
};
|
||||
const callsBefore = myChatHistoryCalls.length;
|
||||
const callsBefore = myChatActivityCalls.length;
|
||||
triggerIntersection();
|
||||
await waitFor(() => expect(myChatHistoryCalls.length).toBe(callsBefore + 1));
|
||||
await waitFor(() =>
|
||||
expect(myChatActivityCalls.length).toBe(callsBefore + 1),
|
||||
);
|
||||
}
|
||||
|
||||
// The original observer is still the live one — no churn.
|
||||
|
||||
@ -9,7 +9,6 @@
|
||||
// AttachmentLightbox).
|
||||
|
||||
import { useState, useEffect, useRef } from "react";
|
||||
import { platformAuthHeaders } from "@/lib/api";
|
||||
import type { ChatAttachment } from "./types";
|
||||
import { isPlatformAttachment, resolveAttachmentHref } from "./uploads";
|
||||
import { AttachmentChip } from "./AttachmentViews";
|
||||
@ -44,8 +43,13 @@ export function AttachmentAudio({ workspaceId, attachment, onDownload, tone }: P
|
||||
void (async () => {
|
||||
try {
|
||||
const href = resolveAttachmentHref(workspaceId, attachment.uri);
|
||||
const headers: Record<string, string> = {};
|
||||
const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN;
|
||||
if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`;
|
||||
const slug = getTenantSlug();
|
||||
if (slug) headers["X-Molecule-Org-Slug"] = slug;
|
||||
const res = await fetch(href, {
|
||||
headers: platformAuthHeaders(),
|
||||
headers,
|
||||
credentials: "include",
|
||||
signal: AbortSignal.timeout(60_000),
|
||||
});
|
||||
@ -112,5 +116,9 @@ export function AttachmentAudio({ workspaceId, attachment, onDownload, tone }: P
|
||||
);
|
||||
}
|
||||
|
||||
// Local getTenantSlug() removed — auth-header construction now goes
|
||||
// through platformAuthHeaders() from @/lib/api (#178).
|
||||
function getTenantSlug(): string | null {
|
||||
if (typeof window === "undefined") return null;
|
||||
const host = window.location.hostname;
|
||||
const m = host.match(/^([^.]+)\.moleculesai\.app$/);
|
||||
return m ? m[1] : null;
|
||||
}
|
||||
|
||||
@ -35,7 +35,6 @@
|
||||
// downscale via canvas, but defer that to v2.
|
||||
|
||||
import { useState, useEffect, useRef } from "react";
|
||||
import { platformAuthHeaders } from "@/lib/api";
|
||||
import type { ChatAttachment } from "./types";
|
||||
import { isPlatformAttachment, resolveAttachmentHref } from "./uploads";
|
||||
import { AttachmentLightbox } from "./AttachmentLightbox";
|
||||
@ -76,14 +75,22 @@ export function AttachmentImage({ workspaceId, attachment, onDownload, tone }: P
|
||||
}
|
||||
|
||||
// Platform-auth path: identical to downloadChatFile but we keep
|
||||
// the blob (don't trigger a Save-As). Auth headers come from the
|
||||
// shared `platformAuthHeaders()` helper — one source of truth for
|
||||
// every authenticated raw fetch in the canvas (#178).
|
||||
// the blob (don't trigger a Save-As). Use the same headers it does
|
||||
// by going through it indirectly — no, downloadChatFile triggers a
|
||||
// Save-As. Need a separate fetch.
|
||||
void (async () => {
|
||||
try {
|
||||
const href = resolveAttachmentHref(workspaceId, attachment.uri);
|
||||
const headers: Record<string, string> = {};
|
||||
// Read the same env var downloadChatFile reads — single source
|
||||
// of truth would be cleaner; refactor opportunity for PR-2 if
|
||||
// we add the same path to AttachmentVideo.
|
||||
const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN;
|
||||
if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`;
|
||||
const slug = getTenantSlug();
|
||||
if (slug) headers["X-Molecule-Org-Slug"] = slug;
|
||||
const res = await fetch(href, {
|
||||
headers: platformAuthHeaders(),
|
||||
headers,
|
||||
credentials: "include",
|
||||
signal: AbortSignal.timeout(30_000),
|
||||
});
|
||||
@ -177,7 +184,15 @@ export function AttachmentImage({ workspaceId, attachment, onDownload, tone }: P
|
||||
);
|
||||
}
|
||||
|
||||
// Local getTenantSlug() removed — auth-header construction now goes
|
||||
// through platformAuthHeaders() from @/lib/api which uses the canonical
|
||||
// getTenantSlug() from @/lib/tenant. This eliminates the duplicate
|
||||
// hostname-regex + the duplicate bearer-token-attach pattern (#178).
|
||||
// Internal helper — duplicated from uploads.ts (it's not exported
|
||||
// there). Kept local so this component doesn't reach into private
|
||||
// surface; if AttachmentVideo / AttachmentPDF in PR-2/PR-3 also need
|
||||
// it, lift to an exported helper at that point (the third-caller
|
||||
// rule).
|
||||
function getTenantSlug(): string | null {
|
||||
if (typeof window === "undefined") return null;
|
||||
const host = window.location.hostname;
|
||||
// Tenant subdomain shape: <slug>.moleculesai.app
|
||||
const m = host.match(/^([^.]+)\.moleculesai\.app$/);
|
||||
return m ? m[1] : null;
|
||||
}
|
||||
|
||||
@ -33,7 +33,6 @@
|
||||
// timeout, swap to chip. Implemented as a 3-second watchdog.
|
||||
|
||||
import { useState, useEffect, useRef } from "react";
|
||||
import { platformAuthHeaders } from "@/lib/api";
|
||||
import type { ChatAttachment } from "./types";
|
||||
import { isPlatformAttachment, resolveAttachmentHref } from "./uploads";
|
||||
import { AttachmentLightbox } from "./AttachmentLightbox";
|
||||
@ -70,8 +69,13 @@ export function AttachmentPDF({ workspaceId, attachment, onDownload, tone }: Pro
|
||||
void (async () => {
|
||||
try {
|
||||
const href = resolveAttachmentHref(workspaceId, attachment.uri);
|
||||
const headers: Record<string, string> = {};
|
||||
const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN;
|
||||
if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`;
|
||||
const slug = getTenantSlug();
|
||||
if (slug) headers["X-Molecule-Org-Slug"] = slug;
|
||||
const res = await fetch(href, {
|
||||
headers: platformAuthHeaders(),
|
||||
headers,
|
||||
credentials: "include",
|
||||
signal: AbortSignal.timeout(60_000),
|
||||
});
|
||||
@ -185,5 +189,9 @@ function PdfGlyph() {
|
||||
);
|
||||
}
|
||||
|
||||
// Local getTenantSlug() removed — auth-header construction now goes
|
||||
// through platformAuthHeaders() from @/lib/api (#178).
|
||||
function getTenantSlug(): string | null {
|
||||
if (typeof window === "undefined") return null;
|
||||
const host = window.location.hostname;
|
||||
const m = host.match(/^([^.]+)\.moleculesai\.app$/);
|
||||
return m ? m[1] : null;
|
||||
}
|
||||
|
||||
@ -26,7 +26,6 @@
|
||||
// to download the full file.
|
||||
|
||||
import { useState, useEffect } from "react";
|
||||
import { platformAuthHeaders } from "@/lib/api";
|
||||
import type { ChatAttachment } from "./types";
|
||||
import { isPlatformAttachment, resolveAttachmentHref } from "./uploads";
|
||||
import { AttachmentChip } from "./AttachmentViews";
|
||||
@ -58,13 +57,13 @@ export function AttachmentTextPreview({ workspaceId, attachment, onDownload, ton
|
||||
void (async () => {
|
||||
try {
|
||||
const href = resolveAttachmentHref(workspaceId, attachment.uri);
|
||||
// Only attach platform auth headers for in-platform URIs —
|
||||
// off-platform URLs (HTTP/HTTPS attachments) MUST NOT receive
|
||||
// our bearer token (it would leak the admin token to a third
|
||||
// party). The branch is preserved with the new shared helper.
|
||||
const headers: Record<string, string> = isPlatformAttachment(attachment.uri)
|
||||
? platformAuthHeaders()
|
||||
: {};
|
||||
const headers: Record<string, string> = {};
|
||||
if (isPlatformAttachment(attachment.uri)) {
|
||||
const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN;
|
||||
if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`;
|
||||
const slug = getTenantSlug();
|
||||
if (slug) headers["X-Molecule-Org-Slug"] = slug;
|
||||
}
|
||||
const res = await fetch(href, {
|
||||
headers,
|
||||
credentials: "include",
|
||||
@ -183,5 +182,9 @@ export function AttachmentTextPreview({ workspaceId, attachment, onDownload, ton
|
||||
);
|
||||
}
|
||||
|
||||
// Local getTenantSlug() removed — auth-header construction now goes
|
||||
// through platformAuthHeaders() from @/lib/api (#178).
|
||||
function getTenantSlug(): string | null {
|
||||
if (typeof window === "undefined") return null;
|
||||
const host = window.location.hostname;
|
||||
const m = host.match(/^([^.]+)\.moleculesai\.app$/);
|
||||
return m ? m[1] : null;
|
||||
}
|
||||
|
||||
@ -25,7 +25,6 @@
|
||||
// fetch via service worker. v2 if measured-needed.
|
||||
|
||||
import { useState, useEffect, useRef } from "react";
|
||||
import { platformAuthHeaders } from "@/lib/api";
|
||||
import type { ChatAttachment } from "./types";
|
||||
import { isPlatformAttachment, resolveAttachmentHref } from "./uploads";
|
||||
import { AttachmentChip } from "./AttachmentViews";
|
||||
@ -62,8 +61,13 @@ export function AttachmentVideo({ workspaceId, attachment, onDownload, tone }: P
|
||||
void (async () => {
|
||||
try {
|
||||
const href = resolveAttachmentHref(workspaceId, attachment.uri);
|
||||
const headers: Record<string, string> = {};
|
||||
const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN;
|
||||
if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`;
|
||||
const slug = getTenantSlug();
|
||||
if (slug) headers["X-Molecule-Org-Slug"] = slug;
|
||||
const res = await fetch(href, {
|
||||
headers: platformAuthHeaders(),
|
||||
headers,
|
||||
credentials: "include",
|
||||
// Videos are larger than images on average; give the request
|
||||
// more headroom. The server's per-request body cap (50MB) is
|
||||
@ -143,5 +147,11 @@ export function AttachmentVideo({ workspaceId, attachment, onDownload, tone }: P
|
||||
);
|
||||
}
|
||||
|
||||
// Local getTenantSlug() removed — auth-header construction now goes
|
||||
// through platformAuthHeaders() from @/lib/api (#178).
|
||||
// Internal helper — same shape as AttachmentImage's. Lifted to a
|
||||
// shared util in PR-2.5 if a third caller needs it (PDF, audio).
|
||||
function getTenantSlug(): string | null {
|
||||
if (typeof window === "undefined") return null;
|
||||
const host = window.location.hostname;
|
||||
const m = host.match(/^([^.]+)\.moleculesai\.app$/);
|
||||
return m ? m[1] : null;
|
||||
}
|
||||
|
||||
@ -1,16 +1,12 @@
|
||||
import { PLATFORM_URL, platformAuthHeaders } from "@/lib/api";
|
||||
import { PLATFORM_URL } from "@/lib/api";
|
||||
import { getTenantSlug } from "@/lib/tenant";
|
||||
import type { ChatAttachment } from "./types";
|
||||
|
||||
/** Chat attachments are intentionally uploaded via a direct fetch()
|
||||
* instead of the `api.post` helper — `api.post` JSON-stringifies the
|
||||
* body, which would 500 on a Blob. Auth headers (tenant slug, admin
|
||||
* token, credentials) come from `platformAuthHeaders()` — the same
|
||||
* helper `request()` uses, so a missing bearer surfaces as a single
|
||||
* fix site instead of N copies. We deliberately do NOT set
|
||||
* Content-Type so the browser writes the multipart boundary into the
|
||||
* header; setting it manually would yield a multipart body the server
|
||||
* can't parse. See lib/api.ts platformAuthHeaders() for the full
|
||||
* rationale on why this pair must stay matched. */
|
||||
* body, which would 500 on a Blob. Mirrors the header plumbing
|
||||
* (tenant slug, admin token, credentials) so SaaS + self-hosted
|
||||
* callers work the same way. */
|
||||
export async function uploadChatFiles(
|
||||
workspaceId: string,
|
||||
files: File[],
|
||||
@ -20,12 +16,18 @@ export async function uploadChatFiles(
|
||||
const form = new FormData();
|
||||
for (const f of files) form.append("files", f, f.name);
|
||||
|
||||
const headers: Record<string, string> = {};
|
||||
const slug = getTenantSlug();
|
||||
if (slug) headers["X-Molecule-Org-Slug"] = slug;
|
||||
const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN;
|
||||
if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`;
|
||||
|
||||
// Uploads legitimately take a while on cold cache (tar write +
|
||||
// docker cp into the container). 60s is comfortable for the 25MB/
|
||||
// 50MB caps the server enforces.
|
||||
const res = await fetch(`${PLATFORM_URL}/workspaces/${workspaceId}/chat/uploads`, {
|
||||
method: "POST",
|
||||
headers: platformAuthHeaders(),
|
||||
headers,
|
||||
body: form,
|
||||
credentials: "include",
|
||||
signal: AbortSignal.timeout(60_000),
|
||||
@ -141,8 +143,14 @@ export async function downloadChatFile(
|
||||
return;
|
||||
}
|
||||
|
||||
const headers: Record<string, string> = {};
|
||||
const slug = getTenantSlug();
|
||||
if (slug) headers["X-Molecule-Org-Slug"] = slug;
|
||||
const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN;
|
||||
if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`;
|
||||
|
||||
const res = await fetch(href, {
|
||||
headers: platformAuthHeaders(),
|
||||
headers,
|
||||
credentials: "include",
|
||||
signal: AbortSignal.timeout(60_000),
|
||||
});
|
||||
|
||||
@ -1,97 +0,0 @@
|
||||
// @vitest-environment jsdom
|
||||
import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
|
||||
|
||||
// Tests for platformAuthHeaders — the shared helper extracted in #178
|
||||
// to consolidate the bearer-token-attach + tenant-slug-attach pattern
|
||||
// that was previously duplicated across 7 raw-fetch callsites in the
|
||||
// canvas (uploads + 5 Attachment* components + the api.ts request()
|
||||
// function).
|
||||
//
|
||||
// What we pin here:
|
||||
// - Returns a fresh object each call (so callers can mutate without
|
||||
// leaking into each other).
|
||||
// - Empty result on a non-tenant host with no admin token (the
|
||||
// localhost / self-hosted shape).
|
||||
// - Bearer attached when NEXT_PUBLIC_ADMIN_TOKEN is set.
|
||||
// - X-Molecule-Org-Slug attached when window.location.hostname is a
|
||||
// tenant subdomain (<slug>.moleculesai.app).
|
||||
// - Both attached when both apply (the production SaaS shape).
|
||||
//
|
||||
// Why jsdom: getTenantSlug() reads window.location.hostname. Node-only
|
||||
// environment yields no window and getTenantSlug returns null
|
||||
// unconditionally — wouldn't exercise the slug branch.
|
||||
|
||||
import { platformAuthHeaders } from "../api";
|
||||
|
||||
describe("platformAuthHeaders", () => {
|
||||
let originalAdminToken: string | undefined;
|
||||
|
||||
beforeEach(() => {
|
||||
originalAdminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN;
|
||||
delete process.env.NEXT_PUBLIC_ADMIN_TOKEN;
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (originalAdminToken === undefined) delete process.env.NEXT_PUBLIC_ADMIN_TOKEN;
|
||||
else process.env.NEXT_PUBLIC_ADMIN_TOKEN = originalAdminToken;
|
||||
// jsdom resets hostname between tests via the @vitest-environment
|
||||
// pragma's per-test isolation. No explicit reset needed.
|
||||
});
|
||||
|
||||
it("returns an empty object on a non-tenant host with no admin token", () => {
|
||||
// jsdom default hostname is "localhost" — not a tenant slug, so
|
||||
// getTenantSlug() returns null and no X-Molecule-Org-Slug is added.
|
||||
const headers = platformAuthHeaders();
|
||||
expect(headers).toEqual({});
|
||||
});
|
||||
|
||||
it("attaches Authorization when NEXT_PUBLIC_ADMIN_TOKEN is set", () => {
|
||||
process.env.NEXT_PUBLIC_ADMIN_TOKEN = "local-dev-admin";
|
||||
const headers = platformAuthHeaders();
|
||||
expect(headers).toEqual({ Authorization: "Bearer local-dev-admin" });
|
||||
});
|
||||
|
||||
it("does NOT attach Authorization when NEXT_PUBLIC_ADMIN_TOKEN is empty string", () => {
|
||||
// Empty-string env is the JS-side shape of `KEY=` in .env.
|
||||
// Treating it as unset matches the matched-pair guard in
|
||||
// next.config.ts (admin-token-pair.test.ts) — symmetric semantics.
|
||||
process.env.NEXT_PUBLIC_ADMIN_TOKEN = "";
|
||||
const headers = platformAuthHeaders();
|
||||
expect(headers).toEqual({});
|
||||
});
|
||||
|
||||
it("attaches X-Molecule-Org-Slug on a tenant subdomain", () => {
|
||||
Object.defineProperty(window, "location", {
|
||||
value: { hostname: "reno-stars.moleculesai.app" },
|
||||
writable: true,
|
||||
});
|
||||
const headers = platformAuthHeaders();
|
||||
expect(headers).toEqual({ "X-Molecule-Org-Slug": "reno-stars" });
|
||||
});
|
||||
|
||||
it("attaches both when both apply (production SaaS shape)", () => {
|
||||
Object.defineProperty(window, "location", {
|
||||
value: { hostname: "reno-stars.moleculesai.app" },
|
||||
writable: true,
|
||||
});
|
||||
process.env.NEXT_PUBLIC_ADMIN_TOKEN = "tenant-bearer";
|
||||
const headers = platformAuthHeaders();
|
||||
// Pin exact-equality on the full shape — substring/contains
|
||||
// assertions would also pass for an extra-header bug.
|
||||
expect(headers).toEqual({
|
||||
"X-Molecule-Org-Slug": "reno-stars",
|
||||
Authorization: "Bearer tenant-bearer",
|
||||
});
|
||||
});
|
||||
|
||||
it("returns a fresh object each call (callers can mutate safely)", () => {
|
||||
process.env.NEXT_PUBLIC_ADMIN_TOKEN = "tok";
|
||||
const a = platformAuthHeaders();
|
||||
const b = platformAuthHeaders();
|
||||
expect(a).not.toBe(b); // distinct refs
|
||||
expect(a).toEqual(b); // same content
|
||||
a["Content-Type"] = "application/json";
|
||||
// Mutation on `a` does not leak into `b`.
|
||||
expect(b["Content-Type"]).toBeUndefined();
|
||||
});
|
||||
});
|
||||
@ -21,45 +21,6 @@ export interface RequestOptions {
|
||||
timeoutMs?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the platform auth header set used by every authenticated fetch
|
||||
* from the canvas. Returns a fresh object so callers can mutate (e.g.
|
||||
* append `Content-Type` for JSON requests, omit it for FormData).
|
||||
*
|
||||
* SaaS cross-origin shape:
|
||||
* - `X-Molecule-Org-Slug` — derived from `window.location.hostname`
|
||||
* by `getTenantSlug()`. Control plane uses it for fly-replay
|
||||
* routing. Empty on localhost / non-tenant hosts — safe to omit.
|
||||
* - `Authorization: Bearer <token>` — `NEXT_PUBLIC_ADMIN_TOKEN` baked
|
||||
* into the canvas build (see canvas/Dockerfile L8/L11). Required by
|
||||
* the workspace-server when `ADMIN_TOKEN` is set on the server side
|
||||
* (Tier-2b AdminAuth gate, wsauth_middleware.go ~L245). Empty when
|
||||
* no admin token was provisioned — the Tier-1 session-cookie path
|
||||
* handles that case via `credentials:"include"`.
|
||||
*
|
||||
* Why a shared helper: the two-line "read env, attach bearer; read
|
||||
* slug, attach header" pattern was duplicated across `request()` and
|
||||
* 7 raw-fetch callsites (chat uploads/download + 5 Attachment*
|
||||
* components) before this consolidation. A new poller or raw fetch
|
||||
* that forgets one of the two headers silently 401s against
|
||||
* workspace-server when ADMIN_TOKEN is set — the exact bug shape
|
||||
* called out in #178 / closes the post-#176 self-review gap.
|
||||
*
|
||||
* Callers that want JSON Content-Type should spread this and add it
|
||||
* themselves; FormData callers should NOT add Content-Type (the
|
||||
* browser sets the multipart boundary). Centralizing the auth pair
|
||||
* but leaving Content-Type up to the caller is the minimum viable
|
||||
* shared shape.
|
||||
*/
|
||||
export function platformAuthHeaders(): Record<string, string> {
|
||||
const headers: Record<string, string> = {};
|
||||
const slug = getTenantSlug();
|
||||
if (slug) headers["X-Molecule-Org-Slug"] = slug;
|
||||
const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN;
|
||||
if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`;
|
||||
return headers;
|
||||
}
|
||||
|
||||
async function request<T>(
|
||||
method: string,
|
||||
path: string,
|
||||
@ -67,16 +28,17 @@ async function request<T>(
|
||||
retryCount = 0,
|
||||
options?: RequestOptions,
|
||||
): Promise<T> {
|
||||
// JSON-bodied request — Content-Type is JSON. Auth pair comes from
|
||||
// the shared helper; see its doc comment for the SaaS-shape rationale.
|
||||
const headers: Record<string, string> = {
|
||||
"Content-Type": "application/json",
|
||||
...platformAuthHeaders(),
|
||||
};
|
||||
// Re-read slug locally for the 401 handler below — `headers` already
|
||||
// has it, but the 401 branch needs the bare value to gate the
|
||||
// session-probe + redirect logic on tenant context.
|
||||
// SaaS cross-origin shape:
|
||||
// - X-Molecule-Org-Slug: derived from window.location.hostname by
|
||||
// getTenantSlug(). Control plane uses it for fly-replay routing.
|
||||
// Empty on localhost / non-tenant hosts — safe to omit.
|
||||
// - credentials:"include": sends the session cookie cross-origin.
|
||||
// Cookie's Domain=.moleculesai.app attribute + cp's CORS allow this.
|
||||
const headers: Record<string, string> = { "Content-Type": "application/json" };
|
||||
const slug = getTenantSlug();
|
||||
if (slug) headers["X-Molecule-Org-Slug"] = slug;
|
||||
const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN;
|
||||
if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`;
|
||||
|
||||
const res = await fetch(`${PLATFORM_URL}${path}`, {
|
||||
method,
|
||||
|
||||
@ -7,32 +7,6 @@ export default defineConfig({
|
||||
test: {
|
||||
environment: 'node',
|
||||
exclude: ['e2e/**', 'node_modules/**', '**/dist/**'],
|
||||
// CI-conditional test timeout (issue #96).
|
||||
//
|
||||
// Vitest's 5000ms default is too tight for the first test in any
|
||||
// file under our CI shape: `npx vitest run --coverage` on the
|
||||
// self-hosted Gitea Actions Docker runner. The cold-start cost
|
||||
// (v8 coverage instrumentation init + JSDOM bootstrap + module-
|
||||
// graph import for @/components/* and @/lib/* + first React
|
||||
// render) consistently consumes 5-7 seconds for the first
|
||||
// synchronous test in heavyweight component files
|
||||
// (ActivityTab.test.tsx, CreateWorkspaceDialog.test.tsx,
|
||||
// ConfigTab.provider.test.tsx) — even though every subsequent
|
||||
// test in the same file completes in 100-1500ms.
|
||||
//
|
||||
// Empirically the worst observed first-test was 6453ms in a
|
||||
// single file (CreateWorkspaceDialog). 30000ms gives ~5x
|
||||
// headroom over that on CI; we still keep 5000ms locally so
|
||||
// genuine waitFor races / hung promises stay sensitive in dev.
|
||||
//
|
||||
// Same vitest pattern documented at:
|
||||
// https://vitest.dev/config/testtimeout
|
||||
// https://vitest.dev/guide/coverage#profiling-test-performance
|
||||
//
|
||||
// Per-test duration is still emitted to the CI log; if a test
|
||||
// ever silently approaches 25-30s under this raised ceiling that
|
||||
// will surface as a duration regression and we revisit.
|
||||
testTimeout: process.env.CI ? 30000 : 5000,
|
||||
// Coverage is instrumented but NOT yet a CI gate — first land
|
||||
// observability so we can see the baseline, then dial in
|
||||
// thresholds + a hard gate in a follow-up PR (#1815). Today's
|
||||
|
||||
@ -1,43 +0,0 @@
|
||||
# docker-compose.dev.yml — overlay over docker-compose.yml for local dev
|
||||
# with air-driven live reload of the platform (workspace-server) service.
|
||||
#
|
||||
# Usage:
|
||||
# docker compose -f docker-compose.yml -f docker-compose.dev.yml up
|
||||
# (or `make dev` shorthand from repo root)
|
||||
#
|
||||
# What this overlay changes vs docker-compose.yml alone:
|
||||
# - Platform service uses workspace-server/Dockerfile.dev (air on top of
|
||||
# golang:1.25-alpine) instead of the multi-stage prod Dockerfile.
|
||||
# - Platform service bind-mounts the host's workspace-server/ source
|
||||
# into /app/workspace-server so air sees source edits live.
|
||||
# - Other services (postgres, redis, langfuse, etc.) inherit unchanged
|
||||
# from docker-compose.yml.
|
||||
#
|
||||
# What stays the same:
|
||||
# - All env vars, volumes, depends_on, healthchecks from docker-compose.yml.
|
||||
# - Network topology + ports.
|
||||
# - Postgres/Redis as service containers (no in-process replacements).
|
||||
|
||||
services:
|
||||
platform:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: workspace-server/Dockerfile.dev
|
||||
# Rebind source: edits under host's workspace-server/ propagate live.
|
||||
# The named volume on go-build-cache speeds up first build per container.
|
||||
volumes:
|
||||
- ./workspace-server:/app/workspace-server
|
||||
- go-build-cache:/root/.cache/go-build
|
||||
- go-mod-cache:/go/pkg/mod
|
||||
# Air signals the running binary on rebuild; ensure shell stops cleanly.
|
||||
init: true
|
||||
# Mark the service as dev-mode so the platform can short-circuit any
|
||||
# behavior that's incompatible with hot-reload (e.g. background
|
||||
# cron-style watchers that don't survive process restart). No-op
|
||||
# today; reserved for future flag use.
|
||||
environment:
|
||||
MOLECULE_DEV_HOT_RELOAD: "1"
|
||||
|
||||
volumes:
|
||||
go-build-cache:
|
||||
go-mod-cache:
|
||||
@ -13,7 +13,6 @@ services:
|
||||
- pgdata:/var/lib/postgresql/data
|
||||
networks:
|
||||
- molecule-monorepo-net
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-dev}"]
|
||||
interval: 2s
|
||||
@ -51,7 +50,6 @@ services:
|
||||
- redisdata:/data
|
||||
networks:
|
||||
- molecule-monorepo-net
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "ping"]
|
||||
interval: 2s
|
||||
@ -128,10 +126,6 @@ services:
|
||||
REDIS_URL: redis://redis:6379
|
||||
PORT: "${PLATFORM_PORT:-8080}"
|
||||
PLATFORM_URL: "http://platform:${PLATFORM_PORT:-8080}"
|
||||
# Container network namespace is already isolated; "all interfaces"
|
||||
# inside the container = the bridge interface only. The fail-open
|
||||
# default (127.0.0.1) would block host-to-container access.
|
||||
BIND_ADDR: "${BIND_ADDR:-0.0.0.0}"
|
||||
# Default MOLECULE_ENV=development so the WorkspaceAuth / AdminAuth
|
||||
# middleware fail-open path activates when ADMIN_TOKEN is unset —
|
||||
# otherwise the canvas (which runs without a bearer in pure local
|
||||
@ -201,28 +195,12 @@ services:
|
||||
# App private key — read-only bind-mount. The host-side path is
|
||||
# gitignored per .gitignore rules (/.secrets/ + *.pem).
|
||||
- ./.secrets/github-app.pem:/secrets/github-app.pem:ro
|
||||
# Per-role persona credentials (molecule-core#242 local surface).
|
||||
# Sourced at workspace creation time by org_import.go::loadPersonaEnvFile
|
||||
# when a workspace.yaml carries `role: <name>`. The host-side dir is
|
||||
# populated by the operator-host bootstrap kit (28 dev-tree personas);
|
||||
# /etc/molecule-bootstrap/personas is the in-container path the
|
||||
# platform expects (matches the prod tenant-EC2 path so the same code
|
||||
# works in both modes).
|
||||
#
|
||||
# Read-only mount — workspace-server only reads, never writes here.
|
||||
# If the host dir is empty/missing the platform's loadPersonaEnvFile
|
||||
# silently no-ops per its existing semantics, so this mount is safe
|
||||
# even on a fresh machine that hasn't run the bootstrap kit yet.
|
||||
- ${MOLECULE_PERSONA_ROOT_HOST:-${HOME}/.molecule-ai/personas}:/etc/molecule-bootstrap/personas:ro
|
||||
ports:
|
||||
- "${PLATFORM_PUBLISH_PORT:-8080}:${PLATFORM_PORT:-8080}"
|
||||
networks:
|
||||
- molecule-monorepo-net
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
# Plain GET — `--spider` would issue HEAD, which returns 404 because
|
||||
# /health is registered as GET only.
|
||||
test: ["CMD-SHELL", "wget -qO /dev/null --tries=1 http://localhost:${PLATFORM_PORT:-8080}/health || exit 1"]
|
||||
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:${PLATFORM_PORT:-8080}/health || exit 1"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 10
|
||||
@ -234,8 +212,8 @@ services:
|
||||
# docker compose pull canvas && docker compose up -d canvas
|
||||
# First-time local setup or testing unreleased changes — build from source:
|
||||
# docker compose build canvas && docker compose up -d canvas
|
||||
# Note: ECR images require AWS auth — `aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 153263036946.dkr.ecr.us-east-2.amazonaws.com` before pull.
|
||||
image: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/canvas:latest
|
||||
# Note: GHCR images are private — `docker login ghcr.io` required before pull.
|
||||
image: ghcr.io/molecule-ai/canvas:latest
|
||||
build:
|
||||
context: ./canvas
|
||||
dockerfile: Dockerfile
|
||||
@ -260,7 +238,7 @@ services:
|
||||
networks:
|
||||
- molecule-monorepo-net
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget -qO /dev/null --tries=1 http://127.0.0.1:${CANVAS_PORT:-3000} || exit 1"]
|
||||
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://127.0.0.1:${CANVAS_PORT:-3000} || exit 1"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 10
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
**Status:** living document — update when you ship a feature that touches one backend.
|
||||
**Owner:** workspace-server + controlplane teams.
|
||||
**Last audit:** 2026-05-07 (plugin install/uninstall closed for EC2 backend via EIC SSH push to the bind-mounted `/configs/plugins/<name>/`, mirroring the Files API PR #1702 pattern).
|
||||
**Last audit:** 2026-05-05 (Claude agent — `provisionWorkspaceAuto` / `StopWorkspaceAuto` / `HasProvisioner` SoT pattern landed in PRs #2811 + #2824).
|
||||
|
||||
## Why this exists
|
||||
|
||||
@ -54,7 +54,7 @@ For "do we have any backend?", use `HasProvisioner()`, never bare `h.provisioner
|
||||
| **Files API** | | | | |
|
||||
| List / Read / Write / Replace / Delete | `container_files.go`, `template_import.go` | `docker exec` + tar `CopyToContainer` | SSH via EIC tunnel (PR #1702) | ✅ parity as of 2026-04-22 (previously docker-only) |
|
||||
| **Plugins** | | | | |
|
||||
| Install / uninstall / list | `plugins_install.go` + `plugins_install_eic.go` | `deliverToContainer()` → exec+`CopyToContainer` on local container | `instance_id` set → EIC SSH push of the staged tarball into the EC2's bind-mounted `/configs/plugins/<name>/` (per `workspaceFilePathPrefix`), `chown 1000:1000`, restart | ✅ parity |
|
||||
| Install / uninstall / list | `plugins_install.go` | `deliverToContainer()` + volume rm | **gap — no live plugin delivery** | 🔴 **docker-only** |
|
||||
| **Terminal (WebSocket)** | | | | |
|
||||
| Dispatch | `terminal.go:90-105` | `instance_id=""` → `handleLocalConnect` → `docker attach` | `instance_id` set → `handleRemoteConnect` → EIC SSH + `docker exec` | ✅ parity (different implementations, same UX) |
|
||||
| **A2A proxy** | | | | |
|
||||
|
||||
@ -4,7 +4,7 @@ How a workspace-server code change reaches the prod tenant fleet — and how to
|
||||
|
||||
> **⚠️ State note (2026-04-22):** this doc describes the **intended design**. As of this write, the canary fleet described below is **not actually running** — no canary tenants are provisioned, `CANARY_TENANT_URLS` / `CANARY_ADMIN_TOKENS` / `CANARY_CP_SHARED_SECRET` are empty in repo secrets, and `canary-verify.yml` fails every run.
|
||||
>
|
||||
> Current merges gate on manual `promote-latest.yml` dispatches, not canary. See [molecule-controlplane/docs/canary-tenants.md](https://git.moleculesai.app/molecule-ai/molecule-controlplane/src/branch/main/docs/canary-tenants.md) for the Phase 1 code work that's already shipped + the Phase 2 plan for actually standing up the fleet + a "should we even do this now?" decision framework.
|
||||
> Current merges gate on manual `promote-latest.yml` dispatches, not canary. See [molecule-controlplane/docs/canary-tenants.md](https://git.moleculesai.app/molecule-ai/molecule-controlplane/blob/main/docs/canary-tenants.md) for the Phase 1 code work that's already shipped + the Phase 2 plan for actually standing up the fleet + a "should we even do this now?" decision framework.
|
||||
>
|
||||
> **Account-specific identifiers (AWS account ID, IAM role name) referenced below in the original design have been redacted from this public doc.** The actual values — if they exist — are in `Molecule-AI/internal/runbooks/canary-fleet.md`. If you're implementing Phase 2, start there.
|
||||
>
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# Molecule AI — Comprehensive Technical Documentation
|
||||
|
||||
> Definitive technical reference for the Molecule AI Agent Team platform.
|
||||
> Based on a full non-invasive scan of the [molecule-monorepo](https://git.moleculesai.app/molecule-ai/molecule-monorepo) repository.
|
||||
> Based on a full non-invasive scan of the [molecule-monorepo](https://git.moleculesai.app/molecule-ai/molecule-core) repository.
|
||||
|
||||
---
|
||||
|
||||
@ -1149,11 +1149,11 @@ Molecule AI's workspace abstraction is **runtime-agnostic by design**. A workspa
|
||||
|
||||
## Links
|
||||
|
||||
- **GitHub**: https://git.moleculesai.app/molecule-ai/molecule-monorepo
|
||||
- **Architecture Docs**: https://git.moleculesai.app/molecule-ai/molecule-monorepo/src/branch/main/docs/architecture
|
||||
- **API Protocol**: https://git.moleculesai.app/molecule-ai/molecule-monorepo/src/branch/main/docs/api-protocol
|
||||
- **Agent Runtime**: https://git.moleculesai.app/molecule-ai/molecule-monorepo/src/branch/main/docs/agent-runtime
|
||||
- **Product Docs**: https://git.moleculesai.app/molecule-ai/molecule-monorepo/src/branch/main/docs/product
|
||||
- **GitHub**: https://git.moleculesai.app/molecule-ai/molecule-core
|
||||
- **Architecture Docs**: https://git.moleculesai.app/molecule-ai/molecule-core/tree/main/docs/architecture
|
||||
- **API Protocol**: https://git.moleculesai.app/molecule-ai/molecule-core/tree/main/docs/api-protocol
|
||||
- **Agent Runtime**: https://git.moleculesai.app/molecule-ai/molecule-core/tree/main/docs/agent-runtime
|
||||
- **Product Docs**: https://git.moleculesai.app/molecule-ai/molecule-core/tree/main/docs/product
|
||||
|
||||
---
|
||||
|
||||
|
||||
@ -79,7 +79,7 @@ For SOC2 / ISO 27001 / customer security questionnaires:
|
||||
|
||||
## Pointers
|
||||
|
||||
- KMS envelope code: [`molecule-controlplane/internal/crypto/kms.go`](https://git.moleculesai.app/molecule-ai/molecule-controlplane/src/branch/main/internal/crypto/kms.go)
|
||||
- Static-key fallback: [`molecule-controlplane/internal/crypto/aes.go`](https://git.moleculesai.app/molecule-ai/molecule-controlplane/src/branch/main/internal/crypto/aes.go)
|
||||
- KMS envelope code: [`molecule-controlplane/internal/crypto/kms.go`](https://git.moleculesai.app/molecule-ai/molecule-controlplane/blob/main/internal/crypto/kms.go)
|
||||
- Static-key fallback: [`molecule-controlplane/internal/crypto/aes.go`](https://git.moleculesai.app/molecule-ai/molecule-controlplane/blob/main/internal/crypto/aes.go)
|
||||
- Tenant secrets handler: [`workspace-server/internal/crypto/aes.go`](../../workspace-server/internal/crypto/aes.go)
|
||||
- Tenant secrets schema: [database-schema.md](./database-schema.md#workspace_secrets)
|
||||
|
||||
@ -1,28 +0,0 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 64 64">
|
||||
<style>
|
||||
.bg { fill: #0a1120; }
|
||||
.accent { fill: #7fe8d6; }
|
||||
.accent-stroke { stroke: #7fe8d6; }
|
||||
@media (prefers-color-scheme: light) {
|
||||
.bg { fill: #f5f7fa; }
|
||||
.accent { fill: #1a8a72; }
|
||||
.accent-stroke { stroke: #1a8a72; }
|
||||
}
|
||||
</style>
|
||||
<rect class="bg" width="64" height="64" rx="14"/>
|
||||
<g class="accent-stroke" stroke-width="2.4" stroke-linecap="round" fill="none">
|
||||
<line x1="32" y1="32" x2="12" y2="14"/>
|
||||
<line x1="32" y1="32" x2="52" y2="18"/>
|
||||
<line x1="32" y1="32" x2="10" y2="40"/>
|
||||
<line x1="32" y1="32" x2="54" y2="44"/>
|
||||
<line x1="32" y1="32" x2="32" y2="56"/>
|
||||
</g>
|
||||
<g class="accent">
|
||||
<circle cx="32" cy="32" r="6.5"/>
|
||||
<circle cx="12" cy="14" r="3.5"/>
|
||||
<circle cx="52" cy="18" r="3.5"/>
|
||||
<circle cx="10" cy="40" r="3.5"/>
|
||||
<circle cx="54" cy="44" r="3.5"/>
|
||||
<circle cx="32" cy="56" r="3.5"/>
|
||||
</g>
|
||||
</svg>
|
||||
|
Before Width: | Height: | Size: 957 B |
@ -1,17 +0,0 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 64 64" role="img" aria-label="Molecule AI">
|
||||
<g stroke="#7fe8d6" stroke-width="2.6" stroke-linecap="round" fill="none">
|
||||
<line x1="32" y1="32" x2="12" y2="14"/>
|
||||
<line x1="32" y1="32" x2="52" y2="18"/>
|
||||
<line x1="32" y1="32" x2="10" y2="40"/>
|
||||
<line x1="32" y1="32" x2="54" y2="44"/>
|
||||
<line x1="32" y1="32" x2="32" y2="56"/>
|
||||
</g>
|
||||
<g fill="#7fe8d6">
|
||||
<circle cx="32" cy="32" r="7"/>
|
||||
<circle cx="12" cy="14" r="3.6"/>
|
||||
<circle cx="52" cy="18" r="3.6"/>
|
||||
<circle cx="10" cy="40" r="3.6"/>
|
||||
<circle cx="54" cy="44" r="3.6"/>
|
||||
<circle cx="32" cy="56" r="3.6"/>
|
||||
</g>
|
||||
</svg>
|
||||
|
Before Width: | Height: | Size: 662 B |
@ -303,4 +303,4 @@ Or use the Canvas UI: Workspace → Config → MCP Servers → Add browser MCP s
|
||||
|
||||
---
|
||||
|
||||
*Have a browser automation use case you want to see covered? File an issue with the `enhancement` label on the [molecule-core issue tracker](https://git.moleculesai.app/molecule-ai/molecule-core/issues).*
|
||||
*Have a browser automation use case you want to see covered? Open a discussion on [GitHub Discussions](https://git.moleculesai.app/molecule-ai/molecule-core/discussions) — or file an issue with the `enhancement` label.*
|
||||
|
||||
@ -133,4 +133,4 @@ With protocol-native A2A, you get:
|
||||
|
||||
Molecule AI's external agent registration is production-ready. Documentation is live at [External Agent Registration Guide](https://docs.molecule.ai/docs/guides/external-agent-registration). The npm package for the MCP server is available at [`@molecule-ai/mcp-server`](https://www.npmjs.com/package/@molecule-ai/mcp-server).
|
||||
|
||||
Read the full [A2A v1.0 protocol spec](https://git.moleculesai.app/molecule-ai/molecule-core/src/branch/main/docs/api-protocol/a2a-protocol.md) on GitHub.
|
||||
Read the full [A2A v1.0 protocol spec](https://git.moleculesai.app/molecule-ai/molecule-core/blob/main/docs/api-protocol/a2a-protocol.md) on GitHub.
|
||||
@ -215,7 +215,7 @@ Push mode (this guide) works today but requires an inbound-reachable URL — whi
|
||||
|
||||
Your agent makes only outbound HTTPS calls to the platform, pulling messages from an inbox queue and posting replies back. Works behind any NAT/firewall, tolerates offline laptops, no tunnel needed.
|
||||
|
||||
See the [design doc](https://git.moleculesai.app/molecule-ai/internal/src/branch/main/product/external-workspaces-polling.md) (internal) and the implementation tracking issue (search `polling+mode` on the [molecule-core issue tracker](https://git.moleculesai.app/molecule-ai/molecule-core/issues)).
|
||||
See the [design doc](https://git.moleculesai.app/molecule-ai/internal/blob/main/product/external-workspaces-polling.md) (internal) and [implementation tracking issue](https://git.moleculesai.app/molecule-ai/molecule-core/issues?q=polling+mode) once opened.
|
||||
|
||||
---
|
||||
|
||||
|
||||
@ -144,4 +144,4 @@ The agent appears on the canvas with a **purple REMOTE badge** within seconds. F
|
||||
|
||||
- **[External Agent Registration Guide →](/docs/guides/external-agent-registration)** — full endpoint reference, Python + Node.js examples, troubleshooting
|
||||
- **[molecule-sdk-python →](https://git.moleculesai.app/molecule-ai/molecule-sdk-python)** — SDK source, `RemoteAgentClient` API docs
|
||||
- **[SDK Examples →](https://git.moleculesai.app/molecule-ai/molecule-sdk-python/src/branch/main/examples/remote-agent)** — `run.py` demo script, annotated walkthrough
|
||||
- **[SDK Examples →](https://git.moleculesai.app/molecule-ai/molecule-sdk-python/tree/main/examples/remote-agent)** — `run.py` demo script, annotated walkthrough
|
||||
|
||||
@ -64,7 +64,7 @@ When opencode connects to the Molecule MCP endpoint, the agent gains access to:
|
||||
"tool": "delegate_task",
|
||||
"arguments": {
|
||||
"target": "research-lead",
|
||||
"task": "Summarise the last 7 days of commits in Molecule-AI/molecule-monorepo"
|
||||
"task": "Summarise the last 7 days of commits in molecule-ai/molecule-core"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# Internal content policy
|
||||
|
||||
The `Molecule-AI/molecule-monorepo` repo is **public**. Anything internal
|
||||
The `molecule-ai/molecule-core` repo is **public**. Anything internal
|
||||
(positioning, competitive briefs, sales playbooks, PMM/press drip, draft
|
||||
campaigns, raw research notes, ops runbooks, retrospectives) lives in
|
||||
**`Molecule-AI/internal`**.
|
||||
@ -18,10 +18,10 @@ This page is the canonical decision tree.
|
||||
| Draft campaign asset (still iterating, not yet customer-visible) | `Molecule-AI/internal/marketing/campaigns/` |
|
||||
| Roadmap discussion, planning doc, retrospective | `Molecule-AI/internal/PLAN.md` or `Molecule-AI/internal/retrospectives/` |
|
||||
| Runbook, ops procedure, incident postmortem | `Molecule-AI/internal/runbooks/` |
|
||||
| **Public-ready** blog post (final draft, ready to ship to docs site) | `Molecule-AI/molecule-monorepo/docs/blog/` |
|
||||
| **Public-ready** tutorial / quickstart | `Molecule-AI/molecule-monorepo/docs/tutorials/` |
|
||||
| Public DevRel content (code samples, demos for users) | `Molecule-AI/molecule-monorepo/docs/devrel/` |
|
||||
| API reference, architecture docs for external developers | `Molecule-AI/molecule-monorepo/docs/api/` |
|
||||
| **Public-ready** blog post (final draft, ready to ship to docs site) | `molecule-ai/molecule-core/docs/blog/` |
|
||||
| **Public-ready** tutorial / quickstart | `molecule-ai/molecule-core/docs/tutorials/` |
|
||||
| Public DevRel content (code samples, demos for users) | `molecule-ai/molecule-core/docs/devrel/` |
|
||||
| API reference, architecture docs for external developers | `molecule-ai/molecule-core/docs/api/` |
|
||||
| Code, tests, infrastructure | wherever is appropriate inside this repo |
|
||||
|
||||
**Rule of thumb:** *"Would I be comfortable if a competitor / journalist / customer
|
||||
|
||||
@ -19,8 +19,8 @@ import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
mclient "github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/client"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/contract"
|
||||
mclient "github.com/go.moleculesai.app/core/platform/internal/memory/client"
|
||||
"github.com/go.moleculesai.app/core/platform/internal/memory/contract"
|
||||
)
|
||||
|
||||
func TestMyPlugin_FullRoundTrip(t *testing.T) {
|
||||
|
||||
@ -17,7 +17,7 @@ This path is aligned to the current repository and current UI. It gets you from
|
||||
## The one-command path
|
||||
|
||||
```bash
|
||||
git clone https://git.moleculesai.app/molecule-ai/molecule-monorepo.git
|
||||
git clone https://git.moleculesai.app/molecule-ai/molecule-core.git
|
||||
cd molecule-monorepo
|
||||
./scripts/dev-start.sh
|
||||
```
|
||||
@ -42,7 +42,7 @@ If you'd rather run each component yourself — useful when you're iterating on
|
||||
### Step 1: Clone the repository
|
||||
|
||||
```bash
|
||||
git clone https://git.moleculesai.app/molecule-ai/molecule-monorepo.git
|
||||
git clone https://git.moleculesai.app/molecule-ai/molecule-core.git
|
||||
cd molecule-monorepo
|
||||
```
|
||||
|
||||
|
||||
@ -1,137 +0,0 @@
|
||||
# Runbook — Handlers Postgres Integration port-collision substrate
|
||||
|
||||
**Status:** Resolved 2026-05-08 (PR for class B Hongming-owned CICD red sweep).
|
||||
|
||||
## Symptom
|
||||
|
||||
`Handlers Postgres Integration` workflow fails on staging push and PRs.
|
||||
Step `Apply migrations to Postgres service` shows:
|
||||
|
||||
```
|
||||
psql: error: connection to server at "127.0.0.1", port 5432 failed: Connection refused
|
||||
```
|
||||
|
||||
Job-cleanup step further down logs:
|
||||
|
||||
```
|
||||
Cleaning up services for job Handlers Postgres Integration
|
||||
failed to remove container: Error response from daemon: No such container: <id>
|
||||
```
|
||||
|
||||
…confirming the postgres service container was already gone before
|
||||
cleanup ran.
|
||||
|
||||
## Root cause
|
||||
|
||||
Our Gitea act_runner (operator host `5.78.80.188`,
|
||||
`/opt/molecule/runners/config.yaml`) sets:
|
||||
|
||||
```yaml
|
||||
container:
|
||||
network: host
|
||||
```
|
||||
|
||||
…which act_runner applies to BOTH the job container AND every
|
||||
`services:` container in a workflow. Multiple workflow instances
|
||||
running concurrently across the 16 parallel runners each try to bind
|
||||
postgres on `0.0.0.0:5432`. The first wins; subsequent instances exit
|
||||
immediately with:
|
||||
|
||||
```
|
||||
LOG: could not bind IPv4 address "0.0.0.0": Address in use
|
||||
HINT: Is another postmaster already running on port 5432?
|
||||
FATAL: could not create any TCP/IP sockets
|
||||
```
|
||||
|
||||
act_runner sets `AutoRemove:true` on service containers, so Docker
|
||||
garbage-collects them as soon as they exit. By the time the migrations
|
||||
step runs `pg_isready` / `psql`, the container is gone and connection
|
||||
refused.
|
||||
|
||||
Reproduction (operator host):
|
||||
|
||||
```bash
|
||||
docker run --rm -d --name pg-A --network host \
|
||||
-e POSTGRES_PASSWORD=test postgres:15-alpine
|
||||
docker run -d --name pg-B --network host \
|
||||
-e POSTGRES_PASSWORD=test postgres:15-alpine
|
||||
docker logs pg-B # FATAL: could not create any TCP/IP sockets
|
||||
```
|
||||
|
||||
## Why per-job override doesn't work
|
||||
|
||||
The natural fix — per-job `container.network` override — is silently
|
||||
ignored by act_runner. The runner log emits:
|
||||
|
||||
```
|
||||
--network and --net in the options will be ignored.
|
||||
```
|
||||
|
||||
This is a documented act_runner constraint: container network is a
|
||||
runner-wide setting, not per-job. Source: gitea/act_runner config docs
|
||||
+ vegardit/docker-gitea-act-runner issue #7.
|
||||
|
||||
Flipping the global `container.network` to `bridge` would break every
|
||||
other workflow in the repo (cache server discovery,
|
||||
`molecule-monorepo-net` peer access during integration tests, etc.) —
|
||||
unacceptable blast radius for a per-test bug.
|
||||
|
||||
## Fix shape
|
||||
|
||||
`handlers-postgres-integration.yml` no longer uses `services: postgres:`.
|
||||
It launches a sibling postgres container manually on the existing
|
||||
`molecule-monorepo-net` bridge network with a per-run unique name:
|
||||
|
||||
```yaml
|
||||
env:
|
||||
PG_NAME: pg-handlers-${{ github.run_id }}-${{ github.run_attempt }}
|
||||
PG_NETWORK: molecule-monorepo-net
|
||||
|
||||
steps:
|
||||
- name: Start sibling Postgres on bridge network
|
||||
run: |
|
||||
docker run -d --name "${PG_NAME}" --network "${PG_NETWORK}" \
|
||||
...
|
||||
postgres:15-alpine
|
||||
PG_HOST=$(docker inspect "${PG_NAME}" \
|
||||
--format "{{(index .NetworkSettings.Networks \"${PG_NETWORK}\").IPAddress}}")
|
||||
echo "PG_HOST=${PG_HOST}" >> "$GITHUB_ENV"
|
||||
|
||||
# … migrations + tests use ${PG_HOST}, not 127.0.0.1 …
|
||||
|
||||
- if: always() && …
|
||||
name: Stop sibling Postgres
|
||||
run: docker rm -f "${PG_NAME}" || true
|
||||
```
|
||||
|
||||
The host-net job container can reach a bridge-net container via the
|
||||
bridge IP directly (verified manually, 2026-05-08). Two parallel runs
|
||||
use different names + different bridge IPs — no collision.
|
||||
|
||||
## Future-proofing
|
||||
|
||||
Other workflows that hit the same shape (any `services:` with a
|
||||
fixed-port image) will exhibit the same failure mode under
|
||||
host-network runner config. Translate using this same pattern:
|
||||
|
||||
1. Drop the `services:` block.
|
||||
2. Use `${{ github.run_id }}-${{ github.run_attempt }}` for unique
|
||||
container name.
|
||||
3. Launch on `molecule-monorepo-net` (already trusted bridge in
|
||||
`docker-compose.infra.yml`).
|
||||
4. Read back the bridge IP via `docker inspect` and export as a step env.
|
||||
5. `if: always()` cleanup step at the end.
|
||||
|
||||
If the count of such workflows grows, factor into a composite action
|
||||
(`./.github/actions/sibling-postgres`) so the substrate logic lives
|
||||
in one place.
|
||||
|
||||
## Related
|
||||
|
||||
- Issue #88 (closed by #92): localhost → 127.0.0.1 fix that unmasked
|
||||
this collision; the IPv6 fix is correct, port collision is the new
|
||||
layer.
|
||||
- Issue #94 created `molecule-monorepo-net` + `alpine:latest` as
|
||||
prereqs.
|
||||
- Saved memory `feedback_act_runner_github_server_url` documents
|
||||
another act_runner-vs-GHA divergence (server URL).
|
||||
@ -244,7 +244,7 @@ correctness before pushing a `runtime-v*` tag.
|
||||
## Writing a new adapter
|
||||
|
||||
Use the GitHub template repo
|
||||
[`molecule-ai/molecule-ai-workspace-template-starter`](https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-starter) (note: the starter repo did not survive the 2026-05-06 GitHub-org-suspension migration; recreation tracked at internal#41)
|
||||
[`Molecule-AI/molecule-ai-workspace-template-starter`](https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-starter)
|
||||
— it ships with the canonical Dockerfile + adapter.py skeleton + config.yaml
|
||||
schema + the `repository_dispatch: [runtime-published]` cascade receiver
|
||||
already wired up. No follow-up setup PR required.
|
||||
@ -256,7 +256,7 @@ gh repo create Molecule-AI/molecule-ai-workspace-template-<runtime> \
|
||||
--public \
|
||||
--description "Molecule AI workspace template: <runtime>"
|
||||
|
||||
git clone https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-<runtime>.git
|
||||
git clone https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-<runtime>
|
||||
cd molecule-ai-workspace-template-<runtime>
|
||||
```
|
||||
|
||||
@ -286,7 +286,7 @@ After `git push`:
|
||||
If the canonical shape changes (e.g. `config.yaml` schema gets a new field,
|
||||
the `BaseAdapter` interface adds a method, the reusable CI workflow
|
||||
signature changes), update the
|
||||
[starter](https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-starter) (recreation pending — see note above)
|
||||
[starter](https://git.moleculesai.app/molecule-ai/molecule-ai-workspace-template-starter)
|
||||
**first**. Existing templates can either migrate at their own pace or be
|
||||
touched in a coordinated cleanup PR. Either way, future templates pick up
|
||||
the new shape from day one.
|
||||
|
||||
@ -1,46 +1,47 @@
|
||||
{
|
||||
"_comment": "OSS surface registry — every repo listed here MUST be public on git.moleculesai.app. Layer-3 customer/private templates are NOT registered here; they are handled at provision-time via the per-tenant credential resolver (see internal#102 RFC). 'main' refs are pinned to tags before broad rollout.",
|
||||
"_comment": "Pin refs to release tags for reproducible builds. 'main' is OK while all repos are internal.",
|
||||
"version": 1,
|
||||
"plugins": [
|
||||
{"name": "browser-automation", "repo": "molecule-ai/molecule-ai-plugin-browser-automation", "ref": "main"},
|
||||
{"name": "ecc", "repo": "molecule-ai/molecule-ai-plugin-ecc", "ref": "main"},
|
||||
{"name": "gh-identity", "repo": "molecule-ai/molecule-ai-plugin-gh-identity", "ref": "main"},
|
||||
{"name": "molecule-audit", "repo": "molecule-ai/molecule-ai-plugin-molecule-audit", "ref": "main"},
|
||||
{"name": "molecule-audit-trail", "repo": "molecule-ai/molecule-ai-plugin-molecule-audit-trail", "ref": "main"},
|
||||
{"name": "molecule-careful-bash", "repo": "molecule-ai/molecule-ai-plugin-molecule-careful-bash", "ref": "main"},
|
||||
{"name": "molecule-compliance", "repo": "molecule-ai/molecule-ai-plugin-molecule-compliance", "ref": "main"},
|
||||
{"name": "molecule-dev", "repo": "molecule-ai/molecule-ai-plugin-molecule-dev", "ref": "main"},
|
||||
{"name": "molecule-freeze-scope", "repo": "molecule-ai/molecule-ai-plugin-molecule-freeze-scope", "ref": "main"},
|
||||
{"name": "molecule-hitl", "repo": "molecule-ai/molecule-ai-plugin-molecule-hitl", "ref": "main"},
|
||||
{"name": "molecule-prompt-watchdog", "repo": "molecule-ai/molecule-ai-plugin-molecule-prompt-watchdog", "ref": "main"},
|
||||
{"name": "molecule-security-scan", "repo": "molecule-ai/molecule-ai-plugin-molecule-security-scan", "ref": "main"},
|
||||
{"name": "molecule-session-context", "repo": "molecule-ai/molecule-ai-plugin-molecule-session-context", "ref": "main"},
|
||||
{"name": "molecule-skill-code-review", "repo": "molecule-ai/molecule-ai-plugin-molecule-skill-code-review", "ref": "main"},
|
||||
{"name": "molecule-skill-cron-learnings", "repo": "molecule-ai/molecule-ai-plugin-molecule-skill-cron-learnings", "ref": "main"},
|
||||
{"name": "molecule-skill-cross-vendor-review", "repo": "molecule-ai/molecule-ai-plugin-molecule-skill-cross-vendor-review", "ref": "main"},
|
||||
{"name": "molecule-skill-llm-judge", "repo": "molecule-ai/molecule-ai-plugin-molecule-skill-llm-judge", "ref": "main"},
|
||||
{"name": "molecule-skill-update-docs", "repo": "molecule-ai/molecule-ai-plugin-molecule-skill-update-docs", "ref": "main"},
|
||||
{"name": "molecule-workflow-retro", "repo": "molecule-ai/molecule-ai-plugin-molecule-workflow-retro", "ref": "main"},
|
||||
{"name": "molecule-workflow-triage", "repo": "molecule-ai/molecule-ai-plugin-molecule-workflow-triage", "ref": "main"},
|
||||
{"name": "superpowers", "repo": "molecule-ai/molecule-ai-plugin-superpowers", "ref": "main"}
|
||||
{"name": "browser-automation", "repo": "Molecule-AI/molecule-ai-plugin-browser-automation", "ref": "main"},
|
||||
{"name": "ecc", "repo": "Molecule-AI/molecule-ai-plugin-ecc", "ref": "main"},
|
||||
{"name": "gh-identity", "repo": "Molecule-AI/molecule-ai-plugin-gh-identity", "ref": "main"},
|
||||
{"name": "molecule-audit", "repo": "Molecule-AI/molecule-ai-plugin-molecule-audit", "ref": "main"},
|
||||
{"name": "molecule-audit-trail", "repo": "Molecule-AI/molecule-ai-plugin-molecule-audit-trail", "ref": "main"},
|
||||
{"name": "molecule-careful-bash", "repo": "Molecule-AI/molecule-ai-plugin-molecule-careful-bash", "ref": "main"},
|
||||
{"name": "molecule-compliance", "repo": "Molecule-AI/molecule-ai-plugin-molecule-compliance", "ref": "main"},
|
||||
{"name": "molecule-dev", "repo": "Molecule-AI/molecule-ai-plugin-molecule-dev", "ref": "main"},
|
||||
{"name": "molecule-freeze-scope", "repo": "Molecule-AI/molecule-ai-plugin-molecule-freeze-scope", "ref": "main"},
|
||||
{"name": "molecule-hitl", "repo": "Molecule-AI/molecule-ai-plugin-molecule-hitl", "ref": "main"},
|
||||
{"name": "molecule-prompt-watchdog", "repo": "Molecule-AI/molecule-ai-plugin-molecule-prompt-watchdog", "ref": "main"},
|
||||
{"name": "molecule-security-scan", "repo": "Molecule-AI/molecule-ai-plugin-molecule-security-scan", "ref": "main"},
|
||||
{"name": "molecule-session-context", "repo": "Molecule-AI/molecule-ai-plugin-molecule-session-context", "ref": "main"},
|
||||
{"name": "molecule-skill-code-review", "repo": "Molecule-AI/molecule-ai-plugin-molecule-skill-code-review", "ref": "main"},
|
||||
{"name": "molecule-skill-cron-learnings", "repo": "Molecule-AI/molecule-ai-plugin-molecule-skill-cron-learnings", "ref": "main"},
|
||||
{"name": "molecule-skill-cross-vendor-review", "repo": "Molecule-AI/molecule-ai-plugin-molecule-skill-cross-vendor-review", "ref": "main"},
|
||||
{"name": "molecule-skill-llm-judge", "repo": "Molecule-AI/molecule-ai-plugin-molecule-skill-llm-judge", "ref": "main"},
|
||||
{"name": "molecule-skill-update-docs", "repo": "Molecule-AI/molecule-ai-plugin-molecule-skill-update-docs", "ref": "main"},
|
||||
{"name": "molecule-workflow-retro", "repo": "Molecule-AI/molecule-ai-plugin-molecule-workflow-retro", "ref": "main"},
|
||||
{"name": "molecule-workflow-triage", "repo": "Molecule-AI/molecule-ai-plugin-molecule-workflow-triage", "ref": "main"},
|
||||
{"name": "superpowers", "repo": "Molecule-AI/molecule-ai-plugin-superpowers", "ref": "main"}
|
||||
],
|
||||
"workspace_templates": [
|
||||
{"name": "claude-code-default", "repo": "molecule-ai/molecule-ai-workspace-template-claude-code", "ref": "main"},
|
||||
{"name": "hermes", "repo": "molecule-ai/molecule-ai-workspace-template-hermes", "ref": "main"},
|
||||
{"name": "openclaw", "repo": "molecule-ai/molecule-ai-workspace-template-openclaw", "ref": "main"},
|
||||
{"name": "codex", "repo": "molecule-ai/molecule-ai-workspace-template-codex", "ref": "main"},
|
||||
{"name": "langgraph", "repo": "molecule-ai/molecule-ai-workspace-template-langgraph", "ref": "main"},
|
||||
{"name": "crewai", "repo": "molecule-ai/molecule-ai-workspace-template-crewai", "ref": "main"},
|
||||
{"name": "autogen", "repo": "molecule-ai/molecule-ai-workspace-template-autogen", "ref": "main"},
|
||||
{"name": "deepagents", "repo": "molecule-ai/molecule-ai-workspace-template-deepagents", "ref": "main"},
|
||||
{"name": "gemini-cli", "repo": "molecule-ai/molecule-ai-workspace-template-gemini-cli", "ref": "main"}
|
||||
{"name": "claude-code-default", "repo": "Molecule-AI/molecule-ai-workspace-template-claude-code", "ref": "main"},
|
||||
{"name": "hermes", "repo": "Molecule-AI/molecule-ai-workspace-template-hermes", "ref": "main"},
|
||||
{"name": "openclaw", "repo": "Molecule-AI/molecule-ai-workspace-template-openclaw", "ref": "main"},
|
||||
{"name": "codex", "repo": "Molecule-AI/molecule-ai-workspace-template-codex", "ref": "main"},
|
||||
{"name": "langgraph", "repo": "Molecule-AI/molecule-ai-workspace-template-langgraph", "ref": "main"},
|
||||
{"name": "crewai", "repo": "Molecule-AI/molecule-ai-workspace-template-crewai", "ref": "main"},
|
||||
{"name": "autogen", "repo": "Molecule-AI/molecule-ai-workspace-template-autogen", "ref": "main"},
|
||||
{"name": "deepagents", "repo": "Molecule-AI/molecule-ai-workspace-template-deepagents", "ref": "main"},
|
||||
{"name": "gemini-cli", "repo": "Molecule-AI/molecule-ai-workspace-template-gemini-cli", "ref": "main"}
|
||||
],
|
||||
"org_templates": [
|
||||
{"name": "molecule-dev", "repo": "molecule-ai/molecule-ai-org-template-molecule-dev", "ref": "main"},
|
||||
{"name": "free-beats-all", "repo": "molecule-ai/molecule-ai-org-template-free-beats-all", "ref": "main"},
|
||||
{"name": "medo-smoke", "repo": "molecule-ai/molecule-ai-org-template-medo-smoke", "ref": "main"},
|
||||
{"name": "molecule-worker-gemini", "repo": "molecule-ai/molecule-ai-org-template-molecule-worker-gemini", "ref": "main"},
|
||||
{"name": "ux-ab-lab", "repo": "molecule-ai/molecule-ai-org-template-ux-ab-lab", "ref": "main"},
|
||||
{"name": "mock-bigorg", "repo": "molecule-ai/molecule-ai-org-template-mock-bigorg", "ref": "main"}
|
||||
{"name": "molecule-dev", "repo": "Molecule-AI/molecule-ai-org-template-molecule-dev", "ref": "main"},
|
||||
{"name": "free-beats-all", "repo": "Molecule-AI/molecule-ai-org-template-free-beats-all", "ref": "main"},
|
||||
{"name": "medo-smoke", "repo": "Molecule-AI/molecule-ai-org-template-medo-smoke", "ref": "main"},
|
||||
{"name": "molecule-worker-gemini", "repo": "Molecule-AI/molecule-ai-org-template-molecule-worker-gemini", "ref": "main"},
|
||||
{"name": "reno-stars", "repo": "Molecule-AI/molecule-ai-org-template-reno-stars", "ref": "main"},
|
||||
{"name": "ux-ab-lab", "repo": "Molecule-AI/molecule-ai-org-template-ux-ab-lab", "ref": "main"},
|
||||
{"name": "mock-bigorg", "repo": "Molecule-AI/molecule-ai-org-template-mock-bigorg", "ref": "main"}
|
||||
]
|
||||
}
|
||||
|
||||
@ -278,7 +278,7 @@ include = ["molecule_runtime*"]
|
||||
README_TEMPLATE = """\
|
||||
# molecule-ai-workspace-runtime
|
||||
|
||||
Shared workspace runtime for [Molecule AI](https://git.moleculesai.app/molecule-ai/molecule-core)
|
||||
Shared workspace runtime for [Molecule AI](https://github.com/Molecule-AI/molecule-core)
|
||||
agent adapters. Installed by every workspace template image
|
||||
(`workspace-template-claude-code`, `-langgraph`, `-hermes`, etc.) to provide
|
||||
A2A delegation, heartbeat, memory, plugin loading, and skill management.
|
||||
@ -376,7 +376,7 @@ hold:
|
||||
non-plugin-sourced server, which Claude Code rejects with
|
||||
`channel_enable requires a marketplace plugin`. Until the
|
||||
official `moleculesai/claude-code-plugin` marketplace lands
|
||||
(tracking [#2936](https://git.moleculesai.app/molecule-ai/molecule-core/issues/2936)),
|
||||
(tracking [#2936](https://github.com/Molecule-AI/molecule-core/issues/2936)),
|
||||
operators who want push must scaffold their own local marketplace
|
||||
under
|
||||
`~/.claude/marketplaces/molecule-local/` containing a
|
||||
@ -389,14 +389,14 @@ hold:
|
||||
Symptom of any condition failing: messages arrive but only via the
|
||||
poll path (every ~1–60s), not real-time. There's currently no
|
||||
diagnostic surfaced — `molecule-mcp doctor` (tracking
|
||||
[#2937](https://git.moleculesai.app/molecule-ai/molecule-core/issues/2937)) is
|
||||
[#2937](https://github.com/Molecule-AI/molecule-core/issues/2937)) is
|
||||
planned.
|
||||
|
||||
If you don't need real-time push, the default poll path works
|
||||
universally with no extra setup; both modes converge on the same
|
||||
`inbox_pop` ack so messages never duplicate.
|
||||
|
||||
See [`docs/workspace-runtime-package.md`](https://git.moleculesai.app/molecule-ai/molecule-core/src/branch/main/docs/workspace-runtime-package.md)
|
||||
See [`docs/workspace-runtime-package.md`](https://github.com/Molecule-AI/molecule-core/blob/main/docs/workspace-runtime-package.md)
|
||||
for the publish flow and architecture.
|
||||
"""
|
||||
|
||||
|
||||
@ -8,24 +8,27 @@
|
||||
# Requires: git, jq (lighter than python3 — ~2MB vs ~50MB in Alpine)
|
||||
#
|
||||
# Auth (optional):
|
||||
# Post-2026-05-08 (#192): every repo in manifest.json is public on
|
||||
# git.moleculesai.app. Anonymous clone works for the entire registered
|
||||
# set. The OSS-surface contract is recorded in manifest.json's _comment
|
||||
# — Layer-3 customer/private templates (e.g. reno-stars) are NOT in the
|
||||
# manifest; they are handled at provision-time via the per-tenant
|
||||
# credential resolver (internal#102 RFC).
|
||||
# When MOLECULE_GITEA_TOKEN is set, embed it as the basic-auth password so
|
||||
# private Gitea repos clone successfully. When unset, clone anonymously
|
||||
# (works only for repos that are public on git.moleculesai.app).
|
||||
#
|
||||
# MOLECULE_GITEA_TOKEN is therefore optional today. Kept supported for
|
||||
# two reasons: (a) historical CI configs that still inject
|
||||
# AUTO_SYNC_TOKEN remain harmless, (b) reserved for the case where a
|
||||
# private internal-only template is later registered via a ci-readonly
|
||||
# team grant — review must explicitly sign off on that, since it
|
||||
# violates the public-OSS-surface contract.
|
||||
# This is the path the publish-workspace-server-image.yml workflow uses:
|
||||
# it injects AUTO_SYNC_TOKEN (devops-engineer persona PAT, repo:read on
|
||||
# the molecule-ai org) so the in-CI pre-clone step succeeds for ALL
|
||||
# manifest entries — including the 5 private workspace-template-* repos
|
||||
# (codex, crewai, deepagents, gemini-cli, langgraph) and all 7
|
||||
# org-template-* repos.
|
||||
#
|
||||
# The token (when set) never enters the Docker image: this script runs
|
||||
# in the trusted CI context BEFORE `docker buildx build`, populates
|
||||
# The token never enters the Docker image: this script runs in the
|
||||
# trusted CI context BEFORE `docker buildx build`, populates
|
||||
# .tenant-bundle-deps/, then `Dockerfile.tenant` COPYs from there with
|
||||
# the .git directories already stripped (see line ~67 below).
|
||||
#
|
||||
# For backward compatibility — and so a fresh clone works without
|
||||
# secrets when (eventually) the workspace-template-* repos flip public —
|
||||
# the unset path remains a plain anonymous HTTPS clone. That path will
|
||||
# FAIL with "could not read Username" on private repos today; CI MUST
|
||||
# set MOLECULE_GITEA_TOKEN.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
@ -65,19 +68,22 @@ clone_category() {
|
||||
continue
|
||||
fi
|
||||
|
||||
# Post-2026-05-06 GitHub-org-suspension: clone from Gitea instead.
|
||||
# manifest.json paths still read "Molecule-AI/..." (the historic
|
||||
# github.com slug); Gitea lowercases the org part to "molecule-ai/".
|
||||
# Lowercase the org segment on the fly so we don't need to rewrite
|
||||
# every manifest entry.
|
||||
repo_gitea="$(echo "$repo" | awk -F/ '{ printf "%s", tolower($1); for (i=2; i<=NF; i++) printf "/%s", $i; print "" }')"
|
||||
|
||||
# Build the clone URL. When MOLECULE_GITEA_TOKEN is set (CI path)
|
||||
# embed it as basic-auth so private repos succeed. The username
|
||||
# part ("oauth2") is conventional and ignored by Gitea — only the
|
||||
# token-as-password is verified.
|
||||
#
|
||||
# manifest.json was migrated to lowercase org slugs on
|
||||
# 2026-05-07 (post-suspension reconciliation), so we use $repo
|
||||
# verbatim — no on-the-fly tolower transform needed.
|
||||
if [ -n "${MOLECULE_GITEA_TOKEN:-}" ]; then
|
||||
clone_url="https://oauth2:${MOLECULE_GITEA_TOKEN}@git.moleculesai.app/${repo}.git"
|
||||
display_url="https://oauth2:***@git.moleculesai.app/${repo}.git"
|
||||
clone_url="https://oauth2:${MOLECULE_GITEA_TOKEN}@git.moleculesai.app/${repo_gitea}.git"
|
||||
display_url="https://oauth2:***@git.moleculesai.app/${repo_gitea}.git"
|
||||
else
|
||||
clone_url="https://git.moleculesai.app/${repo}.git"
|
||||
clone_url="https://git.moleculesai.app/${repo_gitea}.git"
|
||||
display_url="$clone_url"
|
||||
fi
|
||||
|
||||
|
||||
@ -10,11 +10,11 @@
|
||||
# → PyPI auto-bumps molecule-ai-workspace-runtime patch version
|
||||
# → repository_dispatch fans out to 8 workspace-template-* repos
|
||||
# → each template repo rebuilds and re-tags
|
||||
# 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/workspace-template-<runtime>:latest
|
||||
# ghcr.io/molecule-ai/workspace-template-<runtime>:latest
|
||||
#
|
||||
# PATH 2: any merge to a workspace-template-* repo's main branch
|
||||
# → that repo's publish-image.yml fires
|
||||
# → 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/workspace-template-<runtime>:latest
|
||||
# → ghcr.io/molecule-ai/workspace-template-<runtime>:latest
|
||||
# gets re-tagged
|
||||
#
|
||||
# provisioner.go:296 RuntimeImages[runtime] reads `:latest` at every
|
||||
|
||||
@ -51,7 +51,7 @@ log "pulling latest images for: ${RUNTIMES[*]}"
|
||||
PULLED=()
|
||||
FAILED=()
|
||||
for rt in "${RUNTIMES[@]}"; do
|
||||
IMG="153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/workspace-template-$rt:latest"
|
||||
IMG="ghcr.io/molecule-ai/workspace-template-$rt:latest"
|
||||
if docker pull "$IMG" >/dev/null 2>&1; then
|
||||
log " ✓ $rt"
|
||||
PULLED+=("$rt")
|
||||
|
||||
@ -1,10 +1,9 @@
|
||||
#!/bin/bash
|
||||
# rollback-latest.sh — moves the :latest tag on the platform image
|
||||
# (and the matching tenant image) on AWS ECR back to a prior
|
||||
# :staging-<sha> digest without rebuilding anything. Prod tenants
|
||||
# auto-pull :latest every 5 min, so this is the fast path when a
|
||||
# canary-verified image turns out to have a runtime regression that
|
||||
# canary didn't catch.
|
||||
# rollback-latest.sh — moves the :latest tag on ghcr.io/molecule-ai/platform
|
||||
# (and the matching tenant image) back to a prior :staging-<sha> digest
|
||||
# without rebuilding anything. Prod tenants auto-pull :latest every 5
|
||||
# min, so this is the fast path when a canary-verified image turns out
|
||||
# to have a runtime regression that canary didn't catch.
|
||||
#
|
||||
# Usage:
|
||||
# scripts/rollback-latest.sh <sha>
|
||||
@ -13,14 +12,12 @@
|
||||
# Prereqs:
|
||||
# - crane on $PATH (brew install crane OR download from
|
||||
# https://github.com/google/go-containerregistry/releases)
|
||||
# - aws CLI authenticated for region us-east-2 with ECR pull/push
|
||||
# access to the molecule-ai/platform + platform-tenant repositories.
|
||||
# `aws sts get-caller-identity` should succeed.
|
||||
# - GHCR token exported as GITHUB_TOKEN with write:packages scope
|
||||
#
|
||||
# What it does (per image — platform + tenant):
|
||||
# crane digest <ecr>:<sha> # verify the target sha exists
|
||||
# crane tag <ecr>:<sha> latest # retag remotely, single API call
|
||||
# crane digest <ecr>:latest # confirm the move
|
||||
# crane digest ghcr.io/…:<sha> # verify the target sha exists
|
||||
# crane tag ghcr.io/…:<sha> latest # retag remotely, single API call
|
||||
# crane digest ghcr.io/…:latest # confirm the move
|
||||
#
|
||||
# Exit codes: 0 = both retagged, 1 = tag missing / crane error, 2 = bad args.
|
||||
|
||||
@ -33,23 +30,21 @@ if [ "${1:-}" = "" ]; then
|
||||
fi
|
||||
|
||||
TARGET_SHA="$1"
|
||||
ECR_HOST=153263036946.dkr.ecr.us-east-2.amazonaws.com
|
||||
PLATFORM=$ECR_HOST/molecule-ai/platform
|
||||
TENANT=$ECR_HOST/molecule-ai/platform-tenant
|
||||
PLATFORM=ghcr.io/molecule-ai/platform
|
||||
TENANT=ghcr.io/molecule-ai/platform-tenant
|
||||
|
||||
if ! command -v crane >/dev/null; then
|
||||
echo "ERROR: crane not installed. brew install crane" >&2
|
||||
exit 1
|
||||
fi
|
||||
if ! command -v aws >/dev/null; then
|
||||
echo "ERROR: aws CLI not installed. brew install awscli" >&2
|
||||
if [ -z "${GITHUB_TOKEN:-}" ]; then
|
||||
echo "ERROR: GITHUB_TOKEN unset. export it with write:packages scope." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Log in once. ECR auth is via short-lived password from `aws ecr
|
||||
# get-login-password`. crane stores creds in a config file keyed by
|
||||
# registry; re-running is cheap.
|
||||
aws ecr get-login-password --region us-east-2 | crane auth login "$ECR_HOST" -u AWS --password-stdin >/dev/null
|
||||
# Log in once. crane stores creds in a config file keyed by registry;
|
||||
# re-running is cheap.
|
||||
printf '%s\n' "$GITHUB_TOKEN" | crane auth login ghcr.io -u "${GITHUB_ACTOR:-$(whoami)}" --password-stdin >/dev/null
|
||||
|
||||
roll() {
|
||||
local image="$1"
|
||||
|
||||
@ -1,7 +1,5 @@
|
||||
# Production-shape local harness
|
||||
|
||||
<!-- Retrigger Harness Replays after Class G #168 + clone-manifest fix (#42). -->
|
||||
|
||||
The harness brings up the SaaS tenant topology on localhost using the
|
||||
same `Dockerfile.tenant` image that ships to production. Tests target
|
||||
the cf-proxy on `http://localhost:8080` and pass the tenant identity
|
||||
|
||||
@ -1,14 +0,0 @@
|
||||
# cf-proxy harness image — nginx + the harness's tenant-routing config baked
|
||||
# in at build time.
|
||||
#
|
||||
# Why bake (not bind-mount): on Gitea Actions / act_runner, the runner is a
|
||||
# container talking to the OUTER docker daemon over the host socket; runc
|
||||
# resolves bind-mount source paths on the outer host filesystem, where the
|
||||
# repo at `/workspace/.../tests/harness/cf-proxy/nginx.conf` is invisible.
|
||||
# Compose `configs:` (with `file:`) falls back to bind mounts when swarm is
|
||||
# not active, so it hits the same gap. A build-time COPY uploads the file
|
||||
# as part of the docker build context — the daemon receives the tarball
|
||||
# directly and never bind-mounts. See issue #88 item 2.
|
||||
FROM nginx:1.27-alpine
|
||||
|
||||
COPY nginx.conf /etc/nginx/nginx.conf
|
||||
@ -167,26 +167,15 @@ services:
|
||||
# Production shape: same single CF tunnel front-doors every tenant
|
||||
# subdomain — the Host header carries the tenant identity, not the
|
||||
# routing destination. Local cf-proxy mirrors this exactly.
|
||||
#
|
||||
# nginx.conf delivery: built into a custom image via cf-proxy/Dockerfile
|
||||
# (a thin nginx:1.27-alpine + COPY). NOT a bind mount and NOT a
|
||||
# compose `configs:` block, both of which break under Gitea's
|
||||
# act_runner: the runner talks to the OUTER docker daemon over the
|
||||
# host socket, and runc resolves bind sources on the outer host
|
||||
# filesystem, where `/workspace/.../tests/harness/cf-proxy/nginx.conf`
|
||||
# is invisible. Compose `configs:` falls back to bind mounts without
|
||||
# swarm, so it hits the same gap. A build context, by contrast, is
|
||||
# uploaded to the daemon as a tarball at build time — no bind. See
|
||||
# issue #88 item 2.
|
||||
cf-proxy:
|
||||
build:
|
||||
context: ./cf-proxy
|
||||
dockerfile: Dockerfile
|
||||
image: nginx:1.27-alpine
|
||||
depends_on:
|
||||
tenant-alpha:
|
||||
condition: service_healthy
|
||||
tenant-beta:
|
||||
condition: service_healthy
|
||||
volumes:
|
||||
- ./cf-proxy/nginx.conf:/etc/nginx/nginx.conf:ro
|
||||
# Bind to 127.0.0.1 only — hardcoded ADMIN_TOKENs make 0.0.0.0
|
||||
# exposure unsafe even on a local network.
|
||||
ports:
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
module github.com/Molecule-AI/molecule-monorepo/tests/harness/cp-stub
|
||||
module go.moleculesai.app/core/tests/harness/cp-stub
|
||||
|
||||
go 1.25
|
||||
|
||||
@ -18,7 +18,7 @@
|
||||
#
|
||||
# Or inline via curl:
|
||||
#
|
||||
# bash <(curl -fsSL https://git.moleculesai.app/molecule-ai/molecule-core/raw/branch/main/tools/check-template-parity.sh) \
|
||||
# bash <(curl -fsSL https://raw.githubusercontent.com/Molecule-AI/molecule-core/main/tools/check-template-parity.sh) \
|
||||
# install.sh start.sh
|
||||
#
|
||||
# Exit codes:
|
||||
|
||||
@ -1,49 +0,0 @@
|
||||
# air.toml — live-reload config for local docker-compose dev mode.
|
||||
#
|
||||
# Active when the platform service runs from workspace-server/Dockerfile.dev
|
||||
# (selected via docker-compose.dev.yml overlay). In production, the regular
|
||||
# Dockerfile builds a static binary; air is dev-only.
|
||||
#
|
||||
# Reference: https://github.com/air-verse/air
|
||||
|
||||
root = "."
|
||||
testdata_dir = "testdata"
|
||||
tmp_dir = "tmp"
|
||||
|
||||
[build]
|
||||
# Same build invocation as Dockerfile's builder stage minus the
|
||||
# CGO_ENABLED=0 toggle (CGO ok in dev for richer race detector output).
|
||||
cmd = "go build -o ./tmp/server ./cmd/server"
|
||||
bin = "tmp/server"
|
||||
full_bin = ""
|
||||
args_bin = []
|
||||
# Watch every .go and .yaml file under workspace-server/.
|
||||
include_ext = ["go", "yaml", "tmpl"]
|
||||
# Don't watch tests, build artifacts, vendored deps, or migration .sql
|
||||
# (migrations need a clean DB anyway — handled by docker-compose down/up).
|
||||
exclude_dir = ["assets", "tmp", "vendor", "testdata", "node_modules"]
|
||||
exclude_file = []
|
||||
# _test.go and *_mock.go shouldn't trigger a rebuild — saves cycles.
|
||||
exclude_regex = ["_test\\.go$", "_mock\\.go$"]
|
||||
exclude_unchanged = true
|
||||
follow_symlink = false
|
||||
log = "build-errors.log"
|
||||
# Kill running binary 1s before starting new one.
|
||||
kill_delay = "1s"
|
||||
send_interrupt = true
|
||||
stop_on_error = true
|
||||
# Debounce: wait this long after last change before triggering rebuild.
|
||||
delay = 500
|
||||
|
||||
[log]
|
||||
time = false
|
||||
|
||||
[color]
|
||||
main = "magenta"
|
||||
watcher = "cyan"
|
||||
build = "yellow"
|
||||
runner = "green"
|
||||
|
||||
[misc]
|
||||
# Don't keep the tmp/ dir around between runs.
|
||||
clean_on_exit = true
|
||||
3
workspace-server/.gitignore
vendored
3
workspace-server/.gitignore
vendored
@ -1,5 +1,2 @@
|
||||
# The compiled binary, not the cmd/server package.
|
||||
/server
|
||||
|
||||
# air live-reload build cache (Dockerfile.dev + docker-compose.dev.yml).
|
||||
/tmp/
|
||||
|
||||
@ -23,7 +23,7 @@ COPY workspace-server/ .
|
||||
# GIT_SHA mirror of Dockerfile.tenant — see that file for the rationale.
|
||||
ARG GIT_SHA=dev
|
||||
RUN CGO_ENABLED=0 GOOS=linux go build \
|
||||
-ldflags "-X github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo.GitSHA=${GIT_SHA}" \
|
||||
-ldflags "-X go.moleculesai.app/core/platform/internal/buildinfo.GitSHA=${GIT_SHA}" \
|
||||
-o /platform ./cmd/server
|
||||
# Bundle the built-in memory-plugin-postgres binary so an operator can
|
||||
# activate Memory v2 by setting MEMORY_V2_CUTOVER=true + (default)
|
||||
@ -31,7 +31,7 @@ RUN CGO_ENABLED=0 GOOS=linux go build \
|
||||
# binary in the background; main /platform talks to it over loopback.
|
||||
# Stays inert until the operator flips the cutover env var.
|
||||
RUN CGO_ENABLED=0 GOOS=linux go build \
|
||||
-ldflags "-X github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo.GitSHA=${GIT_SHA}" \
|
||||
-ldflags "-X go.moleculesai.app/core/platform/internal/buildinfo.GitSHA=${GIT_SHA}" \
|
||||
-o /memory-plugin ./cmd/memory-plugin-postgres
|
||||
|
||||
FROM alpine:3.20
|
||||
|
||||
@ -1,44 +0,0 @@
|
||||
# Dockerfile.dev — local-development image with air-driven live reload.
|
||||
#
|
||||
# Selected by docker-compose.dev.yml (overlay over docker-compose.yml).
|
||||
# Production stays on workspace-server/Dockerfile (static binary, no air).
|
||||
#
|
||||
# Workflow:
|
||||
# 1. docker compose -f docker-compose.yml -f docker-compose.dev.yml up
|
||||
# 2. Edit any .go file under workspace-server/
|
||||
# 3. air detects, rebuilds, kills old binary, starts new one (~3-5s)
|
||||
# 4. No `docker compose up --build` needed
|
||||
#
|
||||
# Templates + plugins are NOT pre-cloned here — air-mode assumes the
|
||||
# developer's filesystem has the workspace-configs-templates/ + plugins/
|
||||
# dirs available, mounted at runtime via docker-compose.dev.yml.
|
||||
|
||||
FROM golang:1.25-alpine
|
||||
|
||||
# air + git (for go mod) + ca-certs (for TLS) + tzdata (for time-zone DB)
|
||||
# + docker-cli + docker-cli-buildx so the platform binary can shell out to
|
||||
# /var/run/docker.sock (bind-mounted from host) for local-build provisioning.
|
||||
# docker-cli alone is insufficient: alpine's docker-cli enables BuildKit by
|
||||
# default but ships without buildx, producing
|
||||
# `ERROR: BuildKit is enabled but the buildx component is missing or broken`
|
||||
# on every `docker build`. docker-cli-buildx provides the buildx subcommand.
|
||||
RUN apk add --no-cache git ca-certificates tzdata wget docker-cli docker-cli-buildx \
|
||||
&& go install github.com/air-verse/air@latest
|
||||
|
||||
WORKDIR /app/workspace-server
|
||||
|
||||
# Pre-fetch deps so the first `air` rebuild on a fresh container is fast.
|
||||
# These are bind-mount-overridden at runtime, so the COPY here is just
|
||||
# to warm the module cache.
|
||||
COPY workspace-server/go.mod workspace-server/go.sum ./
|
||||
RUN go mod download
|
||||
|
||||
# Source is bind-mounted at runtime (see docker-compose.dev.yml volumes
|
||||
# block) so the Dockerfile doesn't need to COPY it. air watches the
|
||||
# bind-mounted dir for changes.
|
||||
|
||||
ENV CGO_ENABLED=0
|
||||
ENV GOFLAGS="-buildvcs=false"
|
||||
|
||||
# Run air with the .air.toml in the bind-mounted source dir.
|
||||
CMD ["air", "-c", ".air.toml"]
|
||||
@ -53,14 +53,14 @@ COPY workspace-server/ .
|
||||
# fails closed — which is the correct fail-direction (#2395 root fix).
|
||||
ARG GIT_SHA=dev
|
||||
RUN CGO_ENABLED=0 GOOS=linux go build \
|
||||
-ldflags "-X github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo.GitSHA=${GIT_SHA}" \
|
||||
-ldflags "-X go.moleculesai.app/core/platform/internal/buildinfo.GitSHA=${GIT_SHA}" \
|
||||
-o /platform ./cmd/server
|
||||
# Memory v2 sidecar binary (Memory v2 #2728). Bundled so an operator
|
||||
# can activate cutover by flipping MEMORY_V2_CUTOVER=true without
|
||||
# provisioning a separate service. See entrypoint-tenant.sh for the
|
||||
# launch logic.
|
||||
RUN CGO_ENABLED=0 GOOS=linux go build \
|
||||
-ldflags "-X github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo.GitSHA=${GIT_SHA}" \
|
||||
-ldflags "-X go.moleculesai.app/core/platform/internal/buildinfo.GitSHA=${GIT_SHA}" \
|
||||
-o /memory-plugin ./cmd/memory-plugin-postgres
|
||||
|
||||
# ── Stage 2: Canvas Next.js standalone ────────────────────────────────
|
||||
|
||||
@ -30,9 +30,9 @@ import (
|
||||
|
||||
_ "github.com/lib/pq"
|
||||
|
||||
mclient "github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/client"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/contract"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/namespace"
|
||||
mclient "go.moleculesai.app/core/platform/internal/memory/client"
|
||||
"go.moleculesai.app/core/platform/internal/memory/contract"
|
||||
"go.moleculesai.app/core/platform/internal/memory/namespace"
|
||||
)
|
||||
|
||||
const defaultLimit = 1000000 // effectively unlimited; cap keeps SQL pageable
|
||||
|
||||
@ -10,8 +10,8 @@ import (
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/contract"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/namespace"
|
||||
"go.moleculesai.app/core/platform/internal/memory/contract"
|
||||
"go.moleculesai.app/core/platform/internal/memory/namespace"
|
||||
)
|
||||
|
||||
// stubBackfillPlugin records calls for assertions.
|
||||
|
||||
@ -20,8 +20,8 @@ import (
|
||||
"math/rand"
|
||||
"os"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/contract"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil"
|
||||
"go.moleculesai.app/core/platform/internal/memory/contract"
|
||||
"go.moleculesai.app/core/platform/internal/textutil"
|
||||
)
|
||||
|
||||
// verifyConfig is the typed dependency bundle for verifyParity.
|
||||
|
||||
@ -9,7 +9,7 @@ import (
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/contract"
|
||||
"go.moleculesai.app/core/platform/internal/memory/contract"
|
||||
)
|
||||
|
||||
// stubVerifyPlugin records search calls and returns canned results.
|
||||
|
||||
@ -45,8 +45,8 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
mclient "github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/client"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/contract"
|
||||
mclient "go.moleculesai.app/core/platform/internal/memory/client"
|
||||
"go.moleculesai.app/core/platform/internal/memory/contract"
|
||||
)
|
||||
|
||||
const (
|
||||
|
||||
@ -25,7 +25,7 @@ import (
|
||||
|
||||
_ "github.com/lib/pq"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/pgplugin"
|
||||
"go.moleculesai.app/core/platform/internal/memory/pgplugin"
|
||||
)
|
||||
|
||||
// migrationsFS bundles the .up.sql files into the binary at build time
|
||||
|
||||
@ -12,28 +12,28 @@ import (
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/channels"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/crypto"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/handlers"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/imagewatch"
|
||||
memwiring "github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/wiring"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/middleware"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/pendinguploads"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/registry"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/router"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/scheduler"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/supervised"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/ws"
|
||||
"go.moleculesai.app/core/platform/internal/channels"
|
||||
"go.moleculesai.app/core/platform/internal/crypto"
|
||||
"go.moleculesai.app/core/platform/internal/db"
|
||||
"go.moleculesai.app/core/platform/internal/events"
|
||||
"go.moleculesai.app/core/platform/internal/handlers"
|
||||
"go.moleculesai.app/core/platform/internal/imagewatch"
|
||||
memwiring "go.moleculesai.app/core/platform/internal/memory/wiring"
|
||||
"go.moleculesai.app/core/platform/internal/middleware"
|
||||
"go.moleculesai.app/core/platform/internal/pendinguploads"
|
||||
"go.moleculesai.app/core/platform/internal/provisioner"
|
||||
"go.moleculesai.app/core/platform/internal/registry"
|
||||
"go.moleculesai.app/core/platform/internal/router"
|
||||
"go.moleculesai.app/core/platform/internal/scheduler"
|
||||
"go.moleculesai.app/core/platform/internal/supervised"
|
||||
"go.moleculesai.app/core/platform/internal/ws"
|
||||
|
||||
// External plugins — each registers EnvMutator(s) that run at workspace
|
||||
// provision time. Loaded via soft-dep gates in main() so self-hosters
|
||||
// without per-agent identity configured keep working.
|
||||
ghidentity "github.com/Molecule-AI/molecule-ai-plugin-gh-identity/pluginloader"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/pkg/provisionhook"
|
||||
"go.moleculesai.app/core/platform/pkg/provisionhook"
|
||||
)
|
||||
|
||||
func main() {
|
||||
@ -249,19 +249,6 @@ func main() {
|
||||
})
|
||||
}
|
||||
|
||||
// CP-mode orphan sweeper — SaaS counterpart to the Docker sweeper
|
||||
// above. Re-issues cpProv.Stop for any workspace at status='removed'
|
||||
// with a non-NULL instance_id, healing the deprovision split-write
|
||||
// race documented in #2989: tenant marks status='removed' BEFORE
|
||||
// calling CP DELETE, so a transient CP failure leaves the EC2
|
||||
// running with no retry path. cpProv.Stop is idempotent against
|
||||
// already-terminated instances; on success we clear instance_id.
|
||||
if cpProv != nil {
|
||||
go supervised.RunWithRecover(ctx, "cp-orphan-sweeper", func(c context.Context) {
|
||||
registry.StartCPOrphanSweeper(c, cpProv)
|
||||
})
|
||||
}
|
||||
|
||||
// Pending-uploads GC sweep — deletes acked rows past their retention
|
||||
// window plus unacked rows past expires_at. Without this the
|
||||
// pending_uploads table grows unbounded; even with the 24h hard TTL,
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
module github.com/Molecule-AI/molecule-monorepo/platform
|
||||
module go.moleculesai.app/core/platform
|
||||
|
||||
go 1.25.0
|
||||
|
||||
|
||||
@ -8,7 +8,7 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/artifacts"
|
||||
"go.moleculesai.app/core/platform/internal/artifacts"
|
||||
)
|
||||
|
||||
// cfEnvelope wraps a result value in the Cloudflare v4 response envelope.
|
||||
@ -250,7 +250,7 @@ func TestImportRepo_Success(t *testing.T) {
|
||||
|
||||
client := newTestClient(t, mux)
|
||||
repo, err := client.ImportRepo(context.Background(), "imported", artifacts.ImportRepoRequest{
|
||||
URL: "https://github.com/Molecule-AI/molecule-core.git",
|
||||
URL: "https://git.moleculesai.app/molecule-ai/molecule-core.git",
|
||||
Branch: "main",
|
||||
Depth: 1,
|
||||
})
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
//
|
||||
// Set at link time:
|
||||
//
|
||||
// go build -ldflags "-X github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo.GitSHA=<sha>"
|
||||
// go build -ldflags "-X github.com/go.moleculesai.app/core/platform/internal/buildinfo.GitSHA=<sha>"
|
||||
//
|
||||
// CI passes ${{ github.sha }} via Dockerfile.tenant ARG GIT_SHA; local
|
||||
// dev builds default to "dev" so unset never reads as success.
|
||||
|
||||
@ -6,7 +6,7 @@ import (
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo"
|
||||
"go.moleculesai.app/core/platform/internal/buildinfo"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
|
||||
@ -11,8 +11,8 @@ import (
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
|
||||
"go.moleculesai.app/core/platform/internal/db"
|
||||
"go.moleculesai.app/core/platform/internal/provisioner"
|
||||
"github.com/docker/docker/api/types/container"
|
||||
"github.com/docker/docker/client"
|
||||
"github.com/docker/docker/pkg/stdcopy"
|
||||
|
||||
@ -5,10 +5,10 @@ import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
|
||||
"go.moleculesai.app/core/platform/internal/db"
|
||||
"go.moleculesai.app/core/platform/internal/events"
|
||||
"go.moleculesai.app/core/platform/internal/models"
|
||||
"go.moleculesai.app/core/platform/internal/provisioner"
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
|
||||
@ -6,7 +6,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"go.moleculesai.app/core/platform/internal/db"
|
||||
)
|
||||
|
||||
// ==================== Adapter Interface Tests ====================
|
||||
|
||||
@ -9,8 +9,8 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
||||
"go.moleculesai.app/core/platform/internal/db"
|
||||
"go.moleculesai.app/core/platform/internal/events"
|
||||
)
|
||||
|
||||
const (
|
||||
|
||||
@ -29,7 +29,7 @@ import (
|
||||
"encoding/base64"
|
||||
"strings"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/crypto"
|
||||
"go.moleculesai.app/core/platform/internal/crypto"
|
||||
)
|
||||
|
||||
// sensitiveFields is the set of channel_config keys that get encrypted at
|
||||
|
||||
@ -4,7 +4,7 @@ import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/crypto"
|
||||
"go.moleculesai.app/core/platform/internal/crypto"
|
||||
)
|
||||
|
||||
// withTestEncryptionKey installs a deterministic 32-byte key for the
|
||||
|
||||
@ -18,7 +18,7 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
const moduleInternalPrefix = "github.com/Molecule-AI/molecule-monorepo/platform/internal/"
|
||||
const moduleInternalPrefix = "go.moleculesai.app/core/platform/internal/"
|
||||
|
||||
func TestDBHasNoInternalDependencies(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@ -7,9 +7,9 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/ws"
|
||||
"go.moleculesai.app/core/platform/internal/db"
|
||||
"go.moleculesai.app/core/platform/internal/models"
|
||||
"go.moleculesai.app/core/platform/internal/ws"
|
||||
"github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
|
||||
@ -20,12 +20,12 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/registry"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth"
|
||||
"go.moleculesai.app/core/platform/internal/db"
|
||||
"go.moleculesai.app/core/platform/internal/events"
|
||||
"go.moleculesai.app/core/platform/internal/models"
|
||||
"go.moleculesai.app/core/platform/internal/provisioner"
|
||||
"go.moleculesai.app/core/platform/internal/registry"
|
||||
"go.moleculesai.app/core/platform/internal/wsauth"
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
@ -13,10 +13,10 @@ import (
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth"
|
||||
"go.moleculesai.app/core/platform/internal/db"
|
||||
"go.moleculesai.app/core/platform/internal/events"
|
||||
"go.moleculesai.app/core/platform/internal/models"
|
||||
"go.moleculesai.app/core/platform/internal/wsauth"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
|
||||
@ -9,8 +9,8 @@ import (
|
||||
"testing"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
|
||||
"go.moleculesai.app/core/platform/internal/models"
|
||||
"go.moleculesai.app/core/platform/internal/provisioner"
|
||||
)
|
||||
|
||||
// preflightLocalProv is a controllable LocalProvisionerAPI stub for the
|
||||
|
||||
@ -16,8 +16,8 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
|
||||
"go.moleculesai.app/core/platform/internal/models"
|
||||
"go.moleculesai.app/core/platform/internal/provisioner"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
|
||||
@ -20,9 +20,9 @@ import (
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil"
|
||||
"go.moleculesai.app/core/platform/internal/db"
|
||||
"go.moleculesai.app/core/platform/internal/events"
|
||||
"go.moleculesai.app/core/platform/internal/textutil"
|
||||
)
|
||||
|
||||
// extractIdempotencyKey pulls params.message.messageId out of an A2A JSON-RPC
|
||||
|
||||
@ -42,8 +42,8 @@ import (
|
||||
"log"
|
||||
"net/http"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth"
|
||||
"go.moleculesai.app/core/platform/internal/db"
|
||||
"go.moleculesai.app/core/platform/internal/wsauth"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
|
||||
@ -19,7 +19,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"go.moleculesai.app/core/platform/internal/db"
|
||||
"github.com/alicebob/miniredis/v2"
|
||||
)
|
||||
|
||||
|
||||
@ -12,8 +12,8 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
||||
"go.moleculesai.app/core/platform/internal/db"
|
||||
"go.moleculesai.app/core/platform/internal/events"
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user