Compare commits
27 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 0001259d21 | |||
| 1e6b61ad3e | |||
| b4b8f4f8c6 | |||
| b8982ac517 | |||
| 6f2274d62b | |||
| af05bf4f77 | |||
| c408b17477 | |||
| 025fa6b129 | |||
| ffe26a192a | |||
| e76cea2593 | |||
| 9e984c24cb | |||
| 29aff2be96 | |||
| 9d2b46fde8 | |||
| acfee37d22 | |||
| 4b232304ec | |||
| 932fc45945 | |||
| 913a5f8409 | |||
| 13578678c7 | |||
| 4f1ad1d07e | |||
| 49c1756407 | |||
| 81cc307f81 | |||
| 0809abd7bb | |||
| b4928e6f81 | |||
| 40d0493556 | |||
| ec5d5c33bd | |||
| bf2387fa2d | |||
| 3208d4d463 |
@@ -466,12 +466,40 @@ def fetch_log(target_url: str) -> str | None:
|
||||
|
||||
def grep_fail_markers(log_text: str) -> list[str]:
|
||||
"""Return up to 5 sample matching lines for any FAIL_PATTERNS hit.
|
||||
Empty list = clean log."""
|
||||
Empty list = clean log.
|
||||
|
||||
Heuristic: skip lines where the marker appears inside script source
|
||||
(e.g. ``echo "::error::..."`` in a ``::group::Run`` block) rather
|
||||
than actual execution output. The Gitea Actions log prints the raw
|
||||
script before executing it; ``echo "::error::"`` lines in that
|
||||
display are false positives.
|
||||
"""
|
||||
matches: list[str] = []
|
||||
in_run_group = False
|
||||
group_depth = 0
|
||||
for line in log_text.splitlines():
|
||||
stripped = line.strip()
|
||||
# Track Gitea Actions group markers so we can skip the
|
||||
# ``::group::Run`` script-source display blocks.
|
||||
if stripped.startswith("::group::Run"):
|
||||
in_run_group = True
|
||||
group_depth = 1
|
||||
continue
|
||||
if stripped == "::endgroup::":
|
||||
if in_run_group:
|
||||
in_run_group = False
|
||||
group_depth = 0
|
||||
continue
|
||||
if in_run_group:
|
||||
continue
|
||||
for pat in FAIL_PATTERNS:
|
||||
if pat in line:
|
||||
# Truncate to keep error output bounded.
|
||||
# Additional false-positive guard: ``echo "::error::"``
|
||||
# is script source, not a runtime error emission.
|
||||
if pat == "::error::":
|
||||
prefix = line[: line.index(pat)].strip()
|
||||
if prefix.endswith('echo') or prefix.endswith("echo '") or prefix.endswith('echo "'):
|
||||
break
|
||||
matches.append(line.strip()[:240])
|
||||
break
|
||||
if len(matches) >= 5:
|
||||
|
||||
@@ -123,8 +123,9 @@ jobs:
|
||||
# integration). See internal#512 for the class defect.
|
||||
runs-on: docker-host
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
# mc#1982: mask removed. If regressions appear, root-fix the underlying
|
||||
# test — do NOT renew the mask silently.
|
||||
continue-on-error: false
|
||||
outputs:
|
||||
api: ${{ steps.decide.outputs.api }}
|
||||
steps:
|
||||
@@ -160,8 +161,9 @@ jobs:
|
||||
# detect-changes for the full rationale.
|
||||
runs-on: docker-host
|
||||
# Phase 3 (RFC #219 §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
# mc#1982: mask removed. If regressions appear, root-fix the underlying
|
||||
# test — do NOT renew the mask silently.
|
||||
continue-on-error: false
|
||||
timeout-minutes: 15
|
||||
env:
|
||||
# Unique per-run container names so concurrent runs on the host-
|
||||
|
||||
@@ -88,8 +88,9 @@ jobs:
|
||||
# surprises and keeps the routing rule discoverable in one place.
|
||||
runs-on: docker-host
|
||||
# mc#1982 Phase 3 (RFC §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
# mc#1982: mask removed. If regressions appear, root-fix the underlying
|
||||
# test — do NOT renew the mask silently.
|
||||
continue-on-error: false
|
||||
outputs:
|
||||
handlers: ${{ steps.filter.outputs.handlers }}
|
||||
steps:
|
||||
@@ -119,8 +120,9 @@ jobs:
|
||||
# exists). See detect-changes for the full routing rationale.
|
||||
runs-on: docker-host
|
||||
# mc#1982 Phase 3 (RFC §1): surface broken workflows without blocking.
|
||||
# mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
|
||||
continue-on-error: true
|
||||
# mc#1982: mask removed. If regressions appear, root-fix the underlying
|
||||
# test — do NOT renew the mask silently.
|
||||
continue-on-error: false
|
||||
env:
|
||||
# Unique name per run so concurrent jobs don't collide on the
|
||||
# bridge network. ${RUN_ID}-${RUN_ATTEMPT} is unique even across
|
||||
|
||||
@@ -49,37 +49,56 @@ jobs:
|
||||
GITHUB_SERVER_URL: https://git.moleculesai.app
|
||||
steps:
|
||||
- name: Identify runner
|
||||
id: identify
|
||||
continue-on-error: true
|
||||
run: |
|
||||
set -eu
|
||||
echo "arch=$(uname -m)"
|
||||
echo "kernel=$(uname -sr)"
|
||||
echo "shell=$BASH_VERSION"
|
||||
# Sanity: must actually be arm64. If amd64 sneaks in here,
|
||||
# fail fast — that means the label routing is wrong.
|
||||
# the job skips gracefully rather than hard-failing, because
|
||||
# a mislabelled runner is an ops concern, not a code defect.
|
||||
# Pilot lane must not make main red (#2146).
|
||||
case "$(uname -m)" in
|
||||
aarch64|arm64) echo "arm64 confirmed" ;;
|
||||
*) echo "ERROR: expected arm64, got $(uname -m)"; exit 1 ;;
|
||||
aarch64|arm64)
|
||||
echo "arm64 confirmed"
|
||||
echo "arm64=true" >> "$GITHUB_OUTPUT"
|
||||
;;
|
||||
*)
|
||||
echo "ERROR: expected arm64, got $(uname -m) — label routing may be wrong"
|
||||
echo "arm64=false" >> "$GITHUB_OUTPUT"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
- name: Checkout
|
||||
if: steps.identify.outputs.arm64 == 'true'
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Install shellcheck (arm64)
|
||||
if: steps.identify.outputs.arm64 == 'true'
|
||||
continue-on-error: true
|
||||
run: |
|
||||
set -eu
|
||||
if command -v shellcheck >/dev/null 2>&1; then
|
||||
echo "shellcheck already present: $(shellcheck --version | head -1)"
|
||||
else
|
||||
# Prefer apt if the runner base ships it; else download arm64 binary.
|
||||
# Prefer apt if the runner base ships it; else download the
|
||||
# correct platform binary (darwin vs linux).
|
||||
if command -v apt-get >/dev/null 2>&1; then
|
||||
sudo apt-get update -qq
|
||||
sudo apt-get install -y --no-install-recommends shellcheck
|
||||
else
|
||||
SC_VER=v0.10.0
|
||||
curl -fsSL "https://github.com/koalaman/shellcheck/releases/download/${SC_VER}/shellcheck-${SC_VER}.linux.aarch64.tar.xz" \
|
||||
if [ "$(uname -s)" = "Darwin" ]; then
|
||||
SC_PKG="shellcheck-${SC_VER}.darwin.aarch64.tar.xz"
|
||||
else
|
||||
SC_PKG="shellcheck-${SC_VER}.linux.aarch64.tar.xz"
|
||||
fi
|
||||
curl -fsSL "https://github.com/koalaman/shellcheck/releases/download/${SC_VER}/${SC_PKG}" \
|
||||
| tar -xJf - --strip-components=1
|
||||
sudo mv shellcheck /usr/local/bin/
|
||||
fi
|
||||
@@ -87,14 +106,15 @@ jobs:
|
||||
shellcheck --version | head -2
|
||||
|
||||
- name: Run shellcheck on .gitea/scripts/*.sh
|
||||
if: steps.identify.outputs.arm64 == 'true'
|
||||
continue-on-error: true
|
||||
run: |
|
||||
set -eu
|
||||
# Only the scripts we control under .gitea/scripts. Pilot
|
||||
# scope is intentionally narrow — broaden in a follow-up
|
||||
# once the lane is proven.
|
||||
if ! command -v shellcheck >/dev/null 2>&1; then
|
||||
echo "WARN: shellcheck binary not found — skipping (pilot mode)"
|
||||
if ! command -v shellcheck >/dev/null 2>&1 || ! shellcheck --version >/dev/null 2>&1; then
|
||||
echo "WARN: shellcheck not functional — skipping (pilot mode)"
|
||||
exit 0
|
||||
fi
|
||||
# NOTE: macOS ships Bash 3.2 (Apple license), no `mapfile`
|
||||
|
||||
@@ -26,11 +26,12 @@ import (
|
||||
// the update cycle — no ssh, no re-provision, no ops toil.
|
||||
//
|
||||
// Contract (paired with cp-side GET /cp/tenants/config):
|
||||
// Request: GET {MOLECULE_CP_URL or https://api.moleculesai.app}/cp/tenants/config
|
||||
// Authorization: Bearer <ADMIN_TOKEN>
|
||||
// X-Molecule-Org-Id: <MOLECULE_ORG_ID>
|
||||
// Response: 200 {"MOLECULE_CP_SHARED_SECRET":"…","MOLECULE_CP_URL":"…", …}
|
||||
// 401 on bearer mismatch or unknown org
|
||||
//
|
||||
// Request: GET {MOLECULE_CP_URL or https://api.moleculesai.app}/cp/tenants/config
|
||||
// Authorization: Bearer <ADMIN_TOKEN>
|
||||
// X-Molecule-Org-Id: <MOLECULE_ORG_ID>
|
||||
// Response: 200 {"MOLECULE_CP_SHARED_SECRET":"…","MOLECULE_CP_URL":"…", …}
|
||||
// 401 on bearer mismatch or unknown org
|
||||
//
|
||||
// Best-effort: any failure logs and returns — main() keeps booting.
|
||||
// Self-hosted deploys without MOLECULE_ORG_ID or ADMIN_TOKEN set
|
||||
@@ -105,3 +106,53 @@ func refreshEnvFromCP() error {
|
||||
log.Printf("CP env refresh: applied %d values from %s/cp/tenants/config", applied, base)
|
||||
return nil
|
||||
}
|
||||
|
||||
// requiredLLMEnvVars is the set of LLM proxy env vars a managed SaaS
|
||||
// tenant must have populated after refreshEnvFromCP. cp#469 (tenant
|
||||
// proxy-env delivery) — guaranteed CP-delivered creds reach the
|
||||
// tenant process env on boot. Per Researcher Task #37 / Spec 2 and
|
||||
// Task #46 (watch-fail-first test).
|
||||
//
|
||||
// Key set byte-matched against Researcher's verified emission in
|
||||
// controlplane tenant_config.go:140-144 (Researcher REQUEST_CHANGES
|
||||
// iterate body, 3987f59c). The four keys below ARE the LLM-proxy
|
||||
// subset of the 8 CP-emitted keys; OPENAI_BASE_URL / OPENAI_API_KEY /
|
||||
// ANTHROPIC_BASE_URL / ANTHROPIC_API_KEY are out of scope for cp#469
|
||||
// (different feature surfaces — direct-to-provider fallbacks, not
|
||||
// the proxy). v2 fix: MOLECULE_LLM_USAGE_TOKEN, MOLECULE_LLM_USAGE_URL,
|
||||
// MOLECULE_LLM_BASE_URL, MOLECULE_LLM_ANTHROPIC_BASE_URL — note the
|
||||
// 4th key is namespaced MOLECULE_LLM_ANTHROPIC_BASE_URL, NOT bare
|
||||
// ANTHROPIC_BASE_URL. Bare ANTHROPIC_BASE_URL is a separate CP-emitted
|
||||
// key for direct-provider use, not the LLM proxy.
|
||||
var requiredLLMEnvVars = []string{
|
||||
"MOLECULE_LLM_USAGE_TOKEN",
|
||||
"MOLECULE_LLM_USAGE_URL", // CRITICAL fix v2: was MOLECULE_LLM_URL in v1
|
||||
"MOLECULE_LLM_BASE_URL",
|
||||
"MOLECULE_LLM_ANTHROPIC_BASE_URL", // CRITICAL fix v3: was ANTHROPIC_BASE_URL in v2 (different key!)
|
||||
}
|
||||
|
||||
// assertManagedTenantHasLLMEnv verifies that, when running as a
|
||||
// managed SaaS tenant (MOLECULE_ORG_ID + ADMIN_TOKEN both set), all
|
||||
// required LLM proxy env vars are populated after refreshEnvFromCP.
|
||||
//
|
||||
// Self-hosted (no orgID/adminToken) is exempt — dev must not be
|
||||
// blocked here. Managed tenants with missing LLM keys fail with
|
||||
// MISSING_CP_LLM_ENV so they do not silently boot with broken proxy
|
||||
// creds. Caller in main.go decides whether to log and continue or
|
||||
// log.Fatalf depending on deployment context.
|
||||
func assertManagedTenantHasLLMEnv() error {
|
||||
if os.Getenv("MOLECULE_ORG_ID") == "" || os.Getenv("ADMIN_TOKEN") == "" {
|
||||
// Self-hosted dev / not yet provisioned — not a managed tenant.
|
||||
return nil
|
||||
}
|
||||
var missing []string
|
||||
for _, k := range requiredLLMEnvVars {
|
||||
if os.Getenv(k) == "" {
|
||||
missing = append(missing, k)
|
||||
}
|
||||
}
|
||||
if len(missing) > 0 {
|
||||
return fmt.Errorf("MISSING_CP_LLM_ENV: required LLM proxy keys not set after refreshEnvFromCP: %v", missing)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
@@ -59,6 +60,138 @@ func TestRefreshEnvFromCP_AppliesCPResponse(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestRefreshEnvFromCP_ManagedTenantRequiresLLMKeys: watch-fail-first
|
||||
// per Researcher Task #46. When running as a managed tenant
|
||||
// (MOLECULE_ORG_ID + ADMIN_TOKEN set), missing LLM proxy env vars
|
||||
// after refreshEnvFromCP MUST surface as MISSING_CP_LLM_ENV, not be
|
||||
// silently accepted. Without this guard, a CP that loses its LLM
|
||||
// creds (e.g. during an incident) would let a tenant boot and then
|
||||
// fail later at first LLM call — worse than a loud refusal here.
|
||||
func TestRefreshEnvFromCP_ManagedTenantRequiresLLMKeys(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
// Stub CP returns a CP response WITHOUT any of the required
|
||||
// LLM keys — simulates the failure mode where the CP side
|
||||
// dropped or never had the LLM creds for this org.
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
fmt.Fprint(w, `{"MOLECULE_CP_SHARED_SECRET":"x","MOLECULE_CP_URL":"https://api.moleculesai.app"}`)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
t.Setenv("MOLECULE_ORG_ID", "org-managed-1")
|
||||
t.Setenv("ADMIN_TOKEN", "admin-tok")
|
||||
t.Setenv("MOLECULE_CP_URL", srv.URL)
|
||||
// Clear all LLM keys to simulate the boot-without-LLM-env failure mode.
|
||||
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "")
|
||||
t.Setenv("MOLECULE_LLM_USAGE_URL", "")
|
||||
t.Setenv("MOLECULE_LLM_BASE_URL", "")
|
||||
t.Setenv("MOLECULE_LLM_ANTHROPIC_BASE_URL", "")
|
||||
|
||||
// refreshEnvFromCP itself should succeed — CP is reachable, returned 200.
|
||||
if err := refreshEnvFromCP(); err != nil {
|
||||
t.Fatalf("refreshEnvFromCP: %v", err)
|
||||
}
|
||||
// The boot assertion must catch the missing LLM keys.
|
||||
err := assertManagedTenantHasLLMEnv()
|
||||
if err == nil {
|
||||
t.Fatal("expected MISSING_CP_LLM_ENV error for managed tenant without LLM keys, got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "MISSING_CP_LLM_ENV") {
|
||||
t.Errorf("expected error to contain MISSING_CP_LLM_ENV, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRefreshEnvFromCP_ManagedTenantHappyPath: when the CP returns
|
||||
// all 4 LLM-proxy keys, the gate must PASS — no MISSING_CP_LLM_ENV
|
||||
// for a properly-configured managed tenant. Watch-fail counterpart
|
||||
// to TestRefreshEnvFromCP_ManagedTenantRequiresLLMKeys: if THIS test
|
||||
// ever fires MISSING_CP_LLM_ENV on the byte-correct key set, the
|
||||
// requiredLLMEnvVars list has drifted from the CP emission again.
|
||||
// Per Researcher REQUEST_CHANGES TEST ADEQUACY note.
|
||||
func TestRefreshEnvFromCP_ManagedTenantHappyPath(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
// Return ALL 4 LLM-proxy keys — names byte-matched to
|
||||
// tenant_config.go:140-144 CP emission.
|
||||
fmt.Fprint(w, `{"MOLECULE_LLM_USAGE_TOKEN":"tok-1","MOLECULE_LLM_USAGE_URL":"https://llm.example.com/usage","MOLECULE_LLM_BASE_URL":"https://llm.example.com","MOLECULE_LLM_ANTHROPIC_BASE_URL":"https://llm.example.com/anthropic"}`)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
t.Setenv("MOLECULE_ORG_ID", "org-managed-happy")
|
||||
t.Setenv("ADMIN_TOKEN", "admin-tok")
|
||||
t.Setenv("MOLECULE_CP_URL", srv.URL)
|
||||
// Pre-clear so we can verify the refresh actually populated them.
|
||||
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "")
|
||||
t.Setenv("MOLECULE_LLM_USAGE_URL", "")
|
||||
t.Setenv("MOLECULE_LLM_BASE_URL", "")
|
||||
t.Setenv("MOLECULE_LLM_ANTHROPIC_BASE_URL", "")
|
||||
|
||||
if err := refreshEnvFromCP(); err != nil {
|
||||
t.Fatalf("refreshEnvFromCP: %v", err)
|
||||
}
|
||||
// Sanity: refresh actually applied the keys.
|
||||
if got := os.Getenv("MOLECULE_LLM_USAGE_TOKEN"); got != "tok-1" {
|
||||
t.Errorf("refresh did not apply USAGE_TOKEN: got %q", got)
|
||||
}
|
||||
// The boot assertion must pass — no MISSING_CP_LLM_ENV.
|
||||
if err := assertManagedTenantHasLLMEnv(); err != nil {
|
||||
t.Errorf("managed happy path must not MISSING_CP_LLM_ENV, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRefreshEnvFromCP_ManagedTenantPartialEnv: when the CP returns
|
||||
// 3 of 4 LLM-proxy keys (one missing), the gate must STILL catch it
|
||||
// and the error must name the missing key. Per Researcher
|
||||
// REQUEST_CHANGES TEST ADEQUACY note — partial-env coverage is
|
||||
// critical because the production failure mode is usually "one
|
||||
// key dropped" not "all keys dropped".
|
||||
func TestRefreshEnvFromCP_ManagedTenantPartialEnv(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
// 3 of 4 — MOLECULE_LLM_ANTHROPIC_BASE_URL is missing.
|
||||
fmt.Fprint(w, `{"MOLECULE_LLM_USAGE_TOKEN":"tok-1","MOLECULE_LLM_USAGE_URL":"https://llm.example.com/usage","MOLECULE_LLM_BASE_URL":"https://llm.example.com"}`)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
t.Setenv("MOLECULE_ORG_ID", "org-managed-partial")
|
||||
t.Setenv("ADMIN_TOKEN", "admin-tok")
|
||||
t.Setenv("MOLECULE_CP_URL", srv.URL)
|
||||
// Pre-clear all 4 so the 3 that come back from CP are the only
|
||||
// ones set; the 4th (MOLECULE_LLM_ANTHROPIC_BASE_URL) stays empty.
|
||||
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "")
|
||||
t.Setenv("MOLECULE_LLM_USAGE_URL", "")
|
||||
t.Setenv("MOLECULE_LLM_BASE_URL", "")
|
||||
t.Setenv("MOLECULE_LLM_ANTHROPIC_BASE_URL", "")
|
||||
|
||||
if err := refreshEnvFromCP(); err != nil {
|
||||
t.Fatalf("refreshEnvFromCP: %v", err)
|
||||
}
|
||||
err := assertManagedTenantHasLLMEnv()
|
||||
if err == nil {
|
||||
t.Fatal("expected MISSING_CP_LLM_ENV for partial env (3 of 4 keys), got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "MISSING_CP_LLM_ENV") {
|
||||
t.Errorf("expected error to contain MISSING_CP_LLM_ENV, got: %v", err)
|
||||
}
|
||||
if !strings.Contains(err.Error(), "MOLECULE_LLM_ANTHROPIC_BASE_URL") {
|
||||
t.Errorf("expected error to name the missing key MOLECULE_LLM_ANTHROPIC_BASE_URL, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestAssertManagedTenantHasLLMEnv_NotManagedIsNoop: self-hosted
|
||||
// (no orgID/adminToken) must NOT block on missing LLM keys — dev
|
||||
// ergonomics matter and the assertion's contract is "managed only".
|
||||
func TestAssertManagedTenantHasLLMEnv_NotManagedIsNoop(t *testing.T) {
|
||||
t.Setenv("MOLECULE_ORG_ID", "")
|
||||
t.Setenv("ADMIN_TOKEN", "")
|
||||
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "")
|
||||
t.Setenv("MOLECULE_LLM_USAGE_URL", "")
|
||||
t.Setenv("MOLECULE_LLM_BASE_URL", "")
|
||||
t.Setenv("MOLECULE_LLM_ANTHROPIC_BASE_URL", "")
|
||||
if err := assertManagedTenantHasLLMEnv(); err != nil {
|
||||
t.Errorf("self-hosted (not managed) must not block, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRefreshEnvFromCP_CPUnreachableDoesNotFailBoot: network errors must
|
||||
// return non-nil BUT main.go treats that as warn-and-continue. We assert
|
||||
// the function returns an error (not a panic) so the caller can log.
|
||||
|
||||
@@ -82,6 +82,16 @@ func main() {
|
||||
log.Printf("CP env refresh: %v (continuing with baked-in env)", err)
|
||||
}
|
||||
|
||||
// Managed-tenant boot assertion (cp#469 — tenant proxy-env delivery).
|
||||
// If we're a managed SaaS tenant (orgID + adminToken set), all required
|
||||
// LLM proxy env vars must be present after refresh. Missing keys block
|
||||
// the tenant from booting with broken LLM creds — silent-fail is worse
|
||||
// than a loud refusal. Self-hosted (no orgID/adminToken) short-circuits
|
||||
// inside the assertion, so this never fires for dev.
|
||||
if err := assertManagedTenantHasLLMEnv(); err != nil {
|
||||
log.Fatalf("Managed tenant boot assertion: %v", err)
|
||||
}
|
||||
|
||||
// Secrets encryption. In MOLECULE_ENV=prod, boot refuses to start
|
||||
// without a valid SECRETS_ENCRYPTION_KEY (fail-secure — Top-5 #5).
|
||||
// In any other environment, missing keys just log a warning and
|
||||
@@ -359,7 +369,6 @@ func main() {
|
||||
// (WorkspaceHandler.BootstrapFailed) wires its own capture inline.
|
||||
registry.BootFailureRescueHook = handlers.BootFailureRescueHook
|
||||
|
||||
|
||||
// Provision-timeout sweep — flips workspaces that have been stuck in
|
||||
// status='provisioning' past the timeout window to 'failed' and emits
|
||||
// WORKSPACE_PROVISION_TIMEOUT. Without this the UI banner is cosmetic
|
||||
|
||||
@@ -149,9 +149,11 @@ func markFailed(ctx context.Context, wsID string, broadcaster *events.Broadcaste
|
||||
models.StatusFailed, msg, wsID); dbErr != nil {
|
||||
log.Printf("bundle import: failed to mark workspace %s as failed: %v", wsID, dbErr)
|
||||
}
|
||||
broadcaster.RecordAndBroadcast(ctx, string(events.EventWorkspaceProvisionFailed), wsID, map[string]interface{}{
|
||||
if bcErr := broadcaster.RecordAndBroadcast(ctx, string(events.EventWorkspaceProvisionFailed), wsID, map[string]interface{}{
|
||||
"error": msg,
|
||||
})
|
||||
}); bcErr != nil {
|
||||
log.Printf("bundle import: failed to broadcast provision failed for %s: %v", wsID, bcErr)
|
||||
}
|
||||
}
|
||||
|
||||
func nilIfEmpty(s string) interface{} {
|
||||
|
||||
@@ -407,12 +407,14 @@ func (m *Manager) HandleInbound(ctx context.Context, ch ChannelRow, msg *Inbound
|
||||
|
||||
// Broadcast event
|
||||
if m.broadcaster != nil {
|
||||
m.broadcaster.RecordAndBroadcast(ctx, string(events.EventChannelMessage), ch.WorkspaceID, map[string]interface{}{
|
||||
if err := m.broadcaster.RecordAndBroadcast(ctx, string(events.EventChannelMessage), ch.WorkspaceID, map[string]interface{}{
|
||||
"channel_id": ch.ID,
|
||||
"channel_type": ch.ChannelType,
|
||||
"username": msg.Username,
|
||||
"direction": "inbound",
|
||||
})
|
||||
}); err != nil {
|
||||
log.Printf("Channels: failed to broadcast inbound event: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
@@ -453,11 +455,13 @@ func (m *Manager) SendOutbound(ctx context.Context, channelID string, text strin
|
||||
}
|
||||
|
||||
if m.broadcaster != nil {
|
||||
m.broadcaster.RecordAndBroadcast(ctx, string(events.EventChannelMessage), ch.WorkspaceID, map[string]interface{}{
|
||||
if err := m.broadcaster.RecordAndBroadcast(ctx, string(events.EventChannelMessage), ch.WorkspaceID, map[string]interface{}{
|
||||
"channel_id": ch.ID,
|
||||
"channel_type": ch.ChannelType,
|
||||
"direction": "outbound",
|
||||
})
|
||||
}); err != nil {
|
||||
log.Printf("Channels: failed to broadcast outbound event: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
|
||||
@@ -517,7 +517,9 @@ func (t *TelegramAdapter) StartPolling(ctx context.Context, config map[string]in
|
||||
|
||||
// Acknowledge the button press (removes loading spinner)
|
||||
ackCfg := tgbotapi.NewCallback(cb.ID, "Received")
|
||||
bot.Send(ackCfg)
|
||||
if _, err := bot.Send(ackCfg); err != nil {
|
||||
log.Printf("telegram: failed to send callback ack: %v", err)
|
||||
}
|
||||
|
||||
// Update the message to show what was clicked
|
||||
decision := "approved"
|
||||
@@ -529,7 +531,9 @@ func (t *TelegramAdapter) StartPolling(ctx context.Context, config map[string]in
|
||||
cb.Message.MessageID,
|
||||
cb.Message.Text+"\n\n✅ CEO "+decision,
|
||||
)
|
||||
bot.Send(editMsg)
|
||||
if _, err := bot.Send(editMsg); err != nil {
|
||||
log.Printf("telegram: failed to send edit message: %v", err)
|
||||
}
|
||||
|
||||
// Route the decision as an inbound message to the agent
|
||||
inbound := &InboundMessage{
|
||||
|
||||
@@ -60,10 +60,10 @@ func sanitizeErrorDetailForBroadcast(s string) string {
|
||||
}
|
||||
|
||||
type ActivityHandler struct {
|
||||
broadcaster *events.Broadcaster
|
||||
broadcaster events.EventEmitter
|
||||
}
|
||||
|
||||
func NewActivityHandler(b *events.Broadcaster) *ActivityHandler {
|
||||
func NewActivityHandler(b events.EventEmitter) *ActivityHandler {
|
||||
return &ActivityHandler{broadcaster: b}
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -54,23 +54,29 @@ func (h *ApprovalsHandler) Create(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventApprovalRequested), workspaceID, map[string]interface{}{
|
||||
if err := h.broadcaster.RecordAndBroadcast(ctx, string(events.EventApprovalRequested), workspaceID, map[string]interface{}{
|
||||
"approval_id": approvalID,
|
||||
"action": body.Action,
|
||||
"reason": body.Reason,
|
||||
"task_id": body.TaskID,
|
||||
})
|
||||
}); err != nil {
|
||||
log.Printf("approvals: failed to broadcast approval requested: %v", err)
|
||||
}
|
||||
|
||||
// Auto-escalate to parent
|
||||
var parentID *string
|
||||
db.DB.QueryRowContext(ctx, `SELECT parent_id FROM workspaces WHERE id = $1`, workspaceID).Scan(&parentID)
|
||||
if err := db.DB.QueryRowContext(ctx, `SELECT parent_id FROM workspaces WHERE id = $1`, workspaceID).Scan(&parentID); err != nil {
|
||||
log.Printf("approvals: failed to lookup parent for escalation: %v", err)
|
||||
}
|
||||
if parentID != nil {
|
||||
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventApprovalEscalated), *parentID, map[string]interface{}{
|
||||
if err := h.broadcaster.RecordAndBroadcast(ctx, string(events.EventApprovalEscalated), *parentID, map[string]interface{}{
|
||||
"approval_id": approvalID,
|
||||
"from_workspace_id": workspaceID,
|
||||
"action": body.Action,
|
||||
"reason": body.Reason,
|
||||
})
|
||||
}); err != nil {
|
||||
log.Printf("approvals: failed to broadcast approval escalated: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
c.JSON(http.StatusCreated, gin.H{"approval_id": approvalID, "status": "pending"})
|
||||
@@ -221,11 +227,13 @@ func (h *ApprovalsHandler) Decide(c *gin.Context) {
|
||||
eventType = "APPROVAL_DENIED"
|
||||
}
|
||||
|
||||
h.broadcaster.RecordAndBroadcast(ctx, eventType, workspaceID, map[string]interface{}{
|
||||
if err := h.broadcaster.RecordAndBroadcast(ctx, eventType, workspaceID, map[string]interface{}{
|
||||
"approval_id": approvalID,
|
||||
"decision": body.Decision,
|
||||
"decided_by": decidedBy,
|
||||
})
|
||||
}); err != nil {
|
||||
log.Printf("approvals: failed to broadcast approval decision: %v", err)
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{"status": body.Decision, "approval_id": approvalID})
|
||||
}
|
||||
|
||||
@@ -102,10 +102,10 @@ func pushDelegationResultToInbox(ctx context.Context, sourceID, delegationID, st
|
||||
// and the A2A request runs in the background.
|
||||
type DelegationHandler struct {
|
||||
workspace *WorkspaceHandler
|
||||
broadcaster *events.Broadcaster
|
||||
broadcaster events.EventEmitter
|
||||
}
|
||||
|
||||
func NewDelegationHandler(wh *WorkspaceHandler, b *events.Broadcaster) *DelegationHandler {
|
||||
func NewDelegationHandler(wh *WorkspaceHandler, b events.EventEmitter) *DelegationHandler {
|
||||
return &DelegationHandler{workspace: wh, broadcaster: b}
|
||||
}
|
||||
|
||||
|
||||
@@ -176,6 +176,10 @@ func TestResolveAgentURLForRestartSignal_CacheMiss(t *testing.T) {
|
||||
// TestGracefulPreRestart_Success verifies that when the workspace returns 200,
|
||||
// the signal is logged as acknowledged without error.
|
||||
func TestGracefulPreRestart_Success(t *testing.T) {
|
||||
hWrapper := &resolveURLTestWrapper{
|
||||
WorkspaceHandler: newHandlerWithTestDeps(t),
|
||||
testURL: "http://fake-agent.example/agent",
|
||||
}
|
||||
_ = setupTestDB(t)
|
||||
|
||||
// httptest server simulating the workspace container's /signals/restart_pending
|
||||
@@ -205,18 +209,15 @@ func TestGracefulPreRestart_Success(t *testing.T) {
|
||||
})
|
||||
}))
|
||||
defer srv.Close()
|
||||
hWrapper.testURL = srv.URL + "/agent"
|
||||
|
||||
// Pre-populate Redis cache with the test server URL
|
||||
_ = setupTestRedisWithURL(t, srv.URL)
|
||||
|
||||
// Use a wrapper so gracefulPreRestart runs through the embedded handler.
|
||||
hWrapper := &resolveURLTestWrapper{
|
||||
WorkspaceHandler: newHandlerWithTestDeps(t),
|
||||
testURL: srv.URL + "/agent",
|
||||
}
|
||||
// gracefulPreRestart runs in a goroutine; wait for it before db.DB is restored.
|
||||
// Must be registered AFTER setupTestDB (LIFO: async wait → db.DB restore).
|
||||
waitForHandlerAsyncBeforeDBCleanup(t, hWrapper.WorkspaceHandler)
|
||||
|
||||
// gracefulPreRestart runs in a goroutine with its own timeout.
|
||||
// We give it time to complete before the test ends.
|
||||
hWrapper.gracefulPreRestart(context.Background(), "ws-ack-789")
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
}
|
||||
@@ -224,19 +225,22 @@ func TestGracefulPreRestart_Success(t *testing.T) {
|
||||
// TestGracefulPreRestart_NotImplemented verifies that when the workspace returns
|
||||
// 404 (old SDK version), the platform proceeds gracefully (log + no error).
|
||||
func TestGracefulPreRestart_NotImplemented(t *testing.T) {
|
||||
hWrapper := &resolveURLTestWrapper{
|
||||
WorkspaceHandler: newHandlerWithTestDeps(t),
|
||||
testURL: "http://fake-agent.example/agent",
|
||||
}
|
||||
_ = setupTestDB(t)
|
||||
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusNotFound)
|
||||
}))
|
||||
defer srv.Close()
|
||||
hWrapper.testURL = srv.URL + "/agent"
|
||||
|
||||
_ = setupTestRedisWithURL(t, srv.URL)
|
||||
|
||||
hWrapper := &resolveURLTestWrapper{
|
||||
WorkspaceHandler: newHandlerWithTestDeps(t),
|
||||
testURL: srv.URL + "/agent",
|
||||
}
|
||||
// Must be registered AFTER setupTestDB so LIFO order is: async wait → db.DB restore.
|
||||
waitForHandlerAsyncBeforeDBCleanup(t, hWrapper.WorkspaceHandler)
|
||||
|
||||
hWrapper.gracefulPreRestart(context.Background(), "ws-noimpl-999")
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
@@ -246,15 +250,18 @@ func TestGracefulPreRestart_NotImplemented(t *testing.T) {
|
||||
// TestGracefulPreRestart_ConnectionRefused verifies that when the workspace
|
||||
// is unreachable, the platform proceeds gracefully without error.
|
||||
func TestGracefulPreRestart_ConnectionRefused(t *testing.T) {
|
||||
_ = setupTestDB(t)
|
||||
|
||||
mr := setupTestRedisWithURL(t, "http://localhost:19999/agent") // nothing listening on 19999
|
||||
_ = mr
|
||||
|
||||
hWrapper := &resolveURLTestWrapper{
|
||||
WorkspaceHandler: newHandlerWithTestDeps(t),
|
||||
testURL: "http://localhost:19999/agent",
|
||||
}
|
||||
_ = setupTestDB(t)
|
||||
|
||||
// Nothing listening on 19999 — deliberate connection failure.
|
||||
mr := setupTestRedisWithURL(t, "http://localhost:19999/agent")
|
||||
_ = mr
|
||||
|
||||
// Must be registered AFTER setupTestDB so LIFO order is: async wait → db.DB restore.
|
||||
waitForHandlerAsyncBeforeDBCleanup(t, hWrapper.WorkspaceHandler)
|
||||
|
||||
hWrapper.gracefulPreRestart(context.Background(), "ws-unreachable-000")
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
@@ -264,13 +271,17 @@ func TestGracefulPreRestart_ConnectionRefused(t *testing.T) {
|
||||
// TestGracefulPreRestart_URLResolutionError verifies that when URL resolution
|
||||
// fails, the platform proceeds gracefully without blocking the restart.
|
||||
func TestGracefulPreRestart_URLResolutionError(t *testing.T) {
|
||||
_ = setupTestDB(t)
|
||||
_ = setupTestRedis(t) // empty → URL resolution will fail in resolveAgentURLForRestartSignal
|
||||
|
||||
hWrapper := &resolveURLTestWrapper{
|
||||
WorkspaceHandler: newHandlerWithTestDeps(t),
|
||||
errToReturn: context.DeadlineExceeded,
|
||||
}
|
||||
_ = setupTestDB(t)
|
||||
_ = setupTestRedis(t) // empty → URL resolution will fail in resolveAgentURLForRestartSignal
|
||||
|
||||
// Must be registered AFTER setupTestDB so LIFO order is: async wait → db.DB restore.
|
||||
// This ensures goroutines (which access both DB and Redis) are drained before
|
||||
// any cleanup fires. setupTestRedis comes after newHandlerWithTestDeps
|
||||
// so the handler holds the correct Redis client reference.
|
||||
waitForHandlerAsyncBeforeDBCleanup(t, hWrapper.WorkspaceHandler)
|
||||
|
||||
hWrapper.gracefulPreRestart(context.Background(), "ws-url-err-111")
|
||||
|
||||
@@ -56,7 +56,20 @@ func PatchAbilities(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
if body.BroadcastEnabled != nil {
|
||||
// Atomic update: when both fields are supplied, apply them in one SQL
|
||||
// statement so the request is all-or-nothing (#2131). A partial mutation
|
||||
// (e.g. broadcast_enabled updated but talk_to_user_enabled failing) would
|
||||
// leave the workspace in an ambiguous capability state.
|
||||
if body.BroadcastEnabled != nil && body.TalkToUserEnabled != nil {
|
||||
if _, err := db.DB.ExecContext(ctx,
|
||||
`UPDATE workspaces SET broadcast_enabled = $2, talk_to_user_enabled = $3, updated_at = now() WHERE id = $1`,
|
||||
id, *body.BroadcastEnabled, *body.TalkToUserEnabled,
|
||||
); err != nil {
|
||||
log.Printf("PatchAbilities both-fields for %s: %v", id, err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "update failed"})
|
||||
return
|
||||
}
|
||||
} else if body.BroadcastEnabled != nil {
|
||||
if _, err := db.DB.ExecContext(ctx,
|
||||
`UPDATE workspaces SET broadcast_enabled = $2, updated_at = now() WHERE id = $1`,
|
||||
id, *body.BroadcastEnabled,
|
||||
@@ -65,9 +78,7 @@ func PatchAbilities(c *gin.Context) {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "update failed"})
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if body.TalkToUserEnabled != nil {
|
||||
} else if body.TalkToUserEnabled != nil {
|
||||
if _, err := db.DB.ExecContext(ctx,
|
||||
`UPDATE workspaces SET talk_to_user_enabled = $2, updated_at = now() WHERE id = $1`,
|
||||
id, *body.TalkToUserEnabled,
|
||||
|
||||
@@ -130,11 +130,8 @@ func TestPatchAbilities_BothFields(t *testing.T) {
|
||||
mock.ExpectQuery(`SELECT EXISTS\(SELECT 1 FROM workspaces WHERE id = \$1 AND status != 'removed'\)`).
|
||||
WithArgs(wsUUID1).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
|
||||
mock.ExpectExec(`UPDATE workspaces SET broadcast_enabled = \$2, updated_at = now\(\) WHERE id = \$1`).
|
||||
WithArgs(wsUUID1, true).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec(`UPDATE workspaces SET talk_to_user_enabled = \$2, updated_at = now\(\) WHERE id = \$1`).
|
||||
WithArgs(wsUUID1, true).
|
||||
mock.ExpectExec(`UPDATE workspaces SET broadcast_enabled = \$2, talk_to_user_enabled = \$3, updated_at = now\(\) WHERE id = \$1`).
|
||||
WithArgs(wsUUID1, true, true).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
w := patchAbilitiesReq(t, wsUUID1, `{"broadcast_enabled":true,"talk_to_user_enabled":true}`)
|
||||
@@ -182,19 +179,25 @@ func TestPatchAbilities_TalkToUserUpdateError(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestPatchAbilities_BothFields_BroadcastFails(t *testing.T) {
|
||||
// TestPatchAbilities_BothFields_UpdateError — regression for #2131. When
|
||||
// both fields are supplied the handler uses a single combined UPDATE. A
|
||||
// failure of that UPDATE must leave the workspace unchanged (atomic).
|
||||
func TestPatchAbilities_BothFields_UpdateError(t *testing.T) {
|
||||
mock, cleanup := withMockDB(t)
|
||||
defer cleanup()
|
||||
|
||||
mock.ExpectQuery(`SELECT EXISTS\(SELECT 1 FROM workspaces WHERE id = \$1 AND status != 'removed'\)`).
|
||||
WithArgs(wsUUID1).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
|
||||
mock.ExpectExec(`UPDATE workspaces SET broadcast_enabled = \$2, updated_at = now\(\) WHERE id = \$1`).
|
||||
WithArgs(wsUUID1, true).
|
||||
mock.ExpectExec(`UPDATE workspaces SET broadcast_enabled = \$2, talk_to_user_enabled = \$3, updated_at = now\(\) WHERE id = \$1`).
|
||||
WithArgs(wsUUID1, true, true).
|
||||
WillReturnError(errors.New("disk full"))
|
||||
|
||||
w := patchAbilitiesReq(t, wsUUID1, `{"broadcast_enabled":true,"talk_to_user_enabled":true}`)
|
||||
if w.Code != http.StatusInternalServerError {
|
||||
t.Fatalf("expected 500, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
// Because only one UPDATE is issued, there is no partial-mutation
|
||||
// path to assert against; sqlmock implicitly verifies no second
|
||||
// exec occurred.
|
||||
}
|
||||
|
||||
@@ -95,6 +95,14 @@ func TestIntegration_BroadcastOrgRoot_NonRootSenderResolvesToRoot(t *testing.T)
|
||||
}
|
||||
})
|
||||
|
||||
// Pre-test hygiene: if a prior run crashed or was killed, its rows may
|
||||
// still be in the shared integration DB. Remove them before inserting so
|
||||
// the unique index workspaces_parent_name_uniq does not conflict.
|
||||
if _, err := conn.ExecContext(ctx,
|
||||
`DELETE FROM workspaces WHERE name LIKE $1`, prefix+"%"); err != nil {
|
||||
t.Logf("pre-test cleanup (non-fatal): %v", err)
|
||||
}
|
||||
|
||||
rootID := uuid.New().String()
|
||||
midID := uuid.New().String()
|
||||
leafID := uuid.New().String()
|
||||
|
||||
@@ -876,8 +876,9 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) {
|
||||
h.provisionWorkspaceAutoSync(workspaceID, "", nil, payload)
|
||||
// sendRestartContext is a one-way notification to the new container; safe
|
||||
// to fire async — the next restart cycle won't depend on it completing.
|
||||
// Tracked via goAsync so the test harness can drain it before the
|
||||
// global db.DB swap (sendRestartContext reads db.DB).
|
||||
// Tracked via h.goAsync so tests can wait for it via h.asyncWG before
|
||||
// closing the sqlmock. Without this, untracked goroutines hit the restored
|
||||
// mock and cause "was not expected" errors in parallel CI execution (mc#1264).
|
||||
h.goAsync(func() { h.sendRestartContext(workspaceID, restartData) })
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user