forked from molecule-ai/molecule-core
Merge pull request #2386 from Molecule-AI/staging
staging → main: auto-promote cdef893
This commit is contained in:
commit
319f85a4b4
55
.github/workflows/auto-promote-staging.yml
vendored
55
.github/workflows/auto-promote-staging.yml
vendored
@ -267,6 +267,32 @@ jobs:
|
||||
echo "promote_pr_num=${PR_NUM}" >> "$GITHUB_OUTPUT"
|
||||
id: promote_pr
|
||||
|
||||
# Mint a short-lived GitHub App installation token for the dispatch
|
||||
# step below. We CANNOT use `secrets.GITHUB_TOKEN` to dispatch the
|
||||
# downstream publish chain — workflow runs created by GITHUB_TOKEN
|
||||
# do not fire `workflow_run` triggers on completion (the
|
||||
# documented "no recursion" rule —
|
||||
# https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow).
|
||||
#
|
||||
# Symptom this caused (root-caused on 2026-04-30): publish-image
|
||||
# ran successfully twice (21313dc 14:41Z, 59dec57 15:21Z) but
|
||||
# canary-verify and redeploy-tenants-on-main never chained,
|
||||
# because the publish run's `triggering_actor` was
|
||||
# `github-actions[bot]` (i.e. GITHUB_TOKEN). A manual dispatch
|
||||
# earlier in the day with the operator's PAT (d850ec7 06:52Z) did
|
||||
# chain — same workflow file, only the actor differed.
|
||||
#
|
||||
# An App token's triggering_actor is the App user (e.g.
|
||||
# `molecule-ai[bot]`), which IS allowed to fire downstream
|
||||
# workflow_run cascades.
|
||||
- name: Mint App token for downstream dispatch
|
||||
if: steps.promote_pr.outputs.promote_pr_num != ''
|
||||
id: app-token
|
||||
uses: actions/create-github-app-token@1b10c78c7865c340bc4f6099eb2f838309f1e8c3 # v3.1.1
|
||||
with:
|
||||
app-id: ${{ secrets.MOLECULE_AI_APP_ID }}
|
||||
private-key: ${{ secrets.MOLECULE_AI_APP_PRIVATE_KEY }}
|
||||
|
||||
- name: Wait for promote merge, then dispatch publish + redeploy (#2357)
|
||||
# GITHUB_TOKEN-initiated merges suppress downstream `push` events
|
||||
# (https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow).
|
||||
@ -276,18 +302,20 @@ jobs:
|
||||
# tenants stay on stale code (issue #2357).
|
||||
#
|
||||
# Workaround: poll for the merge to land, then explicitly
|
||||
# `gh workflow run` publish-workspace-server-image. workflow_dispatch
|
||||
# is the documented exception to the GITHUB_TOKEN suppression rule —
|
||||
# dispatch DOES create a new workflow run. canary-verify chains via
|
||||
# workflow_run (no branch filter) and redeploys to fleet via the
|
||||
# existing chain.
|
||||
# `gh workflow run` publish-workspace-server-image. The dispatch
|
||||
# MUST authenticate as the molecule-ai App (App token minted
|
||||
# above) — not GITHUB_TOKEN — so that the resulting publish
|
||||
# run's completion event can fire the workflow_run cascade
|
||||
# into canary-verify + redeploy-tenants-on-main. See the prior
|
||||
# step's comment for the GITHUB_TOKEN no-recursion details.
|
||||
#
|
||||
# Long-term fix: switch the auto-merge call above to a GitHub App
|
||||
# token (actions/create-github-app-token) and remove this polling
|
||||
# tail step. Tracked in #2357.
|
||||
# Long-term fix: switch the auto-merge call above to use the
|
||||
# same App token, so the merge's push event fires
|
||||
# publish-workspace-server-image naturally and this polling tail
|
||||
# becomes unnecessary. Tracked in #2357.
|
||||
if: steps.promote_pr.outputs.promote_pr_num != ''
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
GH_TOKEN: ${{ steps.app-token.outputs.token }}
|
||||
REPO: ${{ github.repository }}
|
||||
PR_NUM: ${{ steps.promote_pr.outputs.promote_pr_num }}
|
||||
run: |
|
||||
@ -318,17 +346,18 @@ jobs:
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Dispatch publish on main. workflow_dispatch via GITHUB_TOKEN
|
||||
# IS allowed to create new workflow runs (per the linked docs).
|
||||
# Dispatch publish on main using the App token. App-initiated
|
||||
# workflow_dispatch DOES propagate the workflow_run cascade,
|
||||
# unlike GITHUB_TOKEN-initiated dispatch.
|
||||
# publish completes → canary-verify chains via workflow_run →
|
||||
# redeploy-tenants-on-main chains via workflow_run + branches:[main].
|
||||
if gh workflow run publish-workspace-server-image.yml \
|
||||
--repo "$REPO" --ref main 2>&1; then
|
||||
echo "::notice::Dispatched publish-workspace-server-image on ref=main — canary-verify and redeploy-tenants-on-main will chain via workflow_run."
|
||||
echo "::notice::Dispatched publish-workspace-server-image on ref=main as molecule-ai App — canary-verify and redeploy-tenants-on-main will chain via workflow_run."
|
||||
{
|
||||
echo "## 🚀 Tenant redeploy chain dispatched"
|
||||
echo
|
||||
echo "- publish-workspace-server-image (workflow_dispatch on \`main\`)"
|
||||
echo "- publish-workspace-server-image (workflow_dispatch on \`main\`, actor: \`molecule-ai[bot]\`)"
|
||||
echo "- canary-verify will chain on completion"
|
||||
echo "- redeploy-tenants-on-main will chain on canary green"
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
227
workspace-server/internal/db/workspace_status_enum_drift_test.go
Normal file
227
workspace-server/internal/db/workspace_status_enum_drift_test.go
Normal file
@ -0,0 +1,227 @@
|
||||
package db_test
|
||||
|
||||
// Static drift gate: every workspaces.status literal written in the Go
|
||||
// tree must exist in the workspace_status enum defined by the migrations.
|
||||
//
|
||||
// Why this exists: the `workspace_status` enum (migrations 043 + 046)
|
||||
// shipped without 'awaiting_agent' even though application code wrote
|
||||
// that value, and every UPDATE silently failed in production for five
|
||||
// days before the gap surfaced (see 046_workspace_status_awaiting_agent.up.sql).
|
||||
// The unit tests passed because sqlmock matches SQL by regex, not against
|
||||
// a live enum constraint.
|
||||
//
|
||||
// Approach: extract every Go string literal whose body matches
|
||||
// (?i)workspaces[^a-z_].*status (so "UPDATE workspaces SET status",
|
||||
// "FROM workspaces WHERE ... status", "INSERT INTO workspaces ... status",
|
||||
// CTEs that reference workspaces, etc.). For each such SQL fragment,
|
||||
// pull the single-quoted status values out of `status =`, `status IN`,
|
||||
// `THEN`, and `ELSE`. Every value must be in the union of CREATE TYPE +
|
||||
// ALTER TYPE ADD VALUE across all migrations.
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestWorkspaceStatusEnum_NoLiteralDrift(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
repoRoot := findRepoRoot(t)
|
||||
migrationsDir := filepath.Join(repoRoot, "workspace-server", "migrations")
|
||||
internalDir := filepath.Join(repoRoot, "workspace-server", "internal")
|
||||
|
||||
enum := loadWorkspaceStatusEnum(t, migrationsDir)
|
||||
if len(enum) == 0 {
|
||||
t.Fatalf("could not parse workspace_status enum from %s — gate is non-functional", migrationsDir)
|
||||
}
|
||||
|
||||
literals := collectWorkspacesStatusLiterals(t, internalDir)
|
||||
if len(literals) == 0 {
|
||||
t.Fatalf("found zero workspaces.status literals under %s — gate is non-functional", internalDir)
|
||||
}
|
||||
|
||||
var rogue []string
|
||||
for lit := range literals {
|
||||
if _, ok := enum[lit]; ok {
|
||||
continue
|
||||
}
|
||||
rogue = append(rogue, lit)
|
||||
}
|
||||
if len(rogue) > 0 {
|
||||
sort.Strings(rogue)
|
||||
t.Errorf(
|
||||
"workspaces.status literal(s) %v are written by Go code but not in the workspace_status enum.\n"+
|
||||
"Add a migration `ALTER TYPE workspace_status ADD VALUE 'X';` (see 046 for shape).\n"+
|
||||
"Enum currently is: %v",
|
||||
rogue, sortedKeys(enum),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// loadWorkspaceStatusEnum scans every *.up.sql file for either:
|
||||
//
|
||||
// CREATE TYPE workspace_status AS ENUM ('a', 'b', ...)
|
||||
// ALTER TYPE workspace_status ADD VALUE [IF NOT EXISTS] 'X' [BEFORE|AFTER 'Y']
|
||||
//
|
||||
// and returns the union of every value the enum will hold after all
|
||||
// migrations apply.
|
||||
func loadWorkspaceStatusEnum(t *testing.T, migrationsDir string) map[string]struct{} {
|
||||
t.Helper()
|
||||
|
||||
out := make(map[string]struct{})
|
||||
|
||||
files, err := filepath.Glob(filepath.Join(migrationsDir, "*.up.sql"))
|
||||
if err != nil {
|
||||
t.Fatalf("glob migrations: %v", err)
|
||||
}
|
||||
sort.Strings(files)
|
||||
|
||||
createRE := regexp.MustCompile(`(?is)CREATE\s+TYPE\s+workspace_status\s+AS\s+ENUM\s*\(([^)]+)\)`)
|
||||
addValueRE := regexp.MustCompile(`(?i)ALTER\s+TYPE\s+workspace_status\s+ADD\s+VALUE(?:\s+IF\s+NOT\s+EXISTS)?\s+'([^']+)'`)
|
||||
literalRE := regexp.MustCompile(`'([^']+)'`)
|
||||
|
||||
for _, f := range files {
|
||||
body, err := os.ReadFile(f)
|
||||
if err != nil {
|
||||
t.Fatalf("read %s: %v", f, err)
|
||||
}
|
||||
for _, m := range createRE.FindAllStringSubmatch(string(body), -1) {
|
||||
for _, lit := range literalRE.FindAllStringSubmatch(m[1], -1) {
|
||||
out[lit[1]] = struct{}{}
|
||||
}
|
||||
}
|
||||
for _, m := range addValueRE.FindAllStringSubmatch(string(body), -1) {
|
||||
out[m[1]] = struct{}{}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// collectWorkspacesStatusLiterals walks every non-test .go file under
|
||||
// root, finds Go string literals that contain `UPDATE workspaces` or
|
||||
// `INSERT INTO workspaces`, and extracts the status literals appearing
|
||||
// inside the matching SQL statement.
|
||||
//
|
||||
// Why this scope: any UPDATE/INSERT against `workspaces` is the moment
|
||||
// a status literal hits the column constrained by the enum. Read-side
|
||||
// SQL (SELECT ... WHERE status = 'X') cannot fail on enum drift, so it's
|
||||
// out of scope. JOINs to `workspaces` from other tables (e.g. approvals
|
||||
// joining workspaces for display) write to a different table's status —
|
||||
// also out of scope. Anchoring on the leading `UPDATE workspaces` /
|
||||
// `INSERT INTO workspaces` keyword unambiguously identifies the writes
|
||||
// we care about.
|
||||
func collectWorkspacesStatusLiterals(t *testing.T, root string) map[string]struct{} {
|
||||
t.Helper()
|
||||
|
||||
// Match raw-string and double-quoted Go string literals. Backtick
|
||||
// strings can span multiple lines. Both forms are extracted via the
|
||||
// same DOTALL regex over the whole file body.
|
||||
rawRE := regexp.MustCompile("(?s)`([^`]*?)`")
|
||||
dquoteRE := regexp.MustCompile(`"((?:[^"\\]|\\.)*)"`)
|
||||
|
||||
// A SQL string is in scope if it begins (after optional leading
|
||||
// whitespace) with UPDATE workspaces or INSERT INTO workspaces.
|
||||
// `(?i)` is case-insensitive; `\s*` allows the format-friendly
|
||||
// leading newline and indent that the codebase uses.
|
||||
updateWorkspacesRE := regexp.MustCompile(`(?is)^\s*UPDATE\s+workspaces\b`)
|
||||
insertWorkspacesRE := regexp.MustCompile(`(?is)^\s*INSERT\s+INTO\s+workspaces\b`)
|
||||
|
||||
// Inside a scoped SQL fragment, status literals appear in:
|
||||
// status = 'X' — assignment in SET (or filter in WHERE)
|
||||
// status IN ('X', ...) — filter
|
||||
// status NOT IN ('X') — filter
|
||||
// THEN 'X' — CASE arm
|
||||
// ELSE 'X' — CASE default
|
||||
statusEqRE := regexp.MustCompile(`(?i)status\s*(?:=|!=|<>)\s*'([a-z_]+)'`)
|
||||
statusInRE := regexp.MustCompile(`(?i)status\s+(?:NOT\s+)?IN\s*\(([^)]*)\)`)
|
||||
thenRE := regexp.MustCompile(`(?i)THEN\s+'([a-z_]+)'`)
|
||||
elseRE := regexp.MustCompile(`(?i)ELSE\s+'([a-z_]+)'`)
|
||||
inListLiteralRE := regexp.MustCompile(`'([a-z_]+)'`)
|
||||
|
||||
out := make(map[string]struct{})
|
||||
|
||||
walkErr := filepath.Walk(root, func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.IsDir() {
|
||||
return nil
|
||||
}
|
||||
if !strings.HasSuffix(path, ".go") {
|
||||
return nil
|
||||
}
|
||||
if strings.HasSuffix(path, "_test.go") {
|
||||
return nil
|
||||
}
|
||||
body, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
text := string(body)
|
||||
|
||||
harvest := func(fragment string) {
|
||||
if !updateWorkspacesRE.MatchString(fragment) && !insertWorkspacesRE.MatchString(fragment) {
|
||||
return
|
||||
}
|
||||
for _, m := range statusEqRE.FindAllStringSubmatch(fragment, -1) {
|
||||
out[m[1]] = struct{}{}
|
||||
}
|
||||
for _, m := range statusInRE.FindAllStringSubmatch(fragment, -1) {
|
||||
for _, lit := range inListLiteralRE.FindAllStringSubmatch(m[1], -1) {
|
||||
out[lit[1]] = struct{}{}
|
||||
}
|
||||
}
|
||||
for _, m := range thenRE.FindAllStringSubmatch(fragment, -1) {
|
||||
out[m[1]] = struct{}{}
|
||||
}
|
||||
for _, m := range elseRE.FindAllStringSubmatch(fragment, -1) {
|
||||
out[m[1]] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
for _, m := range rawRE.FindAllStringSubmatch(text, -1) {
|
||||
harvest(m[1])
|
||||
}
|
||||
for _, m := range dquoteRE.FindAllStringSubmatch(text, -1) {
|
||||
harvest(m[1])
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if walkErr != nil {
|
||||
t.Fatalf("walk %s: %v", root, walkErr)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func findRepoRoot(t *testing.T) string {
|
||||
t.Helper()
|
||||
dir, err := os.Getwd()
|
||||
if err != nil {
|
||||
t.Fatalf("getwd: %v", err)
|
||||
}
|
||||
for i := 0; i < 8; i++ {
|
||||
if _, err := os.Stat(filepath.Join(dir, "workspace-server", "migrations")); err == nil {
|
||||
return dir
|
||||
}
|
||||
parent := filepath.Dir(dir)
|
||||
if parent == dir {
|
||||
break
|
||||
}
|
||||
dir = parent
|
||||
}
|
||||
t.Fatalf("could not locate repo root with workspace-server/migrations from %s", dir)
|
||||
return ""
|
||||
}
|
||||
|
||||
func sortedKeys(m map[string]struct{}) []string {
|
||||
out := make([]string, 0, len(m))
|
||||
for k := range m {
|
||||
out = append(out, k)
|
||||
}
|
||||
sort.Strings(out)
|
||||
return out
|
||||
}
|
||||
@ -259,24 +259,7 @@ func (h *ChatFilesHandler) Upload(c *gin.Context) {
|
||||
req.ContentLength = c.Request.ContentLength
|
||||
}
|
||||
|
||||
resp, err := h.httpClient.Do(req)
|
||||
if err != nil {
|
||||
log.Printf("chat_files Upload: forward to %s failed: %v", forwardURL, err)
|
||||
c.JSON(http.StatusBadGateway, gin.H{"error": "workspace unreachable"})
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Stream response back. Copy headers we know are safe + the body.
|
||||
if ct := resp.Header.Get("Content-Type"); ct != "" {
|
||||
c.Header("Content-Type", ct)
|
||||
}
|
||||
c.Status(resp.StatusCode)
|
||||
if _, err := io.Copy(c.Writer, resp.Body); err != nil {
|
||||
// Mid-stream failure — too late to write a JSON error, just
|
||||
// log so ops can correlate with the workspace's logs.
|
||||
log.Printf("chat_files Upload: stream response back failed for %s: %v", workspaceID, err)
|
||||
}
|
||||
h.streamWorkspaceResponse(c, "upload", workspaceID, forwardURL, req, []string{"Content-Type"})
|
||||
}
|
||||
|
||||
// Download handles GET /workspaces/:id/chat/download?path=<abs path>.
|
||||
@ -351,27 +334,42 @@ func (h *ChatFilesHandler) Download(c *gin.Context) {
|
||||
}
|
||||
req.Header.Set("Authorization", "Bearer "+secret)
|
||||
|
||||
h.streamWorkspaceResponse(c, "download", workspaceID, forwardURL, req,
|
||||
[]string{"Content-Type", "Content-Length", "Content-Disposition"})
|
||||
}
|
||||
|
||||
// streamWorkspaceResponse executes the prepared forward request and
|
||||
// streams the workspace's response back to the inbound caller.
|
||||
// Forwards the named response headers verbatim. Centralizes the
|
||||
// "do request → check err → defer close → copy headers → set status →
|
||||
// io.Copy" tail that's identical between Upload and Download.
|
||||
//
|
||||
// op is the human-readable feature label ("upload"/"download") used
|
||||
// in log messages so operators can distinguish which feature ran.
|
||||
func (h *ChatFilesHandler) streamWorkspaceResponse(
|
||||
c *gin.Context,
|
||||
op, workspaceID, forwardURL string,
|
||||
req *http.Request,
|
||||
forwardHeaders []string,
|
||||
) {
|
||||
resp, err := h.httpClient.Do(req)
|
||||
if err != nil {
|
||||
log.Printf("chat_files Download: forward to %s failed: %v", forwardURL, err)
|
||||
log.Printf("chat_files %s: forward to %s failed: %v", op, forwardURL, err)
|
||||
c.JSON(http.StatusBadGateway, gin.H{"error": "workspace unreachable"})
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Stream response back, including the workspace's headers so the
|
||||
// client gets the correct Content-Type + Content-Disposition (the
|
||||
// workspace constructs them from the actual file's extension +
|
||||
// basename — keeping that logic on the workspace side avoids a
|
||||
// double-source-of-truth on filename encoding rules).
|
||||
for _, hdr := range []string{"Content-Type", "Content-Length", "Content-Disposition"} {
|
||||
for _, hdr := range forwardHeaders {
|
||||
if v := resp.Header.Get(hdr); v != "" {
|
||||
c.Header(hdr, v)
|
||||
}
|
||||
}
|
||||
c.Status(resp.StatusCode)
|
||||
if _, err := io.Copy(c.Writer, resp.Body); err != nil {
|
||||
log.Printf("chat_files Download: stream response back failed for %s: %v", workspaceID, err)
|
||||
// Mid-stream failure — too late to write a JSON error, just
|
||||
// log so ops can correlate with the workspace's logs.
|
||||
log.Printf("chat_files %s: stream response back failed for %s: %v", op, workspaceID, err)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -0,0 +1,53 @@
|
||||
-- 046_workspace_status_awaiting_agent.down.sql
|
||||
--
|
||||
-- Reverse 046_workspace_status_awaiting_agent.up.sql.
|
||||
--
|
||||
-- Postgres does NOT support DROP VALUE on an enum. The standard rollback
|
||||
-- recipe is rename → recreate → cast → drop, which is intrusive (locks
|
||||
-- workspaces with ACCESS EXCLUSIVE the same way migration 043 did). We
|
||||
-- punt: only run this manually, and only if you're prepared to nuke and
|
||||
-- recreate the type. Application code WILL fail if the value disappears
|
||||
-- and any row currently has it; pre-flight with:
|
||||
--
|
||||
-- UPDATE workspaces SET status = 'offline'
|
||||
-- WHERE status = 'awaiting_agent';
|
||||
--
|
||||
-- before running the recipe below.
|
||||
|
||||
BEGIN;
|
||||
|
||||
SET LOCAL lock_timeout = '5s';
|
||||
|
||||
-- Convert any existing awaiting_agent / hibernating rows to a value the
|
||||
-- new enum will accept. 'offline' is the safest fallback for awaiting_agent
|
||||
-- (operator can re-register to bring them back online); 'hibernated' is
|
||||
-- the natural terminal of an in-flight 'hibernating'.
|
||||
UPDATE workspaces SET status = 'offline' WHERE status = 'awaiting_agent';
|
||||
UPDATE workspaces SET status = 'hibernated' WHERE status = 'hibernating';
|
||||
|
||||
ALTER TYPE workspace_status RENAME TO workspace_status_with_awaiting;
|
||||
|
||||
CREATE TYPE workspace_status AS ENUM (
|
||||
'provisioning',
|
||||
'online',
|
||||
'offline',
|
||||
'degraded',
|
||||
'failed',
|
||||
'removed',
|
||||
'paused',
|
||||
'hibernated'
|
||||
);
|
||||
|
||||
ALTER TABLE workspaces
|
||||
ALTER COLUMN status DROP DEFAULT;
|
||||
|
||||
ALTER TABLE workspaces
|
||||
ALTER COLUMN status TYPE workspace_status
|
||||
USING status::text::workspace_status;
|
||||
|
||||
ALTER TABLE workspaces
|
||||
ALTER COLUMN status SET DEFAULT 'provisioning'::workspace_status;
|
||||
|
||||
DROP TYPE workspace_status_with_awaiting;
|
||||
|
||||
COMMIT;
|
||||
@ -0,0 +1,60 @@
|
||||
-- 046_workspace_status_awaiting_agent.up.sql
|
||||
--
|
||||
-- Add the missing 'awaiting_agent' and 'hibernating' values to the
|
||||
-- workspace_status enum.
|
||||
--
|
||||
-- Migration 043 (2026-04-25) introduced the workspace_status enum but
|
||||
-- omitted two values that application code had already been writing:
|
||||
--
|
||||
-- 'awaiting_agent' — handlers/workspace.go:333 (since 2026-04-24,
|
||||
-- commit 1e8b5e01, "first-class BYO-compute"):
|
||||
-- external workspaces created without a URL are
|
||||
-- meant to park here until the agent registers.
|
||||
-- 'hibernating' — handlers/workspace_restart.go:271-272 (since
|
||||
-- 29_workspace_hibernation): intermediate state
|
||||
-- between 'online' and the final 'hibernated';
|
||||
-- acts as a DB-level claim while provisioner.Stop
|
||||
-- runs.
|
||||
--
|
||||
-- Every UPDATE that tried to write either value failed with
|
||||
-- `invalid input value for enum workspace_status: "..."`. The
|
||||
-- consequences were silent because both call sites either dropped or
|
||||
-- log-and-continued the error:
|
||||
--
|
||||
-- - workspace.go:333 discards the Exec result entirely. Canvas shows
|
||||
-- `awaiting_agent` in the response, but the row stays in
|
||||
-- 'provisioning' until first /registry/register.
|
||||
-- - workspace_restart.go:277 logs and returns; hibernation simply
|
||||
-- never happens. Idle workspaces consumed resources indefinitely
|
||||
-- for five days before this gate flagged it.
|
||||
--
|
||||
-- 2026-04-30 PR #2382 ("default external runtime to poll-mode +
|
||||
-- awaiting_agent") added two more silent-fail sites in
|
||||
-- registry/liveness.go and registry/healthsweep.go. Both log + continue.
|
||||
-- Result: liveness expiry no longer transitions runtime='external'
|
||||
-- rows, and the heartbeat-staleness sweep is a no-op for them.
|
||||
-- UI/canvas shows external workspaces stuck on 'online' or 'degraded'
|
||||
-- indefinitely after the agent disconnects.
|
||||
--
|
||||
-- Tests across all four sites used sqlmock, which matches SQL by regex
|
||||
-- but does not validate against the live enum constraint, so unit
|
||||
-- tests passed despite the prod-only failures. See
|
||||
-- feedback_mock_at_drifting_layer in operator memory.
|
||||
--
|
||||
-- This migration adds both enum values so all six call sites start
|
||||
-- succeeding. IF NOT EXISTS makes the migration idempotent across
|
||||
-- re-runs (RunMigrations in postgres.go re-applies migrations until
|
||||
-- schema_migrations records them; a future operator-driven re-run is
|
||||
-- harmless).
|
||||
--
|
||||
-- ALTER TYPE ADD VALUE is committed immediately by Postgres regardless
|
||||
-- of transaction wrapping, and does NOT take a heavy lock on workspaces.
|
||||
-- Safe to run during normal traffic.
|
||||
--
|
||||
-- A regression gate lives at internal/db/workspace_status_enum_drift_test.go:
|
||||
-- it parses every UPDATE/INSERT against `workspaces` in the Go tree
|
||||
-- and asserts every status literal is in the enum. That test would
|
||||
-- have caught both omissions on the day they shipped.
|
||||
|
||||
ALTER TYPE workspace_status ADD VALUE IF NOT EXISTS 'awaiting_agent';
|
||||
ALTER TYPE workspace_status ADD VALUE IF NOT EXISTS 'hibernating';
|
||||
54
workspace/tests/snapshots/platform_auth_signature.json
Normal file
54
workspace/tests/snapshots/platform_auth_signature.json
Normal file
@ -0,0 +1,54 @@
|
||||
{
|
||||
"functions": [
|
||||
{
|
||||
"is_abstract": false,
|
||||
"is_async": false,
|
||||
"name": "auth_headers",
|
||||
"parameters": [],
|
||||
"return_annotation": "dict[str, str]"
|
||||
},
|
||||
{
|
||||
"is_abstract": false,
|
||||
"is_async": false,
|
||||
"name": "get_token",
|
||||
"parameters": [],
|
||||
"return_annotation": "str | None"
|
||||
},
|
||||
{
|
||||
"is_abstract": false,
|
||||
"is_async": false,
|
||||
"name": "refresh_cache",
|
||||
"parameters": [],
|
||||
"return_annotation": "str | None"
|
||||
},
|
||||
{
|
||||
"is_abstract": false,
|
||||
"is_async": false,
|
||||
"name": "save_token",
|
||||
"parameters": [
|
||||
{
|
||||
"annotation": "str",
|
||||
"has_default": false,
|
||||
"kind": "POSITIONAL_OR_KEYWORD",
|
||||
"name": "token"
|
||||
}
|
||||
],
|
||||
"return_annotation": "None"
|
||||
},
|
||||
{
|
||||
"is_abstract": false,
|
||||
"is_async": false,
|
||||
"name": "self_source_headers",
|
||||
"parameters": [
|
||||
{
|
||||
"annotation": "str",
|
||||
"has_default": false,
|
||||
"kind": "POSITIONAL_OR_KEYWORD",
|
||||
"name": "workspace_id"
|
||||
}
|
||||
],
|
||||
"return_annotation": "dict[str, str]"
|
||||
}
|
||||
],
|
||||
"module": "platform_auth"
|
||||
}
|
||||
114
workspace/tests/test_platform_auth_signature.py
Normal file
114
workspace/tests/test_platform_auth_signature.py
Normal file
@ -0,0 +1,114 @@
|
||||
"""platform_auth public-API signature snapshot — drift gate.
|
||||
|
||||
``platform_auth`` is the workspace's auth-token store. Every outbound
|
||||
HTTP from the runtime — heartbeat, registry/register, A2A delegation,
|
||||
memory tool calls, chat uploads, temporal_workflow, molecule_ai_status
|
||||
— pulls credentials through one of these five module-level functions.
|
||||
|
||||
A grep of ``from platform_auth import`` across workspace/ shows it's
|
||||
imported by 14+ files in the runtime hot path:
|
||||
|
||||
- main.py (boot + token issuance)
|
||||
- heartbeat.py (every heartbeat loop fire)
|
||||
- a2a_client.py (every A2A peer call)
|
||||
- a2a_tools.py (delegate_task_async)
|
||||
- consolidation.py
|
||||
- events.py (canvas push)
|
||||
- executor_helpers.py (3 sites)
|
||||
- molecule_ai_status.py
|
||||
- builtin_tools/memory.py (3 sites)
|
||||
- builtin_tools/temporal_workflow.py (2 sites)
|
||||
|
||||
Renaming any of the five (e.g. ``auth_headers`` → ``bearer_headers``)
|
||||
would make every one of those imports raise ``ImportError`` at boot —
|
||||
the workspace fails to start with a confusing trace deep in
|
||||
heartbeat init, not at the rename site.
|
||||
|
||||
Same drift class as the BaseAdapter signature snapshot (#2378, #2380),
|
||||
skill_loader gate (#2381), and runtime_wedge gate (#2383). The
|
||||
shared ``_signature_snapshot.py`` helpers do the heavy lifting; this
|
||||
file just declares which functions are part of the contract.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
WORKSPACE_DIR = Path(__file__).parent.parent
|
||||
if str(WORKSPACE_DIR) not in sys.path:
|
||||
sys.path.insert(0, str(WORKSPACE_DIR))
|
||||
|
||||
from tests._signature_snapshot import ( # noqa: E402
|
||||
build_module_functions_record,
|
||||
compare_against_snapshot,
|
||||
)
|
||||
|
||||
SNAPSHOT_PATH = Path(__file__).parent / "snapshots" / "platform_auth_signature.json"
|
||||
|
||||
|
||||
def _build_full_snapshot() -> dict:
|
||||
"""Pin only the five contract functions runtime + adapters call.
|
||||
``clear_cache`` is intentionally NOT in the snapshot — it's a
|
||||
test-only helper. Callers in production code MUST NOT depend on it.
|
||||
"""
|
||||
import platform_auth
|
||||
|
||||
return build_module_functions_record(
|
||||
platform_auth,
|
||||
function_names=[
|
||||
"auth_headers",
|
||||
"self_source_headers",
|
||||
"get_token",
|
||||
"save_token",
|
||||
"refresh_cache",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def test_platform_auth_signature_matches_snapshot():
|
||||
compare_against_snapshot(_build_full_snapshot(), SNAPSHOT_PATH)
|
||||
|
||||
|
||||
def test_snapshot_has_required_functions():
|
||||
"""Defense-in-depth: even if both source and snapshot are updated
|
||||
together, removing any of the five contract functions requires
|
||||
explicit edit here. The required set is the documented public
|
||||
contract — every workspace runtime import path depends on these.
|
||||
"""
|
||||
if not SNAPSHOT_PATH.exists():
|
||||
pytest.skip(f"{SNAPSHOT_PATH.name} not generated yet")
|
||||
|
||||
import json
|
||||
snapshot = json.loads(SNAPSHOT_PATH.read_text())
|
||||
fn_names = {f["name"] for f in snapshot["functions"]}
|
||||
|
||||
required = {
|
||||
# Every outbound httpx call merges this into headers
|
||||
"auth_headers",
|
||||
# A2A peer + self-message paths add X-Workspace-ID via this
|
||||
"self_source_headers",
|
||||
# main.py reads this on boot to decide register-vs-resume
|
||||
"get_token",
|
||||
# main.py persists the platform-issued token via this
|
||||
"save_token",
|
||||
# 401-retry path drops the in-process cache via this (#1877)
|
||||
"refresh_cache",
|
||||
}
|
||||
missing = required - fn_names
|
||||
if missing:
|
||||
pytest.fail(
|
||||
f"platform_auth snapshot is missing required functions: {sorted(missing)}.\n"
|
||||
"Either restore them on platform_auth.py, OR coordinate runtime "
|
||||
"module + adapter updates AND remove the entry from `required` in "
|
||||
"this test with a justification."
|
||||
)
|
||||
|
||||
for fn in snapshot["functions"]:
|
||||
if fn.get("missing"):
|
||||
pytest.fail(
|
||||
f"platform_auth.{fn['name']} resolved as a non-function — "
|
||||
"either it was replaced by a different kind of attribute "
|
||||
"(class? module-level alias?) which existing direct calls "
|
||||
"would break, OR it was removed entirely."
|
||||
)
|
||||
Loading…
Reference in New Issue
Block a user