diff --git a/.gitea/scripts/status-reaper.py b/.gitea/scripts/status-reaper.py index 7047a7fc..3c32eb6f 100644 --- a/.gitea/scripts/status-reaper.py +++ b/.gitea/scripts/status-reaper.py @@ -133,6 +133,9 @@ PUSH_COMPENSATION_DESCRIPTION = ( "Compensated by status-reaper (workflow has no push: trigger; " "Gitea 1.22.6 hardcoded-suffix bug — see .gitea/scripts/status-reaper.py)" ) +# Backward-compatible alias for older tests/tooling that predate the split +# between push-suffix compensation and pull-request-shadow compensation. +COMPENSATION_DESCRIPTION = PUSH_COMPENSATION_DESCRIPTION PR_SHADOW_COMPENSATION_DESCRIPTION = ( "Compensated by status-reaper (default-branch pull_request status " "shadowed by successful push status on same SHA; see " @@ -611,11 +614,10 @@ def list_recent_commit_shas(branch: str, limit: int) -> list[str]: (verified via vendor-truth probe 2026-05-11 against git.moleculesai.app — `feedback_smoke_test_vendor_truth_not_shape_match`). - Raises ApiError on non-2xx OR on unexpected response shape. This is - a HARD halt — without the commit list the sweep can't proceed. (The - per-SHA error isolation downstream is a different concern: tolerating - a transient 5xx on ONE commit's status is best-effort; losing the - commit list itself means we don't even know which commits to try.) + Raises ApiError on non-2xx OR on unexpected response shape. The + branch-level caller soft-skips this tick because the next scheduled + tick can safely retry the listing. Per-SHA status/write errors remain + separate and must not be mislabeled as commit-list outages. """ _, body = api( "GET", @@ -656,7 +658,27 @@ def reap_branch( - compensated_per_sha: {: [, ...]} — only SHAs that actually got at least one compensation are included """ - shas = list_recent_commit_shas(branch, limit) + try: + shas = list_recent_commit_shas(branch, limit) + except ApiError as e: + print( + "::warning::status-reaper skipped this tick because the " + f"commit list could not be read after retries: {e}" + ) + return { + "scanned_shas": 0, + "compensated": 0, + "preserved_real_push": 0, + "preserved_unknown": 0, + "preserved_non_failure": 0, + "preserved_non_push_suffix": 0, + "preserved_unparseable": 0, + "compensated_pr_shadowed_by_push_success": 0, + "preserved_pr_without_push_success": 0, + "compensated_per_sha": {}, + "skipped": True, + "skip_reason": "commit-list-api-error", + } aggregate: dict[str, Any] = { "scanned_shas": 0, diff --git a/.gitea/workflows/redeploy-tenants-on-main.yml b/.gitea/workflows/redeploy-tenants-on-main.yml index 0411e149..259df556 100644 --- a/.gitea/workflows/redeploy-tenants-on-main.yml +++ b/.gitea/workflows/redeploy-tenants-on-main.yml @@ -9,19 +9,17 @@ name: redeploy-tenants-on-main # - Workflow-level env.GITHUB_SERVER_URL pinned per # feedback_act_runner_github_server_url. # - `continue-on-error: true` on each job (RFC §1 contract). -# - ~~**Gitea workflow_run trigger limitation**~~ FIXED: replaced with -# push+paths filter per this PR. Gitea 1.22.6 does not support -# `workflow_run` (task #81). The push trigger fires on every -# commit to publish-workspace-server-image.yml which is the -# same signal (only successful runs commit to main). +# - Dropped unsupported `workflow_run` (task #81). +# - Later changed to manual-only after publish-workspace-server-image.yml +# gained an integrated ordered production deploy job. # -# Auto-refresh prod tenant EC2s after every main merge. +# Manual production tenant redeploy/rollback helper. # -# Why this workflow exists: publish-workspace-server-image builds and -# pushes a new platform-tenant : to ECR on every merge to main, -# but running tenants pulled their image once at boot and never re-pull. -# Users see stale code indefinitely. +# Why this workflow is manual-only: publish-workspace-server-image now owns +# the ordered build -> push -> production auto-deploy sequence in one workflow. +# A separate push-triggered redeploy workflow races before the new ECR image +# exists and can paint main red with a false deployment failure. # # This workflow closes the gap by calling the control-plane admin # endpoint that performs a canary-first, batched, health-gated rolling @@ -34,16 +32,11 @@ name: redeploy-tenants-on-main # Gitea suspension migration. The staging-verify.yml promote step now # uses the same redeploy-fleet endpoint (fixes the silent-GHCR gap). # -# Runtime ordering: -# 1. publish-workspace-server-image completes → new :staging- in ECR. -# 2. The merge that updates publish-workspace-server-image.yml triggers -# this push/path-filtered workflow, which calls redeploy-fleet with -# target_tag=staging-. No CDN propagation wait needed — ECR image -# manifest is consistent immediately after push. -# 3. Calls redeploy-fleet with canary_slug (if set) and a soak -# period. Canary proves the image boots; batches follow. -# 4. Any failure aborts the rollout and leaves older tenants on the -# prior image — safer default than half-and-half state. +# Runtime ordering for automatic deploys now lives in +# publish-workspace-server-image.yml: +# 1. build-and-push creates new :staging- images in ECR. +# 2. deploy-production waits for required push contexts on that SHA. +# 3. deploy-production calls redeploy-fleet canary-first. # # Rollback path: set PROD_MANUAL_REDEPLOY_TARGET_TAG as a repo/org # variable or secret, run workflow_dispatch, then unset it after the @@ -51,21 +44,14 @@ name: redeploy-tenants-on-main # re-pulling the pinned image on every tenant. on: - push: - branches: [main] - paths: - - '.gitea/workflows/publish-workspace-server-image.yml' workflow_dispatch: permissions: contents: read # No write scopes needed — the workflow hits an external CP endpoint, # not the GitHub API. -# Serialize redeploys so two rapid main pushes' redeploys don't overlap -# and cause confusing per-tenant SSM state. Without this, GitHub's -# implicit workflow_run queueing would *probably* serialize them, but -# the explicit block makes the invariant defensible. Mirrors the -# concurrency block on redeploy-tenants-on-staging.yml for shape parity. +# Serialize manual redeploys so two operator-triggered rollbacks do not +# overlap and cause confusing per-tenant SSM state. # # NOTE: cancel-in-progress: false removed (Rule 7 fix). Gitea 1.22.6 # cancels queued runs regardless of this setting, so it provides no @@ -81,18 +67,15 @@ env: jobs: # bp-exempt: production redeploy is a side-effect workflow, not a merge gate. redeploy: - # Gitea 1.22.6 does not support workflow_run. This workflow is now - # controlled by push/path triggers plus an explicit kill switch. - if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }} + if: ${{ github.event_name == 'workflow_dispatch' }} runs-on: ubuntu-latest # Phase 3 (RFC #219 §1): surface broken workflows without blocking. # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true timeout-minutes: 25 env: - # Rule 9 fix: operational kill switch for auto-triggered deployments. - # Set repo variable or secret PROD_AUTO_DEPLOY_DISABLED=true to prevent - # this workflow from redeploying. Manual workflow_dispatch bypasses this. + # Rule 9 fix: keep the same operational kill switch surface as the + # integrated auto-deploy workflow. PROD_AUTO_DEPLOY_DISABLED: ${{ vars.PROD_AUTO_DEPLOY_DISABLED || secrets.PROD_AUTO_DEPLOY_DISABLED || '' }} steps: - name: Kill-switch guard @@ -114,13 +97,8 @@ jobs: # tag) → used verbatim. Lets ops pin `latest` for emergency # rollback to last canary-verified digest, or pin a specific # `staging-` to roll back to a known-good build. - # 2. Default → `staging-`. The just-published - # digest. Bypasses the `:latest` retag path that's currently - # dead (staging-verify soft-skips without canary fleet, so - # the only thing retagging `:latest` today is the manual - # promote-latest.yml — last run 2026-04-28). Auto-trigger - # from the main push uses github.sha; manual - # dispatch with no variable falls through to github.sha. + # 2. Default → `staging-` for manual reruns from + # the current default-branch SHA. env: PROD_MANUAL_REDEPLOY_TARGET_TAG: ${{ vars.PROD_MANUAL_REDEPLOY_TARGET_TAG || secrets.PROD_MANUAL_REDEPLOY_TARGET_TAG || '' }} HEAD_SHA: ${{ github.sha }} @@ -274,13 +252,11 @@ jobs: # fail the workflow, which is what `ok=true` should have # guaranteed all along. # - # When the redeploy was triggered by workflow_dispatch with a - # specific tag (target_tag != "latest"), the expected SHA may - # not equal ${{ github.sha }} — in that case we resolve via - # GHCR's manifest. For workflow_run (default :latest) the - # workflow_run.head_sha is the SHA that just published. + # When the redeploy is triggered manually with a specific tag + # (target_tag != "latest"), the expected SHA may not equal + # ${{ github.sha }}. env: - EXPECTED_SHA: ${{ github.event.workflow_run.head_sha || github.sha }} + EXPECTED_SHA: ${{ github.sha }} TARGET_TAG: ${{ steps.tag.outputs.target_tag }} # Tenant subdomain template — slugs from the response are # appended. Production CP issues `.moleculesai.app`; diff --git a/tests/test_status_reaper.py b/tests/test_status_reaper.py index 81327487..d7e4ed1f 100644 --- a/tests/test_status_reaper.py +++ b/tests/test_status_reaper.py @@ -495,7 +495,7 @@ def test_reap_required_check_pull_request_suffix_never_touched(sr_module, monkey } counters = sr_module.reap(workflow_map, combined, SHA, dry_run=False) assert counters["compensated"] == 0 - assert counters["preserved_non_push_suffix"] == 1 + assert counters["preserved_pr_without_push_success"] == 1 assert calls == [] @@ -1009,3 +1009,64 @@ def test_reap_continues_on_per_sha_apierror(sr_module, monkeypatch, capsys): captured = capsys.readouterr() assert "::warning::" in captured.out or "::notice::" in captured.out assert SHA_A[:10] in captured.out + + +def test_main_soft_skips_when_commit_listing_times_out(sr_module, monkeypatch, capsys): + """A transient outage while listing recent commits should not paint main red. + + Per-SHA status read failures are already isolated inside `reap_branch`. + The real 2026-05-14 failure was earlier: `/commits?sha=main&limit=30` + timed out after all retries, aborting the tick. The next 5-minute tick can + retry safely, so `main()` should emit an observable warning and return 0. + """ + + monkeypatch.setattr(sr_module, "scan_workflows", lambda _: {"workflow-without-push": False}) + + def fake_list_recent_commit_shas(*args, **kwargs): + raise sr_module.ApiError( + "GET /repos/owner/repo/commits failed after 4 attempts: timed out" + ) + + monkeypatch.setattr(sr_module, "list_recent_commit_shas", fake_list_recent_commit_shas) + monkeypatch.setattr(sys, "argv", ["status-reaper.py"]) + + assert sr_module.main() == 0 + captured = capsys.readouterr() + assert "::warning::status-reaper skipped this tick" in captured.out + assert '"skipped": true' in captured.out + assert '"skip_reason": "commit-list-api-error"' in captured.out + + +def test_main_does_not_soft_skip_status_write_failures(sr_module, monkeypatch): + """Only commit-list read failures are soft-skipped. + + A compensation write failure means the reaper could not repair a red + status. That must still fail the job loudly instead of being mislabeled as + a transient commit-list outage. + """ + + monkeypatch.setattr(sr_module, "scan_workflows", lambda _: {"workflow-without-push": False}) + monkeypatch.setattr(sr_module, "list_recent_commit_shas", lambda *_args, **_kwargs: [SHA_A]) + monkeypatch.setattr( + sr_module, + "get_combined_status", + lambda _sha: { + "state": "failure", + "statuses": [ + { + "context": "workflow-without-push / job (push)", + "status": "failure", + "description": "stranded class-O red", + } + ], + }, + ) + + def fake_post_compensating_status(*args, **kwargs): + raise sr_module.ApiError("POST /statuses failed: 403") + + monkeypatch.setattr(sr_module, "post_compensating_status", fake_post_compensating_status) + monkeypatch.setattr(sys, "argv", ["status-reaper.py"]) + + with pytest.raises(sr_module.ApiError, match="POST /statuses failed"): + sr_module.main() diff --git a/workspace-server/internal/handlers/org_helpers_security_test.go b/workspace-server/internal/handlers/org_helpers_security_test.go new file mode 100644 index 00000000..33daedfa --- /dev/null +++ b/workspace-server/internal/handlers/org_helpers_security_test.go @@ -0,0 +1,316 @@ +package handlers + +import ( + "path/filepath" + "strings" + "testing" +) + +// org_helpers_security_test.go — security-critical path sanitization + role-name +// validation for org template processing. Covers OFFSEC-006-class attacks: +// path traversal via user-controlled files_dir / prompt_file refs, and role-name +// injection via the persona env loader. + +// ── resolveInsideRoot ────────────────────────────────────────────────────────── + +func TestResolveInsideRoot_EmptyUserPath(t *testing.T) { + _, err := resolveInsideRoot("/safe/root", "") + if err == nil { + t.Error("empty userPath: expected error, got nil") + } + if err.Error() != "path is empty" { + t.Errorf("empty userPath: got %q, want %q", err.Error(), "path is empty") + } +} + +func TestResolveInsideRoot_AbsolutePathRejected(t *testing.T) { + _, err := resolveInsideRoot("/safe/root", "/etc/passwd") + if err == nil { + t.Error("absolute userPath: expected error, got nil") + } + if err.Error() != "absolute paths are not allowed" { + t.Errorf("absolute userPath: got %q, want %q", err.Error(), "absolute paths are not allowed") + } +} + +func TestResolveInsideRoot_DotDotTraversal(t *testing.T) { + // ../../etc/passwd from /safe/root + got, err := resolveInsideRoot("/safe/root", "../../etc/passwd") + if err == nil { + t.Errorf("dotdot traversal: expected error, got %q", got) + } + if err.Error() != "path escapes root" { + t.Errorf("dotdot traversal: got %q, want %q", err.Error(), "path escapes root") + } +} + +func TestResolveInsideRoot_DotDotWithIntermediate(t *testing.T) { + // a/b/../../c should escape if a/b is not under root + got, err := resolveInsideRoot("/safe/root", "a/b/../../c") + if err == nil { + t.Errorf("dotdot with intermediate: expected error, got %q", got) + } + if err.Error() != "path escapes root" { + t.Errorf("dotdot with intermediate: got %q, want %q", err.Error(), "path escapes root") + } +} + +func TestResolveInsideRoot_ValidRelativePath(t *testing.T) { + // This test uses the real filesystem since resolveInsideRoot calls filepath.Abs. + // Use t.TempDir() so we have a real root to work with. + root := t.TempDir() + got, err := resolveInsideRoot(root, "subdir/file.txt") + if err != nil { + t.Fatalf("valid relative: unexpected error: %v", err) + } + // Must be inside root + if got[:len(root)] != root { + t.Errorf("result should start with root %q, got %q", root, got) + } +} + +func TestResolveInsideRoot_ExactRootMatch(t *testing.T) { + root := t.TempDir() + got, err := resolveInsideRoot(root, ".") + if err != nil { + t.Fatalf("exact root: unexpected error: %v", err) + } + if got != root { + t.Errorf("exact root match: got %q, want %q", got, root) + } +} + +func TestResolveInsideRoot_DotPathComponent(t *testing.T) { + root := t.TempDir() + // ./subdir/./file.txt should resolve to root/subdir/file.txt + got, err := resolveInsideRoot(root, "./subdir/./file.txt") + if err != nil { + t.Fatalf("dot path component: unexpected error: %v", err) + } + if got[len(got)-14:] != "/subdir/file.txt" { + t.Errorf("dot path component: got %q, want suffix /subdir/file.txt", got) + } +} + +func TestResolveInsideRoot_NestedDotDotEscapes(t *testing.T) { + root := t.TempDir() + // a/../../b from /tmp/dirsomething → /tmp/b (escapes temp dir) + got, err := resolveInsideRoot(root, "a/../../b") + if err == nil { + t.Errorf("nested dotdot: expected error, got %q", got) + } + if err.Error() != "path escapes root" { + t.Errorf("nested dotdot: got %q, want %q", err.Error(), "path escapes root") + } +} + +func TestResolveInsideRoot_DotdotAtStart(t *testing.T) { + root := t.TempDir() + got, err := resolveInsideRoot(root, "../sibling") + if err == nil { + t.Errorf("../sibling: expected error, got %q", got) + } + if err.Error() != "path escapes root" { + t.Errorf("../sibling: got %q, want %q", err.Error(), "path escapes root") + } +} + +func TestResolveInsideRoot_SiblingNotEscaped(t *testing.T) { + // /foo/bar and /foo/baz are siblings — the prefix check with + // filepath.Separator guard must allow /foo/bar/child without matching /foo/baz + // (which would be wrong if the check were just strings.HasPrefix). + root := t.TempDir() + got, err := resolveInsideRoot(root, "valid-subdir/file.txt") + if err != nil { + t.Fatalf("sibling not escaped: unexpected error: %v", err) + } + // Must be inside root + if !strings.HasPrefix(got, root+string(filepath.Separator)) { + t.Errorf("result should be inside root %q, got %q", root, got) + } +} + +// ── isSafeRoleName ──────────────────────────────────────────────────────────── + +func TestIsSafeRoleName_Valid(t *testing.T) { + valid := []string{ + "backend", + "Frontend-Engineer", + "research_lead", + "devOps123", + "a", + "A", + "team_42-leads", + } + for _, name := range valid { + if !isSafeRoleName(name) { + t.Errorf("isSafeRoleName(%q): expected true, got false", name) + } + } +} + +func TestIsSafeRoleName_Empty(t *testing.T) { + if isSafeRoleName("") { + t.Error("isSafeRoleName(\"\"): expected false, got true") + } +} + +func TestIsSafeRoleName_Dot(t *testing.T) { + if isSafeRoleName(".") { + t.Error("isSafeRoleName(\".\"): expected false, got true") + } +} + +func TestIsSafeRoleName_DotDot(t *testing.T) { + if isSafeRoleName("..") { + t.Error("isSafeRoleName(\"..\"): expected false, got true") + } +} + +func TestIsSafeRoleName_PathTraversal(t *testing.T) { + unsafe := []string{ + "../etc", + "foo/../../../etc", + "foo/../../bar", + } + for _, name := range unsafe { + if isSafeRoleName(name) { + t.Errorf("isSafeRoleName(%q): expected false (path traversal), got true", name) + } + } +} + +func TestIsSafeRoleName_SpecialChars(t *testing.T) { + unsafe := []string{ + "foo:bar", + "foo bar", + "foo\tbar", + "foo\nbar", + "foo\x00bar", + "foo@bar", + "foo#bar", + "foo$bar", + } + for _, name := range unsafe { + if isSafeRoleName(name) { + t.Errorf("isSafeRoleName(%q): expected false (special char), got true", name) + } + } +} + +// ── mergeCategoryRouting ────────────────────────────────────────────────────── + +func TestMergeCategoryRouting_BothNil(t *testing.T) { + got := mergeCategoryRouting(nil, nil) + if len(got) != 0 { + t.Errorf("both nil: got %v, want empty", got) + } +} + +func TestMergeCategoryRouting_DefaultOnly(t *testing.T) { + defaultRouting := map[string][]string{ + "security": {"Backend Engineer", "DevOps"}, + } + got := mergeCategoryRouting(defaultRouting, nil) + if len(got) != 1 { + t.Fatalf("default only: got %d entries, want 1", len(got)) + } + if len(got["security"]) != 2 { + t.Errorf("security roles: got %v, want [Backend Engineer, DevOps]", got["security"]) + } +} + +func TestMergeCategoryRouting_WorkspaceOnly(t *testing.T) { + wsRouting := map[string][]string{ + "ui": {"Frontend Engineer"}, + } + got := mergeCategoryRouting(nil, wsRouting) + if len(got) != 1 { + t.Fatalf("ws only: got %d entries, want 1", len(got)) + } + if got["ui"][0] != "Frontend Engineer" { + t.Errorf("ui roles: got %v, want [Frontend Engineer]", got["ui"]) + } +} + +func TestMergeCategoryRouting_MergeNoOverlap(t *testing.T) { + defaultRouting := map[string][]string{ + "security": {"Backend Engineer"}, + } + wsRouting := map[string][]string{ + "ui": {"Frontend Engineer"}, + } + got := mergeCategoryRouting(defaultRouting, wsRouting) + if len(got) != 2 { + t.Errorf("merge no overlap: got %d entries, want 2", len(got)) + } +} + +func TestMergeCategoryRouting_WsOverrideDropsDefault(t *testing.T) { + defaultRouting := map[string][]string{ + "security": {"Backend Engineer", "DevOps"}, + } + wsRouting := map[string][]string{ + "security": {"Security Engineer"}, + } + got := mergeCategoryRouting(defaultRouting, wsRouting) + if len(got["security"]) != 1 { + t.Errorf("ws override: got %v, want [Security Engineer]", got["security"]) + } + if got["security"][0] != "Security Engineer" { + t.Errorf("ws override: got %v, want [Security Engineer]", got["security"]) + } +} + +func TestMergeCategoryRouting_EmptyListDropsCategory(t *testing.T) { + defaultRouting := map[string][]string{ + "security": {"Backend Engineer"}, + "ui": {"Frontend Engineer"}, + } + wsRouting := map[string][]string{ + "security": {}, // empty list = opt out + } + got := mergeCategoryRouting(defaultRouting, wsRouting) + if _, exists := got["security"]; exists { + t.Error("empty ws list should delete the category from output") + } + if len(got["ui"]) != 1 { + t.Errorf("ui should still exist: got %v", got["ui"]) + } +} + +func TestMergeCategoryRouting_EmptyKeySkipped(t *testing.T) { + defaultRouting := map[string][]string{ + "": {"Backend Engineer"}, + } + got := mergeCategoryRouting(defaultRouting, nil) + if _, exists := got[""]; exists { + t.Error("empty key should be skipped") + } +} + +func TestMergeCategoryRouting_EmptyRolesInDefaultSkipped(t *testing.T) { + defaultRouting := map[string][]string{ + "security": {}, + } + got := mergeCategoryRouting(defaultRouting, nil) + if len(got) != 0 { + t.Errorf("empty roles in default should be skipped, got %v", got) + } +} + +func TestMergeCategoryRouting_OriginalMapsUnmodified(t *testing.T) { + defaultRouting := map[string][]string{ + "security": {"Backend Engineer"}, + } + wsRouting := map[string][]string{ + "ui": {"Frontend Engineer"}, + } + mergeCategoryRouting(defaultRouting, wsRouting) + if len(defaultRouting) != 1 || len(defaultRouting["security"]) != 1 { + t.Error("default routing should be unmodified after merge") + } + if len(wsRouting) != 1 { + t.Error("ws routing should be unmodified after merge") + } +}