molecule-core/platform/internal/handlers/hibernation_test.go
Molecule AI Backend Engineer 34f5a3cbe2 fix(platform): atomic hibernate via UPDATE WHERE active_tasks=0 (#819)
Replaces the racy SELECT-then-Stop two-step in HibernateWorkspace with a
three-step atomic pattern that eliminates the TOCTOU window (SAFE-819):

  1. Atomic claim: single UPDATE WHERE id=$1
                   AND status IN ('online','degraded')
                   AND active_tasks = 0
     — rowsAffected=0 means another caller already claimed it or tasks
       arrived; we abort immediately without calling Stop.

  2. provisioner.Stop: safe because status='hibernating' blocks new task
     routing between step 1 and step 2 (no new task can be dispatched).

  3. Final UPDATE to 'hibernated': records the completed hibernation.

Also adds stopFnOverride func(ctx, id) to WorkspaceHandler (always nil in
production) so tests can count Stop calls without a running Docker daemon.

Tests added/updated (13 total across 2 files):
  - TestHibernateWorkspace_ActiveTasksNotHibernated
  - TestHibernateWorkspace_AlreadyHibernatingNotHibernated
  - TestHibernateWorkspace_SuccessPath
  - TestHibernateWorkspace_ConcurrentOnlyOneStop
  - TestHibernateWorkspace_DBErrorOnClaim
  - Updated 3 existing HibernateWorkspace tests + 1 HTTP handler test

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-17 20:52:20 +00:00

289 lines
11 KiB
Go

package handlers
// Integration tests for the workspace hibernation feature (issue #711 / PR #724).
// Updated for the atomic TOCTOU fix (issue #819).
//
// Coverage:
// - HibernateWorkspace(): atomic claim, container stop, DB status update, Redis key clear, event broadcast
// - POST /workspaces/:id/hibernate HTTP handler: online→200, not-eligible→404, DB error→500
// - resolveAgentURL(): hibernated workspace → 503 + Retry-After: 15 + waking: true
//
// The A2A auto-wake path (resolveAgentURL) is tested via TestResolveAgentURL_HibernatedWorkspace_*
// added to a2a_proxy_test.go to keep related resolveAgentURL tests co-located.
import (
"context"
"database/sql"
"encoding/json"
"fmt"
"net/http"
"net/http/httptest"
"strings"
"testing"
sqlmock "github.com/DATA-DOG/go-sqlmock"
"github.com/gin-gonic/gin"
)
// ──────────────────────────────────────────────────────────────────────────────
// HibernateWorkspace unit tests
// ──────────────────────────────────────────────────────────────────────────────
// TestHibernateWorkspace_OnlineWorkspace_Success verifies the happy-path with
// the 3-step atomic pattern (#819):
// - Atomic claim UPDATE returns rowsAffected=1 (workspace was online/degraded + active_tasks=0)
// - Name/tier SELECT runs after the claim
// - Final UPDATE sets status='hibernated', url=''
// - Redis keys ws:{id}, ws:{id}:url, ws:{id}:internal_url are deleted
// - WORKSPACE_HIBERNATED event is broadcast (INSERT INTO structure_events)
func TestHibernateWorkspace_OnlineWorkspace_Success(t *testing.T) {
mock := setupTestDB(t)
mr := setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
wsID := "ws-idle-online"
// Pre-populate Redis keys that ClearWorkspaceKeys should remove.
mr.Set(fmt.Sprintf("ws:%s", wsID), "some-value")
mr.Set(fmt.Sprintf("ws:%s:url", wsID), "http://agent.internal:8000")
mr.Set(fmt.Sprintf("ws:%s:internal_url", wsID), "http://172.17.0.5:8000")
// Step 1: atomic claim UPDATE succeeds.
mock.ExpectExec(`UPDATE workspaces`).
WithArgs(wsID).
WillReturnResult(sqlmock.NewResult(0, 1))
// Post-claim SELECT for name/tier.
mock.ExpectQuery(`SELECT name, tier FROM workspaces WHERE id`).
WithArgs(wsID).
WillReturnRows(sqlmock.NewRows([]string{"name", "tier"}).AddRow("Idle Agent", 1))
// Step 3: final UPDATE to 'hibernated'.
mock.ExpectExec(`UPDATE workspaces SET status = 'hibernated'`).
WithArgs(wsID).
WillReturnResult(sqlmock.NewResult(0, 1))
// Broadcaster inserts a structure_events row.
mock.ExpectExec(`INSERT INTO structure_events`).
WillReturnResult(sqlmock.NewResult(0, 1))
handler.HibernateWorkspace(context.Background(), wsID)
// All DB expectations were exercised.
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet DB expectations: %v", err)
}
// Redis keys must all be gone.
for _, suffix := range []string{"", ":url", ":internal_url"} {
key := fmt.Sprintf("ws:%s%s", wsID, suffix)
if _, err := mr.Get(key); err == nil {
t.Errorf("expected Redis key %q to be deleted, but it still exists", key)
}
}
}
// TestHibernateWorkspace_NotEligible_NoOp verifies that when the atomic claim
// UPDATE returns rowsAffected=0 (workspace not in online/degraded state, or
// active_tasks > 0), HibernateWorkspace returns immediately — no Stop, no
// final UPDATE, no Redis clear, no broadcast.
func TestHibernateWorkspace_NotEligible_NoOp(t *testing.T) {
mock := setupTestDB(t)
mr := setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
wsID := "ws-already-offline"
// Atomic claim finds nothing matching WHERE (workspace offline, paused, etc.).
mock.ExpectExec(`UPDATE workspaces`).
WithArgs(wsID).
WillReturnResult(sqlmock.NewResult(0, 0))
// Set a Redis key to confirm it is NOT cleared by early return.
mr.Set(fmt.Sprintf("ws:%s:url", wsID), "http://still-here:8000")
handler.HibernateWorkspace(context.Background(), wsID)
// Only the one ExecContext expectation; no further DB operations.
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet DB expectations: %v", err)
}
// Redis key must still exist — HibernateWorkspace returned early.
if _, err := mr.Get(fmt.Sprintf("ws:%s:url", wsID)); err != nil {
t.Errorf("expected Redis key to still exist after no-op, but it was deleted: %v", err)
}
}
// TestHibernateWorkspace_DBUpdateFails_NoCrash verifies that a DB error on the
// final status UPDATE does not panic — the function logs and returns silently.
func TestHibernateWorkspace_DBUpdateFails_NoCrash(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
wsID := "ws-update-fail"
// Step 1: atomic claim succeeds.
mock.ExpectExec(`UPDATE workspaces`).
WithArgs(wsID).
WillReturnResult(sqlmock.NewResult(0, 1))
// Post-claim SELECT.
mock.ExpectQuery(`SELECT name, tier FROM workspaces WHERE id`).
WithArgs(wsID).
WillReturnRows(sqlmock.NewRows([]string{"name", "tier"}).AddRow("Flaky Agent", 2))
// Step 3: final UPDATE fails.
mock.ExpectExec(`UPDATE workspaces SET status = 'hibernated'`).
WithArgs(wsID).
WillReturnError(fmt.Errorf("db: connection refused"))
// Must not panic — test will catch a panic via t.Fatal.
defer func() {
if r := recover(); r != nil {
t.Fatalf("HibernateWorkspace panicked on UPDATE error: %v", r)
}
}()
handler.HibernateWorkspace(context.Background(), wsID)
// Claim + SELECT + failing UPDATE; no INSERT INTO structure_events expected.
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet DB expectations: %v", err)
}
}
// ──────────────────────────────────────────────────────────────────────────────
// POST /workspaces/:id/hibernate HTTP handler tests
// ──────────────────────────────────────────────────────────────────────────────
// hibernateRequest fires POST /workspaces/{id}/hibernate against the handler
// and returns the response recorder.
func hibernateRequest(t *testing.T, handler *WorkspaceHandler, wsID string) *httptest.ResponseRecorder {
t.Helper()
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: wsID}}
c.Request = httptest.NewRequest(http.MethodPost, "/workspaces/"+wsID+"/hibernate", nil)
handler.Hibernate(c)
return w
}
// TestHibernateHandler_Online_Returns200 verifies that an online workspace
// that is eligible for hibernation returns 200 {"status":"hibernated"}.
// With the 3-step fix: handler SELECT → atomic claim UPDATE → name/tier SELECT
// → final UPDATE → broadcaster INSERT.
func TestHibernateHandler_Online_Returns200(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
wsID := "ws-handler-online"
// Hibernate() handler eligibility SELECT — checks status IN ('online','degraded').
mock.ExpectQuery(`SELECT name, tier FROM workspaces WHERE id = .* AND status IN`).
WithArgs(wsID).
WillReturnRows(sqlmock.NewRows([]string{"name", "tier"}).AddRow("Online Bot", 1))
// HibernateWorkspace() step 1: atomic claim.
mock.ExpectExec(`UPDATE workspaces`).
WithArgs(wsID).
WillReturnResult(sqlmock.NewResult(0, 1))
// Post-claim SELECT for name/tier.
mock.ExpectQuery(`SELECT name, tier FROM workspaces WHERE id`).
WithArgs(wsID).
WillReturnRows(sqlmock.NewRows([]string{"name", "tier"}).AddRow("Online Bot", 1))
// Step 3: final UPDATE.
mock.ExpectExec(`UPDATE workspaces SET status = 'hibernated'`).
WithArgs(wsID).
WillReturnResult(sqlmock.NewResult(0, 1))
// Broadcaster INSERT.
mock.ExpectExec(`INSERT INTO structure_events`).
WillReturnResult(sqlmock.NewResult(0, 1))
w := hibernateRequest(t, handler, wsID)
if w.Code != http.StatusOK {
t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
}
var resp map[string]interface{}
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
t.Fatalf("failed to decode response: %v", err)
}
if resp["status"] != "hibernated" {
t.Errorf(`expected {"status":"hibernated"}, got %v`, resp)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet DB expectations: %v", err)
}
}
// TestHibernateHandler_NotActive_Returns404 verifies that a workspace not in
// online/degraded state (e.g. offline, paused, already hibernated) returns 404.
func TestHibernateHandler_NotActive_Returns404(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
wsID := "ws-handler-paused"
// Handler's eligibility SELECT returns no rows — workspace is not online/degraded.
mock.ExpectQuery(`SELECT name, tier FROM workspaces WHERE id = .* AND status IN`).
WithArgs(wsID).
WillReturnError(sql.ErrNoRows)
w := hibernateRequest(t, handler, wsID)
if w.Code != http.StatusNotFound {
t.Fatalf("expected 404, got %d: %s", w.Code, w.Body.String())
}
var resp map[string]interface{}
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
t.Fatalf("failed to decode response: %v", err)
}
if !strings.Contains(fmt.Sprint(resp["error"]), "not found") {
t.Errorf("expected error mentioning 'not found', got %v", resp)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet DB expectations: %v", err)
}
}
// TestHibernateHandler_DBError_Returns500 verifies that an unexpected DB error
// on the eligibility SELECT returns 500.
func TestHibernateHandler_DBError_Returns500(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
wsID := "ws-handler-dberror"
mock.ExpectQuery(`SELECT name, tier FROM workspaces WHERE id = .* AND status IN`).
WithArgs(wsID).
WillReturnError(fmt.Errorf("db: connection reset"))
w := hibernateRequest(t, handler, wsID)
if w.Code != http.StatusInternalServerError {
t.Fatalf("expected 500, got %d: %s", w.Code, w.Body.String())
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet DB expectations: %v", err)
}
}