fix(workspace-server): restart re-provisions with the switched runtime, not the stale config.yaml #3208

Merged
devops-engineer merged 1 commits from fix/restart-preserves-switched-runtime into main 2026-06-24 06:03:36 +00:00
3 changed files with 140 additions and 22 deletions
@@ -0,0 +1,106 @@
package handlers
import (
"context"
"testing"
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/provisioner"
)
// execReadStubProv is a LocalProvisionerAPI whose ExecRead returns a
// configurable config.yaml body, so we can simulate a container whose baked
// /configs/config.yaml runtime has drifted from the DB runtime column.
// Every other method panics — restartRuntimeFromConfig only calls ExecRead.
type execReadStubProv struct {
configYAML string
err error
}
func (s *execReadStubProv) Start(_ context.Context, _ provisioner.WorkspaceConfig) (string, error) {
panic("execReadStubProv.Start not implemented in test")
}
func (s *execReadStubProv) Stop(_ context.Context, _ string) error {
panic("execReadStubProv.Stop not implemented in test")
}
func (s *execReadStubProv) IsRunning(_ context.Context, _ string) (bool, error) {
panic("execReadStubProv.IsRunning not implemented in test")
}
func (s *execReadStubProv) ExecRead(_ context.Context, _, _ string) ([]byte, error) {
if s.err != nil {
return nil, s.err
}
return []byte(s.configYAML), nil
}
func (s *execReadStubProv) RemoveVolume(_ context.Context, _ string) error {
panic("execReadStubProv.RemoveVolume not implemented in test")
}
func (s *execReadStubProv) VolumeHasFile(_ context.Context, _, _ string) (bool, error) {
panic("execReadStubProv.VolumeHasFile not implemented in test")
}
func (s *execReadStubProv) WriteAuthTokenToVolume(_ context.Context, _, _ string) error {
panic("execReadStubProv.WriteAuthTokenToVolume not implemented in test")
}
var _ provisioner.LocalProvisionerAPI = (*execReadStubProv)(nil)
// TestRestartRuntimeFromConfig_DBRuntimeWinsOverStaleConfig is the regression
// test for the runtime-switch-then-restart bug: the runtime-switch PATCH writes
// only the workspaces.runtime DB column (not the running container's
// /configs/config.yaml), so on restart the DB column is the SSOT. Pre-fix, the
// container's stale template-default config.yaml ("claude-code") won over the
// switched DB runtime ("google-adk") AND stomped the DB column — so a switched
// runtime box was never re-provisioned.
//
// Here the DB runtime is "google-adk" while the container config.yaml still
// declares "claude-code"; the restart must re-provision with "google-adk".
func TestRestartRuntimeFromConfig_DBRuntimeWinsOverStaleConfig(t *testing.T) {
h := &WorkspaceHandler{
provisioner: &execReadStubProv{configYAML: "runtime: claude-code\nmodel: x\n"},
}
got := h.restartRuntimeFromConfig(context.Background(), "ws-1", "adk-test", "google-adk", false)
if got != "google-adk" {
t.Fatalf("restartRuntimeFromConfig returned %q; want the DB SSOT runtime %q (the stale config.yaml=claude-code must NOT win)", got, "google-adk")
}
}
// TestRestartRuntimeFromConfig_ApplyTemplateShortCircuits verifies that
// apply_template=true bypasses the container read entirely and returns the DB
// runtime (the existing fast path).
func TestRestartRuntimeFromConfig_ApplyTemplateShortCircuits(t *testing.T) {
// ExecRead would panic if reached; apply_template=true must short-circuit.
h := &WorkspaceHandler{provisioner: &execReadStubProv{}}
got := h.restartRuntimeFromConfig(context.Background(), "ws-1", "adk-test", "google-adk", true)
if got != "google-adk" {
t.Fatalf("restartRuntimeFromConfig (apply_template) returned %q; want %q", got, "google-adk")
}
}
// TestRestartRuntimeFromConfig_NilProvisionerReturnsDB verifies the SaaS path
// (no local Docker provisioner) trusts the DB runtime.
func TestRestartRuntimeFromConfig_NilProvisionerReturnsDB(t *testing.T) {
h := &WorkspaceHandler{provisioner: nil}
got := h.restartRuntimeFromConfig(context.Background(), "ws-1", "adk-test", "google-adk", false)
if got != "google-adk" {
t.Fatalf("restartRuntimeFromConfig (nil provisioner) returned %q; want %q", got, "google-adk")
}
}
// TestRestartRuntimeFromConfig_MatchingConfigReturnsDB verifies the no-drift
// case: when config.yaml agrees with the DB, the DB runtime is returned.
func TestRestartRuntimeFromConfig_MatchingConfigReturnsDB(t *testing.T) {
h := &WorkspaceHandler{
provisioner: &execReadStubProv{configYAML: "runtime: google-adk\n"},
}
got := h.restartRuntimeFromConfig(context.Background(), "ws-1", "adk-test", "google-adk", false)
if got != "google-adk" {
t.Fatalf("restartRuntimeFromConfig returned %q; want %q", got, "google-adk")
}
}
@@ -7,7 +7,6 @@ import (
"testing"
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/provisioner"
"github.com/DATA-DOG/go-sqlmock"
)
// Tests for resolveRestartTemplate — the pure helper that implements the
@@ -152,24 +151,27 @@ func TestRestartRuntimeFromConfig_ApplyTemplateTrustsDBRuntime(t *testing.T) {
}
}
func TestRestartRuntimeFromConfig_DefaultRestartPreservesContainerRuntime(t *testing.T) {
mock := setupTestDB(t)
mock.ExpectExec(`UPDATE workspaces SET runtime = \$1 WHERE id = \$2`).
WithArgs("claude-code", "ws-runtime").
WillReturnResult(sqlmock.NewResult(0, 1))
// TestRestartRuntimeFromConfig_DefaultRestartTrustsDBRuntime is the regression
// test for the runtime-switch-then-restart bug. The runtime-switch PATCH
// (workspace_crud.go Update) writes ONLY the workspaces.runtime DB column — it
// does not write through to the running container's /configs/config.yaml — so
// on a plain restart (apply_template=false) the DB column is the SSOT.
//
// Pre-fix, this default path let the container's stale template-default
// config.yaml ("claude-code") win over the switched DB runtime ("hermes") AND
// overwrote the DB column back to the stale value, so a switched-runtime box
// was never re-provisioned. The DB value must now win, and the DB must NOT be
// stomped (setupTestDB asserts no unexpected queries).
func TestRestartRuntimeFromConfig_DefaultRestartTrustsDBRuntime(t *testing.T) {
// No sqlmock UPDATE expectation: the function must NOT write the DB anymore.
setupTestDB(t)
prov := &restartRuntimeProv{config: []byte("runtime: claude-code\n")}
h := &WorkspaceHandler{provisioner: prov}
got := h.restartRuntimeFromConfig(context.Background(), "ws-runtime", "Runtime Workspace", "hermes", false)
if got != "claude-code" {
t.Fatalf("runtime = %q, want container runtime claude-code", got)
}
if prov.execReadCalls != 1 {
t.Fatalf("ExecRead calls = %d, want 1", prov.execReadCalls)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("unmet sqlmock expectations: %v", err)
if got != "hermes" {
t.Fatalf("runtime = %q, want DB SSOT runtime hermes (stale config.yaml=claude-code must NOT win)", got)
}
}
@@ -488,28 +488,38 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) {
}
func (h *WorkspaceHandler) restartRuntimeFromConfig(ctx context.Context, id, wsName, dbRuntime string, applyTemplate bool) string {
// The workspaces.runtime DB column is the SSOT for the workspace's runtime
// — it is the column the runtime-switch PATCH (workspace_crud.go Update)
// writes, and ONLY that column; the PATCH does NOT write through to the
// running container's /configs/config.yaml. So on restart we must always
// re-provision with the DB runtime, never the runtime baked into the
// container's (possibly stale, template-default) config.yaml.
//
// Pre-fix this function let a config.yaml runtime that differed from the DB
// WIN, and even stomped the DB column back to the config value. That
// silently reverted a switched runtime (e.g. google-adk → claude-code) on
// every plain "Restart" click, so a switched-runtime box was never built.
if h.provisioner == nil || applyTemplate {
return dbRuntime
}
containerRuntime := dbRuntime
// Best-effort drift detection only: if the container's config.yaml disagrees
// with the DB, log it (the config volume will be re-rendered from the
// runtime-default template on re-provision), but the DB value remains
// authoritative and is never overwritten from the stale config.
containerName := provisioner.ContainerName(id) // ws-{id} (KI-013 full UUID)
if cfgBytes, readErr := h.provisioner.ExecRead(ctx, containerName, "/configs/config.yaml"); readErr == nil {
for _, line := range strings.Split(string(cfgBytes), "\n") {
line = strings.TrimSpace(line)
if strings.HasPrefix(line, "runtime:") {
parsed := strings.TrimSpace(strings.TrimPrefix(line, "runtime:"))
if parsed != "" && parsed != containerRuntime {
log.Printf("Restart: runtime changed in config.yaml %q→%q for %s", containerRuntime, parsed, wsName)
containerRuntime = parsed
if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET runtime = $1 WHERE id = $2`, containerRuntime, id); err != nil {
log.Printf("Restart: failed to persist runtime %q for %s: %v", containerRuntime, id, err)
}
if parsed != "" && parsed != dbRuntime {
log.Printf("Restart: config.yaml runtime %q differs from DB runtime %q for %s — using DB (SSOT); config volume will be re-rendered", parsed, dbRuntime, wsName)
}
break
}
}
}
return containerRuntime
return dbRuntime
}
// Hibernate handles POST /workspaces/:id/hibernate