From 80caba97ee4b1a9ba72dcb4d8ac2ab3f24a43ab1 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 19 Apr 2026 02:41:15 -0700 Subject: [PATCH] feat(ws-server): pull env from CP on startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Paired with molecule-controlplane PR #55 (GET /cp/tenants/config). Lets existing tenants heal themselves when we rotate or add a CP-side env var (e.g. MOLECULE_CP_SHARED_SECRET landing earlier today) without any ssh or re-provision. Flow: main() calls refreshEnvFromCP() before any other os.Getenv read. The helper reads MOLECULE_ORG_ID + ADMIN_TOKEN from the baked-in user-data env, GETs {MOLECULE_CP_URL}/cp/tenants/config with those credentials, and applies the returned string map via os.Setenv so downstream code (CPProvisioner, etc.) sees the fresh values. Best-effort semantics: - self-hosted / no MOLECULE_ORG_ID → no-op (return nil) - CP unreachable / non-200 → log + return error (main keeps booting) - oversized values (>4 KiB each) rejected to avoid env pollution - body read capped at 64 KiB Once this image hits GHCR, the 5-minute tenant auto-updater picks it up, the container restarts, refresh runs, and every tenant has MOLECULE_CP_SHARED_SECRET within ~5 minutes — no operator toil. Also fixes workspace-server/.gitignore so `server` no longer matches the cmd/server package dir — it only ignored the compiled binary but pattern was too broad. Anchored to `/server`. Co-Authored-By: Claude Opus 4.7 (1M context) --- workspace-server/.gitignore | 3 +- workspace-server/cmd/server/cp_config.go | 107 ++++++++++++++++++ workspace-server/cmd/server/cp_config_test.go | 100 ++++++++++++++++ workspace-server/cmd/server/main.go | 10 ++ 4 files changed, 219 insertions(+), 1 deletion(-) create mode 100644 workspace-server/cmd/server/cp_config.go create mode 100644 workspace-server/cmd/server/cp_config_test.go diff --git a/workspace-server/.gitignore b/workspace-server/.gitignore index 254defdd..3f67c92f 100644 --- a/workspace-server/.gitignore +++ b/workspace-server/.gitignore @@ -1 +1,2 @@ -server +# The compiled binary, not the cmd/server package. +/server diff --git a/workspace-server/cmd/server/cp_config.go b/workspace-server/cmd/server/cp_config.go new file mode 100644 index 00000000..ff3f24e0 --- /dev/null +++ b/workspace-server/cmd/server/cp_config.go @@ -0,0 +1,107 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "os" + "time" +) + +// refreshEnvFromCP pulls the tenant's current config-plane env vars +// from the control plane and applies them via os.Setenv BEFORE any +// other code calls os.Getenv on them. +// +// Why: +// - user-data on the tenant EC2 bakes env vars into `docker run` at +// provision time. Those values are frozen. When we rotate a secret +// on CP (e.g. PROVISION_SHARED_SECRET) there's no way to push the +// new value into already-provisioned tenants. +// - the Docker image auto-updater already pulls the latest workspace- +// server image every 5 min. If THAT image knows how to refresh its +// own env from the CP on startup, every tenant heals itself within +// the update cycle — no ssh, no re-provision, no ops toil. +// +// Contract (paired with cp-side GET /cp/tenants/config): +// Request: GET {MOLECULE_CP_URL or https://api.moleculesai.app}/cp/tenants/config +// Authorization: Bearer +// X-Molecule-Org-Id: +// Response: 200 {"MOLECULE_CP_SHARED_SECRET":"…","MOLECULE_CP_URL":"…", …} +// 401 on bearer mismatch or unknown org +// +// Best-effort: any failure logs and returns — main() keeps booting. +// Self-hosted deploys without MOLECULE_ORG_ID or ADMIN_TOKEN set +// short-circuit silently so this function is a no-op there. +func refreshEnvFromCP() error { + orgID := os.Getenv("MOLECULE_ORG_ID") + adminToken := os.Getenv("ADMIN_TOKEN") + if orgID == "" || adminToken == "" { + // Not a SaaS tenant (self-hosted dev or not yet provisioned). + return nil + } + + base := os.Getenv("MOLECULE_CP_URL") + if base == "" { + // Default to prod for any tenant that lost track of its CP URL + // (e.g. older user-data that only set MOLECULE_ORG_ID). + base = "https://api.moleculesai.app" + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + req, err := http.NewRequestWithContext(ctx, "GET", base+"/cp/tenants/config", nil) + if err != nil { + return fmt.Errorf("build request: %w", err) + } + req.Header.Set("Authorization", "Bearer "+adminToken) + req.Header.Set("X-Molecule-Org-Id", orgID) + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return fmt.Errorf("do request: %w", err) + } + defer resp.Body.Close() + + // 64 KiB cap — the CP only returns small JSON blobs here. An + // unbounded read would be weaponizable if a compromised upstream + // ever echoed back a gigabyte. + body, err := io.ReadAll(io.LimitReader(resp.Body, 64<<10)) + if err != nil { + return fmt.Errorf("read body: %w", err) + } + + if resp.StatusCode != http.StatusOK { + // 401 on first boot-after-restart is expected for tenants still + // running under old user-data where admin_token on-disk hasn't + // had its corresponding row seeded. Don't treat as fatal — just + // log so operators can spot repeat offenders in logs. + return fmt.Errorf("cp returned %d", resp.StatusCode) + } + + var cfg map[string]string + if err := json.Unmarshal(body, &cfg); err != nil { + return fmt.Errorf("decode: %w", err) + } + + // Apply only strings; reject oversized values defensively. An + // operator-supplied config should never exceed 4 KiB per key — + // workspace-server env vars are URLs, hex secrets, short identifiers. + const maxValueBytes = 4 << 10 + applied := 0 + for k, v := range cfg { + if k == "" || len(v) > maxValueBytes { + continue + } + if err := os.Setenv(k, v); err != nil { + log.Printf("CP env refresh: setenv %s: %v", k, err) + continue + } + applied++ + } + log.Printf("CP env refresh: applied %d values from %s/cp/tenants/config", applied, base) + return nil +} diff --git a/workspace-server/cmd/server/cp_config_test.go b/workspace-server/cmd/server/cp_config_test.go new file mode 100644 index 00000000..fddcedde --- /dev/null +++ b/workspace-server/cmd/server/cp_config_test.go @@ -0,0 +1,100 @@ +package main + +import ( + "fmt" + "net/http" + "net/http/httptest" + "os" + "testing" +) + +// TestRefreshEnvFromCP_NoopWhenNotSaaS: without MOLECULE_ORG_ID or +// ADMIN_TOKEN, the function short-circuits silently — self-hosted dev +// must not fail or log spam here. +func TestRefreshEnvFromCP_NoopWhenNotSaaS(t *testing.T) { + t.Setenv("MOLECULE_ORG_ID", "") + t.Setenv("ADMIN_TOKEN", "") + if err := refreshEnvFromCP(); err != nil { + t.Errorf("expected nil on non-SaaS, got %v", err) + } +} + +// TestRefreshEnvFromCP_AppliesCPResponse: wire a stub CP, run refresh, +// confirm the returned env vars ended up in os.Environ(). +func TestRefreshEnvFromCP_AppliesCPResponse(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if got := r.Header.Get("Authorization"); got != "Bearer tenant-admin-token" { + t.Errorf("bearer: got %q", got) + } + if got := r.Header.Get("X-Molecule-Org-Id"); got != "org-abc" { + t.Errorf("org id header: got %q", got) + } + w.Header().Set("Content-Type", "application/json") + fmt.Fprint(w, `{"MOLECULE_CP_SHARED_SECRET":"new-secret","MOLECULE_CP_URL":"https://api.moleculesai.app"}`) + })) + defer srv.Close() + + t.Setenv("MOLECULE_ORG_ID", "org-abc") + t.Setenv("ADMIN_TOKEN", "tenant-admin-token") + t.Setenv("MOLECULE_CP_URL", srv.URL) + t.Setenv("MOLECULE_CP_SHARED_SECRET", "") // clear before refresh + + if err := refreshEnvFromCP(); err != nil { + t.Fatalf("refreshEnvFromCP: %v", err) + } + if got := os.Getenv("MOLECULE_CP_SHARED_SECRET"); got != "new-secret" { + t.Errorf("SHARED_SECRET: want new-secret, got %q", got) + } +} + +// TestRefreshEnvFromCP_CPUnreachableDoesNotFailBoot: network errors must +// return non-nil BUT main.go treats that as warn-and-continue. We assert +// the function returns an error (not a panic) so the caller can log. +func TestRefreshEnvFromCP_CPUnreachableDoesNotFailBoot(t *testing.T) { + t.Setenv("MOLECULE_ORG_ID", "org-abc") + t.Setenv("ADMIN_TOKEN", "t") + t.Setenv("MOLECULE_CP_URL", "http://127.0.0.1:1") // closed port + err := refreshEnvFromCP() + if err == nil { + t.Error("expected an error when CP is unreachable") + } +} + +// TestRefreshEnvFromCP_NonOKPropagates: CP returns 500 → error. +func TestRefreshEnvFromCP_NonOKPropagates(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, "boom", http.StatusInternalServerError) + })) + defer srv.Close() + t.Setenv("MOLECULE_ORG_ID", "org-abc") + t.Setenv("ADMIN_TOKEN", "t") + t.Setenv("MOLECULE_CP_URL", srv.URL) + if err := refreshEnvFromCP(); err == nil { + t.Error("expected error on 500, got nil") + } +} + +// TestRefreshEnvFromCP_RejectsOversizedValue: a single-value-over-4KiB +// payload must NOT poison the environment. +func TestRefreshEnvFromCP_RejectsOversizedValue(t *testing.T) { + giant := make([]byte, 5<<10) + for i := range giant { + giant[i] = 'x' + } + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + fmt.Fprintf(w, `{"MOLECULE_CP_SHARED_SECRET":%q}`, string(giant)) + })) + defer srv.Close() + t.Setenv("MOLECULE_ORG_ID", "org-abc") + t.Setenv("ADMIN_TOKEN", "t") + t.Setenv("MOLECULE_CP_URL", srv.URL) + t.Setenv("MOLECULE_CP_SHARED_SECRET", "original") + if err := refreshEnvFromCP(); err != nil { + t.Fatalf("refreshEnvFromCP: %v", err) + } + if got := os.Getenv("MOLECULE_CP_SHARED_SECRET"); got != "original" { + t.Errorf("oversized value was applied — want %q, got %d bytes", + "original", len(got)) + } +} diff --git a/workspace-server/cmd/server/main.go b/workspace-server/cmd/server/main.go index 88ef581d..3855a859 100644 --- a/workspace-server/cmd/server/main.go +++ b/workspace-server/cmd/server/main.go @@ -30,6 +30,16 @@ import ( ) func main() { + // CP self-refresh: pull any operator-rotated config (e.g. a new + // MOLECULE_CP_SHARED_SECRET) before any other code reads env. + // Best-effort — if the CP is unreachable we keep booting with the + // env we were provisioned with. Older SaaS tenants predate PR #53 + // and can arrive here with MOLECULE_CP_SHARED_SECRET unset; this + // is how they heal without SSH. + if err := refreshEnvFromCP(); err != nil { + log.Printf("CP env refresh: %v (continuing with baked-in env)", err) + } + // Secrets encryption. In MOLECULE_ENV=prod, boot refuses to start // without a valid SECRETS_ENCRYPTION_KEY (fail-secure — Top-5 #5). // In any other environment, missing keys just log a warning and