fix(platform): /github-installation-token returns 501 on missing config (closes #388 )

When GITHUB_APP_ID/INSTALLATION_ID are unset (post org suspension or Gitea-canonical deployments without GitHub App), the fallback path in GetInstallationToken was returning 500 Internal Server Error. This pollutes platform logs with ~28×false-positive 500s/hour across all workspaces. Return 501 Not Implemented with {"error":"GitHub integration not configured","scm":"gitea"} when the "required" error fires — callers can now distinguish "feature off" from "transient error" and stop polling. Update TestGitHubToken_NoTokenProvider to assert the new 501 shape. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
fix(docker-compose): remove duplicate service definitions across include:
2026-05-11 05:13:27 +00:00 · 2026-05-11 05:11:37 +00:00 · 2026-05-11 05:11:37 +00:00 · 2026-05-11 05:11:37 +00:00 · 2026-05-11 05:11:37 +00:00 · 2026-05-11 05:11:37 +00:00
11 changed files with 151 additions and 101 deletions
@@ -32,11 +32,9 @@ on:
      - '.gitea/workflows/publish-workspace-server-image.yml'
  workflow_dispatch:

-# Serialize per-branch so two rapid staging pushes don't race the same
-# :staging-latest tag retag. Allow staging and main to run in parallel
-# (different GITHUB_REF → different concurrency group) since they
-# produce different :staging-<sha> tags and last-write-wins on
-# :staging-latest is acceptable across branches.
+# Serialize per-branch so two rapid main pushes don't race the same
+# :staging-latest tag retag. Allow parallel runs as they produce
+# different :staging-<sha> tags and last-write-wins on :staging-latest.
 #
 # cancel-in-progress: false → in-flight builds finish; the next push's
 # build queues. This avoids a partially-pushed image.
@@ -77,6 +77,13 @@ jobs:
          # works if we never check out PR HEAD. Same SHA the workflow
          # itself was loaded from.
          ref: ${{ github.event.pull_request.base.sha }}
+      - name: Install jq
+        # Gitea Actions runners (ubuntu-latest label) do not bundle jq.
+        # The script uses jq extensively for all JSON parsing; install it
+        # before the script runs. Using -qq for quiet output — diagnostic
+        # info is already captured via SOP_DEBUG=1 on failure.
+        run: apt-get update -qq && apt-get install -y -qq jq
+
      - name: Verify tier label + reviewer team membership
        env:
          # SOP_TIER_CHECK_TOKEN is the org-level secret for the
@@ -0,0 +1 @@
+staging trigger
@@ -11,6 +11,9 @@ services:
      - "5432:5432"
    volumes:
      - pgdata:/var/lib/postgresql/data
+    networks:
+      - molecule-core-net
+    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-dev}"]
      interval: 2s
@@ -25,6 +28,8 @@ services:
    environment:
      POSTGRES_USER: ${POSTGRES_USER:-dev}
      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-dev}
+    networks:
+      - molecule-core-net
    command:
      - /bin/sh
      - -c
@@ -45,6 +50,9 @@ services:
      - "6379:6379"
    volumes:
      - redisdata:/data
+    networks:
+      - molecule-core-net
+    restart: unless-stopped
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 2s
@@ -52,7 +60,9 @@ services:
      retries: 10

  # digest-pinned 2026-05-10 (sha256:5b296e0ba1da74efea3143c773ddd60245f249fb7c72eb1d866c2d6ebc759fbe, linux/amd64)
-  clickhouse:
+  # Named langfuse-clickhouse (not clickhouse) to match the service name used in
+  # docker-compose.yml's depends_on block for the main langfuse service.
+  langfuse-clickhouse:
    image: clickhouse/clickhouse-server@sha256:5b296e0ba1da74efea3143c773ddd60245f249fb7c72eb1d866c2d6ebc759fbe
    environment:
      CLICKHOUSE_DB: langfuse
@@ -60,6 +70,8 @@ services:
      CLICKHOUSE_PASSWORD: ${CLICKHOUSE_PASSWORD:-langfuse-dev}
    volumes:
      - clickhousedata:/var/lib/clickhouse
+    networks:
+      - molecule-core-net
    healthcheck:
      test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://127.0.0.1:8123/ping || exit 1"]
      interval: 5s
@@ -104,7 +116,7 @@ services:
  langfuse-web:
    image: langfuse/langfuse@sha256:e7aafd3ccf721821b40f8b2251220b4bb8af5e4877b5c5a8846af5b3318aaf1d
    depends_on:
-      clickhouse:
+      langfuse-clickhouse:
        condition: service_healthy
      langfuse-db-init:
        condition: service_completed_successfully
@@ -113,8 +125,8 @@ services:
      # Langfuse v2 expects the HTTP interface (port 8123). The previous
      # clickhouse://...:9000 native-protocol URL is rejected with
      # "ClickHouse URL protocol must be either http or https".
-      CLICKHOUSE_URL: http://clickhouse:8123
-      CLICKHOUSE_MIGRATION_URL: clickhouse://clickhouse:9000
+      CLICKHOUSE_URL: http://langfuse-clickhouse:8123
+      CLICKHOUSE_MIGRATION_URL: clickhouse://langfuse-clickhouse:9000
      CLICKHOUSE_USER: langfuse
      CLICKHOUSE_PASSWORD: ${CLICKHOUSE_PASSWORD:-langfuse-dev}
      NEXTAUTH_SECRET: ${LANGFUSE_SECRET:-changeme-langfuse-secret}
@@ -3,85 +3,7 @@ include:
  - docker-compose.infra.yml

 services:
-  # --- Infrastructure ---
-  # digest-pinned 2026-05-10 (sha256:4941ef97aaa2633ce9808f7766f8b8d746dd039ce8c51ca6da185c3dc63ab579, linux/amd64)
-  postgres:
-    image: postgres@sha256:4941ef97aaa2633ce9808f7766f8b8d746dd039ce8c51ca6da185c3dc63ab579
-    environment:
-      POSTGRES_USER: ${POSTGRES_USER:-dev}
-      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-dev}
-      POSTGRES_DB: ${POSTGRES_DB:-molecule}
-    command: ["postgres", "-c", "wal_level=logical"]
-    ports:
-      - "5432:5432"
-    volumes:
-      - pgdata:/var/lib/postgresql/data
-    networks:
-      - molecule-core-net
-    restart: unless-stopped
-    healthcheck:
-      test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-dev}"]
-      interval: 2s
-      timeout: 5s
-      retries: 10
-
-  langfuse-db-init:
-    image: postgres@sha256:4941ef97aaa2633ce9808f7766f8b8d746dd039ce8c51ca6da185c3dc63ab579
-    depends_on:
-      postgres:
-        condition: service_healthy
-    environment:
-      POSTGRES_USER: ${POSTGRES_USER:-dev}
-      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-dev}
-    command:
-      - /bin/sh
-      - -c
-      - |
-        export PGPASSWORD="$${POSTGRES_PASSWORD}"
-        until pg_isready -h postgres -U "$${POSTGRES_USER}" -d postgres >/dev/null 2>&1; do
-          sleep 1
-        done
-        if ! psql -h postgres -U "$${POSTGRES_USER}" -d postgres -tAc "SELECT 1 FROM pg_database WHERE datname = 'langfuse'" | grep -q 1; then
-          psql -h postgres -U "$${POSTGRES_USER}" -d postgres -c "CREATE DATABASE langfuse"
-        fi
-    networks:
-      - molecule-core-net
-
-  # digest-pinned 2026-05-10 (sha256:b1addbe72465a718643cff9e60a58e6df1841e29d6d7d60c9a85d8d72f08d1a7, linux/amd64)
-  redis:
-    image: redis@sha256:b1addbe72465a718643cff9e60a58e6df1841e29d6d7d60c9a85d8d72f08d1a7
-    command: ["redis-server", "--notify-keyspace-events", "KEA"]
-    ports:
-      - "6379:6379"
-    volumes:
-      - redisdata:/data
-    networks:
-      - molecule-core-net
-    restart: unless-stopped
-    healthcheck:
-      test: ["CMD", "redis-cli", "ping"]
-      interval: 2s
-      timeout: 5s
-      retries: 10
-
  # --- Observability ---
-  # digest-pinned 2026-05-10 (sha256:5b296e0ba1da74efea3143c773ddd60245f249fb7c72eb1d866c2d6ebc759fbe, linux/amd64)
-  langfuse-clickhouse:
-    image: clickhouse/clickhouse-server@sha256:5b296e0ba1da74efea3143c773ddd60245f249fb7c72eb1d866c2d6ebc759fbe
-    environment:
-      CLICKHOUSE_DB: langfuse
-      CLICKHOUSE_USER: langfuse
-      CLICKHOUSE_PASSWORD: langfuse
-    volumes:
-      - clickhousedata:/var/lib/clickhouse
-    networks:
-      - molecule-core-net
-    healthcheck:
-      test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://127.0.0.1:8123/ping || exit 1"]
-      interval: 5s
-      timeout: 5s
-      retries: 10
-
  # digest-pinned 2026-05-10 (sha256:e7aafd3ccf721821b40f8b2251220b4bb8af5e4877b5c5a8846af5b3318aaf1d, linux/amd64)
  langfuse:
    image: langfuse/langfuse@sha256:e7aafd3ccf721821b40f8b2251220b4bb8af5e4877b5c5a8846af5b3318aaf1d
@@ -295,7 +217,7 @@ services:
      - "4000:4000"
    volumes:
      - ./infra/litellm_config.yml:/app/config.yaml:ro
-    command: ["--config", "/app/config.yaml", "--port", "4000", "--num_workers", "4"]
+    command: ["--config", "/app/config.yaml", "--port", "4000", "--num_workers", 4]
    environment:
      # Pass provider API keys through — only the ones you have are needed
      ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-}
@@ -44,3 +44,4 @@
    {"name": "mock-bigorg", "repo": "molecule-ai/molecule-ai-org-template-mock-bigorg", "ref": "main"}
  ]
 }
+// Triggered by Integration Tester at 2026-05-10T08:52Z
@@ -49,6 +49,7 @@ import (
 	"net/http"
 	"os"
 	"strconv"
+	"strings"
 	"time"

 	"github.com/Molecule-AI/molecule-monorepo/platform/pkg/provisionhook"
@@ -98,7 +99,19 @@ func (h *GitHubTokenHandler) GetInstallationToken(c *gin.Context) {
 		token, expiresAt, err := generateAppInstallationToken()
 		if err != nil {
 			log.Printf("[github] fallback token generation failed: %v", err)
-			c.JSON(http.StatusInternalServerError, gin.H{"error": "token refresh failed"})
+			// #388: when GITHUB_APP_ID/INSTALLATION_ID are unset (e.g. post
+			// org suspension or Gitea-canonical deployments), this is a
+			// configuration gap, not an internal server error. Return 501 so
+			// callers (workspace polling loop) can distinguish "feature off"
+			// from "transient error" and stop polling.
+			if strings.Contains(err.Error(), "required") {
+				c.JSON(http.StatusNotImplemented, gin.H{
+					"error": "GitHub integration not configured",
+					"scm":   "gitea",
+				})
+			} else {
+				c.JSON(http.StatusInternalServerError, gin.H{"error": "token refresh failed"})
+			}
 			return
 		}
 		c.JSON(http.StatusOK, gin.H{"token": token, "expires_at": expiresAt})
@@ -76,14 +76,16 @@ func TestGitHubToken_NilRegistry(t *testing.T) {
 // implement TokenProvider (e.g. a non-GitHub mutator in the chain).
 //
 // Post-#960/#1101 the handler now falls back to direct env-based App
-// token generation (GITHUB_APP_ID / INSTALLATION_ID / PRIVATE_KEY_FILE)
-// when no registered provider matches. In the test environment those
-// env vars are unset, so the fallback fails with 500 "token refresh
-// failed" — a clean retryable signal for the workspace credential
-// helper. Previously this path returned 404; the new 500 matches the
-// ProviderError shape so callers don't have to branch on "missing
-// provider" vs "provider failed".
-func TestGitHubToken_NoTokenProvider(t *testing.T) {
+// token generation (GITHUB_APP_ID / INSTALLATION_ID / PRIVATE_KEY_FILE).
+//
+// When GITHUB_APP_ID or INSTALLATION_ID is unset (e.g. post org suspension
+// or Gitea-canonical deployments without GitHub App), generateAppInstallationToken
+// returns an error with "required" in the message. The handler now returns
+// 501 Not Implemented with {"error":"GitHub integration not configured","scm":"gitea"}
+// so callers can distinguish "feature off" from "transient error" and stop
+// polling (#388). Other errors (e.g. network failures reading the private key)
+// still return 500.
+func TestGitHubToken_NoTokenProvider_MissingConfigReturns501(t *testing.T) {
 	reg := provisionhook.NewRegistry()
 	reg.Register(&mockMutatorOnly{name: "other-plugin"})
 	h := NewGitHubTokenHandler(reg)
@@ -91,12 +93,20 @@ func TestGitHubToken_NoTokenProvider(t *testing.T) {

 	h.GetInstallationToken(c)

-	if w.Code != http.StatusInternalServerError {
-		t.Fatalf("expected 500 (env-based fallback fails with unset GITHUB_APP_* vars), got %d: %s",
+	// GITHUB_APP_ID/INSTALLATION_ID are unset in test env → "required" error → 501
+	if w.Code != http.StatusNotImplemented {
+		t.Fatalf("expected 501 for missing GITHUB_APP_ID/INSTALLATION_ID, got %d: %s",
 			w.Code, w.Body.String())
 	}
-	if !strings.Contains(w.Body.String(), "token refresh failed") {
-		t.Errorf("expected body to contain 'token refresh failed', got: %s", w.Body.String())
+	var body map[string]string
+	if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil {
+		t.Fatalf("response is not valid JSON: %v", err)
+	}
+	if body["error"] == "" {
+		t.Error("expected non-empty error field in 501 response")
+	}
+	if body["scm"] != "gitea" {
+		t.Errorf("expected scm=gitea, got %q", body["scm"])
 	}
 }

@@ -77,6 +77,16 @@ async def delegate_task(workspace_id: str, task: str) -> str:
                return str(result) if isinstance(result, str) else "(no text)"
            elif "error" in data:
                err = data["error"]
+                # Handle both string-form errors ("error": "some string")
+                # and object-form errors ("error": {"message": "...", "code": ...}).
+                msg = ""
+                if isinstance(err, dict):
+                    msg = err.get("message", "")
+                elif isinstance(err, str):
+                    msg = err
+                else:
+                    msg = str(err)
+                return f"Error: {msg}"
                msg = ""
                if isinstance(err, dict):
                    msg = err.get("message", "")
@@ -51,6 +51,22 @@ class AdaptorSource:

 def _load_module_from_path(module_name: str, path: Path):
    """Import a Python file by absolute path. Returns the module or None on failure."""
+    # Ensure the plugins_registry package and its submodules are importable in the
+    # fresh module namespace created by module_from_spec().  Plugin adapters
+    # (molecule-skill-*/adapters/*.py) use "from plugins_registry.builtins import ..."
+    # which requires plugins_registry and its submodules to already be in sys.modules.
+    # We import and register them before exec_module so the plugin's own
+    # from ... import statements resolve correctly.
+    import sys
+    import plugins_registry
+    sys.modules.setdefault("plugins_registry", plugins_registry)
+    for _sub in ("builtins", "protocol", "raw_drop"):
+        try:
+            sub = importlib.import_module(f"plugins_registry.{_sub}")
+            sys.modules.setdefault(f"plugins_registry.{_sub}", sub)
+        except Exception:
+            # Submodule may not exist in all versions; skip if absent.
+            pass
    spec = importlib.util.spec_from_file_location(module_name, path)
    if spec is None or spec.loader is None:
        return None
@@ -0,0 +1,60 @@
+"""Tests for _load_module_from_path sys.modules injection fix (issue #296).
+
+Verifies that plugin adapters using "from plugins_registry.builtins import ..."
+can be loaded via _load_module_from_path() without ModuleNotFoundError.
+"""
+import sys
+import tempfile
+import os
+from pathlib import Path
+
+# Ensure the plugins_registry package is importable
+import plugins_registry
+
+from plugins_registry import _load_module_from_path
+
+
+def test_load_adapter_with_plugins_registry_import():
+    """Plugin adapter using 'from plugins_registry.builtins import ...' loads cleanly."""
+    # Write a temp adapter file that does the exact import from the bug report.
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".py", delete=False, dir=tempfile.gettempdir()
+    ) as f:
+        f.write("from plugins_registry.builtins import AgentskillsAdaptor as Adaptor\n")
+        f.write("assert Adaptor is not None\n")
+        adapter_path = Path(f.name)
+
+    try:
+        module = _load_module_from_path("test_adapter", adapter_path)
+        assert module is not None, "module should load without error"
+        assert hasattr(module, "Adaptor"), "module should expose Adaptor"
+    finally:
+        os.unlink(adapter_path)
+
+
+def test_load_adapter_with_full_plugins_registry_import():
+    """Plugin adapter using 'from plugins_registry import ...' loads cleanly."""
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".py", delete=False, dir=tempfile.gettempdir()
+    ) as f:
+        f.write("from plugins_registry import InstallContext, resolve\n")
+        f.write("from plugins_registry.protocol import PluginAdaptor\n")
+        f.write("assert InstallContext is not None\n")
+        f.write("assert resolve is not None\n")
+        f.write("assert PluginAdaptor is not None\n")
+        adapter_path = Path(f.name)
+
+    try:
+        module = _load_module_from_path("test_adapter_full", adapter_path)
+        assert module is not None, "module should load without error"
+        assert hasattr(module, "InstallContext"), "module should expose InstallContext"
+        assert hasattr(module, "resolve"), "module should expose resolve"
+        assert hasattr(module, "PluginAdaptor"), "module should expose PluginAdaptor"
+    finally:
+        os.unlink(adapter_path)
+
+
+if __name__ == "__main__":
+    test_load_adapter_with_plugins_registry_import()
+    test_load_adapter_with_full_plugins_registry_import()
+    print("ALL TESTS PASS")