From 2e8892ebc414673d9219c2f60934d36b07155237 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Fri, 1 May 2026 11:47:53 -0700 Subject: [PATCH] fix(workspace): surface errno + path on chat-upload mkdir failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Production incident on hongming.moleculesai.app 2026-05-01T18:30Z — fresh-tenant signup chat upload returned 500 with the body {"error":"failed to prepare uploads dir"}. Diagnosis required SSM access to the workspace stderr to recover errno + actual path. The root-cause fix lives in claude-code template entrypoint (molecule-ai-workspace-template-claude-code#23 — pre-create the .molecule subtree as root before gosu drops to agent). This change is the diagnostic improvement: when mkdir fails for any reason in the future (EACCES, ENOSPC, EROFS, etc.), the response carries the errno + offending path so the operator inspecting browser devtools sees the real cause without needing SSM. Backwards compatible — top-level "error" key is unchanged so existing canvas / external alert rules continue to match. New fields are additive: path, errno, detail. Test pins the diagnostic shape so a future struct refactor can't silently drop these fields. Co-Authored-By: Claude Opus 4.7 (1M context) --- workspace/internal_chat_uploads.py | 19 ++++++++- workspace/tests/test_internal_chat_uploads.py | 42 +++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/workspace/internal_chat_uploads.py b/workspace/internal_chat_uploads.py index 65a389de..396c1ac8 100644 --- a/workspace/internal_chat_uploads.py +++ b/workspace/internal_chat_uploads.py @@ -170,8 +170,25 @@ async def ingest_handler(request: Request) -> JSONResponse: try: Path(CHAT_UPLOAD_DIR).mkdir(parents=True, exist_ok=True) except OSError as exc: + # Surface errno + path in the response so a fresh-tenant + # "failed to prepare uploads dir" 500 self-diagnoses without + # requiring SSM access to the workspace stderr. Prior incident + # 2026-05-01: hongming.moleculesai.app hit EACCES on the + # /workspace volume's `.molecule` subtree (root-owned race + # window between Docker volume create and entrypoint's chown, + # fixed via molecule-ai-workspace-template-claude-code#23). + # The errno + path are not security-sensitive — both are + # well-known to anyone with workspace access. logger.error("internal_chat_uploads: mkdir %s failed: %s", CHAT_UPLOAD_DIR, exc) - return JSONResponse({"error": "failed to prepare uploads dir"}, status_code=500) + return JSONResponse( + { + "error": "failed to prepare uploads dir", + "path": CHAT_UPLOAD_DIR, + "errno": exc.errno, + "detail": str(exc), + }, + status_code=500, + ) response_files: list[dict] = [] total_bytes = 0 diff --git a/workspace/tests/test_internal_chat_uploads.py b/workspace/tests/test_internal_chat_uploads.py index c3de859c..d386de65 100644 --- a/workspace/tests/test_internal_chat_uploads.py +++ b/workspace/tests/test_internal_chat_uploads.py @@ -222,6 +222,48 @@ def test_per_file_oversize_returns_413(client: TestClient, monkeypatch: pytest.M assert "exceeds per-file limit" in r.json()["error"] +# Pins the diagnostic shape of the 500 returned when the upload +# directory cannot be created. Prior to this fix, the response was +# {"error": "failed to prepare uploads dir"} only — opaque to the +# operator inspecting browser devtools, requiring SSM access to the +# workspace stderr to recover errno + actual path. Surfacing both in +# the response body makes the failure self-diagnosing the next time +# this class of bug recurs (e.g. EACCES on a root-owned `.molecule` +# subtree, ENOSPC on a full disk, EROFS on a read-only mount). +# +# Reproduces the failure by pointing CHAT_UPLOAD_DIR at a path whose +# parent the agent user can't write to. The exact errno in the test +# is 13 (EACCES) on a chmod-0 dir; values are not asserted exactly +# because they vary by OS / errno mapping. The PRESENCE of errno + +# path is what's pinned — drift on those keys breaks the operator +# diagnostic loop. +def test_mkdir_failure_returns_errno_and_path(client: TestClient, chat_uploads_dir: Path, monkeypatch: pytest.MonkeyPatch): + # Plant a regular FILE where mkdir's parent should be — mkdir + # raises FileExistsError / NotADirectoryError reliably across + # platforms, exercising the OSError catch path. + blocker = chat_uploads_dir.parent / "chat-uploads-blocker" + blocker.write_text("not a dir") + # Repoint CHAT_UPLOAD_DIR to a child path under the regular file + # so mkdir(parents=True, exist_ok=True) raises NotADirectoryError. + monkeypatch.setattr(internal_chat_uploads, "CHAT_UPLOAD_DIR", str(blocker / "child")) + + r = client.post( + "/internal/chat/uploads/ingest", + files={"files": ("a.txt", b"x")}, + headers={"Authorization": "Bearer test-secret"}, + ) + assert r.status_code == 500, r.text + body = r.json() + # Backwards-compatible top-level error keeps existing canvas / + # external alert rules matching. + assert body.get("error") == "failed to prepare uploads dir" + # New diagnostic fields — operator can now see WHAT path failed + # and WHY without SSM access. + assert body.get("path") == str(blocker / "child") + assert isinstance(body.get("errno"), int) and body["errno"] != 0 + assert "detail" in body and isinstance(body["detail"], str) and body["detail"] + + def test_total_request_body_oversize_returns_413(client: TestClient, monkeypatch: pytest.MonkeyPatch): """Header-side total cap. Set the limit BELOW the actual body and confirm we reject before parsing multipart."""