molecule-core/workspace/tests/test_inbox_uploads.py
Hongming Wang 30fb507165 feat(poll-upload): phase 5b — concurrent BatchFetcher + httpx client reuse
Resolves the two remaining findings from the Phase 1-4 retrospective
review (the Python-side counterparts to phase 5a):

1. Important — inbox_uploads.fetch_and_stage blocked the inbox poll
   loop synchronously per row. A user dragging 4 files into chat at
   once would stall the poller for 4× per-fetch latency before the
   chat message reached the agent. Add BatchFetcher: a thread-pool
   wrapper (default 4 workers) that submits fetches concurrently and
   exposes wait_all() as the barrier the inbox loop calls before
   processing the chat-message row that references the uploads.

   The drain barrier is the correctness invariant: rewrite_request_body
   must observe a populated URI cache when it walks the chat-message
   row's parts. _poll_once now drains the BatchFetcher inline before
   the first non-upload row, AND at end-of-batch (case: batch contains
   only upload rows; the corresponding chat message arrives in a later
   poll, but the future-poll-races-current-fetch race is closed).

2. Nit — fetch_and_stage created two httpx.Client instances per row
   (one for GET /content, one for POST /ack). Refactor so a single
   client serves both calls. When called from BatchFetcher, the
   batch-shared client serves every row's GET + ack — so the second
   fetch reuses the TCP+TLS handshake from the first.

Comprehensive tests:

- 13 new inbox_uploads tests:
  - fetch_and_stage with supplied client: zero httpx.Client
    constructions, GET+POST through the same client, caller's client
    not closed (lifecycle owned by caller).
  - fetch_and_stage without supplied client: exactly one
    httpx.Client constructed (was 2 pre-fix), closed on the way out.
  - BatchFetcher: 3 rows × 120ms = parallel completion < 250ms
    (vs. ~360ms serial), URI cache hot when wait_all returns,
    per-row failure isolation, single-client reuse across all
    submits, idempotent close, submit-after-close raises,
    owned-vs-supplied client lifecycle, no-op wait_all on empty
    batch, graceful httpx-missing degradation.

- 3 new inbox tests:
  - poll_once drains uploads before processing the chat-message row
    (in-place mutation of row['request_body'] proves the URI was
    rewritten BEFORE message_from_activity returned).
  - poll_once with only upload rows still drains at end-of-batch.
  - poll_once with no upload rows never constructs a BatchFetcher
    (zero overhead on the no-upload happy path).

133 total inbox + inbox_uploads tests pass; 0 regressions.

Closes the chat-upload poll-mode-perf gap end-to-end.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-05 11:26:55 -07:00

1037 lines
37 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Tests for workspace/inbox_uploads.py — poll-mode chat-upload fetcher.
Covers the full activity-row → fetch → stage-on-disk → ack flow plus
the URI cache and the rewrite that swaps platform-pending: URIs to
local workspace: URIs in subsequent chat messages.
"""
from __future__ import annotations
import os
from typing import Any
from unittest.mock import MagicMock, patch
import pytest
import inbox_uploads
@pytest.fixture(autouse=True)
def _reset_cache_and_dir(tmp_path, monkeypatch):
"""Each test starts with an empty URI cache and a temp upload dir
so on-disk artifacts from one test don't leak into the next."""
inbox_uploads.get_cache().clear()
monkeypatch.setattr(inbox_uploads, "CHAT_UPLOAD_DIR", str(tmp_path / "chat-uploads"))
yield
inbox_uploads.get_cache().clear()
# ---------------------------------------------------------------------------
# sanitize_filename — parity with internal_chat_uploads + Go SanitizeFilename
# ---------------------------------------------------------------------------
@pytest.mark.parametrize(
"raw,want",
[
("../../etc/passwd", "passwd"),
("/etc/passwd", "passwd"),
("hello world.pdf", "hello_world.pdf"),
("weird;chars!?.txt", "weird_chars__.txt"),
("中文.docx", "__.docx"),
("file (1).pdf", "file__1_.pdf"),
("report-2026.05.04_v2.pdf", "report-2026.05.04_v2.pdf"),
("", "file"),
(".", "file"),
("..", "file"),
],
)
def test_sanitize_filename_parity_with_python_internal(raw, want):
assert inbox_uploads.sanitize_filename(raw) == want
def test_sanitize_filename_caps_at_100_preserves_short_extension():
long = "a" * 200 + ".pdf"
got = inbox_uploads.sanitize_filename(long)
assert len(got) == 100
assert got.endswith(".pdf")
def test_sanitize_filename_drops_long_extension():
long = "c" * 90 + ".thisisaverylongextensionnotpreserved"
got = inbox_uploads.sanitize_filename(long)
assert len(got) == 100
assert ".thisisaverylongextensionnotpreserved" not in got
# ---------------------------------------------------------------------------
# _URICache — LRU semantics
# ---------------------------------------------------------------------------
def test_uricache_set_get_roundtrip():
c = inbox_uploads._URICache(max_entries=10)
c.set("platform-pending:ws/1", "workspace:/local/1")
assert c.get("platform-pending:ws/1") == "workspace:/local/1"
def test_uricache_get_missing_returns_none():
c = inbox_uploads._URICache(max_entries=10)
assert c.get("platform-pending:ws/missing") is None
def test_uricache_evicts_oldest_at_capacity():
c = inbox_uploads._URICache(max_entries=2)
c.set("a", "A")
c.set("b", "B")
c.set("c", "C") # evicts "a"
assert c.get("a") is None
assert c.get("b") == "B"
assert c.get("c") == "C"
assert len(c) == 2
def test_uricache_get_promotes_recently_used():
c = inbox_uploads._URICache(max_entries=2)
c.set("a", "A")
c.set("b", "B")
# Promote "a" by reading; next set should evict "b" instead of "a".
assert c.get("a") == "A"
c.set("c", "C")
assert c.get("a") == "A"
assert c.get("b") is None
assert c.get("c") == "C"
def test_uricache_overwrite_updates_value():
c = inbox_uploads._URICache(max_entries=10)
c.set("k", "v1")
c.set("k", "v2")
assert c.get("k") == "v2"
assert len(c) == 1
def test_uricache_clear():
c = inbox_uploads._URICache(max_entries=10)
c.set("a", "A")
c.set("b", "B")
c.clear()
assert c.get("a") is None
assert len(c) == 0
def test_resolve_pending_uri_uses_module_cache():
inbox_uploads.get_cache().set("platform-pending:ws/x", "workspace:/local/x")
assert inbox_uploads.resolve_pending_uri("platform-pending:ws/x") == "workspace:/local/x"
assert inbox_uploads.resolve_pending_uri("platform-pending:ws/missing") is None
# ---------------------------------------------------------------------------
# stage_to_disk
# ---------------------------------------------------------------------------
def test_stage_to_disk_writes_file_and_returns_workspace_uri(tmp_path):
uri = inbox_uploads.stage_to_disk(b"hello", "report.pdf")
assert uri.startswith("workspace:")
path = uri[len("workspace:"):]
assert os.path.isfile(path)
with open(path, "rb") as f:
assert f.read() == b"hello"
assert path.endswith("-report.pdf")
# Prefix is 32 hex chars + "-" + name.
name = os.path.basename(path)
prefix, _, _ = name.partition("-")
assert len(prefix) == 32
def test_stage_to_disk_sanitizes_filename():
uri = inbox_uploads.stage_to_disk(b"x", "../../evil.txt")
name = os.path.basename(uri)
assert "/" not in name
assert name.endswith("-evil.txt")
def test_stage_to_disk_rejects_oversize():
with pytest.raises(ValueError):
inbox_uploads.stage_to_disk(b"x" * (inbox_uploads.MAX_FILE_BYTES + 1), "big.bin")
def test_stage_to_disk_creates_directory_if_missing():
# CHAT_UPLOAD_DIR is monkeypatched to a non-existent tmp path; the
# call must mkdir -p it on first write.
assert not os.path.exists(inbox_uploads.CHAT_UPLOAD_DIR)
inbox_uploads.stage_to_disk(b"x", "a.txt")
assert os.path.isdir(inbox_uploads.CHAT_UPLOAD_DIR)
def test_stage_to_disk_write_failure_cleans_partial_file(tmp_path, monkeypatch):
# open() succeeds but write() fails — the partial file must be
# removed so a retry can claim a fresh prefix without colliding.
real_fdopen = os.fdopen
written_paths: list[str] = []
def boom_fdopen(fd, mode):
# Wrap the real file with one whose write() raises.
f = real_fdopen(fd, mode)
# Track which path's fd we opened by inspecting the chat-upload dir.
for entry in os.listdir(inbox_uploads.CHAT_UPLOAD_DIR):
written_paths.append(os.path.join(inbox_uploads.CHAT_UPLOAD_DIR, entry))
original_write = f.write
def bad_write(b):
original_write(b"") # ensure file exists
raise OSError(28, "no space")
f.write = bad_write
return f
monkeypatch.setattr(os, "fdopen", boom_fdopen)
with pytest.raises(OSError):
inbox_uploads.stage_to_disk(b"data", "x.txt")
# All staged files cleaned up.
for p in written_paths:
assert not os.path.exists(p)
def test_stage_to_disk_write_failure_unlink_failure_swallowed(monkeypatch):
# open() succeeds, write() fails, unlink() ALSO fails — the unlink
# error is swallowed and the original write error propagates.
real_fdopen = os.fdopen
def boom_fdopen(fd, mode):
f = real_fdopen(fd, mode)
def bad_write(_):
raise OSError(28, "no space")
f.write = bad_write
return f
def bad_unlink(_):
raise OSError(13, "permission denied")
monkeypatch.setattr(os, "fdopen", boom_fdopen)
monkeypatch.setattr(os, "unlink", bad_unlink)
with pytest.raises(OSError) as ei:
inbox_uploads.stage_to_disk(b"data", "x.txt")
# Original write error, not the unlink error.
assert ei.value.errno == 28
def test_stage_to_disk_propagates_oserror_and_cleans_partial(tmp_path, monkeypatch):
# Make the dir read-only AFTER mkdir succeeds, so open() fails. Skip
# this on platforms where the dir's permissions don't restrict the
# process owner (root in Docker, etc.).
inbox_uploads.stage_to_disk(b"first", "a.txt")
if os.geteuid() == 0:
pytest.skip("root bypasses permission bits")
os.chmod(inbox_uploads.CHAT_UPLOAD_DIR, 0o500)
try:
with pytest.raises(OSError):
inbox_uploads.stage_to_disk(b"second", "b.txt")
finally:
os.chmod(inbox_uploads.CHAT_UPLOAD_DIR, 0o755)
# ---------------------------------------------------------------------------
# is_chat_upload_row + _request_body_dict
# ---------------------------------------------------------------------------
def test_is_chat_upload_row_true_on_method_match():
assert inbox_uploads.is_chat_upload_row({"method": "chat_upload_receive"})
def test_is_chat_upload_row_false_on_other_methods():
assert not inbox_uploads.is_chat_upload_row({"method": "message/send"})
assert not inbox_uploads.is_chat_upload_row({"method": None})
assert not inbox_uploads.is_chat_upload_row({})
def test_request_body_dict_passthrough():
body = {"file_id": "x"}
assert inbox_uploads._request_body_dict({"request_body": body}) is body
def test_request_body_dict_string_decoded():
assert inbox_uploads._request_body_dict({"request_body": '{"a": 1}'}) == {"a": 1}
def test_request_body_dict_invalid_string_returns_none():
assert inbox_uploads._request_body_dict({"request_body": "not json"}) is None
def test_request_body_dict_non_dict_after_decode_returns_none():
assert inbox_uploads._request_body_dict({"request_body": "[1, 2]"}) is None
def test_request_body_dict_other_type_returns_none():
assert inbox_uploads._request_body_dict({"request_body": 123}) is None
# ---------------------------------------------------------------------------
# fetch_and_stage — the full GET / write / ack flow
# ---------------------------------------------------------------------------
def _make_resp(status_code: int, content: bytes = b"", content_type: str = "", text: str = "") -> MagicMock:
resp = MagicMock()
resp.status_code = status_code
resp.content = content
headers: dict[str, str] = {}
if content_type:
headers["content-type"] = content_type
resp.headers = headers
resp.text = text
return resp
def _patch_httpx_for_fetch(get_resp: MagicMock, ack_resp: MagicMock | None = None):
"""Patch httpx.Client so each new context-manager returns a client
whose .get() returns get_resp and .post() returns ack_resp.
"""
client = MagicMock()
client.__enter__ = MagicMock(return_value=client)
client.__exit__ = MagicMock(return_value=False)
client.get = MagicMock(return_value=get_resp)
client.post = MagicMock(return_value=ack_resp or _make_resp(200))
return patch("httpx.Client", return_value=client), client
def _row(file_id: str = "file-1", uri: str | None = None, name: str = "report.pdf", body_extra: dict | None = None) -> dict:
body: dict[str, Any] = {
"file_id": file_id,
"name": name,
"mimeType": "application/pdf",
"size": 9,
}
if uri is not None:
body["uri"] = uri
if body_extra:
body.update(body_extra)
return {
"id": "act-100",
"source_id": None,
"method": "chat_upload_receive",
"summary": "chat_upload_receive: report.pdf",
"request_body": body,
"created_at": "2026-05-04T10:00:00Z",
}
def test_fetch_and_stage_happy_path_writes_file_acks_and_caches():
pending_uri = "platform-pending:ws-1/file-1"
row = _row(uri=pending_uri)
get_resp = _make_resp(200, content=b"PDF-bytes", content_type="application/pdf")
p, client = _patch_httpx_for_fetch(get_resp)
with p:
local_uri = inbox_uploads.fetch_and_stage(
row, platform_url="http://plat", workspace_id="ws-1", headers={"Authorization": "Bearer t"}
)
assert local_uri is not None
assert local_uri.startswith("workspace:")
# On-disk file content matches.
path = local_uri[len("workspace:"):]
with open(path, "rb") as f:
assert f.read() == b"PDF-bytes"
# Cache populated.
assert inbox_uploads.get_cache().get(pending_uri) == local_uri
# Ack POSTed to the right URL.
client.post.assert_called_once()
args, kwargs = client.post.call_args
assert "/pending-uploads/file-1/ack" in args[0]
assert kwargs["headers"]["Authorization"] == "Bearer t"
def test_fetch_and_stage_reconstructs_uri_when_missing_in_body():
row = _row(uri=None) # request_body has no 'uri'
get_resp = _make_resp(200, content=b"x", content_type="text/plain")
p, _ = _patch_httpx_for_fetch(get_resp)
with p:
inbox_uploads.fetch_and_stage(
row, platform_url="http://plat", workspace_id="ws-1", headers={}
)
# Cache key reconstructed from workspace_id + file_id.
assert inbox_uploads.get_cache().get("platform-pending:ws-1/file-1") is not None
def test_fetch_and_stage_returns_none_on_missing_request_body():
row = {"id": "act-100", "method": "chat_upload_receive"}
# No httpx call should happen, but we patch defensively.
p, client = _patch_httpx_for_fetch(_make_resp(200))
with p:
result = inbox_uploads.fetch_and_stage(
row, platform_url="http://plat", workspace_id="ws-1", headers={}
)
assert result is None
client.get.assert_not_called()
def test_fetch_and_stage_returns_none_on_missing_file_id():
row = {"id": "act-100", "method": "chat_upload_receive", "request_body": {"name": "x.pdf"}}
p, client = _patch_httpx_for_fetch(_make_resp(200))
with p:
result = inbox_uploads.fetch_and_stage(
row, platform_url="http://plat", workspace_id="ws-1", headers={}
)
assert result is None
client.get.assert_not_called()
def test_fetch_and_stage_handles_nonstring_file_id():
row = {"id": "act-100", "method": "chat_upload_receive", "request_body": {"file_id": 123}}
p, client = _patch_httpx_for_fetch(_make_resp(200))
with p:
result = inbox_uploads.fetch_and_stage(
row, platform_url="http://plat", workspace_id="ws-1", headers={}
)
assert result is None
client.get.assert_not_called()
def test_fetch_and_stage_404_returns_none_no_ack():
row = _row()
get_resp = _make_resp(404, text="gone")
ack_resp = _make_resp(200)
p, client = _patch_httpx_for_fetch(get_resp, ack_resp)
with p:
result = inbox_uploads.fetch_and_stage(
row, platform_url="http://plat", workspace_id="ws-1", headers={}
)
assert result is None
# No ack — the row is already gone.
client.post.assert_not_called()
def test_fetch_and_stage_500_returns_none_no_ack():
row = _row()
p, client = _patch_httpx_for_fetch(_make_resp(500, text="boom"))
with p:
result = inbox_uploads.fetch_and_stage(
row, platform_url="http://plat", workspace_id="ws-1", headers={}
)
assert result is None
client.post.assert_not_called()
def test_fetch_and_stage_network_error_returns_none():
row = _row()
client = MagicMock()
client.__enter__ = MagicMock(return_value=client)
client.__exit__ = MagicMock(return_value=False)
client.get = MagicMock(side_effect=RuntimeError("connection refused"))
with patch("httpx.Client", return_value=client):
result = inbox_uploads.fetch_and_stage(
row, platform_url="http://plat", workspace_id="ws-1", headers={}
)
assert result is None
def test_fetch_and_stage_oversize_response_refused():
row = _row()
big = b"x" * (inbox_uploads.MAX_FILE_BYTES + 1)
p, client = _patch_httpx_for_fetch(_make_resp(200, content=big, content_type="application/octet-stream"))
with p:
result = inbox_uploads.fetch_and_stage(
row, platform_url="http://plat", workspace_id="ws-1", headers={}
)
assert result is None
client.post.assert_not_called()
def test_fetch_and_stage_ack_failure_does_not_invalidate_local_uri():
row = _row(uri="platform-pending:ws-1/file-1")
get_resp = _make_resp(200, content=b"data", content_type="text/plain")
ack_resp = _make_resp(500, text="ack failed")
p, _ = _patch_httpx_for_fetch(get_resp, ack_resp)
with p:
local_uri = inbox_uploads.fetch_and_stage(
row, platform_url="http://plat", workspace_id="ws-1", headers={}
)
# On-disk staging succeeded; ack failure is logged but doesn't
# roll back the cache.
assert local_uri is not None
assert inbox_uploads.get_cache().get("platform-pending:ws-1/file-1") == local_uri
def test_fetch_and_stage_ack_network_error_swallowed():
row = _row(uri="platform-pending:ws-1/file-1")
client = MagicMock()
client.__enter__ = MagicMock(return_value=client)
client.__exit__ = MagicMock(return_value=False)
client.get = MagicMock(return_value=_make_resp(200, content=b"data", content_type="text/plain"))
client.post = MagicMock(side_effect=RuntimeError("ack network error"))
with patch("httpx.Client", return_value=client):
result = inbox_uploads.fetch_and_stage(
row, platform_url="http://plat", workspace_id="ws-1", headers={}
)
assert result is not None # GET succeeded → URI returned even if ack blew up
def test_fetch_and_stage_uses_response_content_type_when_present():
row = _row(name="thing.bin", body_extra={"mimeType": "application/x-bogus"})
# Response says image/png; should win over body's mimeType.
get_resp = _make_resp(200, content=b"PNG", content_type="image/png; charset=binary")
p, _ = _patch_httpx_for_fetch(get_resp)
with p:
# We don't assert on returned mime (not part of the contract);
# the test just verifies the happy path runs without trying to
# parse the trailing parameter.
result = inbox_uploads.fetch_and_stage(
row, platform_url="http://plat", workspace_id="ws-1", headers={}
)
assert result is not None
def test_fetch_and_stage_nonstring_filename_falls_back_to_file():
# body['name'] is a non-string (e.g. truncated to None or a number);
# filename must default to "file" so sanitize_filename has something
# to work with.
row = _row(body_extra={"name": 12345})
p, _ = _patch_httpx_for_fetch(_make_resp(200, content=b"x", content_type="text/plain"))
with p:
local_uri = inbox_uploads.fetch_and_stage(
row, platform_url="http://plat", workspace_id="ws-1", headers={}
)
assert local_uri is not None
assert local_uri.endswith("-file")
def test_fetch_and_stage_default_filename_when_missing():
row = {
"id": "act",
"method": "chat_upload_receive",
"request_body": {"file_id": "file-1"},
}
p, _ = _patch_httpx_for_fetch(_make_resp(200, content=b"data", content_type="text/plain"))
with p:
local_uri = inbox_uploads.fetch_and_stage(
row, platform_url="http://plat", workspace_id="ws-1", headers={}
)
assert local_uri is not None
assert local_uri.endswith("-file") # default filename
def test_fetch_and_stage_disk_write_failure_returns_none(monkeypatch):
row = _row()
p, client = _patch_httpx_for_fetch(_make_resp(200, content=b"x", content_type="text/plain"))
def bad_stage(*args, **kwargs):
raise OSError(28, "no space left")
monkeypatch.setattr(inbox_uploads, "stage_to_disk", bad_stage)
with p:
result = inbox_uploads.fetch_and_stage(
row, platform_url="http://plat", workspace_id="ws-1", headers={}
)
assert result is None
client.post.assert_not_called()
def test_fetch_and_stage_disk_value_error_returns_none(monkeypatch):
row = _row()
p, client = _patch_httpx_for_fetch(_make_resp(200, content=b"x", content_type="text/plain"))
def bad_stage(*args, **kwargs):
raise ValueError("oversize after sanity check")
monkeypatch.setattr(inbox_uploads, "stage_to_disk", bad_stage)
with p:
result = inbox_uploads.fetch_and_stage(
row, platform_url="http://plat", workspace_id="ws-1", headers={}
)
assert result is None
client.post.assert_not_called()
def test_fetch_and_stage_httpx_missing_returns_none(monkeypatch):
row = _row()
# Simulate httpx not installed by making the import fail.
import sys
real_httpx = sys.modules.pop("httpx", None)
monkeypatch.setitem(sys.modules, "httpx", None)
try:
result = inbox_uploads.fetch_and_stage(
row, platform_url="http://plat", workspace_id="ws-1", headers={}
)
finally:
if real_httpx is not None:
sys.modules["httpx"] = real_httpx
else:
sys.modules.pop("httpx", None)
assert result is None
def test_fetch_and_stage_falls_back_to_extension_mime(monkeypatch):
row = _row(name="snap.png", body_extra={"mimeType": ""}) # no mimeType in body
# Response also has no content-type so it falls through to mimetypes.guess_type.
get_resp = _make_resp(200, content=b"PNG", content_type="")
p, _ = _patch_httpx_for_fetch(get_resp)
with p:
result = inbox_uploads.fetch_and_stage(
row, platform_url="http://plat", workspace_id="ws-1", headers={}
)
assert result is not None
# ---------------------------------------------------------------------------
# rewrite_request_body — URI swap in chat-message bodies
# ---------------------------------------------------------------------------
def test_rewrite_request_body_swaps_pending_uri_in_message_parts():
inbox_uploads.get_cache().set("platform-pending:ws/1", "workspace:/local/1")
body = {
"method": "message/send",
"params": {
"message": {
"parts": [
{"kind": "text", "text": "see this"},
{"kind": "file", "file": {"uri": "platform-pending:ws/1", "name": "a.pdf"}},
]
}
},
}
inbox_uploads.rewrite_request_body(body)
assert body["params"]["message"]["parts"][1]["file"]["uri"] == "workspace:/local/1"
def test_rewrite_request_body_swaps_in_params_parts():
inbox_uploads.get_cache().set("platform-pending:ws/2", "workspace:/local/2")
body = {
"params": {
"parts": [
{"kind": "file", "file": {"uri": "platform-pending:ws/2"}},
]
}
}
inbox_uploads.rewrite_request_body(body)
assert body["params"]["parts"][0]["file"]["uri"] == "workspace:/local/2"
def test_rewrite_request_body_swaps_in_top_level_parts():
inbox_uploads.get_cache().set("platform-pending:ws/3", "workspace:/local/3")
body = {
"parts": [{"kind": "file", "file": {"uri": "platform-pending:ws/3"}}]
}
inbox_uploads.rewrite_request_body(body)
assert body["parts"][0]["file"]["uri"] == "workspace:/local/3"
def test_rewrite_request_body_leaves_unmatched_uri_unchanged():
# No cache entry → URI stays as-is. Agent surfaces the unresolvable
# URI rather than the inbox silently dropping the part.
body = {
"parts": [{"kind": "file", "file": {"uri": "platform-pending:ws/missing"}}]
}
inbox_uploads.rewrite_request_body(body)
assert body["parts"][0]["file"]["uri"] == "platform-pending:ws/missing"
def test_rewrite_request_body_leaves_non_pending_uri_unchanged():
inbox_uploads.get_cache().set("platform-pending:ws/3", "workspace:/local/3")
body = {
"parts": [
{"kind": "file", "file": {"uri": "workspace:/already-local.pdf"}},
{"kind": "file", "file": {"uri": "https://example.com/x.pdf"}},
]
}
inbox_uploads.rewrite_request_body(body)
assert body["parts"][0]["file"]["uri"] == "workspace:/already-local.pdf"
assert body["parts"][1]["file"]["uri"] == "https://example.com/x.pdf"
def test_rewrite_request_body_skips_non_dict_parts():
body = {"parts": ["not a dict", 42, None]}
inbox_uploads.rewrite_request_body(body) # must not raise
assert body["parts"] == ["not a dict", 42, None]
def test_rewrite_request_body_skips_text_parts():
body = {
"parts": [{"kind": "text", "text": "platform-pending:ws/should-not-rewrite"}]
}
inbox_uploads.rewrite_request_body(body)
# Text content not touched — only file.uri fields are URIs.
assert body["parts"][0]["text"] == "platform-pending:ws/should-not-rewrite"
def test_rewrite_request_body_skips_part_without_file_dict():
body = {"parts": [{"kind": "file"}]} # no file key
inbox_uploads.rewrite_request_body(body)
assert body["parts"] == [{"kind": "file"}]
def test_rewrite_request_body_skips_file_without_uri():
body = {"parts": [{"kind": "file", "file": {"name": "x.pdf"}}]}
inbox_uploads.rewrite_request_body(body)
assert body["parts"][0]["file"] == {"name": "x.pdf"}
def test_rewrite_request_body_skips_nonstring_uri():
body = {"parts": [{"kind": "file", "file": {"uri": None}}]}
inbox_uploads.rewrite_request_body(body) # must not raise
def test_rewrite_request_body_handles_non_dict_body():
inbox_uploads.rewrite_request_body(None) # no-op
inbox_uploads.rewrite_request_body("string body") # no-op
inbox_uploads.rewrite_request_body([1, 2, 3]) # no-op
def test_rewrite_request_body_handles_non_dict_params():
body = {"params": "not a dict", "parts": []}
inbox_uploads.rewrite_request_body(body) # must not raise
def test_rewrite_request_body_handles_non_dict_message():
body = {"params": {"message": "not a dict"}}
inbox_uploads.rewrite_request_body(body) # must not raise
def test_rewrite_request_body_handles_non_list_parts():
body = {"parts": "not a list"}
inbox_uploads.rewrite_request_body(body) # must not raise
def test_rewrite_request_body_handles_non_dict_file():
body = {"parts": [{"kind": "file", "file": "not a dict"}]}
inbox_uploads.rewrite_request_body(body) # must not raise
# ---------------------------------------------------------------------------
# fetch_and_stage with shared client — Phase 5b client-reuse contract
# ---------------------------------------------------------------------------
#
# When a caller passes ``client=`` to fetch_and_stage, that client must be
# used for BOTH the GET /content and the POST /ack — no fresh
# ``httpx.Client(...)`` constructions should happen. The pre-Phase-5b
# implementation made one new client for GET and another for ack; the new
# shape lets BatchFetcher share one connection pool across an entire batch.
def test_fetch_and_stage_with_supplied_client_does_not_construct_new_client(monkeypatch):
row = _row(uri="platform-pending:ws-1/file-1")
get_resp = _make_resp(200, content=b"PDF", content_type="application/pdf")
ack_resp = _make_resp(200)
supplied = MagicMock()
supplied.get = MagicMock(return_value=get_resp)
supplied.post = MagicMock(return_value=ack_resp)
# Sentinel: any code path that constructs httpx.Client when one was
# already supplied is a regression — count constructions.
constructed: list[Any] = []
class _ShouldNotBeCalled:
def __init__(self, *a, **kw):
constructed.append((a, kw))
monkeypatch.setattr("httpx.Client", _ShouldNotBeCalled)
local_uri = inbox_uploads.fetch_and_stage(
row,
platform_url="http://plat",
workspace_id="ws-1",
headers={"Authorization": "Bearer t"},
client=supplied,
)
assert local_uri is not None
assert constructed == [], "supplied client must be reused; no new Client should be constructed"
# GET + POST ack both went through the supplied client.
supplied.get.assert_called_once()
supplied.post.assert_called_once()
# Caller-owned client must NOT be closed by fetch_and_stage; the
# batch fetcher (or test) closes it once the whole batch is done.
supplied.close.assert_not_called()
def test_fetch_and_stage_without_supplied_client_constructs_and_closes_one(monkeypatch):
row = _row(uri="platform-pending:ws-1/file-1")
get_resp = _make_resp(200, content=b"PDF", content_type="application/pdf")
ack_resp = _make_resp(200)
built: list[MagicMock] = []
def _factory(*args, **kwargs):
c = MagicMock()
c.get = MagicMock(return_value=get_resp)
c.post = MagicMock(return_value=ack_resp)
built.append(c)
return c
monkeypatch.setattr("httpx.Client", _factory)
local_uri = inbox_uploads.fetch_and_stage(
row, platform_url="http://plat", workspace_id="ws-1", headers={}
)
assert local_uri is not None
# Pre-Phase-5b built TWO clients (one for GET, one for ack); now exactly one.
assert len(built) == 1, f"expected 1 httpx.Client construction, got {len(built)}"
# Same client must serve BOTH calls.
built[0].get.assert_called_once()
built[0].post.assert_called_once()
# Owned client must be closed by fetch_and_stage on the way out.
built[0].close.assert_called_once()
def test_fetch_and_stage_with_supplied_client_does_not_close_caller_client():
# Even on failure the supplied client must not be closed — the
# BatchFetcher owns the lifecycle for the whole batch.
row = _row(uri="platform-pending:ws-1/file-1")
supplied = MagicMock()
supplied.get = MagicMock(side_effect=RuntimeError("network down"))
supplied.post = MagicMock() # should not be reached on GET failure
inbox_uploads.fetch_and_stage(
row,
platform_url="http://plat",
workspace_id="ws-1",
headers={},
client=supplied,
)
supplied.close.assert_not_called()
supplied.post.assert_not_called()
# ---------------------------------------------------------------------------
# BatchFetcher — concurrent fetch + URI cache barrier
# ---------------------------------------------------------------------------
def _row_with_id(act_id: str, file_id: str) -> dict:
"""Helper: an upload-receive row with a distinct activity id + file id."""
return {
"id": act_id,
"method": "chat_upload_receive",
"request_body": {
"file_id": file_id,
"name": f"{file_id}.pdf",
"uri": f"platform-pending:ws-1/{file_id}",
"mimeType": "application/pdf",
"size": 1,
},
}
def _stub_client_for_batch(get_responses: dict[str, MagicMock]) -> MagicMock:
"""Build one MagicMock client that returns per-file_id responses
based on the file_id segment of the URL.
"""
client = MagicMock()
def _get(url: str, headers: dict[str, str] | None = None) -> MagicMock:
for fid, resp in get_responses.items():
if f"/pending-uploads/{fid}/content" in url:
return resp
return _make_resp(404)
def _post(url: str, headers: dict[str, str] | None = None) -> MagicMock:
return _make_resp(200)
client.get = MagicMock(side_effect=_get)
client.post = MagicMock(side_effect=_post)
return client
def test_batch_fetcher_runs_submitted_rows_concurrently():
# Three rows whose .get() blocks for ~120ms each. With 4 workers the
# batch should complete in ~120ms (parallel), not ~360ms (serial).
# The 250ms ceiling accommodates CI scheduler jitter while still
# discriminating concurrent (~120ms) from serial (~360ms).
import time
barrier_start = [0.0]
def _slow_get(url: str, headers: dict[str, str] | None = None) -> MagicMock:
time.sleep(0.12)
for fid in ("a", "b", "c"):
if f"/pending-uploads/{fid}/content" in url:
return _make_resp(200, content=b"X", content_type="text/plain")
return _make_resp(404)
client = MagicMock()
client.get = MagicMock(side_effect=_slow_get)
client.post = MagicMock(return_value=_make_resp(200))
bf = inbox_uploads.BatchFetcher(
platform_url="http://plat",
workspace_id="ws-1",
headers={},
client=client,
max_workers=4,
)
barrier_start[0] = time.time()
for fid in ("a", "b", "c"):
bf.submit(_row_with_id(f"act-{fid}", fid))
bf.wait_all()
elapsed = time.time() - barrier_start[0]
bf.close()
assert elapsed < 0.25, (
f"3 rows × 120ms with 4 workers should finish in <250ms; got {elapsed:.3f}s "
"(suggests serial execution — Phase 5b regression)"
)
assert client.get.call_count == 3
assert client.post.call_count == 3
def test_batch_fetcher_wait_all_blocks_until_uri_cache_populated():
"""Pin the correctness invariant: when wait_all returns, the URI
cache is hot for every submitted row. Without this barrier the
inbox loop would process the chat-message row before its uploads
were staged, and rewrite_request_body would surface the un-rewritten
platform-pending: URI to the agent.
"""
import time
def _slow_get(url: str, headers: dict[str, str] | None = None) -> MagicMock:
time.sleep(0.05)
return _make_resp(200, content=b"data", content_type="text/plain")
client = MagicMock()
client.get = MagicMock(side_effect=_slow_get)
client.post = MagicMock(return_value=_make_resp(200))
inbox_uploads.get_cache().clear()
with inbox_uploads.BatchFetcher(
platform_url="http://plat", workspace_id="ws-1", headers={}, client=client
) as bf:
bf.submit(_row_with_id("act-a", "a"))
bf.submit(_row_with_id("act-b", "b"))
bf.wait_all()
# Cache must be hot for BOTH rows by the time wait_all returns.
assert inbox_uploads.get_cache().get("platform-pending:ws-1/a") is not None
assert inbox_uploads.get_cache().get("platform-pending:ws-1/b") is not None
def test_batch_fetcher_isolates_per_row_failure():
"""One failing fetch must not abort siblings. Sibling rows complete,
URI cache populates for them; the bad row's cache entry stays absent.
"""
def _get(url: str, headers: dict[str, str] | None = None) -> MagicMock:
if "/pending-uploads/bad/content" in url:
return _make_resp(500, text="upstream broken")
return _make_resp(200, content=b"ok", content_type="text/plain")
client = MagicMock()
client.get = MagicMock(side_effect=_get)
client.post = MagicMock(return_value=_make_resp(200))
inbox_uploads.get_cache().clear()
with inbox_uploads.BatchFetcher(
platform_url="http://plat", workspace_id="ws-1", headers={}, client=client
) as bf:
bf.submit(_row_with_id("act-1", "good1"))
bf.submit(_row_with_id("act-2", "bad"))
bf.submit(_row_with_id("act-3", "good2"))
bf.wait_all()
cache = inbox_uploads.get_cache()
assert cache.get("platform-pending:ws-1/good1") is not None
assert cache.get("platform-pending:ws-1/good2") is not None
assert cache.get("platform-pending:ws-1/bad") is None
def test_batch_fetcher_reuses_one_client_across_all_submits():
"""Every row in the batch must share the same client instance. This
is the connection-pool-reuse leg of the perf win: a second fetch
to the same host reuses the TCP+TLS handshake from the first.
"""
client = MagicMock()
client.get = MagicMock(return_value=_make_resp(200, content=b"x", content_type="text/plain"))
client.post = MagicMock(return_value=_make_resp(200))
with inbox_uploads.BatchFetcher(
platform_url="http://plat", workspace_id="ws-1", headers={}, client=client
) as bf:
for fid in ("a", "b", "c"):
bf.submit(_row_with_id(f"act-{fid}", fid))
bf.wait_all()
# 3 GETs + 3 POST acks all on the same client — no per-row Client
# construction.
assert client.get.call_count == 3
assert client.post.call_count == 3
def test_batch_fetcher_close_idempotent():
client = MagicMock()
bf = inbox_uploads.BatchFetcher(
platform_url="http://plat", workspace_id="ws-1", headers={}, client=client
)
bf.close()
bf.close() # second call must not raise
def test_batch_fetcher_submit_after_close_raises():
client = MagicMock()
bf = inbox_uploads.BatchFetcher(
platform_url="http://plat", workspace_id="ws-1", headers={}, client=client
)
bf.close()
with pytest.raises(RuntimeError, match="submit after close"):
bf.submit(_row_with_id("act-x", "x"))
def test_batch_fetcher_owns_client_when_not_supplied(monkeypatch):
built: list[MagicMock] = []
def _factory(*args, **kwargs):
c = MagicMock()
c.get = MagicMock(return_value=_make_resp(200, content=b"x", content_type="text/plain"))
c.post = MagicMock(return_value=_make_resp(200))
built.append(c)
return c
monkeypatch.setattr("httpx.Client", _factory)
bf = inbox_uploads.BatchFetcher(
platform_url="http://plat", workspace_id="ws-1", headers={}
)
bf.submit(_row_with_id("act-a", "a"))
bf.wait_all()
bf.close()
assert len(built) == 1, "expected one owned client per BatchFetcher"
built[0].close.assert_called_once()
def test_batch_fetcher_does_not_close_supplied_client():
client = MagicMock()
client.get = MagicMock(return_value=_make_resp(200, content=b"x", content_type="text/plain"))
client.post = MagicMock(return_value=_make_resp(200))
with inbox_uploads.BatchFetcher(
platform_url="http://plat", workspace_id="ws-1", headers={}, client=client
) as bf:
bf.submit(_row_with_id("act-a", "a"))
bf.wait_all()
# Supplied client survives the BatchFetcher's close — caller's lifecycle.
client.close.assert_not_called()
def test_batch_fetcher_wait_all_no_op_on_empty_batch():
client = MagicMock()
with inbox_uploads.BatchFetcher(
platform_url="http://plat", workspace_id="ws-1", headers={}, client=client
) as bf:
bf.wait_all() # nothing submitted; must not block, must not raise
client.get.assert_not_called()
client.post.assert_not_called()
def test_batch_fetcher_httpx_missing_makes_submit_a_noop(monkeypatch):
# No client supplied + httpx import fails → BatchFetcher degrades
# gracefully: submit() returns None and the row is silently skipped.
import sys
real_httpx = sys.modules.pop("httpx", None)
monkeypatch.setitem(sys.modules, "httpx", None)
try:
bf = inbox_uploads.BatchFetcher(
platform_url="http://plat", workspace_id="ws-1", headers={}
)
result = bf.submit(_row_with_id("act-a", "a"))
bf.wait_all()
bf.close()
finally:
if real_httpx is not None:
sys.modules["httpx"] = real_httpx
else:
sys.modules.pop("httpx", None)
assert result is None