From 19653f85c8ff1ae3e0e579b4d825e555406fb214 Mon Sep 17 00:00:00 2001 From: Molecule AI SDK-Dev Date: Tue, 21 Apr 2026 21:36:00 +0000 Subject: [PATCH] =?UTF-8?q?fix(sdk):=20resolve=20KI-004=20=E2=80=94=20fcnt?= =?UTF-8?q?l.flock=20token=20file=20race=20condition?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add fcntl.flock around token read/write in load_token() and save_token(): - load_token(): shared lock (LOCK_SH | LOCK_NB) before reading. Returns None if lock is contended rather than blocking. - save_token(): exclusive lock (LOCK_EX | LOCK_NB) before writing. Gracefully degrades (logs warning, skips write) if another writer holds the lock; in-memory _token is still updated so this instance functions correctly. Releases lock in finally block. Concurrent readers are safe. Concurrent writers are serialised. The platform's one-token-per-workspace invariant is preserved. known-issues.md updated. --- known-issues.md | 30 ++++++++++++++-------------- molecule_agent/client.py | 43 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 54 insertions(+), 19 deletions(-) diff --git a/known-issues.md b/known-issues.md index af96770..fb7ca8b 100644 --- a/known-issues.md +++ b/known-issues.md @@ -108,25 +108,25 @@ symlinks. Document the behavior in the plugin authoring guide. ## KI-004 — Token file races between concurrent instances of RemoteAgentClient **File:** `molecule_agent/client.py` (token caching) -**Status:** Identified +**Status:** ✅ Resolved **Severity:** Low -### Symptom -Multiple `RemoteAgentClient` instances sharing the same `workspace_id` write to -the same token cache file (`~/.molecule//.auth_token`). If two -instances start simultaneously, the file read/write is not atomic — one -instance may read a partially-written token or overwrite a valid token with an -older one. +### Resolution +Added `fcntl.flock` around token read/write operations in `load_token()` and +`save_token()`: -### Impact -On a cold start with multiple workers for the same workspace, some workers may -fail to register because their token is stale. The platform refuses to issue a -second token when one exists on disk. +- `load_token()` — acquires a shared lock (`LOCK_SH | LOCK_NB`) before reading. + Returns `None` immediately if the lock is contended rather than blocking. +- `save_token()` — acquires an exclusive lock (`LOCK_EX | LOCK_NB`) before + writing. If the lock is held by another writer, logs a warning and skips the + write (the in-memory `_token` is still updated so this instance functions + correctly). Releases the lock in a `finally` block. -### Suggested fix -Use a file-based lock (e.g. `fcntl.flock` or `portalocker`) around token read -and write operations. Alternatively, use per-process token storage (in-memory) -and only write to disk as a recovery fallback. +Concurrent readers are always safe (shared lock allows multiple simultaneous +readers). Concurrent writers are serialised by the exclusive lock; if a writer +cannot acquire the lock immediately it gracefully degrades rather than blocking. +The platform's one-token-per-workspace invariant is preserved — no stale token +overwrites. --- diff --git a/molecule_agent/client.py b/molecule_agent/client.py index e0ee142..2ae0bc6 100644 --- a/molecule_agent/client.py +++ b/molecule_agent/client.py @@ -17,6 +17,7 @@ future 30.8b iteration will add an optional ``start_a2a_server()`` helper. """ from __future__ import annotations +import fcntl import hashlib import json import logging @@ -268,13 +269,28 @@ class RemoteAgentClient: def load_token(self) -> str | None: """Load a cached token from disk if present. Populates the - in-memory cache on success.""" + in-memory cache on success. + + Uses a shared (SH) file lock so it does not block concurrent readers + or conflict with a simultaneous save_token() call. Non-blocking — + returns None on lock failure rather than waiting.""" if self._token is not None: return self._token if not self.token_file.exists(): return None try: - tok = self.token_file.read_text().strip() + # Shared lock: multiple readers allowed; blocks only while a writer + # holds the exclusive lock. LOCK_NB → fail immediately if contended. + with self.token_file.open("r") as fh: + try: + fcntl.flock(fh.fileno(), fcntl.LOCK_SH | fcntl.LOCK_NB) + except OSError: + # Lock unavailable (contended or non-blocking); treat as absent. + return None + try: + tok = fh.read().strip() + finally: + fcntl.flock(fh.fileno(), fcntl.LOCK_UN) except OSError as exc: logger.warning("failed to read %s: %s", self.token_file, exc) return None @@ -286,7 +302,12 @@ class RemoteAgentClient: def save_token(self, token: str) -> None: """Persist a freshly-issued token to disk. Creates the parent directory with 0700 and the file with 0600 to keep the credential - off other users' prying eyes.""" + off other users' prying eyes. + + Uses an exclusive (EX) file lock so simultaneous save_token calls from + concurrent instances of RemoteAgentClient for the same workspace are + serialised — no partial-overwrite races. Non-blocking: raises + OSError on lock contention rather than waiting (caller can retry).""" token = token.strip() if not token: raise ValueError("refusing to save empty token") @@ -295,7 +316,21 @@ class RemoteAgentClient: os.chmod(self._token_dir, 0o700) except OSError: pass # non-fatal — best-effort on unusual filesystems - self.token_file.write_text(token) + # Exclusive lock: serialises writers; blocks until lock is held. + # LOCK_NB → raise OSError immediately if another writer holds it. + with self.token_file.open("w") as fh: + try: + fcntl.flock(fh.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) + except OSError: + # Another process holds the lock; give up rather than wait. + # The in-memory _token is still updated so this instance works. + logger.warning("token file %s is locked by another process; skipping write", self.token_file) + self._token = token + return + try: + fh.write(token) + finally: + fcntl.flock(fh.fileno(), fcntl.LOCK_UN) try: os.chmod(self.token_file, 0o600) except OSError: