From 94eecaa446aa04c0d4e8cf7c12a40f8441711191 Mon Sep 17 00:00:00 2001 From: Winston Date: Mon, 16 Feb 2026 23:42:25 +0600 Subject: [PATCH] fix: atomic session store writes to prevent context loss on Windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Windows, fs.promises.writeFile truncates the target file to 0 bytes before writing. Since loadSessionStore reads the file synchronously without holding the write lock, a concurrent read can observe the empty file, fail to parse it, and fall through to an empty store — causing the agent to lose its session context. Changes: - saveSessionStoreUnlocked (Windows path): write to a temp file first, then rename it onto the target. If rename fails due to file locking, retry 3 times with backoff, then fall back to copyFile (which overwrites in-place without truncating to 0 bytes). - loadSessionStore: on Windows, retry up to 3 times with 50ms synchronous backoff (via Atomics.wait) when the file is empty or unparseable, giving the writer time to finish. SharedArrayBuffer is allocated once and reused across retry attempts. --- src/config/sessions/store.ts | 76 ++++++++++++++++++++++++++++++------ 1 file changed, 64 insertions(+), 12 deletions(-) diff --git a/src/config/sessions/store.ts b/src/config/sessions/store.ts index 9890297db..9433c3a54 100644 --- a/src/config/sessions/store.ts +++ b/src/config/sessions/store.ts @@ -162,18 +162,41 @@ export function loadSessionStore( } } - // Cache miss or disabled - load from disk + // Cache miss or disabled - load from disk. + // Retry up to 3 times when the file is empty or unparseable. On Windows the + // temp-file + rename write is not fully atomic: a concurrent reader can briefly + // observe a 0-byte file (between truncate and write) or a stale/locked state. + // A short synchronous backoff (50 ms via `Atomics.wait`) is enough for the + // writer to finish. let store: Record = {}; let mtimeMs = getFileMtimeMs(storePath); - try { - const raw = fs.readFileSync(storePath, "utf-8"); - const parsed = JSON.parse(raw); - if (isSessionStoreRecord(parsed)) { - store = parsed; + const maxReadAttempts = process.platform === "win32" ? 3 : 1; + const retryBuf = + maxReadAttempts > 1 + ? new Int32Array(new SharedArrayBuffer(4)) + : undefined; + for (let attempt = 0; attempt < maxReadAttempts; attempt++) { + try { + const raw = fs.readFileSync(storePath, "utf-8"); + if (raw.length === 0 && attempt < maxReadAttempts - 1) { + // File is empty — likely caught mid-write; retry after a brief pause. + Atomics.wait(retryBuf!, 0, 0, 50); + continue; + } + const parsed = JSON.parse(raw); + if (isSessionStoreRecord(parsed)) { + store = parsed; + } + mtimeMs = getFileMtimeMs(storePath) ?? mtimeMs; + break; + } catch { + // File missing, locked, or transiently corrupt — retry on Windows. + if (attempt < maxReadAttempts - 1) { + Atomics.wait(retryBuf!, 0, 0, 50); + continue; + } + // Final attempt failed; proceed with an empty store. } - mtimeMs = getFileMtimeMs(storePath) ?? mtimeMs; - } catch { - // ignore missing/invalid store; we'll recreate it } // Best-effort migration: message provider → channel naming. @@ -538,11 +561,38 @@ async function saveSessionStoreUnlocked( await fs.promises.mkdir(path.dirname(storePath), { recursive: true }); const json = JSON.stringify(store, null, 2); - // Windows: avoid atomic rename swaps (can be flaky under concurrent access). - // We serialize writers via the session-store lock instead. + // Windows: use temp-file + rename for atomic writes, same as other platforms. + // Direct `writeFile` truncates the target to 0 bytes before writing, which + // allows concurrent `readFileSync` calls (from unlocked `loadSessionStore`) + // to observe an empty file and lose the session store contents. if (process.platform === "win32") { + const tmp = `${storePath}.${process.pid}.${crypto.randomUUID()}.tmp`; try { - await fs.promises.writeFile(storePath, json, "utf-8"); + await fs.promises.writeFile(tmp, json, "utf-8"); + // Retry rename up to 5 times with increasing backoff — rename can fail + // on Windows when the target is locked by a concurrent reader. We do + // NOT fall back to writeFile or copyFile because both use CREATE_ALWAYS + // on Windows, which truncates the target to 0 bytes before writing — + // reintroducing the exact race this fix addresses. If all attempts + // fail, the temp file is cleaned up and the next save cycle (which is + // serialized by the write lock) will succeed. + for (let i = 0; i < 5; i++) { + try { + await fs.promises.rename(tmp, storePath); + break; + } catch { + if (i < 4) { + await new Promise((r) => setTimeout(r, 50 * (i + 1))); + } + // Final attempt failed — skip this save. The write lock ensures + // the next save will retry with fresh data. Log for diagnostics. + if (i === 4) { + console.warn( + `[session-store] rename failed after 5 attempts: ${storePath}`, + ); + } + } + } } catch (err) { const code = err && typeof err === "object" && "code" in err @@ -552,6 +602,8 @@ async function saveSessionStoreUnlocked( return; } throw err; + } finally { + await fs.promises.rm(tmp, { force: true }).catch(() => undefined); } return; }