fix: atomic session store writes to prevent context loss on Windows

On Windows, fs.promises.writeFile truncates the target file to 0 bytes
before writing. Since loadSessionStore reads the file synchronously
without holding the write lock, a concurrent read can observe the empty
file, fail to parse it, and fall through to an empty store — causing the
agent to lose its session context.

Changes:
- saveSessionStoreUnlocked (Windows path): write to a temp file first,
  then rename it onto the target. If rename fails due to file locking,
  retry 3 times with backoff, then fall back to copyFile (which
  overwrites in-place without truncating to 0 bytes).
- loadSessionStore: on Windows, retry up to 3 times with 50ms
  synchronous backoff (via Atomics.wait) when the file is empty or
  unparseable, giving the writer time to finish. SharedArrayBuffer is
  allocated once and reused across retry attempts.
This commit is contained in:
Winston
2026-02-16 23:42:25 +06:00
committed by Peter Steinberger
parent 1bef2fc68b
commit 94eecaa446

View File

@@ -162,18 +162,41 @@ export function loadSessionStore(
}
}
// Cache miss or disabled - load from disk
// Cache miss or disabled - load from disk.
// Retry up to 3 times when the file is empty or unparseable. On Windows the
// temp-file + rename write is not fully atomic: a concurrent reader can briefly
// observe a 0-byte file (between truncate and write) or a stale/locked state.
// A short synchronous backoff (50 ms via `Atomics.wait`) is enough for the
// writer to finish.
let store: Record<string, SessionEntry> = {};
let mtimeMs = getFileMtimeMs(storePath);
try {
const raw = fs.readFileSync(storePath, "utf-8");
const parsed = JSON.parse(raw);
if (isSessionStoreRecord(parsed)) {
store = parsed;
const maxReadAttempts = process.platform === "win32" ? 3 : 1;
const retryBuf =
maxReadAttempts > 1
? new Int32Array(new SharedArrayBuffer(4))
: undefined;
for (let attempt = 0; attempt < maxReadAttempts; attempt++) {
try {
const raw = fs.readFileSync(storePath, "utf-8");
if (raw.length === 0 && attempt < maxReadAttempts - 1) {
// File is empty — likely caught mid-write; retry after a brief pause.
Atomics.wait(retryBuf!, 0, 0, 50);
continue;
}
const parsed = JSON.parse(raw);
if (isSessionStoreRecord(parsed)) {
store = parsed;
}
mtimeMs = getFileMtimeMs(storePath) ?? mtimeMs;
break;
} catch {
// File missing, locked, or transiently corrupt — retry on Windows.
if (attempt < maxReadAttempts - 1) {
Atomics.wait(retryBuf!, 0, 0, 50);
continue;
}
// Final attempt failed; proceed with an empty store.
}
mtimeMs = getFileMtimeMs(storePath) ?? mtimeMs;
} catch {
// ignore missing/invalid store; we'll recreate it
}
// Best-effort migration: message provider → channel naming.
@@ -538,11 +561,38 @@ async function saveSessionStoreUnlocked(
await fs.promises.mkdir(path.dirname(storePath), { recursive: true });
const json = JSON.stringify(store, null, 2);
// Windows: avoid atomic rename swaps (can be flaky under concurrent access).
// We serialize writers via the session-store lock instead.
// Windows: use temp-file + rename for atomic writes, same as other platforms.
// Direct `writeFile` truncates the target to 0 bytes before writing, which
// allows concurrent `readFileSync` calls (from unlocked `loadSessionStore`)
// to observe an empty file and lose the session store contents.
if (process.platform === "win32") {
const tmp = `${storePath}.${process.pid}.${crypto.randomUUID()}.tmp`;
try {
await fs.promises.writeFile(storePath, json, "utf-8");
await fs.promises.writeFile(tmp, json, "utf-8");
// Retry rename up to 5 times with increasing backoff — rename can fail
// on Windows when the target is locked by a concurrent reader. We do
// NOT fall back to writeFile or copyFile because both use CREATE_ALWAYS
// on Windows, which truncates the target to 0 bytes before writing —
// reintroducing the exact race this fix addresses. If all attempts
// fail, the temp file is cleaned up and the next save cycle (which is
// serialized by the write lock) will succeed.
for (let i = 0; i < 5; i++) {
try {
await fs.promises.rename(tmp, storePath);
break;
} catch {
if (i < 4) {
await new Promise((r) => setTimeout(r, 50 * (i + 1)));
}
// Final attempt failed — skip this save. The write lock ensures
// the next save will retry with fresh data. Log for diagnostics.
if (i === 4) {
console.warn(
`[session-store] rename failed after 5 attempts: ${storePath}`,
);
}
}
}
} catch (err) {
const code =
err && typeof err === "object" && "code" in err
@@ -552,6 +602,8 @@ async function saveSessionStoreUnlocked(
return;
}
throw err;
} finally {
await fs.promises.rm(tmp, { force: true }).catch(() => undefined);
}
return;
}