fix(gateway): drain active turns before restart to prevent message loss (#13931)
* fix(gateway): drain active turns before restart to prevent message loss On SIGUSR1 restart, the gateway now waits up to 30s for in-flight agent turns to complete before tearing down the server. This prevents buffered messages from being dropped when config.patch or update triggers a restart while agents are mid-turn. Changes: - command-queue.ts: add getActiveTaskCount() and waitForActiveTasks() helpers to track and wait on active lane tasks - run-loop.ts: on restart signal, drain active tasks before server.close() with a 30s timeout; extend force-exit timer accordingly - command-queue.test.ts: update imports for new exports Fixes #13883 * fix(queue): snapshot active tasks for restart drain --------- Co-authored-by: Elonito <0xRaini@users.noreply.github.com> Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
This commit is contained in:
@@ -6,6 +6,7 @@ import {
|
||||
isGatewaySigusr1RestartExternallyAllowed,
|
||||
} from "../../infra/restart.js";
|
||||
import { createSubsystemLogger } from "../../logging/subsystem.js";
|
||||
import { getActiveTaskCount, waitForActiveTasks } from "../../process/command-queue.js";
|
||||
|
||||
const gatewayLog = createSubsystemLogger("gateway");
|
||||
|
||||
@@ -26,6 +27,9 @@ export async function runGatewayLoop(params: {
|
||||
process.removeListener("SIGUSR1", onSigusr1);
|
||||
};
|
||||
|
||||
const DRAIN_TIMEOUT_MS = 30_000;
|
||||
const SHUTDOWN_TIMEOUT_MS = 5_000;
|
||||
|
||||
const request = (action: GatewayRunSignalAction, signal: string) => {
|
||||
if (shuttingDown) {
|
||||
gatewayLog.info(`received ${signal} during shutdown; ignoring`);
|
||||
@@ -35,14 +39,33 @@ export async function runGatewayLoop(params: {
|
||||
const isRestart = action === "restart";
|
||||
gatewayLog.info(`received ${signal}; ${isRestart ? "restarting" : "shutting down"}`);
|
||||
|
||||
// Allow extra time for draining active turns on restart.
|
||||
const forceExitMs = isRestart ? DRAIN_TIMEOUT_MS + SHUTDOWN_TIMEOUT_MS : SHUTDOWN_TIMEOUT_MS;
|
||||
const forceExitTimer = setTimeout(() => {
|
||||
gatewayLog.error("shutdown timed out; exiting without full cleanup");
|
||||
cleanupSignals();
|
||||
params.runtime.exit(0);
|
||||
}, 5000);
|
||||
}, forceExitMs);
|
||||
|
||||
void (async () => {
|
||||
try {
|
||||
// On restart, wait for in-flight agent turns to finish before
|
||||
// tearing down the server so buffered messages are delivered.
|
||||
if (isRestart) {
|
||||
const activeTasks = getActiveTaskCount();
|
||||
if (activeTasks > 0) {
|
||||
gatewayLog.info(
|
||||
`draining ${activeTasks} active task(s) before restart (timeout ${DRAIN_TIMEOUT_MS}ms)`,
|
||||
);
|
||||
const { drained } = await waitForActiveTasks(DRAIN_TIMEOUT_MS);
|
||||
if (drained) {
|
||||
gatewayLog.info("all active tasks drained");
|
||||
} else {
|
||||
gatewayLog.warn("drain timeout reached; proceeding with restart");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await server?.close({
|
||||
reason: isRestart ? "gateway restarting" : "gateway stopping",
|
||||
restartExpectedMs: isRestart ? 1500 : null,
|
||||
|
||||
Reference in New Issue
Block a user