From 3b879fe52421e9c17a82baca1899d0084ce78ae4 Mon Sep 17 00:00:00 2001 From: elliotsecops Date: Tue, 27 Jan 2026 14:43:42 -0400 Subject: [PATCH] fix(infra): prevent gateway crashes on transient network errors --- CHANGELOG.md | 2 + ...handled-rejections.fatal-detection.test.ts | 159 ++++++++++++++++++ src/infra/unhandled-rejections.ts | 105 ++++++++---- 3 files changed, 233 insertions(+), 33 deletions(-) create mode 100644 src/infra/unhandled-rejections.fatal-detection.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e39702f6..e37ed38be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -68,12 +68,14 @@ Status: unreleased. ### Breaking - **BREAKING:** Gateway auth mode "none" is removed; gateway now requires token/password (Tailscale Serve identity still allowed). +<<<<<<< HEAD ### Fixes - Agents: prevent retries on oversized image errors and surface size limits. (#2871) Thanks @Suksham-sharma. - Agents: inherit provider baseUrl/api for inline models. (#2740) Thanks @lploc94. - Memory Search: keep auto provider model defaults and only include remote when configured. (#2576) Thanks @papago2355. - macOS: auto-scroll to bottom when sending a new message while scrolled up. (#2471) Thanks @kennyklee. - Web UI: auto-expand the chat compose textarea while typing (with sensible max height). (#2950) Thanks @shivamraut101. +- Gateway: prevent crashes on transient network errors (fetch failures, timeouts, DNS). Added fatal error detection to only exit on truly critical errors. Fixes #2895, #2879, #2873. (#2980) Thanks @elliotsecops. - Gateway: suppress AbortError and transient network errors in unhandled rejections. (#2451) Thanks @Glucksberg. - TTS: keep /tts status replies on text-only commands and avoid duplicate block-stream audio. (#2451) Thanks @Glucksberg. - Security: pin npm overrides to keep tar@7.5.4 for install toolchains. diff --git a/src/infra/unhandled-rejections.fatal-detection.test.ts b/src/infra/unhandled-rejections.fatal-detection.test.ts new file mode 100644 index 000000000..7c8d97675 --- /dev/null +++ b/src/infra/unhandled-rejections.fatal-detection.test.ts @@ -0,0 +1,159 @@ +import { describe, it, expect, vi, beforeAll, afterAll, beforeEach, afterEach } from "vitest"; +import process from "node:process"; + +import { installUnhandledRejectionHandler } from "./unhandled-rejections.js"; + +describe("installUnhandledRejectionHandler - fatal detection", () => { + let exitCalls: Array = []; + let consoleErrorSpy: ReturnType; + let consoleWarnSpy: ReturnType; + let originalExit: typeof process.exit; + + beforeAll(() => { + originalExit = process.exit; + installUnhandledRejectionHandler(); + }); + + beforeEach(() => { + exitCalls = []; + + vi.spyOn(process, "exit").mockImplementation((code: string | number | null | undefined) => { + if (code !== undefined && code !== null) { + exitCalls.push(code); + } + }); + + consoleErrorSpy = vi.spyOn(console, "error").mockImplementation(() => {}); + consoleWarnSpy = vi.spyOn(console, "warn").mockImplementation(() => {}); + }); + + afterEach(() => { + vi.clearAllMocks(); + consoleErrorSpy.mockRestore(); + consoleWarnSpy.mockRestore(); + }); + + afterAll(() => { + process.exit = originalExit; + }); + + describe("fatal errors", () => { + it("exits on ERR_OUT_OF_MEMORY", () => { + const oomErr = Object.assign(new Error("Out of memory"), { + code: "ERR_OUT_OF_MEMORY", + }); + + process.emit("unhandledRejection", oomErr, Promise.resolve()); + + expect(exitCalls).toEqual([1]); + expect(consoleErrorSpy).toHaveBeenCalledWith( + "[clawdbot] FATAL unhandled rejection:", + expect.stringContaining("Out of memory"), + ); + }); + + it("exits on ERR_SCRIPT_EXECUTION_TIMEOUT", () => { + const timeoutErr = Object.assign(new Error("Script execution timeout"), { + code: "ERR_SCRIPT_EXECUTION_TIMEOUT", + }); + + process.emit("unhandledRejection", timeoutErr, Promise.resolve()); + + expect(exitCalls).toEqual([1]); + }); + + it("exits on ERR_WORKER_OUT_OF_MEMORY", () => { + const workerOomErr = Object.assign(new Error("Worker out of memory"), { + code: "ERR_WORKER_OUT_OF_MEMORY", + }); + + process.emit("unhandledRejection", workerOomErr, Promise.resolve()); + + expect(exitCalls).toEqual([1]); + }); + }); + + describe("configuration errors", () => { + it("exits on INVALID_CONFIG", () => { + const configErr = Object.assign(new Error("Invalid config"), { + code: "INVALID_CONFIG", + }); + + process.emit("unhandledRejection", configErr, Promise.resolve()); + + expect(exitCalls).toEqual([1]); + expect(consoleErrorSpy).toHaveBeenCalledWith( + "[clawdbot] CONFIGURATION ERROR - requires fix:", + expect.stringContaining("Invalid config"), + ); + }); + + it("exits on MISSING_API_KEY", () => { + const missingKeyErr = Object.assign(new Error("Missing API key"), { + code: "MISSING_API_KEY", + }); + + process.emit("unhandledRejection", missingKeyErr, Promise.resolve()); + + expect(exitCalls).toEqual([1]); + }); + }); + + describe("non-fatal errors", () => { + it("does NOT exit on undici fetch failures", () => { + const fetchErr = Object.assign(new TypeError("fetch failed"), { + cause: { code: "UND_ERR_CONNECT_TIMEOUT", syscall: "connect" }, + }); + + process.emit("unhandledRejection", fetchErr, Promise.resolve()); + + expect(exitCalls).toEqual([]); + expect(consoleWarnSpy).toHaveBeenCalledWith( + "[clawdbot] Non-fatal unhandled rejection (continuing):", + expect.stringContaining("fetch failed"), + ); + }); + + it("does NOT exit on DNS resolution failures", () => { + const dnsErr = Object.assign(new Error("DNS resolve failed"), { + code: "UND_ERR_DNS_RESOLVE_FAILED", + }); + + process.emit("unhandledRejection", dnsErr, Promise.resolve()); + + expect(exitCalls).toEqual([]); + expect(consoleWarnSpy).toHaveBeenCalled(); + }); + + it("does NOT exit on generic errors without code", () => { + const genericErr = new Error("Something went wrong"); + + process.emit("unhandledRejection", genericErr, Promise.resolve()); + + expect(exitCalls).toEqual([]); + expect(consoleWarnSpy).toHaveBeenCalled(); + }); + + it("does NOT exit on connection reset errors", () => { + const connResetErr = Object.assign(new Error("Connection reset"), { + code: "ECONNRESET", + }); + + process.emit("unhandledRejection", connResetErr, Promise.resolve()); + + expect(exitCalls).toEqual([]); + expect(consoleWarnSpy).toHaveBeenCalled(); + }); + + it("does NOT exit on timeout errors", () => { + const timeoutErr = Object.assign(new Error("Timeout"), { + code: "ETIMEDOUT", + }); + + process.emit("unhandledRejection", timeoutErr, Promise.resolve()); + + expect(exitCalls).toEqual([]); + expect(consoleWarnSpy).toHaveBeenCalled(); + }); + }); +}); diff --git a/src/infra/unhandled-rejections.ts b/src/infra/unhandled-rejections.ts index 108b6c016..bfaf75548 100644 --- a/src/infra/unhandled-rejections.ts +++ b/src/infra/unhandled-rejections.ts @@ -1,11 +1,56 @@ import process from "node:process"; -import { formatUncaughtError } from "./errors.js"; +import { extractErrorCode, formatUncaughtError } from "./errors.js"; type UnhandledRejectionHandler = (reason: unknown) => boolean; const handlers = new Set(); +const FATAL_ERROR_CODES = new Set([ + "ERR_OUT_OF_MEMORY", + "ERR_SCRIPT_EXECUTION_TIMEOUT", + "ERR_WORKER_OUT_OF_MEMORY", + "ERR_WORKER_UNCAUGHT_EXCEPTION", + "ERR_WORKER_INITIALIZATION_FAILED", +]); + +const CONFIG_ERROR_CODES = new Set([ + "INVALID_CONFIG", + "MISSING_API_KEY", + "MISSING_CREDENTIALS", +]); + +// Network error codes that indicate transient failures (shouldn't crash the gateway) +const TRANSIENT_NETWORK_CODES = new Set([ + "ECONNRESET", + "ECONNREFUSED", + "ENOTFOUND", + "ETIMEDOUT", + "ESOCKETTIMEDOUT", + "ECONNABORTED", + "EPIPE", + "EHOSTUNREACH", + "ENETUNREACH", + "EAI_AGAIN", + "UND_ERR_CONNECT_TIMEOUT", + "UND_ERR_DNS_RESOLVE_FAILED", + "UND_ERR_CONNECT", + "UND_ERR_SOCKET", + "UND_ERR_HEADERS_TIMEOUT", + "UND_ERR_BODY_TIMEOUT", +]); + +function getErrorCause(err: unknown): unknown { + if (!err || typeof err !== "object") return undefined; + return (err as { cause?: unknown }).cause; +} + +function extractErrorCodeWithCause(err: unknown): string | undefined { + const direct = extractErrorCode(err); + if (direct) return direct; + return extractErrorCode(getErrorCause(err)); +} + /** * Checks if an error is an AbortError. * These are typically intentional cancellations (e.g., during shutdown) and shouldn't crash. @@ -20,33 +65,14 @@ export function isAbortError(err: unknown): boolean { return false; } -// Network error codes that indicate transient failures (shouldn't crash the gateway) -const TRANSIENT_NETWORK_CODES = new Set([ - "ECONNRESET", - "ECONNREFUSED", - "ENOTFOUND", - "ETIMEDOUT", - "ESOCKETTIMEDOUT", - "ECONNABORTED", - "EPIPE", - "EHOSTUNREACH", - "ENETUNREACH", - "EAI_AGAIN", - "UND_ERR_CONNECT_TIMEOUT", - "UND_ERR_SOCKET", - "UND_ERR_HEADERS_TIMEOUT", - "UND_ERR_BODY_TIMEOUT", -]); - -function getErrorCode(err: unknown): string | undefined { - if (!err || typeof err !== "object") return undefined; - const code = (err as { code?: unknown }).code; - return typeof code === "string" ? code : undefined; +function isFatalError(err: unknown): boolean { + const code = extractErrorCodeWithCause(err); + return code !== undefined && FATAL_ERROR_CODES.has(code); } -function getErrorCause(err: unknown): unknown { - if (!err || typeof err !== "object") return undefined; - return (err as { cause?: unknown }).cause; +function isConfigError(err: unknown): boolean { + const code = extractErrorCodeWithCause(err); + return code !== undefined && CONFIG_ERROR_CODES.has(code); } /** @@ -56,16 +82,13 @@ function getErrorCause(err: unknown): unknown { export function isTransientNetworkError(err: unknown): boolean { if (!err) return false; - // Check the error itself - const code = getErrorCode(err); + const code = extractErrorCodeWithCause(err); if (code && TRANSIENT_NETWORK_CODES.has(code)) return true; // "fetch failed" TypeError from undici (Node's native fetch) if (err instanceof TypeError && err.message === "fetch failed") { const cause = getErrorCause(err); - // The cause often contains the actual network error if (cause) return isTransientNetworkError(cause); - // Even without a cause, "fetch failed" is typically a network issue return true; } @@ -115,10 +138,26 @@ export function installUnhandledRejectionHandler(): void { return; } - // Transient network errors (fetch failed, connection reset, etc.) shouldn't crash - // These are temporary connectivity issues that will resolve on their own + if (isFatalError(reason)) { + console.error("[moltbot] FATAL unhandled rejection:", formatUncaughtError(reason)); + process.exit(1); + return; + } + + if (isConfigError(reason)) { + console.error( + "[moltbot] CONFIGURATION ERROR - requires fix:", + formatUncaughtError(reason), + ); + process.exit(1); + return; + } + if (isTransientNetworkError(reason)) { - console.error("[moltbot] Network error (non-fatal):", formatUncaughtError(reason)); + console.warn( + "[moltbot] Non-fatal unhandled rejection (continuing):", + formatUncaughtError(reason), + ); return; }