From 80f430c2be2e91c6c30dbb76ff142a72501bd627 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 23 Feb 2026 01:49:54 +0100 Subject: [PATCH] fix(daemon): extend restart health timeout and improve restart errors --- src/cli/daemon-cli/lifecycle.test.ts | 4 +++- src/cli/daemon-cli/lifecycle.ts | 24 ++++++++++++++++++++---- src/cli/daemon-cli/restart-health.ts | 7 +++++-- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/src/cli/daemon-cli/lifecycle.test.ts b/src/cli/daemon-cli/lifecycle.test.ts index 022bf2db7..741473f69 100644 --- a/src/cli/daemon-cli/lifecycle.test.ts +++ b/src/cli/daemon-cli/lifecycle.test.ts @@ -41,6 +41,8 @@ vi.mock("../../daemon/service.js", () => ({ })); vi.mock("./restart-health.js", () => ({ + DEFAULT_RESTART_HEALTH_ATTEMPTS: 120, + DEFAULT_RESTART_HEALTH_DELAY_MS: 500, waitForGatewayHealthyRestart, terminateStaleGatewayPids, renderRestartDiagnostics, @@ -123,7 +125,7 @@ describe("runDaemonRestart health checks", () => { const { runDaemonRestart } = await import("./lifecycle.js"); await expect(runDaemonRestart({ json: true })).rejects.toMatchObject({ - message: "Gateway restart failed health checks.", + message: "Gateway restart timed out after 60s waiting for health checks.", }); expect(terminateStaleGatewayPids).not.toHaveBeenCalled(); expect(renderRestartDiagnostics).toHaveBeenCalledTimes(1); diff --git a/src/cli/daemon-cli/lifecycle.ts b/src/cli/daemon-cli/lifecycle.ts index e7749e9b2..413320289 100644 --- a/src/cli/daemon-cli/lifecycle.ts +++ b/src/cli/daemon-cli/lifecycle.ts @@ -10,6 +10,8 @@ import { runServiceUninstall, } from "./lifecycle-core.js"; import { + DEFAULT_RESTART_HEALTH_ATTEMPTS, + DEFAULT_RESTART_HEALTH_DELAY_MS, renderRestartDiagnostics, terminateStaleGatewayPids, waitForGatewayHealthyRestart, @@ -17,8 +19,8 @@ import { import { parsePortFromArgs, renderGatewayServiceStartHints } from "./shared.js"; import type { DaemonLifecycleOptions } from "./types.js"; -const POST_RESTART_HEALTH_ATTEMPTS = 8; -const POST_RESTART_HEALTH_DELAY_MS = 450; +const POST_RESTART_HEALTH_ATTEMPTS = DEFAULT_RESTART_HEALTH_ATTEMPTS; +const POST_RESTART_HEALTH_DELAY_MS = DEFAULT_RESTART_HEALTH_DELAY_MS; async function resolveGatewayRestartPort() { const service = resolveGatewayService(); @@ -71,6 +73,8 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi const restartPort = await resolveGatewayRestartPort().catch(() => resolveGatewayPort(loadConfig(), process.env), ); + const restartWaitMs = POST_RESTART_HEALTH_ATTEMPTS * POST_RESTART_HEALTH_DELAY_MS; + const restartWaitSeconds = Math.round(restartWaitMs / 1000); return await runServiceRestart({ serviceNoun: "Gateway", @@ -109,16 +113,28 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi } const diagnostics = renderRestartDiagnostics(health); + const timeoutLine = `Timed out after ${restartWaitSeconds}s waiting for gateway port ${restartPort} to become healthy.`; + const runningNoPortLine = + health.runtime.status === "running" && health.portUsage.status === "free" + ? `Gateway process is running but port ${restartPort} is still free (startup hang/crash loop or very slow VM startup).` + : null; if (!json) { - defaultRuntime.log(theme.warn("Gateway did not become healthy after restart.")); + defaultRuntime.log(theme.warn(timeoutLine)); + if (runningNoPortLine) { + defaultRuntime.log(theme.warn(runningNoPortLine)); + } for (const line of diagnostics) { defaultRuntime.log(theme.muted(line)); } } else { + warnings.push(timeoutLine); + if (runningNoPortLine) { + warnings.push(runningNoPortLine); + } warnings.push(...diagnostics); } - fail("Gateway restart failed health checks.", [ + fail(`Gateway restart timed out after ${restartWaitSeconds}s waiting for health checks.`, [ formatCliCommand("openclaw gateway status --probe --deep"), formatCliCommand("openclaw doctor"), ]); diff --git a/src/cli/daemon-cli/restart-health.ts b/src/cli/daemon-cli/restart-health.ts index b87e58646..4a0d5bcf4 100644 --- a/src/cli/daemon-cli/restart-health.ts +++ b/src/cli/daemon-cli/restart-health.ts @@ -8,8 +8,11 @@ import { } from "../../infra/ports.js"; import { sleep } from "../../utils.js"; -export const DEFAULT_RESTART_HEALTH_ATTEMPTS = 8; -export const DEFAULT_RESTART_HEALTH_DELAY_MS = 450; +export const DEFAULT_RESTART_HEALTH_TIMEOUT_MS = 60_000; +export const DEFAULT_RESTART_HEALTH_DELAY_MS = 500; +export const DEFAULT_RESTART_HEALTH_ATTEMPTS = Math.ceil( + DEFAULT_RESTART_HEALTH_TIMEOUT_MS / DEFAULT_RESTART_HEALTH_DELAY_MS, +); export type GatewayRestartSnapshot = { runtime: GatewayServiceRuntime;