fix(daemon): extend restart health timeout and improve restart errors

This commit is contained in:
Peter Steinberger
2026-02-23 01:49:54 +01:00
parent 278331c49c
commit 80f430c2be
3 changed files with 28 additions and 7 deletions

View File

@@ -41,6 +41,8 @@ vi.mock("../../daemon/service.js", () => ({
}));
vi.mock("./restart-health.js", () => ({
DEFAULT_RESTART_HEALTH_ATTEMPTS: 120,
DEFAULT_RESTART_HEALTH_DELAY_MS: 500,
waitForGatewayHealthyRestart,
terminateStaleGatewayPids,
renderRestartDiagnostics,
@@ -123,7 +125,7 @@ describe("runDaemonRestart health checks", () => {
const { runDaemonRestart } = await import("./lifecycle.js");
await expect(runDaemonRestart({ json: true })).rejects.toMatchObject({
message: "Gateway restart failed health checks.",
message: "Gateway restart timed out after 60s waiting for health checks.",
});
expect(terminateStaleGatewayPids).not.toHaveBeenCalled();
expect(renderRestartDiagnostics).toHaveBeenCalledTimes(1);

View File

@@ -10,6 +10,8 @@ import {
runServiceUninstall,
} from "./lifecycle-core.js";
import {
DEFAULT_RESTART_HEALTH_ATTEMPTS,
DEFAULT_RESTART_HEALTH_DELAY_MS,
renderRestartDiagnostics,
terminateStaleGatewayPids,
waitForGatewayHealthyRestart,
@@ -17,8 +19,8 @@ import {
import { parsePortFromArgs, renderGatewayServiceStartHints } from "./shared.js";
import type { DaemonLifecycleOptions } from "./types.js";
const POST_RESTART_HEALTH_ATTEMPTS = 8;
const POST_RESTART_HEALTH_DELAY_MS = 450;
const POST_RESTART_HEALTH_ATTEMPTS = DEFAULT_RESTART_HEALTH_ATTEMPTS;
const POST_RESTART_HEALTH_DELAY_MS = DEFAULT_RESTART_HEALTH_DELAY_MS;
async function resolveGatewayRestartPort() {
const service = resolveGatewayService();
@@ -71,6 +73,8 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi
const restartPort = await resolveGatewayRestartPort().catch(() =>
resolveGatewayPort(loadConfig(), process.env),
);
const restartWaitMs = POST_RESTART_HEALTH_ATTEMPTS * POST_RESTART_HEALTH_DELAY_MS;
const restartWaitSeconds = Math.round(restartWaitMs / 1000);
return await runServiceRestart({
serviceNoun: "Gateway",
@@ -109,16 +113,28 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi
}
const diagnostics = renderRestartDiagnostics(health);
const timeoutLine = `Timed out after ${restartWaitSeconds}s waiting for gateway port ${restartPort} to become healthy.`;
const runningNoPortLine =
health.runtime.status === "running" && health.portUsage.status === "free"
? `Gateway process is running but port ${restartPort} is still free (startup hang/crash loop or very slow VM startup).`
: null;
if (!json) {
defaultRuntime.log(theme.warn("Gateway did not become healthy after restart."));
defaultRuntime.log(theme.warn(timeoutLine));
if (runningNoPortLine) {
defaultRuntime.log(theme.warn(runningNoPortLine));
}
for (const line of diagnostics) {
defaultRuntime.log(theme.muted(line));
}
} else {
warnings.push(timeoutLine);
if (runningNoPortLine) {
warnings.push(runningNoPortLine);
}
warnings.push(...diagnostics);
}
fail("Gateway restart failed health checks.", [
fail(`Gateway restart timed out after ${restartWaitSeconds}s waiting for health checks.`, [
formatCliCommand("openclaw gateway status --probe --deep"),
formatCliCommand("openclaw doctor"),
]);

View File

@@ -8,8 +8,11 @@ import {
} from "../../infra/ports.js";
import { sleep } from "../../utils.js";
export const DEFAULT_RESTART_HEALTH_ATTEMPTS = 8;
export const DEFAULT_RESTART_HEALTH_DELAY_MS = 450;
export const DEFAULT_RESTART_HEALTH_TIMEOUT_MS = 60_000;
export const DEFAULT_RESTART_HEALTH_DELAY_MS = 500;
export const DEFAULT_RESTART_HEALTH_ATTEMPTS = Math.ceil(
DEFAULT_RESTART_HEALTH_TIMEOUT_MS / DEFAULT_RESTART_HEALTH_DELAY_MS,
);
export type GatewayRestartSnapshot = {
runtime: GatewayServiceRuntime;