fix: verify gateway restart health after daemon restart

This commit is contained in:
Peter Steinberger
2026-02-21 18:02:05 +01:00
parent 5e34eb98fb
commit 905e355f65
5 changed files with 408 additions and 141 deletions

View File

@@ -1,13 +1,38 @@
import { loadConfig, resolveGatewayPort } from "../../config/config.js";
import { resolveGatewayService } from "../../daemon/service.js";
import { defaultRuntime } from "../../runtime.js";
import { theme } from "../../terminal/theme.js";
import { formatCliCommand } from "../command-format.js";
import {
runServiceRestart,
runServiceStart,
runServiceStop,
runServiceUninstall,
} from "./lifecycle-core.js";
import { renderGatewayServiceStartHints } from "./shared.js";
import {
renderRestartDiagnostics,
terminateStaleGatewayPids,
waitForGatewayHealthyRestart,
} from "./restart-health.js";
import { parsePortFromArgs, renderGatewayServiceStartHints } from "./shared.js";
import type { DaemonLifecycleOptions } from "./types.js";
const POST_RESTART_HEALTH_ATTEMPTS = 8;
const POST_RESTART_HEALTH_DELAY_MS = 450;
async function resolveGatewayRestartPort() {
const service = resolveGatewayService();
const command = await service.readCommand(process.env).catch(() => null);
const serviceEnv = command?.environment ?? undefined;
const mergedEnv = {
...(process.env as Record<string, string | undefined>),
...(serviceEnv ?? undefined),
} as NodeJS.ProcessEnv;
const portFromArgs = parsePortFromArgs(command?.programArguments);
return portFromArgs ?? resolveGatewayPort(loadConfig(), mergedEnv);
}
export async function runDaemonUninstall(opts: DaemonLifecycleOptions = {}) {
return await runServiceUninstall({
serviceNoun: "Gateway",
@@ -41,11 +66,62 @@ export async function runDaemonStop(opts: DaemonLifecycleOptions = {}) {
* Throws/exits on check or restart failures.
*/
export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promise<boolean> {
const json = Boolean(opts.json);
const service = resolveGatewayService();
const restartPort = await resolveGatewayRestartPort().catch(() =>
resolveGatewayPort(loadConfig(), process.env),
);
return await runServiceRestart({
serviceNoun: "Gateway",
service: resolveGatewayService(),
service,
renderStartHints: renderGatewayServiceStartHints,
opts,
checkTokenDrift: true,
postRestartCheck: async ({ warnings, fail, stdout }) => {
let health = await waitForGatewayHealthyRestart({
service,
port: restartPort,
attempts: POST_RESTART_HEALTH_ATTEMPTS,
delayMs: POST_RESTART_HEALTH_DELAY_MS,
});
if (!health.healthy && health.staleGatewayPids.length > 0) {
const staleMsg = `Found stale gateway process(es): ${health.staleGatewayPids.join(", ")}.`;
warnings.push(staleMsg);
if (!json) {
defaultRuntime.log(theme.warn(staleMsg));
defaultRuntime.log(theme.muted("Stopping stale process(es) and retrying restart..."));
}
await terminateStaleGatewayPids(health.staleGatewayPids);
await service.restart({ env: process.env, stdout });
health = await waitForGatewayHealthyRestart({
service,
port: restartPort,
attempts: POST_RESTART_HEALTH_ATTEMPTS,
delayMs: POST_RESTART_HEALTH_DELAY_MS,
});
}
if (health.healthy) {
return;
}
const diagnostics = renderRestartDiagnostics(health);
if (!json) {
defaultRuntime.log(theme.warn("Gateway did not become healthy after restart."));
for (const line of diagnostics) {
defaultRuntime.log(theme.muted(line));
}
} else {
warnings.push(...diagnostics);
}
fail("Gateway restart failed health checks.", [
formatCliCommand("openclaw gateway status --probe --deep"),
formatCliCommand("openclaw doctor"),
]);
},
});
}