fix: verify gateway restart health after daemon restart

This commit is contained in:
Peter Steinberger
2026-02-21 18:02:05 +01:00
parent 5e34eb98fb
commit 905e355f65
5 changed files with 408 additions and 141 deletions

View File

@@ -1,3 +1,4 @@
import type { Writable } from "node:stream";
import { loadConfig } from "../../config/config.js";
import { resolveIsNixMode } from "../../config/paths.js";
import { checkTokenDrift } from "../../daemon/service-audit.js";
@@ -18,6 +19,13 @@ type DaemonLifecycleOptions = {
json?: boolean;
};
type RestartPostCheckContext = {
json: boolean;
stdout: Writable;
warnings: string[];
fail: (message: string, hints?: string[]) => void;
};
async function maybeAugmentSystemdHints(hints: string[]): Promise<string[]> {
if (process.platform !== "linux") {
return hints;
@@ -240,6 +248,7 @@ export async function runServiceRestart(params: {
renderStartHints: () => string[];
opts?: DaemonLifecycleOptions;
checkTokenDrift?: boolean;
postRestartCheck?: (ctx: RestartPostCheckContext) => Promise<void>;
}): Promise<boolean> {
const json = Boolean(params.opts?.json);
const { stdout, emit, fail } = createActionIO({ action: "restart", json });
@@ -295,6 +304,9 @@ export async function runServiceRestart(params: {
try {
await params.service.restart({ env: process.env, stdout });
if (params.postRestartCheck) {
await params.postRestartCheck({ json, stdout, warnings, fail });
}
let restarted = true;
try {
restarted = await params.service.isLoaded({ env: process.env });

View File

@@ -0,0 +1,131 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
type RestartHealthSnapshot = {
healthy: boolean;
staleGatewayPids: number[];
runtime: { status?: string };
portUsage: { port: number; status: string; listeners: []; hints: []; errors?: string[] };
};
type RestartPostCheckContext = {
json: boolean;
stdout: NodeJS.WritableStream;
warnings: string[];
fail: (message: string, hints?: string[]) => void;
};
type RestartParams = {
opts?: { json?: boolean };
postRestartCheck?: (ctx: RestartPostCheckContext) => Promise<void>;
};
const service = {
readCommand: vi.fn(),
restart: vi.fn(),
};
const runServiceRestart = vi.fn();
const waitForGatewayHealthyRestart = vi.fn();
const terminateStaleGatewayPids = vi.fn();
const renderRestartDiagnostics = vi.fn(() => ["diag: unhealthy runtime"]);
const resolveGatewayPort = vi.fn(() => 18789);
const loadConfig = vi.fn(() => ({}));
vi.mock("../../config/config.js", () => ({
loadConfig: () => loadConfig(),
resolveGatewayPort,
}));
vi.mock("../../daemon/service.js", () => ({
resolveGatewayService: () => service,
}));
vi.mock("./restart-health.js", () => ({
waitForGatewayHealthyRestart,
terminateStaleGatewayPids,
renderRestartDiagnostics,
}));
vi.mock("./lifecycle-core.js", () => ({
runServiceRestart,
runServiceStart: vi.fn(),
runServiceStop: vi.fn(),
runServiceUninstall: vi.fn(),
}));
describe("runDaemonRestart health checks", () => {
beforeEach(() => {
vi.resetModules();
service.readCommand.mockReset();
service.restart.mockReset();
runServiceRestart.mockReset();
waitForGatewayHealthyRestart.mockReset();
terminateStaleGatewayPids.mockReset();
renderRestartDiagnostics.mockClear();
resolveGatewayPort.mockClear();
loadConfig.mockClear();
service.readCommand.mockResolvedValue({
programArguments: ["openclaw", "gateway", "--port", "18789"],
environment: {},
});
runServiceRestart.mockImplementation(async (params: RestartParams) => {
const fail = (message: string, hints?: string[]) => {
const err = new Error(message) as Error & { hints?: string[] };
err.hints = hints;
throw err;
};
await params.postRestartCheck?.({
json: Boolean(params.opts?.json),
stdout: process.stdout,
warnings: [],
fail,
});
return true;
});
});
it("kills stale gateway pids and retries restart", async () => {
const unhealthy: RestartHealthSnapshot = {
healthy: false,
staleGatewayPids: [1993],
runtime: { status: "stopped" },
portUsage: { port: 18789, status: "busy", listeners: [], hints: [] },
};
const healthy: RestartHealthSnapshot = {
healthy: true,
staleGatewayPids: [],
runtime: { status: "running" },
portUsage: { port: 18789, status: "busy", listeners: [], hints: [] },
};
waitForGatewayHealthyRestart.mockResolvedValueOnce(unhealthy).mockResolvedValueOnce(healthy);
terminateStaleGatewayPids.mockResolvedValue([1993]);
const { runDaemonRestart } = await import("./lifecycle.js");
const result = await runDaemonRestart({ json: true });
expect(result).toBe(true);
expect(terminateStaleGatewayPids).toHaveBeenCalledWith([1993]);
expect(service.restart).toHaveBeenCalledTimes(1);
expect(waitForGatewayHealthyRestart).toHaveBeenCalledTimes(2);
});
it("fails restart when gateway remains unhealthy", async () => {
const unhealthy: RestartHealthSnapshot = {
healthy: false,
staleGatewayPids: [],
runtime: { status: "stopped" },
portUsage: { port: 18789, status: "free", listeners: [], hints: [] },
};
waitForGatewayHealthyRestart.mockResolvedValue(unhealthy);
const { runDaemonRestart } = await import("./lifecycle.js");
await expect(runDaemonRestart({ json: true })).rejects.toMatchObject({
message: "Gateway restart failed health checks.",
});
expect(terminateStaleGatewayPids).not.toHaveBeenCalled();
expect(renderRestartDiagnostics).toHaveBeenCalledTimes(1);
});
});

View File

@@ -1,13 +1,38 @@
import { loadConfig, resolveGatewayPort } from "../../config/config.js";
import { resolveGatewayService } from "../../daemon/service.js";
import { defaultRuntime } from "../../runtime.js";
import { theme } from "../../terminal/theme.js";
import { formatCliCommand } from "../command-format.js";
import {
runServiceRestart,
runServiceStart,
runServiceStop,
runServiceUninstall,
} from "./lifecycle-core.js";
import { renderGatewayServiceStartHints } from "./shared.js";
import {
renderRestartDiagnostics,
terminateStaleGatewayPids,
waitForGatewayHealthyRestart,
} from "./restart-health.js";
import { parsePortFromArgs, renderGatewayServiceStartHints } from "./shared.js";
import type { DaemonLifecycleOptions } from "./types.js";
const POST_RESTART_HEALTH_ATTEMPTS = 8;
const POST_RESTART_HEALTH_DELAY_MS = 450;
async function resolveGatewayRestartPort() {
const service = resolveGatewayService();
const command = await service.readCommand(process.env).catch(() => null);
const serviceEnv = command?.environment ?? undefined;
const mergedEnv = {
...(process.env as Record<string, string | undefined>),
...(serviceEnv ?? undefined),
} as NodeJS.ProcessEnv;
const portFromArgs = parsePortFromArgs(command?.programArguments);
return portFromArgs ?? resolveGatewayPort(loadConfig(), mergedEnv);
}
export async function runDaemonUninstall(opts: DaemonLifecycleOptions = {}) {
return await runServiceUninstall({
serviceNoun: "Gateway",
@@ -41,11 +66,62 @@ export async function runDaemonStop(opts: DaemonLifecycleOptions = {}) {
* Throws/exits on check or restart failures.
*/
export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promise<boolean> {
const json = Boolean(opts.json);
const service = resolveGatewayService();
const restartPort = await resolveGatewayRestartPort().catch(() =>
resolveGatewayPort(loadConfig(), process.env),
);
return await runServiceRestart({
serviceNoun: "Gateway",
service: resolveGatewayService(),
service,
renderStartHints: renderGatewayServiceStartHints,
opts,
checkTokenDrift: true,
postRestartCheck: async ({ warnings, fail, stdout }) => {
let health = await waitForGatewayHealthyRestart({
service,
port: restartPort,
attempts: POST_RESTART_HEALTH_ATTEMPTS,
delayMs: POST_RESTART_HEALTH_DELAY_MS,
});
if (!health.healthy && health.staleGatewayPids.length > 0) {
const staleMsg = `Found stale gateway process(es): ${health.staleGatewayPids.join(", ")}.`;
warnings.push(staleMsg);
if (!json) {
defaultRuntime.log(theme.warn(staleMsg));
defaultRuntime.log(theme.muted("Stopping stale process(es) and retrying restart..."));
}
await terminateStaleGatewayPids(health.staleGatewayPids);
await service.restart({ env: process.env, stdout });
health = await waitForGatewayHealthyRestart({
service,
port: restartPort,
attempts: POST_RESTART_HEALTH_ATTEMPTS,
delayMs: POST_RESTART_HEALTH_DELAY_MS,
});
}
if (health.healthy) {
return;
}
const diagnostics = renderRestartDiagnostics(health);
if (!json) {
defaultRuntime.log(theme.warn("Gateway did not become healthy after restart."));
for (const line of diagnostics) {
defaultRuntime.log(theme.muted(line));
}
} else {
warnings.push(...diagnostics);
}
fail("Gateway restart failed health checks.", [
formatCliCommand("openclaw gateway status --probe --deep"),
formatCliCommand("openclaw doctor"),
]);
},
});
}

View File

@@ -0,0 +1,172 @@
import type { GatewayServiceRuntime } from "../../daemon/service-runtime.js";
import type { GatewayService } from "../../daemon/service.js";
import {
classifyPortListener,
formatPortDiagnostics,
inspectPortUsage,
type PortUsage,
} from "../../infra/ports.js";
import { sleep } from "../../utils.js";
export const DEFAULT_RESTART_HEALTH_ATTEMPTS = 8;
export const DEFAULT_RESTART_HEALTH_DELAY_MS = 450;
export type GatewayRestartSnapshot = {
runtime: GatewayServiceRuntime;
portUsage: PortUsage;
healthy: boolean;
staleGatewayPids: number[];
};
export async function inspectGatewayRestart(params: {
service: GatewayService;
port: number;
env?: NodeJS.ProcessEnv;
}): Promise<GatewayRestartSnapshot> {
const env = params.env ?? process.env;
let runtime: GatewayServiceRuntime = { status: "unknown" };
try {
runtime = await params.service.readRuntime(env);
} catch (err) {
runtime = { status: "unknown", detail: String(err) };
}
let portUsage: PortUsage;
try {
portUsage = await inspectPortUsage(params.port);
} catch (err) {
portUsage = {
port: params.port,
status: "unknown",
listeners: [],
hints: [],
errors: [String(err)],
};
}
const gatewayListeners =
portUsage.status === "busy"
? portUsage.listeners.filter(
(listener) => classifyPortListener(listener, params.port) === "gateway",
)
: [];
const running = runtime.status === "running";
const ownsPort =
runtime.pid != null
? portUsage.listeners.some((listener) => listener.pid === runtime.pid)
: gatewayListeners.length > 0 ||
(portUsage.status === "busy" && portUsage.listeners.length === 0);
const healthy = running && ownsPort;
const staleGatewayPids = Array.from(
new Set(
gatewayListeners
.map((listener) => listener.pid)
.filter((pid): pid is number => Number.isFinite(pid))
.filter((pid) => runtime.pid == null || pid !== runtime.pid || !running),
),
);
return {
runtime,
portUsage,
healthy,
staleGatewayPids,
};
}
export async function waitForGatewayHealthyRestart(params: {
service: GatewayService;
port: number;
attempts?: number;
delayMs?: number;
env?: NodeJS.ProcessEnv;
}): Promise<GatewayRestartSnapshot> {
const attempts = params.attempts ?? DEFAULT_RESTART_HEALTH_ATTEMPTS;
const delayMs = params.delayMs ?? DEFAULT_RESTART_HEALTH_DELAY_MS;
let snapshot = await inspectGatewayRestart({
service: params.service,
port: params.port,
env: params.env,
});
for (let attempt = 0; attempt < attempts; attempt += 1) {
if (snapshot.healthy) {
return snapshot;
}
if (snapshot.staleGatewayPids.length > 0 && snapshot.runtime.status !== "running") {
return snapshot;
}
await sleep(delayMs);
snapshot = await inspectGatewayRestart({
service: params.service,
port: params.port,
env: params.env,
});
}
return snapshot;
}
export function renderRestartDiagnostics(snapshot: GatewayRestartSnapshot): string[] {
const lines: string[] = [];
const runtimeSummary = [
snapshot.runtime.status ? `status=${snapshot.runtime.status}` : null,
snapshot.runtime.state ? `state=${snapshot.runtime.state}` : null,
snapshot.runtime.pid != null ? `pid=${snapshot.runtime.pid}` : null,
snapshot.runtime.lastExitStatus != null ? `lastExit=${snapshot.runtime.lastExitStatus}` : null,
]
.filter(Boolean)
.join(", ");
if (runtimeSummary) {
lines.push(`Service runtime: ${runtimeSummary}`);
}
if (snapshot.portUsage.status === "busy") {
lines.push(...formatPortDiagnostics(snapshot.portUsage));
} else {
lines.push(`Gateway port ${snapshot.portUsage.port} status: ${snapshot.portUsage.status}.`);
}
if (snapshot.portUsage.errors?.length) {
lines.push(`Port diagnostics errors: ${snapshot.portUsage.errors.join("; ")}`);
}
return lines;
}
export async function terminateStaleGatewayPids(pids: number[]): Promise<number[]> {
const killed: number[] = [];
for (const pid of pids) {
try {
process.kill(pid, "SIGTERM");
killed.push(pid);
} catch (err) {
const code = (err as NodeJS.ErrnoException)?.code;
if (code !== "ESRCH") {
throw err;
}
}
}
if (killed.length === 0) {
return killed;
}
await sleep(400);
for (const pid of killed) {
try {
process.kill(pid, 0);
process.kill(pid, "SIGKILL");
} catch (err) {
const code = (err as NodeJS.ErrnoException)?.code;
if (code !== "ESRCH") {
throw err;
}
}
}
return killed;
}