From e93ba6ce2af3c7cecf36e3fe347a394b21bafcb1 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 21 Feb 2026 17:40:17 +0100 Subject: [PATCH] fix: harden update restart service convergence --- src/cli/update-cli.test.ts | 60 +++++++ src/cli/update-cli/update-command.ts | 232 +++++++++++++++++++++++++-- 2 files changed, 283 insertions(+), 9 deletions(-) diff --git a/src/cli/update-cli.test.ts b/src/cli/update-cli.test.ts index 330d5d292..85a3dac2d 100644 --- a/src/cli/update-cli.test.ts +++ b/src/cli/update-cli.test.ts @@ -1,3 +1,4 @@ +import fs from "node:fs/promises"; import path from "node:path"; import { beforeEach, describe, expect, it, vi } from "vitest"; import type { OpenClawConfig, ConfigFileSnapshot } from "../config/types.openclaw.js"; @@ -16,6 +17,10 @@ const serviceLoaded = vi.fn(); const prepareRestartScript = vi.fn(); const runRestartScript = vi.fn(); const mockedRunDaemonInstall = vi.fn(); +const serviceReadRuntime = vi.fn(); +const inspectPortUsage = vi.fn(); +const classifyPortListener = vi.fn(); +const formatPortDiagnostics = vi.fn(); vi.mock("@clack/prompts", () => ({ confirm, @@ -35,6 +40,7 @@ vi.mock("../infra/openclaw-root.js", () => ({ vi.mock("../config/config.js", () => ({ readConfigFileSnapshot: vi.fn(), + resolveGatewayPort: vi.fn(() => 18789), writeConfigFile: vi.fn(), })); @@ -80,9 +86,16 @@ vi.mock("./update-cli/shared.js", async (importOriginal) => { vi.mock("../daemon/service.js", () => ({ resolveGatewayService: vi.fn(() => ({ isLoaded: (...args: unknown[]) => serviceLoaded(...args), + readRuntime: (...args: unknown[]) => serviceReadRuntime(...args), })), })); +vi.mock("../infra/ports.js", () => ({ + inspectPortUsage: (...args: unknown[]) => inspectPortUsage(...args), + classifyPortListener: (...args: unknown[]) => classifyPortListener(...args), + formatPortDiagnostics: (...args: unknown[]) => formatPortDiagnostics(...args), +})); + vi.mock("./update-cli/restart-helper.js", () => ({ prepareRestartScript: (...args: unknown[]) => prepareRestartScript(...args), runRestartScript: (...args: unknown[]) => runRestartScript(...args), @@ -230,8 +243,12 @@ describe("update-cli", () => { readPackageVersion.mockReset(); resolveGlobalManager.mockReset(); serviceLoaded.mockReset(); + serviceReadRuntime.mockReset(); prepareRestartScript.mockReset(); runRestartScript.mockReset(); + inspectPortUsage.mockReset(); + classifyPortListener.mockReset(); + formatPortDiagnostics.mockReset(); vi.mocked(resolveOpenClawPackageRoot).mockResolvedValue(process.cwd()); vi.mocked(readConfigFileSnapshot).mockResolvedValue(baseSnapshot); vi.mocked(fetchNpmTagVersion).mockResolvedValue({ @@ -279,8 +296,21 @@ describe("update-cli", () => { readPackageVersion.mockResolvedValue("1.0.0"); resolveGlobalManager.mockResolvedValue("npm"); serviceLoaded.mockResolvedValue(false); + serviceReadRuntime.mockResolvedValue({ + status: "running", + pid: 4242, + state: "running", + }); prepareRestartScript.mockResolvedValue("/tmp/openclaw-restart-test.sh"); runRestartScript.mockResolvedValue(undefined); + inspectPortUsage.mockResolvedValue({ + port: 18789, + status: "busy", + listeners: [{ pid: 4242, command: "openclaw-gateway" }], + hints: [], + }); + classifyPortListener.mockReturnValue("gateway"); + formatPortDiagnostics.mockReturnValue(["Port 18789 is already in use."]); vi.mocked(runDaemonInstall).mockResolvedValue(undefined); setTty(false); setStdoutTty(false); @@ -486,6 +516,36 @@ describe("update-cli", () => { expect(runDaemonRestart).not.toHaveBeenCalled(); }); + it("updateCommand refreshes service env from updated install root when available", async () => { + const root = createCaseDir("openclaw-updated-root"); + await fs.mkdir(path.join(root, "dist"), { recursive: true }); + await fs.writeFile(path.join(root, "dist", "entry.js"), "console.log('ok');\n", "utf8"); + + vi.mocked(runGatewayUpdate).mockResolvedValue({ + status: "ok", + mode: "npm", + root, + steps: [], + durationMs: 100, + }); + serviceLoaded.mockResolvedValue(true); + + await updateCommand({}); + + expect(runCommandWithTimeout).toHaveBeenCalledWith( + [ + expect.stringMatching(/node/), + path.join(root, "dist", "entry.js"), + "gateway", + "install", + "--force", + ], + expect.objectContaining({ timeoutMs: 60_000 }), + ); + expect(runDaemonInstall).not.toHaveBeenCalled(); + expect(runRestartScript).toHaveBeenCalled(); + }); + it("updateCommand falls back to restart when env refresh install fails", async () => { const mockResult: UpdateRunResult = { status: "ok", diff --git a/src/cli/update-cli/update-command.ts b/src/cli/update-cli/update-command.ts index 469b32b45..4a20a7c75 100644 --- a/src/cli/update-cli/update-command.ts +++ b/src/cli/update-cli/update-command.ts @@ -5,8 +5,19 @@ import { ensureCompletionCacheExists, } from "../../commands/doctor-completion.js"; import { doctorCommand } from "../../commands/doctor.js"; -import { readConfigFileSnapshot, writeConfigFile } from "../../config/config.js"; +import { + readConfigFileSnapshot, + resolveGatewayPort, + writeConfigFile, +} from "../../config/config.js"; +import type { GatewayServiceRuntime } from "../../daemon/service-runtime.js"; import { resolveGatewayService } from "../../daemon/service.js"; +import { + classifyPortListener, + formatPortDiagnostics, + inspectPortUsage, + type PortUsage, +} from "../../infra/ports.js"; import { channelToNpmTag, DEFAULT_GIT_CHANNEL, @@ -29,7 +40,7 @@ import { runCommandWithTimeout } from "../../process/exec.js"; import { defaultRuntime } from "../../runtime.js"; import { stylePromptMessage } from "../../terminal/prompt-style.js"; import { theme } from "../../terminal/theme.js"; -import { pathExists } from "../../utils.js"; +import { pathExists, sleep } from "../../utils.js"; import { replaceCliName, resolveCliName } from "../cli-name.js"; import { formatCliCommand } from "../command-format.js"; import { installCompletion } from "../completion-cli.js"; @@ -55,6 +66,9 @@ import { import { suppressDeprecations } from "./suppress-deprecations.js"; const CLI_NAME = resolveCliName(); +const SERVICE_REFRESH_TIMEOUT_MS = 60_000; +const POST_RESTART_HEALTH_ATTEMPTS = 8; +const POST_RESTART_HEALTH_DELAY_MS = 450; const UPDATE_QUIPS = [ "Leveled up! New skills unlocked. You're welcome.", @@ -83,6 +97,180 @@ function pickUpdateQuip(): string { return UPDATE_QUIPS[Math.floor(Math.random() * UPDATE_QUIPS.length)] ?? "Update complete."; } +type GatewayRestartSnapshot = { + runtime: GatewayServiceRuntime; + portUsage: PortUsage; + healthy: boolean; + staleGatewayPids: number[]; +}; + +function resolveGatewayInstallEntrypointCandidates(root?: string): string[] { + if (!root) { + return []; + } + return [ + path.join(root, "dist", "entry.js"), + path.join(root, "dist", "entry.mjs"), + path.join(root, "dist", "index.js"), + path.join(root, "dist", "index.mjs"), + ]; +} + +function formatCommandFailure(stdout: string, stderr: string): string { + const detail = (stderr || stdout).trim(); + if (!detail) { + return "command returned a non-zero exit code"; + } + return detail.split("\n").slice(-3).join("\n"); +} + +async function refreshGatewayServiceEnv(params: { + result: UpdateRunResult; + jsonMode: boolean; +}): Promise { + const args = ["gateway", "install", "--force"]; + if (params.jsonMode) { + args.push("--json"); + } + + for (const candidate of resolveGatewayInstallEntrypointCandidates(params.result.root)) { + if (!(await pathExists(candidate))) { + continue; + } + const res = await runCommandWithTimeout([resolveNodeRunner(), candidate, ...args], { + timeoutMs: SERVICE_REFRESH_TIMEOUT_MS, + }); + if (res.code === 0) { + return; + } + throw new Error( + `updated install refresh failed (${candidate}): ${formatCommandFailure(res.stdout, res.stderr)}`, + ); + } + + await runDaemonInstall({ force: true, json: params.jsonMode || undefined }); +} + +async function inspectGatewayRestart(port: number): Promise { + const service = resolveGatewayService(); + let runtime: GatewayServiceRuntime = { status: "unknown" }; + try { + runtime = await service.readRuntime(process.env); + } catch (err) { + runtime = { status: "unknown", detail: String(err) }; + } + + let portUsage: PortUsage; + try { + portUsage = await inspectPortUsage(port); + } catch (err) { + portUsage = { + port, + status: "unknown", + listeners: [], + hints: [], + errors: [String(err)], + }; + } + + const gatewayListeners = + portUsage.status === "busy" + ? portUsage.listeners.filter((listener) => classifyPortListener(listener, port) === "gateway") + : []; + const running = runtime.status === "running"; + const ownsPort = + runtime.pid != null + ? portUsage.listeners.some((listener) => listener.pid === runtime.pid) + : gatewayListeners.length > 0 || + (portUsage.status === "busy" && portUsage.listeners.length === 0); + const healthy = running && ownsPort; + const staleGatewayPids = Array.from( + new Set( + gatewayListeners + .map((listener) => listener.pid) + .filter((pid): pid is number => Number.isFinite(pid)) + .filter((pid) => runtime.pid == null || pid !== runtime.pid || !running), + ), + ); + + return { + runtime, + portUsage, + healthy, + staleGatewayPids, + }; +} + +async function waitForGatewayHealthyRestart(port: number): Promise { + let snapshot = await inspectGatewayRestart(port); + for (let attempt = 0; attempt < POST_RESTART_HEALTH_ATTEMPTS; attempt += 1) { + if (snapshot.healthy) { + return snapshot; + } + if (snapshot.staleGatewayPids.length > 0 && snapshot.runtime.status !== "running") { + return snapshot; + } + await sleep(POST_RESTART_HEALTH_DELAY_MS); + snapshot = await inspectGatewayRestart(port); + } + return snapshot; +} + +function renderRestartDiagnostics(snapshot: GatewayRestartSnapshot): string[] { + const lines: string[] = []; + const runtimeSummary = [ + snapshot.runtime.status ? `status=${snapshot.runtime.status}` : null, + snapshot.runtime.state ? `state=${snapshot.runtime.state}` : null, + snapshot.runtime.pid != null ? `pid=${snapshot.runtime.pid}` : null, + snapshot.runtime.lastExitStatus != null ? `lastExit=${snapshot.runtime.lastExitStatus}` : null, + ] + .filter(Boolean) + .join(", "); + if (runtimeSummary) { + lines.push(`Service runtime: ${runtimeSummary}`); + } + if (snapshot.portUsage.status === "busy") { + lines.push(...formatPortDiagnostics(snapshot.portUsage)); + } else { + lines.push(`Gateway port ${snapshot.portUsage.port} status: ${snapshot.portUsage.status}.`); + } + if (snapshot.portUsage.errors?.length) { + lines.push(`Port diagnostics errors: ${snapshot.portUsage.errors.join("; ")}`); + } + return lines; +} + +async function terminateStaleGatewayPids(pids: number[]): Promise { + const killed: number[] = []; + for (const pid of pids) { + try { + process.kill(pid, "SIGTERM"); + killed.push(pid); + } catch (err) { + const code = (err as NodeJS.ErrnoException)?.code; + if (code !== "ESRCH") { + throw err; + } + } + } + if (killed.length === 0) { + return killed; + } + await sleep(400); + for (const pid of killed) { + try { + process.kill(pid, 0); + process.kill(pid, "SIGKILL"); + } catch (err) { + const code = (err as NodeJS.ErrnoException)?.code; + if (code !== "ESRCH") { + throw err; + } + } + } + return killed; +} + async function tryInstallShellCompletion(opts: { jsonMode: boolean; skipPrompt: boolean; @@ -392,6 +580,7 @@ async function maybeRestartService(params: { result: UpdateRunResult; opts: UpdateCommandOptions; refreshServiceEnv: boolean; + gatewayPort: number; restartScriptPath?: string | null; }): Promise { if (params.shouldRestart) { @@ -405,7 +594,10 @@ async function maybeRestartService(params: { let restartInitiated = false; if (params.refreshServiceEnv) { try { - await runDaemonInstall({ force: true, json: params.opts.json }); + await refreshGatewayServiceEnv({ + result: params.result, + jsonMode: Boolean(params.opts.json), + }); } catch (err) { if (!params.opts.json) { defaultRuntime.log( @@ -441,12 +633,33 @@ async function maybeRestartService(params: { } if (!params.opts.json && restartInitiated) { - defaultRuntime.log(theme.success("Daemon restart initiated.")); - defaultRuntime.log( - theme.muted( - `Verify with \`${replaceCliName(formatCliCommand("openclaw gateway status"), CLI_NAME)}\` once the gateway is back.`, - ), - ); + let health = await waitForGatewayHealthyRestart(params.gatewayPort); + if (!health.healthy && health.staleGatewayPids.length > 0) { + if (!params.opts.json) { + defaultRuntime.log( + theme.warn( + `Found stale gateway process(es) after restart: ${health.staleGatewayPids.join(", ")}. Cleaning up...`, + ), + ); + } + await terminateStaleGatewayPids(health.staleGatewayPids); + await runDaemonRestart(); + health = await waitForGatewayHealthyRestart(params.gatewayPort); + } + + if (health.healthy) { + defaultRuntime.log(theme.success("Daemon restart completed.")); + } else { + defaultRuntime.log(theme.warn("Gateway did not become healthy after restart.")); + for (const line of renderRestartDiagnostics(health)) { + defaultRuntime.log(theme.muted(line)); + } + defaultRuntime.log( + theme.muted( + `Run \`${replaceCliName(formatCliCommand("openclaw gateway status --probe --deep"), CLI_NAME)}\` for details.`, + ), + ); + } defaultRuntime.log(""); } } catch (err) { @@ -686,6 +899,7 @@ export async function updateCommand(opts: UpdateCommandOptions): Promise { result, opts, refreshServiceEnv: refreshGatewayServiceEnv, + gatewayPort: resolveGatewayPort(configSnapshot.valid ? configSnapshot.config : undefined), restartScriptPath, });