From bf9c362129e293fdab75dc440d7d12d81745e3a1 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 7 Mar 2026 21:20:29 -0500 Subject: [PATCH] Gateway: stop and restart unmanaged listeners (#39355) * Daemon: allow unmanaged gateway lifecycle fallback * Status: fix service summary formatting * Changelog: note unmanaged gateway lifecycle fallback * Tests: cover unmanaged gateway lifecycle fallback * Daemon: split unmanaged restart health checks * Daemon: harden unmanaged gateway signaling * Daemon: reject unmanaged restarts when disabled --- CHANGELOG.md | 1 + src/cli/daemon-cli/lifecycle-core.test.ts | 48 ++++- src/cli/daemon-cli/lifecycle-core.ts | 87 ++++++++-- src/cli/daemon-cli/lifecycle.test.ts | 203 ++++++++++++++++++++-- src/cli/daemon-cli/lifecycle.ts | 196 ++++++++++++++++++++- src/cli/daemon-cli/restart-health.ts | 68 ++++++++ 6 files changed, 569 insertions(+), 34 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bcc9d6a9c..705ca9d6b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -260,6 +260,7 @@ Docs: https://docs.openclaw.ai - Sessions/bootstrap cache rollover invalidation: clear cached workspace bootstrap snapshots whenever an existing `sessionKey` rolls to a new `sessionId` across auto-reply, command, and isolated cron session resolvers, so `AGENTS.md`/`MEMORY.md`/`USER.md` updates are reloaded after daily, idle, or forced session resets instead of staying stale until gateway restart. (#38494) Thanks @LivingInDrm. - Gateway/Telegram polling health monitor: skip stale-socket restarts for Telegram long-polling channels and thread channel identity through shared health evaluation so polling connections are not restarted on the WebSocket stale-socket heuristic. (#38395) Thanks @ql-wade and @Takhoffman. - Daemon/systemd fresh-install probe: check for OpenClaw's managed user unit before running `systemctl --user is-enabled`, so first-time Linux installs no longer fail on generic missing-unit probe errors. (#38819) Thanks @adaHubble. +- Gateway/container lifecycle: allow `openclaw gateway stop` to SIGTERM unmanaged gateway listeners and `openclaw gateway restart` to SIGUSR1 a single unmanaged listener when no service manager is installed, so container and supervisor-based deployments are no longer blocked by `service disabled` no-op responses. Fixes #36137. Thanks @vincentkoc. - Gateway/Windows restart supervision: relaunch task-managed gateways through Scheduled Task with quoted helper-script command paths, distinguish restart-capable supervisors per platform, and stop orphaned Windows gateway children during self-restart. (#38825) Thanks @obviyus. - Telegram/native topic command routing: resolve forum-topic native commands through the same conversation route as inbound messages so topic `agentId` overrides and bound topic sessions target the active session instead of the default topic-parent session. (#38871) Thanks @obviyus. - Markdown/assistant image hardening: flatten remote markdown images to plain text across the Control UI, exported HTML, and shared Swift chat while keeping inline `data:image/...` markdown renderable, so model output no longer triggers automatic remote image fetches. (#38895) Thanks @obviyus. diff --git a/src/cli/daemon-cli/lifecycle-core.test.ts b/src/cli/daemon-cli/lifecycle-core.test.ts index 989a4e772..8fa7ded1b 100644 --- a/src/cli/daemon-cli/lifecycle-core.test.ts +++ b/src/cli/daemon-cli/lifecycle-core.test.ts @@ -40,10 +40,11 @@ vi.mock("../../runtime.js", () => ({ })); let runServiceRestart: typeof import("./lifecycle-core.js").runServiceRestart; +let runServiceStop: typeof import("./lifecycle-core.js").runServiceStop; describe("runServiceRestart token drift", () => { beforeAll(async () => { - ({ runServiceRestart } = await import("./lifecycle-core.js")); + ({ runServiceRestart, runServiceStop } = await import("./lifecycle-core.js")); }); beforeEach(() => { @@ -130,4 +131,49 @@ describe("runServiceRestart token drift", () => { const payload = JSON.parse(jsonLine ?? "{}") as { warnings?: string[] }; expect(payload.warnings).toBeUndefined(); }); + + it("emits stopped when an unmanaged process handles stop", async () => { + service.isLoaded.mockResolvedValue(false); + + await runServiceStop({ + serviceNoun: "Gateway", + service, + opts: { json: true }, + onNotLoaded: async () => ({ + result: "stopped", + message: "Gateway stop signal sent to unmanaged process on port 18789: 4200.", + }), + }); + + const jsonLine = runtimeLogs.find((line) => line.trim().startsWith("{")); + const payload = JSON.parse(jsonLine ?? "{}") as { result?: string; message?: string }; + expect(payload.result).toBe("stopped"); + expect(payload.message).toContain("unmanaged process"); + expect(service.stop).not.toHaveBeenCalled(); + }); + + it("runs restart health checks after an unmanaged restart signal", async () => { + const postRestartCheck = vi.fn(async () => {}); + service.isLoaded.mockResolvedValue(false); + + await runServiceRestart({ + serviceNoun: "Gateway", + service, + renderStartHints: () => [], + opts: { json: true }, + onNotLoaded: async () => ({ + result: "restarted", + message: "Gateway restart signal sent to unmanaged process on port 18789: 4200.", + }), + postRestartCheck, + }); + + expect(postRestartCheck).toHaveBeenCalledTimes(1); + expect(service.restart).not.toHaveBeenCalled(); + expect(service.readCommand).not.toHaveBeenCalled(); + const jsonLine = runtimeLogs.find((line) => line.trim().startsWith("{")); + const payload = JSON.parse(jsonLine ?? "{}") as { result?: string; message?: string }; + expect(payload.result).toBe("restarted"); + expect(payload.message).toContain("unmanaged process"); + }); }); diff --git a/src/cli/daemon-cli/lifecycle-core.ts b/src/cli/daemon-cli/lifecycle-core.ts index 143a5f9c4..00d70f24a 100644 --- a/src/cli/daemon-cli/lifecycle-core.ts +++ b/src/cli/daemon-cli/lifecycle-core.ts @@ -28,6 +28,18 @@ type RestartPostCheckContext = { fail: (message: string, hints?: string[]) => void; }; +type NotLoadedActionResult = { + result: "stopped" | "restarted"; + message?: string; + warnings?: string[]; +}; + +type NotLoadedActionContext = { + json: boolean; + stdout: Writable; + fail: (message: string, hints?: string[]) => void; +}; + async function maybeAugmentSystemdHints(hints: string[]): Promise { if (process.platform !== "linux") { return hints; @@ -200,6 +212,7 @@ export async function runServiceStop(params: { serviceNoun: string; service: GatewayService; opts?: DaemonLifecycleOptions; + onNotLoaded?: (ctx: NotLoadedActionContext) => Promise; }) { const json = Boolean(params.opts?.json); const { stdout, emit, fail } = createActionIO({ action: "stop", json }); @@ -213,6 +226,25 @@ export async function runServiceStop(params: { return; } if (!loaded) { + try { + const handled = await params.onNotLoaded?.({ json, stdout, fail }); + if (handled) { + emit({ + ok: true, + result: handled.result, + message: handled.message, + warnings: handled.warnings, + service: buildDaemonServiceSnapshot(params.service, false), + }); + if (!json && handled.message) { + defaultRuntime.log(handled.message); + } + return; + } + } catch (err) { + fail(`${params.serviceNoun} stop failed: ${String(err)}`); + return; + } emit({ ok: true, result: "not-loaded", @@ -251,9 +283,12 @@ export async function runServiceRestart(params: { opts?: DaemonLifecycleOptions; checkTokenDrift?: boolean; postRestartCheck?: (ctx: RestartPostCheckContext) => Promise; + onNotLoaded?: (ctx: NotLoadedActionContext) => Promise; }): Promise { const json = Boolean(params.opts?.json); const { stdout, emit, fail } = createActionIO({ action: "restart", json }); + const warnings: string[] = []; + let handledNotLoaded: NotLoadedActionResult | null = null; const loaded = await resolveServiceLoadedOrFail({ serviceNoun: params.serviceNoun, @@ -264,19 +299,29 @@ export async function runServiceRestart(params: { return false; } if (!loaded) { - await handleServiceNotLoaded({ - serviceNoun: params.serviceNoun, - service: params.service, - loaded, - renderStartHints: params.renderStartHints, - json, - emit, - }); - return false; + try { + handledNotLoaded = (await params.onNotLoaded?.({ json, stdout, fail })) ?? null; + } catch (err) { + fail(`${params.serviceNoun} restart failed: ${String(err)}`); + return false; + } + if (!handledNotLoaded) { + await handleServiceNotLoaded({ + serviceNoun: params.serviceNoun, + service: params.service, + loaded, + renderStartHints: params.renderStartHints, + json, + emit, + }); + return false; + } + if (handledNotLoaded.warnings?.length) { + warnings.push(...handledNotLoaded.warnings); + } } - const warnings: string[] = []; - if (params.checkTokenDrift) { + if (loaded && params.checkTokenDrift) { // Check for token drift before restart (service token vs config token) try { const command = await params.service.readCommand(process.env); @@ -309,22 +354,30 @@ export async function runServiceRestart(params: { } try { - await params.service.restart({ env: process.env, stdout }); + if (loaded) { + await params.service.restart({ env: process.env, stdout }); + } if (params.postRestartCheck) { await params.postRestartCheck({ json, stdout, warnings, fail }); } - let restarted = true; - try { - restarted = await params.service.isLoaded({ env: process.env }); - } catch { - restarted = true; + let restarted = loaded; + if (loaded) { + try { + restarted = await params.service.isLoaded({ env: process.env }); + } catch { + restarted = true; + } } emit({ ok: true, result: "restarted", + message: handledNotLoaded?.message, service: buildDaemonServiceSnapshot(params.service, restarted), warnings: warnings.length ? warnings : undefined, }); + if (!json && handledNotLoaded?.message) { + defaultRuntime.log(handledNotLoaded.message); + } return true; } catch (err) { const hints = params.renderStartHints(); diff --git a/src/cli/daemon-cli/lifecycle.test.ts b/src/cli/daemon-cli/lifecycle.test.ts index 70d94c063..853a8b7d9 100644 --- a/src/cli/daemon-cli/lifecycle.test.ts +++ b/src/cli/daemon-cli/lifecycle.test.ts @@ -1,4 +1,7 @@ -import { beforeAll, beforeEach, describe, expect, it, vi } from "vitest"; +import { afterEach, beforeAll, beforeEach, describe, expect, it, vi } from "vitest"; + +const mockReadFileSync = vi.hoisted(() => vi.fn()); +const mockSpawnSync = vi.hoisted(() => vi.fn()); type RestartHealthSnapshot = { healthy: boolean; @@ -25,18 +28,59 @@ const service = { }; const runServiceRestart = vi.fn(); +const runServiceStop = vi.fn(); +const waitForGatewayHealthyListener = vi.fn(); const waitForGatewayHealthyRestart = vi.fn(); const terminateStaleGatewayPids = vi.fn(); +const renderGatewayPortHealthDiagnostics = vi.fn(() => ["diag: unhealthy port"]); const renderRestartDiagnostics = vi.fn(() => ["diag: unhealthy runtime"]); const resolveGatewayPort = vi.fn(() => 18789); +const findGatewayPidsOnPortSync = vi.fn<(port: number) => number[]>(() => []); +const probeGateway = vi.fn< + (opts: { + url: string; + auth?: { token?: string; password?: string }; + timeoutMs: number; + }) => Promise<{ + ok: boolean; + configSnapshot: unknown; + }> +>(); +const isRestartEnabled = vi.fn<(config?: { commands?: unknown }) => boolean>(() => true); const loadConfig = vi.fn(() => ({})); +vi.mock("node:fs", () => ({ + default: { + readFileSync: (...args: unknown[]) => mockReadFileSync(...args), + }, +})); + +vi.mock("node:child_process", () => ({ + spawnSync: (...args: unknown[]) => mockSpawnSync(...args), +})); + vi.mock("../../config/config.js", () => ({ loadConfig: () => loadConfig(), readBestEffortConfig: async () => loadConfig(), resolveGatewayPort, })); +vi.mock("../../infra/restart.js", () => ({ + findGatewayPidsOnPortSync: (port: number) => findGatewayPidsOnPortSync(port), +})); + +vi.mock("../../gateway/probe.js", () => ({ + probeGateway: (opts: { + url: string; + auth?: { token?: string; password?: string }; + timeoutMs: number; + }) => probeGateway(opts), +})); + +vi.mock("../../config/commands.js", () => ({ + isRestartEnabled: (config?: { commands?: unknown }) => isRestartEnabled(config), +})); + vi.mock("../../daemon/service.js", () => ({ resolveGatewayService: () => service, })); @@ -44,7 +88,9 @@ vi.mock("../../daemon/service.js", () => ({ vi.mock("./restart-health.js", () => ({ DEFAULT_RESTART_HEALTH_ATTEMPTS: 120, DEFAULT_RESTART_HEALTH_DELAY_MS: 500, + waitForGatewayHealthyListener, waitForGatewayHealthyRestart, + renderGatewayPortHealthDiagnostics, terminateStaleGatewayPids, renderRestartDiagnostics, })); @@ -52,26 +98,35 @@ vi.mock("./restart-health.js", () => ({ vi.mock("./lifecycle-core.js", () => ({ runServiceRestart, runServiceStart: vi.fn(), - runServiceStop: vi.fn(), + runServiceStop, runServiceUninstall: vi.fn(), })); describe("runDaemonRestart health checks", () => { let runDaemonRestart: (opts?: { json?: boolean }) => Promise; + let runDaemonStop: (opts?: { json?: boolean }) => Promise; beforeAll(async () => { - ({ runDaemonRestart } = await import("./lifecycle.js")); + ({ runDaemonRestart, runDaemonStop } = await import("./lifecycle.js")); }); beforeEach(() => { - service.readCommand.mockClear(); - service.restart.mockClear(); - runServiceRestart.mockClear(); - waitForGatewayHealthyRestart.mockClear(); - terminateStaleGatewayPids.mockClear(); - renderRestartDiagnostics.mockClear(); - resolveGatewayPort.mockClear(); - loadConfig.mockClear(); + service.readCommand.mockReset(); + service.restart.mockReset(); + runServiceRestart.mockReset(); + runServiceStop.mockReset(); + waitForGatewayHealthyListener.mockReset(); + waitForGatewayHealthyRestart.mockReset(); + terminateStaleGatewayPids.mockReset(); + renderGatewayPortHealthDiagnostics.mockReset(); + renderRestartDiagnostics.mockReset(); + resolveGatewayPort.mockReset(); + findGatewayPidsOnPortSync.mockReset(); + probeGateway.mockReset(); + isRestartEnabled.mockReset(); + loadConfig.mockReset(); + mockReadFileSync.mockReset(); + mockSpawnSync.mockReset(); service.readCommand.mockResolvedValue({ programArguments: ["openclaw", "gateway", "--port", "18789"], @@ -92,6 +147,37 @@ describe("runDaemonRestart health checks", () => { }); return true; }); + runServiceStop.mockResolvedValue(undefined); + waitForGatewayHealthyListener.mockResolvedValue({ + healthy: true, + portUsage: { port: 18789, status: "busy", listeners: [], hints: [] }, + }); + probeGateway.mockResolvedValue({ + ok: true, + configSnapshot: { commands: { restart: true } }, + }); + isRestartEnabled.mockReturnValue(true); + mockReadFileSync.mockImplementation((path: string) => { + const match = path.match(/\/proc\/(\d+)\/cmdline$/); + if (!match) { + throw new Error(`unexpected path ${path}`); + } + const pid = Number.parseInt(match[1] ?? "", 10); + if ([4200, 4300].includes(pid)) { + return ["openclaw", "gateway", "--port", "18789", ""].join("\0"); + } + throw new Error(`unknown pid ${pid}`); + }); + mockSpawnSync.mockReturnValue({ + error: null, + status: 0, + stdout: "openclaw gateway --port 18789", + stderr: "", + }); + }); + + afterEach(() => { + vi.restoreAllMocks(); }); it("kills stale gateway pids and retries restart", async () => { @@ -134,4 +220,99 @@ describe("runDaemonRestart health checks", () => { expect(terminateStaleGatewayPids).not.toHaveBeenCalled(); expect(renderRestartDiagnostics).toHaveBeenCalledTimes(1); }); + + it("signals an unmanaged gateway process on stop", async () => { + const killSpy = vi.spyOn(process, "kill").mockImplementation(() => true); + findGatewayPidsOnPortSync.mockReturnValue([4200, 4200, 4300]); + runServiceStop.mockImplementation(async (params: { onNotLoaded?: () => Promise }) => { + await params.onNotLoaded?.(); + }); + + await runDaemonStop({ json: true }); + + expect(findGatewayPidsOnPortSync).toHaveBeenCalledWith(18789); + expect(killSpy).toHaveBeenCalledWith(4200, "SIGTERM"); + expect(killSpy).toHaveBeenCalledWith(4300, "SIGTERM"); + }); + + it("signals a single unmanaged gateway process on restart", async () => { + const killSpy = vi.spyOn(process, "kill").mockImplementation(() => true); + findGatewayPidsOnPortSync.mockReturnValue([4200]); + runServiceRestart.mockImplementation( + async (params: RestartParams & { onNotLoaded?: () => Promise }) => { + await params.onNotLoaded?.(); + await params.postRestartCheck?.({ + json: Boolean(params.opts?.json), + stdout: process.stdout, + warnings: [], + fail: (message: string) => { + throw new Error(message); + }, + }); + return true; + }, + ); + + await runDaemonRestart({ json: true }); + + expect(findGatewayPidsOnPortSync).toHaveBeenCalledWith(18789); + expect(killSpy).toHaveBeenCalledWith(4200, "SIGUSR1"); + expect(probeGateway).toHaveBeenCalledTimes(1); + expect(waitForGatewayHealthyListener).toHaveBeenCalledTimes(1); + expect(waitForGatewayHealthyRestart).not.toHaveBeenCalled(); + expect(terminateStaleGatewayPids).not.toHaveBeenCalled(); + expect(service.restart).not.toHaveBeenCalled(); + }); + + it("fails unmanaged restart when multiple gateway listeners are present", async () => { + findGatewayPidsOnPortSync.mockReturnValue([4200, 4300]); + runServiceRestart.mockImplementation( + async (params: RestartParams & { onNotLoaded?: () => Promise }) => { + await params.onNotLoaded?.(); + return true; + }, + ); + + await expect(runDaemonRestart({ json: true })).rejects.toThrow( + "multiple gateway processes are listening on port 18789", + ); + }); + + it("fails unmanaged restart when the running gateway has commands.restart disabled", async () => { + findGatewayPidsOnPortSync.mockReturnValue([4200]); + probeGateway.mockResolvedValue({ + ok: true, + configSnapshot: { commands: { restart: false } }, + }); + isRestartEnabled.mockReturnValue(false); + runServiceRestart.mockImplementation( + async (params: RestartParams & { onNotLoaded?: () => Promise }) => { + await params.onNotLoaded?.(); + return true; + }, + ); + + await expect(runDaemonRestart({ json: true })).rejects.toThrow( + "Gateway restart is disabled in the running gateway config", + ); + }); + + it("skips unmanaged signaling for pids that are not live gateway processes", async () => { + const killSpy = vi.spyOn(process, "kill").mockImplementation(() => true); + findGatewayPidsOnPortSync.mockReturnValue([4200]); + mockReadFileSync.mockReturnValue(["python", "-m", "http.server", ""].join("\0")); + mockSpawnSync.mockReturnValue({ + error: null, + status: 0, + stdout: "python -m http.server", + stderr: "", + }); + runServiceStop.mockImplementation(async (params: { onNotLoaded?: () => Promise }) => { + await params.onNotLoaded?.(); + }); + + await runDaemonStop({ json: true }); + + expect(killSpy).not.toHaveBeenCalled(); + }); }); diff --git a/src/cli/daemon-cli/lifecycle.ts b/src/cli/daemon-cli/lifecycle.ts index 5088316a0..1ca321b0d 100644 --- a/src/cli/daemon-cli/lifecycle.ts +++ b/src/cli/daemon-cli/lifecycle.ts @@ -1,5 +1,10 @@ +import { spawnSync } from "node:child_process"; +import fsSync from "node:fs"; +import { isRestartEnabled } from "../../config/commands.js"; import { readBestEffortConfig, resolveGatewayPort } from "../../config/config.js"; import { resolveGatewayService } from "../../daemon/service.js"; +import { probeGateway } from "../../gateway/probe.js"; +import { findGatewayPidsOnPortSync } from "../../infra/restart.js"; import { defaultRuntime } from "../../runtime.js"; import { theme } from "../../terminal/theme.js"; import { formatCliCommand } from "../command-format.js"; @@ -12,8 +17,10 @@ import { import { DEFAULT_RESTART_HEALTH_ATTEMPTS, DEFAULT_RESTART_HEALTH_DELAY_MS, + renderGatewayPortHealthDiagnostics, renderRestartDiagnostics, terminateStaleGatewayPids, + waitForGatewayHealthyListener, waitForGatewayHealthyRestart, } from "./restart-health.js"; import { parsePortFromArgs, renderGatewayServiceStartHints } from "./shared.js"; @@ -22,8 +29,7 @@ import type { DaemonLifecycleOptions } from "./types.js"; const POST_RESTART_HEALTH_ATTEMPTS = DEFAULT_RESTART_HEALTH_ATTEMPTS; const POST_RESTART_HEALTH_DELAY_MS = DEFAULT_RESTART_HEALTH_DELAY_MS; -async function resolveGatewayRestartPort() { - const service = resolveGatewayService(); +async function resolveGatewayLifecyclePort(service = resolveGatewayService()) { const command = await service.readCommand(process.env).catch(() => null); const serviceEnv = command?.environment ?? undefined; const mergedEnv = { @@ -35,6 +41,145 @@ async function resolveGatewayRestartPort() { return portFromArgs ?? resolveGatewayPort(await readBestEffortConfig(), mergedEnv); } +function normalizeProcArg(arg: string): string { + return arg.replaceAll("\\", "/").toLowerCase(); +} + +function parseProcCmdline(raw: string): string[] { + return raw + .split("\0") + .map((entry) => entry.trim()) + .filter(Boolean); +} + +function isGatewayArgv(args: string[]): boolean { + const normalized = args.map(normalizeProcArg); + if (!normalized.includes("gateway")) { + return false; + } + + const entryCandidates = [ + "dist/index.js", + "dist/entry.js", + "openclaw.mjs", + "scripts/run-node.mjs", + "src/index.ts", + ]; + if (normalized.some((arg) => entryCandidates.some((entry) => arg.endsWith(entry)))) { + return true; + } + + const exe = normalized[0] ?? ""; + return exe.endsWith("/openclaw") || exe === "openclaw" || exe.endsWith("/openclaw-gateway"); +} + +function readGatewayProcessArgsSync(pid: number): string[] | null { + if (process.platform === "linux") { + try { + return parseProcCmdline(fsSync.readFileSync(`/proc/${pid}/cmdline`, "utf8")); + } catch { + return null; + } + } + if (process.platform === "darwin") { + const ps = spawnSync("ps", ["-o", "command=", "-p", String(pid)], { + encoding: "utf8", + timeout: 1000, + }); + if (ps.error || ps.status !== 0) { + return null; + } + const command = ps.stdout.trim(); + return command ? command.split(/\s+/) : null; + } + return null; +} + +function resolveGatewayListenerPids(port: number): number[] { + return Array.from(new Set(findGatewayPidsOnPortSync(port))) + .filter((pid): pid is number => Number.isFinite(pid) && pid > 0) + .filter((pid) => { + const args = readGatewayProcessArgsSync(pid); + return args != null && isGatewayArgv(args); + }); +} + +function resolveGatewayPortFallback(): Promise { + return readBestEffortConfig() + .then((cfg) => resolveGatewayPort(cfg, process.env)) + .catch(() => resolveGatewayPort(undefined, process.env)); +} + +function signalGatewayPid(pid: number, signal: "SIGTERM" | "SIGUSR1") { + const args = readGatewayProcessArgsSync(pid); + if (!args || !isGatewayArgv(args)) { + throw new Error(`refusing to signal non-gateway process pid ${pid}`); + } + process.kill(pid, signal); +} + +function formatGatewayPidList(pids: number[]): string { + return pids.join(", "); +} + +async function assertUnmanagedGatewayRestartEnabled(port: number): Promise { + const probe = await probeGateway({ + url: `ws://127.0.0.1:${port}`, + auth: { + token: process.env.OPENCLAW_GATEWAY_TOKEN?.trim() || undefined, + password: process.env.OPENCLAW_GATEWAY_PASSWORD?.trim() || undefined, + }, + timeoutMs: 1_000, + }).catch(() => null); + + if (!probe?.ok) { + return; + } + if (!isRestartEnabled(probe.configSnapshot as { commands?: unknown } | undefined)) { + throw new Error( + "Gateway restart is disabled in the running gateway config (commands.restart=false); unmanaged SIGUSR1 restart would be ignored", + ); + } +} + +function resolveVerifiedGatewayListenerPids(port: number): number[] { + return resolveGatewayListenerPids(port).filter( + (pid): pid is number => Number.isFinite(pid) && pid > 0, + ); +} + +async function stopGatewayWithoutServiceManager(port: number) { + const pids = resolveVerifiedGatewayListenerPids(port); + if (pids.length === 0) { + return null; + } + for (const pid of pids) { + signalGatewayPid(pid, "SIGTERM"); + } + return { + result: "stopped" as const, + message: `Gateway stop signal sent to unmanaged process${pids.length === 1 ? "" : "es"} on port ${port}: ${formatGatewayPidList(pids)}.`, + }; +} + +async function restartGatewayWithoutServiceManager(port: number) { + await assertUnmanagedGatewayRestartEnabled(port); + const pids = resolveVerifiedGatewayListenerPids(port); + if (pids.length === 0) { + return null; + } + if (pids.length > 1) { + throw new Error( + `multiple gateway processes are listening on port ${port}: ${formatGatewayPidList(pids)}; use "openclaw gateway status --deep" before retrying restart`, + ); + } + signalGatewayPid(pids[0], "SIGUSR1"); + return { + result: "restarted" as const, + message: `Gateway restart signal sent to unmanaged process on port ${port}: ${pids[0]}.`, + }; +} + export async function runDaemonUninstall(opts: DaemonLifecycleOptions = {}) { return await runServiceUninstall({ serviceNoun: "Gateway", @@ -55,10 +200,15 @@ export async function runDaemonStart(opts: DaemonLifecycleOptions = {}) { } export async function runDaemonStop(opts: DaemonLifecycleOptions = {}) { + const service = resolveGatewayService(); + const gatewayPort = await resolveGatewayLifecyclePort(service).catch(() => + resolveGatewayPortFallback(), + ); return await runServiceStop({ serviceNoun: "Gateway", - service: resolveGatewayService(), + service, opts, + onNotLoaded: async () => stopGatewayWithoutServiceManager(gatewayPort), }); } @@ -70,8 +220,9 @@ export async function runDaemonStop(opts: DaemonLifecycleOptions = {}) { export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promise { const json = Boolean(opts.json); const service = resolveGatewayService(); - const restartPort = await resolveGatewayRestartPort().catch(async () => - resolveGatewayPort(await readBestEffortConfig(), process.env), + let restartedWithoutServiceManager = false; + const restartPort = await resolveGatewayLifecyclePort(service).catch(() => + resolveGatewayPortFallback(), ); const restartWaitMs = POST_RESTART_HEALTH_ATTEMPTS * POST_RESTART_HEALTH_DELAY_MS; const restartWaitSeconds = Math.round(restartWaitMs / 1000); @@ -82,7 +233,42 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi renderStartHints: renderGatewayServiceStartHints, opts, checkTokenDrift: true, + onNotLoaded: async () => { + const handled = await restartGatewayWithoutServiceManager(restartPort); + if (handled) { + restartedWithoutServiceManager = true; + } + return handled; + }, postRestartCheck: async ({ warnings, fail, stdout }) => { + if (restartedWithoutServiceManager) { + const health = await waitForGatewayHealthyListener({ + port: restartPort, + attempts: POST_RESTART_HEALTH_ATTEMPTS, + delayMs: POST_RESTART_HEALTH_DELAY_MS, + }); + if (health.healthy) { + return; + } + + const diagnostics = renderGatewayPortHealthDiagnostics(health); + const timeoutLine = `Timed out after ${restartWaitSeconds}s waiting for gateway port ${restartPort} to become healthy.`; + if (!json) { + defaultRuntime.log(theme.warn(timeoutLine)); + for (const line of diagnostics) { + defaultRuntime.log(theme.muted(line)); + } + } else { + warnings.push(timeoutLine); + warnings.push(...diagnostics); + } + + fail(`Gateway restart timed out after ${restartWaitSeconds}s waiting for health checks.`, [ + formatCliCommand("openclaw gateway status --deep"), + formatCliCommand("openclaw doctor"), + ]); + } + let health = await waitForGatewayHealthyRestart({ service, port: restartPort, diff --git a/src/cli/daemon-cli/restart-health.ts b/src/cli/daemon-cli/restart-health.ts index daa838988..fb0ecf0c1 100644 --- a/src/cli/daemon-cli/restart-health.ts +++ b/src/cli/daemon-cli/restart-health.ts @@ -23,6 +23,11 @@ export type GatewayRestartSnapshot = { staleGatewayPids: number[]; }; +export type GatewayPortHealthSnapshot = { + portUsage: PortUsage; + healthy: boolean; +}; + function listenerOwnedByRuntimePid(params: { listener: PortUsage["listeners"][number]; runtimePid: number; @@ -55,6 +60,32 @@ async function confirmGatewayReachable(port: number): Promise { return probe.ok || looksLikeAuthClose(probe.close?.code, probe.close?.reason); } +async function inspectGatewayPortHealth(port: number): Promise { + let portUsage: PortUsage; + try { + portUsage = await inspectPortUsage(port); + } catch (err) { + portUsage = { + port, + status: "unknown", + listeners: [], + hints: [], + errors: [String(err)], + }; + } + + let healthy = false; + if (portUsage.status === "busy") { + try { + healthy = await confirmGatewayReachable(port); + } catch { + // best-effort probe + } + } + + return { portUsage, healthy }; +} + export async function inspectGatewayRestart(params: { service: GatewayService; port: number; @@ -178,6 +209,27 @@ export async function waitForGatewayHealthyRestart(params: { return snapshot; } +export async function waitForGatewayHealthyListener(params: { + port: number; + attempts?: number; + delayMs?: number; +}): Promise { + const attempts = params.attempts ?? DEFAULT_RESTART_HEALTH_ATTEMPTS; + const delayMs = params.delayMs ?? DEFAULT_RESTART_HEALTH_DELAY_MS; + + let snapshot = await inspectGatewayPortHealth(params.port); + + for (let attempt = 0; attempt < attempts; attempt += 1) { + if (snapshot.healthy) { + return snapshot; + } + await sleep(delayMs); + snapshot = await inspectGatewayPortHealth(params.port); + } + + return snapshot; +} + export function renderRestartDiagnostics(snapshot: GatewayRestartSnapshot): string[] { const lines: string[] = []; const runtimeSummary = [ @@ -206,6 +258,22 @@ export function renderRestartDiagnostics(snapshot: GatewayRestartSnapshot): stri return lines; } +export function renderGatewayPortHealthDiagnostics(snapshot: GatewayPortHealthSnapshot): string[] { + const lines: string[] = []; + + if (snapshot.portUsage.status === "busy") { + lines.push(...formatPortDiagnostics(snapshot.portUsage)); + } else { + lines.push(`Gateway port ${snapshot.portUsage.port} status: ${snapshot.portUsage.status}.`); + } + + if (snapshot.portUsage.errors?.length) { + lines.push(`Port diagnostics errors: ${snapshot.portUsage.errors.join("; ")}`); + } + + return lines; +} + export async function terminateStaleGatewayPids(pids: number[]): Promise { const targets = Array.from( new Set(pids.filter((pid): pid is number => Number.isFinite(pid) && pid > 0)),