fix: clean stale gateway PIDs before triggerOpenClawRestart calls launchctl/systemctl
When the /restart command runs inside an embedded agent process (no SIGUSR1 listener), it falls through to triggerOpenClawRestart() which calls launchctl kickstart -k directly — bypassing the pre-restart port cleanup added in #27013. If the gateway was started via TUI/CLI, the orphaned process still holds the port and the new launchd instance crash-loops. Add synchronous stale-PID detection (lsof) and termination (SIGTERM→SIGKILL) inside triggerOpenClawRestart() itself, so every caller — including the embedded agent /restart path — gets port cleanup before the service manager restart command fires. Closes #26736 Made-with: Cursor
This commit is contained in:
committed by
Peter Steinberger
parent
792ce7b5b4
commit
63c6080d50
19
src/infra/restart.test.ts
Normal file
19
src/infra/restart.test.ts
Normal file
@@ -0,0 +1,19 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { findGatewayPidsOnPortSync } from "./restart.js";
|
||||
|
||||
describe("findGatewayPidsOnPortSync", () => {
|
||||
it("returns an empty array for a port with no listeners", () => {
|
||||
const pids = findGatewayPidsOnPortSync(19999);
|
||||
expect(pids).toEqual([]);
|
||||
});
|
||||
|
||||
it("never includes the current process PID", () => {
|
||||
const pids = findGatewayPidsOnPortSync(18789);
|
||||
expect(pids).not.toContain(process.pid);
|
||||
});
|
||||
|
||||
it("returns an array (not undefined or null) on any port", () => {
|
||||
const pids = findGatewayPidsOnPortSync(0);
|
||||
expect(Array.isArray(pids)).toBe(true);
|
||||
});
|
||||
});
|
||||
@@ -1,9 +1,11 @@
|
||||
import { spawnSync } from "node:child_process";
|
||||
import { resolveGatewayPort } from "../config/paths.js";
|
||||
import {
|
||||
resolveGatewayLaunchAgentLabel,
|
||||
resolveGatewaySystemdServiceName,
|
||||
} from "../daemon/constants.js";
|
||||
import { createSubsystemLogger } from "../logging/subsystem.js";
|
||||
import { resolveLsofCommandSync } from "./ports-lsof.js";
|
||||
|
||||
export type RestartAttempt = {
|
||||
ok: boolean;
|
||||
@@ -283,10 +285,106 @@ function normalizeSystemdUnit(raw?: string, profile?: string): string {
|
||||
return unit.endsWith(".service") ? unit : `${unit}.service`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find PIDs of gateway processes listening on the given port using synchronous lsof.
|
||||
* Returns only PIDs that belong to openclaw gateway processes (not the current process).
|
||||
*/
|
||||
export function findGatewayPidsOnPortSync(port: number): number[] {
|
||||
if (process.platform === "win32") {
|
||||
return [];
|
||||
}
|
||||
const lsof = resolveLsofCommandSync();
|
||||
const res = spawnSync(lsof, ["-nP", `-iTCP:${port}`, "-sTCP:LISTEN", "-Fpc"], {
|
||||
encoding: "utf8",
|
||||
timeout: SPAWN_TIMEOUT_MS,
|
||||
});
|
||||
if (res.error || res.status !== 0) {
|
||||
return [];
|
||||
}
|
||||
const pids: number[] = [];
|
||||
let currentPid: number | undefined;
|
||||
let currentCmd: string | undefined;
|
||||
for (const line of res.stdout.split(/\r?\n/).filter(Boolean)) {
|
||||
if (line.startsWith("p")) {
|
||||
if (currentPid != null && currentCmd && currentCmd.toLowerCase().includes("openclaw")) {
|
||||
pids.push(currentPid);
|
||||
}
|
||||
const parsed = Number.parseInt(line.slice(1), 10);
|
||||
currentPid = Number.isFinite(parsed) && parsed > 0 ? parsed : undefined;
|
||||
currentCmd = undefined;
|
||||
} else if (line.startsWith("c")) {
|
||||
currentCmd = line.slice(1);
|
||||
}
|
||||
}
|
||||
if (currentPid != null && currentCmd && currentCmd.toLowerCase().includes("openclaw")) {
|
||||
pids.push(currentPid);
|
||||
}
|
||||
return pids.filter((pid) => pid !== process.pid);
|
||||
}
|
||||
|
||||
const STALE_SIGTERM_WAIT_MS = 300;
|
||||
const STALE_SIGKILL_WAIT_MS = 200;
|
||||
|
||||
/**
|
||||
* Synchronously terminate stale gateway processes.
|
||||
* Sends SIGTERM, waits briefly, then SIGKILL for survivors.
|
||||
*/
|
||||
function terminateStaleProcessesSync(pids: number[]): number[] {
|
||||
if (pids.length === 0) {
|
||||
return [];
|
||||
}
|
||||
const killed: number[] = [];
|
||||
for (const pid of pids) {
|
||||
try {
|
||||
process.kill(pid, "SIGTERM");
|
||||
killed.push(pid);
|
||||
} catch {
|
||||
// ESRCH — already gone
|
||||
}
|
||||
}
|
||||
if (killed.length === 0) {
|
||||
return killed;
|
||||
}
|
||||
spawnSync("sleep", [String(STALE_SIGTERM_WAIT_MS / 1000)], { timeout: 2000 });
|
||||
for (const pid of killed) {
|
||||
try {
|
||||
process.kill(pid, 0);
|
||||
process.kill(pid, "SIGKILL");
|
||||
} catch {
|
||||
// already gone
|
||||
}
|
||||
}
|
||||
spawnSync("sleep", [String(STALE_SIGKILL_WAIT_MS / 1000)], { timeout: 2000 });
|
||||
return killed;
|
||||
}
|
||||
|
||||
/**
|
||||
* Inspect the gateway port and kill any stale gateway processes holding it.
|
||||
* Called before service restart commands to prevent port conflicts.
|
||||
*/
|
||||
function cleanStaleGatewayProcessesSync(): number[] {
|
||||
try {
|
||||
const port = resolveGatewayPort(undefined, process.env);
|
||||
const stalePids = findGatewayPidsOnPortSync(port);
|
||||
if (stalePids.length === 0) {
|
||||
return [];
|
||||
}
|
||||
restartLog.warn(
|
||||
`killing ${stalePids.length} stale gateway process(es) before restart: ${stalePids.join(", ")}`,
|
||||
);
|
||||
return terminateStaleProcessesSync(stalePids);
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
export function triggerOpenClawRestart(): RestartAttempt {
|
||||
if (process.env.VITEST || process.env.NODE_ENV === "test") {
|
||||
return { ok: true, method: "supervisor", detail: "test mode" };
|
||||
}
|
||||
|
||||
cleanStaleGatewayProcessesSync();
|
||||
|
||||
const tried: string[] = [];
|
||||
if (process.platform !== "darwin") {
|
||||
if (process.platform === "linux") {
|
||||
|
||||
Reference in New Issue
Block a user