fix(cron): re-arm timer when onTimer fires during active job execution (#14233)

* fix(cron): re-arm timer when onTimer fires during active job execution

When a cron job takes longer than MAX_TIMER_DELAY_MS (60s), the clamped
timer fires while state.running is still true.  The early return in
onTimer() previously exited without re-arming the timer, leaving no
setTimeout scheduled.  This silently kills the cron scheduler until the
next gateway restart.

The fix calls armTimer(state) before the early return so the scheduler
continues ticking even when a job is in progress.

This is the likely root cause of recurring cron jobs silently skipping,
as reported in #12025.  One-shot (kind: 'at') jobs were unaffected
because they typically complete within a single timer cycle.

Includes a regression test that simulates a slow job exceeding the
timer clamp period and verifies the next occurrence still fires.

* fix: update tests for timer re-arm behavior

- Update existing regression test to expect timer re-arm with non-zero
  delay instead of no timer at all
- Simplify new test to directly verify state.timer is set after onTimer
  returns early due to running guard

* fix: use fixed 60s delay for re-arm to prevent zero-delay hot-loop

When the running guard re-arms the timer, use MAX_TIMER_DELAY_MS
directly instead of calling armTimer() which can compute a zero delay
for past-due jobs.  This prevents a tight spin while still keeping the
scheduler alive.

* style: add curly braces to satisfy eslint(curly) rule
This commit is contained in:
Tom Ron
2026-02-12 06:13:27 +02:00
committed by GitHub
parent dd6047d998
commit ace5e33cee
3 changed files with 137 additions and 3 deletions

View File

@@ -212,7 +212,7 @@ describe("Cron issue regressions", () => {
await store.cleanup();
});
it("does not hot-loop zero-delay timers while a run is already in progress", async () => {
it("re-arms timer without hot-looping when a run is already in progress", async () => {
const timeoutSpy = vi.spyOn(globalThis, "setTimeout");
const store = await makeStorePath();
const now = Date.parse("2026-02-06T10:05:00.000Z");
@@ -233,8 +233,15 @@ describe("Cron issue regressions", () => {
await onTimer(state);
expect(timeoutSpy).not.toHaveBeenCalled();
expect(state.timer).toBeNull();
// The timer should be re-armed (not null) so the scheduler stays alive,
// with a fixed MAX_TIMER_DELAY_MS (60s) delay to avoid a hot-loop when
// past-due jobs are waiting. See #12025.
expect(timeoutSpy).toHaveBeenCalled();
expect(state.timer).not.toBeNull();
const delays = timeoutSpy.mock.calls
.map(([, delay]) => delay)
.filter((d): d is number => typeof d === "number");
expect(delays).toContain(60_000);
timeoutSpy.mockRestore();
await store.cleanup();
});

View File

@@ -0,0 +1,107 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import type { CronJob } from "./types.js";
import { createCronServiceState } from "./service/state.js";
import { onTimer } from "./service/timer.js";
const noopLogger = {
debug: vi.fn(),
info: vi.fn(),
warn: vi.fn(),
error: vi.fn(),
};
async function makeStorePath() {
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-cron-"));
return {
storePath: path.join(dir, "cron", "jobs.json"),
cleanup: async () => {
await fs.rm(dir, { recursive: true, force: true });
},
};
}
function createDueRecurringJob(params: {
id: string;
nowMs: number;
nextRunAtMs: number;
}): CronJob {
return {
id: params.id,
name: params.id,
enabled: true,
deleteAfterRun: false,
createdAtMs: params.nowMs,
updatedAtMs: params.nowMs,
schedule: { kind: "every", everyMs: 5 * 60_000 },
sessionTarget: "isolated",
wakeMode: "next-heartbeat",
payload: { kind: "agentTurn", message: "test" },
delivery: { mode: "none" },
state: { nextRunAtMs: params.nextRunAtMs },
};
}
describe("CronService - timer re-arm when running (#12025)", () => {
beforeEach(() => {
noopLogger.debug.mockClear();
noopLogger.info.mockClear();
noopLogger.warn.mockClear();
noopLogger.error.mockClear();
});
afterEach(() => {
vi.clearAllMocks();
});
it("re-arms the timer when onTimer is called while state.running is true", async () => {
const timeoutSpy = vi.spyOn(globalThis, "setTimeout");
const store = await makeStorePath();
const now = Date.parse("2026-02-06T10:05:00.000Z");
const state = createCronServiceState({
cronEnabled: true,
storePath: store.storePath,
log: noopLogger,
nowMs: () => now,
enqueueSystemEvent: vi.fn(),
requestHeartbeatNow: vi.fn(),
runIsolatedAgentJob: vi.fn().mockResolvedValue({ status: "ok", summary: "ok" }),
});
// Simulate a job that is currently running.
state.running = true;
state.store = {
version: 1,
jobs: [
createDueRecurringJob({
id: "recurring-job",
nowMs: now,
nextRunAtMs: now + 5 * 60_000,
}),
],
};
// Before the fix in #12025, this would return without re-arming,
// silently killing the scheduler.
await onTimer(state);
// The timer must be re-armed so the scheduler continues ticking,
// with a fixed 60s delay to avoid hot-looping.
expect(state.timer).not.toBeNull();
expect(timeoutSpy).toHaveBeenCalled();
const delays = timeoutSpy.mock.calls
.map(([, delay]) => delay)
.filter((d): d is number => typeof d === "number");
expect(delays).toContain(60_000);
// state.running should still be true (onTimer bailed out, didn't
// touch it — the original caller's finally block handles that).
expect(state.running).toBe(true);
timeoutSpy.mockRestore();
await store.cleanup();
});
});

View File

@@ -158,6 +158,26 @@ export function armTimer(state: CronServiceState) {
export async function onTimer(state: CronServiceState) {
if (state.running) {
// Re-arm the timer so the scheduler keeps ticking even when a job is
// still executing. Without this, a long-running job (e.g. an agentTurn
// exceeding MAX_TIMER_DELAY_MS) causes the clamped 60 s timer to fire
// while `running` is true. The early return then leaves no timer set,
// silently killing the scheduler until the next gateway restart.
//
// We use MAX_TIMER_DELAY_MS as a fixed re-check interval to avoid a
// zero-delay hot-loop when past-due jobs are waiting for the current
// execution to finish.
// See: https://github.com/openclaw/openclaw/issues/12025
if (state.timer) {
clearTimeout(state.timer);
}
state.timer = setTimeout(async () => {
try {
await onTimer(state);
} catch (err) {
state.deps.log.error({ err: String(err) }, "cron: timer tick failed");
}
}, MAX_TIMER_DELAY_MS);
return;
}
state.running = true;