From ace5e33ceedfba70a30dd0ba69b909f87a1bd530 Mon Sep 17 00:00:00 2001 From: Tom Ron <126325152+tomron87@users.noreply.github.com> Date: Thu, 12 Feb 2026 06:13:27 +0200 Subject: [PATCH] fix(cron): re-arm timer when onTimer fires during active job execution (#14233) * fix(cron): re-arm timer when onTimer fires during active job execution When a cron job takes longer than MAX_TIMER_DELAY_MS (60s), the clamped timer fires while state.running is still true. The early return in onTimer() previously exited without re-arming the timer, leaving no setTimeout scheduled. This silently kills the cron scheduler until the next gateway restart. The fix calls armTimer(state) before the early return so the scheduler continues ticking even when a job is in progress. This is the likely root cause of recurring cron jobs silently skipping, as reported in #12025. One-shot (kind: 'at') jobs were unaffected because they typically complete within a single timer cycle. Includes a regression test that simulates a slow job exceeding the timer clamp period and verifies the next occurrence still fires. * fix: update tests for timer re-arm behavior - Update existing regression test to expect timer re-arm with non-zero delay instead of no timer at all - Simplify new test to directly verify state.timer is set after onTimer returns early due to running guard * fix: use fixed 60s delay for re-arm to prevent zero-delay hot-loop When the running guard re-arms the timer, use MAX_TIMER_DELAY_MS directly instead of calling armTimer() which can compute a zero delay for past-due jobs. This prevents a tight spin while still keeping the scheduler alive. * style: add curly braces to satisfy eslint(curly) rule --- src/cron/service.issue-regressions.test.ts | 13 ++- .../service.rearm-timer-when-running.test.ts | 107 ++++++++++++++++++ src/cron/service/timer.ts | 20 ++++ 3 files changed, 137 insertions(+), 3 deletions(-) create mode 100644 src/cron/service.rearm-timer-when-running.test.ts diff --git a/src/cron/service.issue-regressions.test.ts b/src/cron/service.issue-regressions.test.ts index c793979c1..cac5b0bab 100644 --- a/src/cron/service.issue-regressions.test.ts +++ b/src/cron/service.issue-regressions.test.ts @@ -212,7 +212,7 @@ describe("Cron issue regressions", () => { await store.cleanup(); }); - it("does not hot-loop zero-delay timers while a run is already in progress", async () => { + it("re-arms timer without hot-looping when a run is already in progress", async () => { const timeoutSpy = vi.spyOn(globalThis, "setTimeout"); const store = await makeStorePath(); const now = Date.parse("2026-02-06T10:05:00.000Z"); @@ -233,8 +233,15 @@ describe("Cron issue regressions", () => { await onTimer(state); - expect(timeoutSpy).not.toHaveBeenCalled(); - expect(state.timer).toBeNull(); + // The timer should be re-armed (not null) so the scheduler stays alive, + // with a fixed MAX_TIMER_DELAY_MS (60s) delay to avoid a hot-loop when + // past-due jobs are waiting. See #12025. + expect(timeoutSpy).toHaveBeenCalled(); + expect(state.timer).not.toBeNull(); + const delays = timeoutSpy.mock.calls + .map(([, delay]) => delay) + .filter((d): d is number => typeof d === "number"); + expect(delays).toContain(60_000); timeoutSpy.mockRestore(); await store.cleanup(); }); diff --git a/src/cron/service.rearm-timer-when-running.test.ts b/src/cron/service.rearm-timer-when-running.test.ts new file mode 100644 index 000000000..21b8f2b95 --- /dev/null +++ b/src/cron/service.rearm-timer-when-running.test.ts @@ -0,0 +1,107 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import type { CronJob } from "./types.js"; +import { createCronServiceState } from "./service/state.js"; +import { onTimer } from "./service/timer.js"; + +const noopLogger = { + debug: vi.fn(), + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), +}; + +async function makeStorePath() { + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-cron-")); + return { + storePath: path.join(dir, "cron", "jobs.json"), + cleanup: async () => { + await fs.rm(dir, { recursive: true, force: true }); + }, + }; +} + +function createDueRecurringJob(params: { + id: string; + nowMs: number; + nextRunAtMs: number; +}): CronJob { + return { + id: params.id, + name: params.id, + enabled: true, + deleteAfterRun: false, + createdAtMs: params.nowMs, + updatedAtMs: params.nowMs, + schedule: { kind: "every", everyMs: 5 * 60_000 }, + sessionTarget: "isolated", + wakeMode: "next-heartbeat", + payload: { kind: "agentTurn", message: "test" }, + delivery: { mode: "none" }, + state: { nextRunAtMs: params.nextRunAtMs }, + }; +} + +describe("CronService - timer re-arm when running (#12025)", () => { + beforeEach(() => { + noopLogger.debug.mockClear(); + noopLogger.info.mockClear(); + noopLogger.warn.mockClear(); + noopLogger.error.mockClear(); + }); + + afterEach(() => { + vi.clearAllMocks(); + }); + + it("re-arms the timer when onTimer is called while state.running is true", async () => { + const timeoutSpy = vi.spyOn(globalThis, "setTimeout"); + const store = await makeStorePath(); + const now = Date.parse("2026-02-06T10:05:00.000Z"); + + const state = createCronServiceState({ + cronEnabled: true, + storePath: store.storePath, + log: noopLogger, + nowMs: () => now, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + runIsolatedAgentJob: vi.fn().mockResolvedValue({ status: "ok", summary: "ok" }), + }); + + // Simulate a job that is currently running. + state.running = true; + state.store = { + version: 1, + jobs: [ + createDueRecurringJob({ + id: "recurring-job", + nowMs: now, + nextRunAtMs: now + 5 * 60_000, + }), + ], + }; + + // Before the fix in #12025, this would return without re-arming, + // silently killing the scheduler. + await onTimer(state); + + // The timer must be re-armed so the scheduler continues ticking, + // with a fixed 60s delay to avoid hot-looping. + expect(state.timer).not.toBeNull(); + expect(timeoutSpy).toHaveBeenCalled(); + const delays = timeoutSpy.mock.calls + .map(([, delay]) => delay) + .filter((d): d is number => typeof d === "number"); + expect(delays).toContain(60_000); + + // state.running should still be true (onTimer bailed out, didn't + // touch it — the original caller's finally block handles that). + expect(state.running).toBe(true); + + timeoutSpy.mockRestore(); + await store.cleanup(); + }); +}); diff --git a/src/cron/service/timer.ts b/src/cron/service/timer.ts index cda67eb2a..3b446848a 100644 --- a/src/cron/service/timer.ts +++ b/src/cron/service/timer.ts @@ -158,6 +158,26 @@ export function armTimer(state: CronServiceState) { export async function onTimer(state: CronServiceState) { if (state.running) { + // Re-arm the timer so the scheduler keeps ticking even when a job is + // still executing. Without this, a long-running job (e.g. an agentTurn + // exceeding MAX_TIMER_DELAY_MS) causes the clamped 60 s timer to fire + // while `running` is true. The early return then leaves no timer set, + // silently killing the scheduler until the next gateway restart. + // + // We use MAX_TIMER_DELAY_MS as a fixed re-check interval to avoid a + // zero-delay hot-loop when past-due jobs are waiting for the current + // execution to finish. + // See: https://github.com/openclaw/openclaw/issues/12025 + if (state.timer) { + clearTimeout(state.timer); + } + state.timer = setTimeout(async () => { + try { + await onTimer(state); + } catch (err) { + state.deps.log.error({ err: String(err) }, "cron: timer tick failed"); + } + }, MAX_TIMER_DELAY_MS); return; } state.running = true;