fix(cron): wrap computeJobNextRunAtMs in try-catch inside applyJobResult (#30905)
* fix(cron): wrap computeJobNextRunAtMs in try-catch inside applyJobResult Without this guard, if the croner library throws during schedule computation (timezone/expression edge cases), the exception propagates out of applyJobResult and the entire state update is lost — runningAtMs never clears, lastRunAtMs never advances, nextRunAtMs never recomputes. After STUCK_RUN_MS (2h), stuck detection clears runningAtMs and the job re-fires, creating a ~2h repeat cycle instead of the intended schedule. The sibling function recomputeJobNextRunAtMs in jobs.ts already wraps computeJobNextRunAtMs in try-catch; this was an oversight in the applyJobResult call sites. Changes: - Error-backoff path: catch and fall back to backoff-only schedule - Success path: catch and fall through to the MIN_REFIRE_GAP_MS safety net - applyOutcomeToStoredJob: log a warning when job not found after forceReload * fix(cron): use recordScheduleComputeError in applyJobResult catch blocks Address review feedback: the original catch blocks only logged a warning, which meant a persistent computeJobNextRunAtMs throw would cause a MIN_REFIRE_GAP_MS (2s) hot loop on cron-kind jobs. Now both catch blocks call recordScheduleComputeError (exported from jobs.ts), which tracks consecutive schedule errors and auto-disables the job after 3 failures — matching the existing behavior in recomputeJobNextRunAtMs. * test(cron): cover applyJobResult schedule-throw fallback paths --------- Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
This commit is contained in:
@@ -9,7 +9,13 @@ import { CronService } from "./service.js";
|
||||
import { createDeferred, createRunningCronServiceState } from "./service.test-harness.js";
|
||||
import { computeJobNextRunAtMs } from "./service/jobs.js";
|
||||
import { createCronServiceState, type CronEvent } from "./service/state.js";
|
||||
import { DEFAULT_JOB_TIMEOUT_MS, executeJobCore, onTimer, runMissedJobs } from "./service/timer.js";
|
||||
import {
|
||||
DEFAULT_JOB_TIMEOUT_MS,
|
||||
applyJobResult,
|
||||
executeJobCore,
|
||||
onTimer,
|
||||
runMissedJobs,
|
||||
} from "./service/timer.js";
|
||||
import type { CronJob, CronJobState } from "./types.js";
|
||||
|
||||
const noopLogger = {
|
||||
@@ -1586,4 +1592,81 @@ describe("Cron issue regressions", () => {
|
||||
expect(job?.state.lastStatus).toBe("error");
|
||||
expect(job?.state.lastError).toContain("timed out");
|
||||
});
|
||||
|
||||
it("keeps state updates when cron next-run computation throws after a successful run (#30905)", () => {
|
||||
const startedAt = Date.parse("2026-03-02T12:00:00.000Z");
|
||||
const endedAt = startedAt + 50;
|
||||
const state = createCronServiceState({
|
||||
cronEnabled: true,
|
||||
storePath: "/tmp/cron-30905-success.json",
|
||||
log: noopLogger,
|
||||
nowMs: () => endedAt,
|
||||
enqueueSystemEvent: vi.fn(),
|
||||
requestHeartbeatNow: vi.fn(),
|
||||
runIsolatedAgentJob: createDefaultIsolatedRunner(),
|
||||
});
|
||||
const job = createIsolatedRegressionJob({
|
||||
id: "apply-result-success-30905",
|
||||
name: "apply-result-success-30905",
|
||||
scheduledAt: startedAt,
|
||||
schedule: { kind: "cron", expr: "0 7 * * *", tz: "Invalid/Timezone" },
|
||||
payload: { kind: "agentTurn", message: "ping" },
|
||||
state: { nextRunAtMs: startedAt - 1_000, runningAtMs: startedAt - 500 },
|
||||
});
|
||||
|
||||
const shouldDelete = applyJobResult(state, job, {
|
||||
status: "ok",
|
||||
delivered: true,
|
||||
startedAt,
|
||||
endedAt,
|
||||
});
|
||||
|
||||
expect(shouldDelete).toBe(false);
|
||||
expect(job.state.runningAtMs).toBeUndefined();
|
||||
expect(job.state.lastRunAtMs).toBe(startedAt);
|
||||
expect(job.state.lastStatus).toBe("ok");
|
||||
expect(job.state.scheduleErrorCount).toBe(1);
|
||||
expect(job.state.lastError).toMatch(/^schedule error:/);
|
||||
expect(job.state.nextRunAtMs).toBe(endedAt + 2_000);
|
||||
expect(job.enabled).toBe(true);
|
||||
});
|
||||
|
||||
it("falls back to backoff schedule when cron next-run computation throws on error path (#30905)", () => {
|
||||
const startedAt = Date.parse("2026-03-02T12:05:00.000Z");
|
||||
const endedAt = startedAt + 25;
|
||||
const state = createCronServiceState({
|
||||
cronEnabled: true,
|
||||
storePath: "/tmp/cron-30905-error.json",
|
||||
log: noopLogger,
|
||||
nowMs: () => endedAt,
|
||||
enqueueSystemEvent: vi.fn(),
|
||||
requestHeartbeatNow: vi.fn(),
|
||||
runIsolatedAgentJob: createDefaultIsolatedRunner(),
|
||||
});
|
||||
const job = createIsolatedRegressionJob({
|
||||
id: "apply-result-error-30905",
|
||||
name: "apply-result-error-30905",
|
||||
scheduledAt: startedAt,
|
||||
schedule: { kind: "cron", expr: "0 7 * * *", tz: "Invalid/Timezone" },
|
||||
payload: { kind: "agentTurn", message: "ping" },
|
||||
state: { nextRunAtMs: startedAt - 1_000, runningAtMs: startedAt - 500 },
|
||||
});
|
||||
|
||||
const shouldDelete = applyJobResult(state, job, {
|
||||
status: "error",
|
||||
error: "synthetic failure",
|
||||
startedAt,
|
||||
endedAt,
|
||||
});
|
||||
|
||||
expect(shouldDelete).toBe(false);
|
||||
expect(job.state.runningAtMs).toBeUndefined();
|
||||
expect(job.state.lastRunAtMs).toBe(startedAt);
|
||||
expect(job.state.lastStatus).toBe("error");
|
||||
expect(job.state.consecutiveErrors).toBe(1);
|
||||
expect(job.state.scheduleErrorCount).toBe(1);
|
||||
expect(job.state.lastError).toMatch(/^schedule error:/);
|
||||
expect(job.state.nextRunAtMs).toBe(endedAt + 30_000);
|
||||
expect(job.enabled).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -214,7 +214,7 @@ export function computeJobNextRunAtMs(job: CronJob, nowMs: number): number | und
|
||||
/** Maximum consecutive schedule errors before auto-disabling a job. */
|
||||
const MAX_SCHEDULE_ERRORS = 3;
|
||||
|
||||
function recordScheduleComputeError(params: {
|
||||
export function recordScheduleComputeError(params: {
|
||||
state: CronServiceState;
|
||||
job: CronJob;
|
||||
err: unknown;
|
||||
|
||||
@@ -15,6 +15,7 @@ import {
|
||||
computeJobNextRunAtMs,
|
||||
nextWakeAtMs,
|
||||
recomputeNextRunsForMaintenance,
|
||||
recordScheduleComputeError,
|
||||
resolveJobPayloadTextForMain,
|
||||
} from "./jobs.js";
|
||||
import { locked } from "./locked.js";
|
||||
@@ -356,7 +357,15 @@ export function applyJobResult(
|
||||
} else if (result.status === "error" && job.enabled) {
|
||||
// Apply exponential backoff for errored jobs to prevent retry storms.
|
||||
const backoff = errorBackoffMs(job.state.consecutiveErrors ?? 1);
|
||||
const normalNext = computeJobNextRunAtMs(job, result.endedAt);
|
||||
let normalNext: number | undefined;
|
||||
try {
|
||||
normalNext = computeJobNextRunAtMs(job, result.endedAt);
|
||||
} catch (err) {
|
||||
// If the schedule expression/timezone throws (croner edge cases),
|
||||
// record the schedule error (auto-disables after repeated failures)
|
||||
// and fall back to backoff-only schedule so the state update is not lost.
|
||||
recordScheduleComputeError({ state, job, err });
|
||||
}
|
||||
const backoffNext = result.endedAt + backoff;
|
||||
// Use whichever is later: the natural next run or the backoff delay.
|
||||
job.state.nextRunAtMs =
|
||||
@@ -371,7 +380,15 @@ export function applyJobResult(
|
||||
"cron: applying error backoff",
|
||||
);
|
||||
} else if (job.enabled) {
|
||||
const naturalNext = computeJobNextRunAtMs(job, result.endedAt);
|
||||
let naturalNext: number | undefined;
|
||||
try {
|
||||
naturalNext = computeJobNextRunAtMs(job, result.endedAt);
|
||||
} catch (err) {
|
||||
// If the schedule expression/timezone throws (croner edge cases),
|
||||
// record the schedule error (auto-disables after repeated failures)
|
||||
// so a persistent throw doesn't cause a MIN_REFIRE_GAP_MS hot loop.
|
||||
recordScheduleComputeError({ state, job, err });
|
||||
}
|
||||
if (job.schedule.kind === "cron") {
|
||||
// Safety net: ensure the next fire is at least MIN_REFIRE_GAP_MS
|
||||
// after the current run ended. Prevents spin-loops when the
|
||||
@@ -399,6 +416,10 @@ function applyOutcomeToStoredJob(state: CronServiceState, result: TimedCronRunOu
|
||||
const jobs = store.jobs;
|
||||
const job = jobs.find((entry) => entry.id === result.jobId);
|
||||
if (!job) {
|
||||
state.deps.log.warn(
|
||||
{ jobId: result.jobId },
|
||||
"cron: applyOutcomeToStoredJob — job not found after forceReload, result discarded",
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user