fix(cron): wrap computeJobNextRunAtMs in try-catch inside applyJobResult (#30905)

* fix(cron): wrap computeJobNextRunAtMs in try-catch inside applyJobResult

Without this guard, if the croner library throws during schedule
computation (timezone/expression edge cases), the exception propagates
out of applyJobResult and the entire state update is lost — runningAtMs
never clears, lastRunAtMs never advances, nextRunAtMs never recomputes.
After STUCK_RUN_MS (2h), stuck detection clears runningAtMs and the job
re-fires, creating a ~2h repeat cycle instead of the intended schedule.

The sibling function recomputeJobNextRunAtMs in jobs.ts already wraps
computeJobNextRunAtMs in try-catch; this was an oversight in the
applyJobResult call sites.

Changes:
- Error-backoff path: catch and fall back to backoff-only schedule
- Success path: catch and fall through to the MIN_REFIRE_GAP_MS safety net
- applyOutcomeToStoredJob: log a warning when job not found after forceReload

* fix(cron): use recordScheduleComputeError in applyJobResult catch blocks

Address review feedback: the original catch blocks only logged a warning,
which meant a persistent computeJobNextRunAtMs throw would cause a
MIN_REFIRE_GAP_MS (2s) hot loop on cron-kind jobs.

Now both catch blocks call recordScheduleComputeError (exported from
jobs.ts), which tracks consecutive schedule errors and auto-disables the
job after 3 failures — matching the existing behavior in
recomputeJobNextRunAtMs.

* test(cron): cover applyJobResult schedule-throw fallback paths

---------

Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
This commit is contained in:
Kate Chapman
2026-03-02 05:38:33 -08:00
committed by GitHub
parent 21e19e42a3
commit 6df8bd9741
3 changed files with 108 additions and 4 deletions

View File

@@ -9,7 +9,13 @@ import { CronService } from "./service.js";
import { createDeferred, createRunningCronServiceState } from "./service.test-harness.js";
import { computeJobNextRunAtMs } from "./service/jobs.js";
import { createCronServiceState, type CronEvent } from "./service/state.js";
import { DEFAULT_JOB_TIMEOUT_MS, executeJobCore, onTimer, runMissedJobs } from "./service/timer.js";
import {
DEFAULT_JOB_TIMEOUT_MS,
applyJobResult,
executeJobCore,
onTimer,
runMissedJobs,
} from "./service/timer.js";
import type { CronJob, CronJobState } from "./types.js";
const noopLogger = {
@@ -1586,4 +1592,81 @@ describe("Cron issue regressions", () => {
expect(job?.state.lastStatus).toBe("error");
expect(job?.state.lastError).toContain("timed out");
});
it("keeps state updates when cron next-run computation throws after a successful run (#30905)", () => {
const startedAt = Date.parse("2026-03-02T12:00:00.000Z");
const endedAt = startedAt + 50;
const state = createCronServiceState({
cronEnabled: true,
storePath: "/tmp/cron-30905-success.json",
log: noopLogger,
nowMs: () => endedAt,
enqueueSystemEvent: vi.fn(),
requestHeartbeatNow: vi.fn(),
runIsolatedAgentJob: createDefaultIsolatedRunner(),
});
const job = createIsolatedRegressionJob({
id: "apply-result-success-30905",
name: "apply-result-success-30905",
scheduledAt: startedAt,
schedule: { kind: "cron", expr: "0 7 * * *", tz: "Invalid/Timezone" },
payload: { kind: "agentTurn", message: "ping" },
state: { nextRunAtMs: startedAt - 1_000, runningAtMs: startedAt - 500 },
});
const shouldDelete = applyJobResult(state, job, {
status: "ok",
delivered: true,
startedAt,
endedAt,
});
expect(shouldDelete).toBe(false);
expect(job.state.runningAtMs).toBeUndefined();
expect(job.state.lastRunAtMs).toBe(startedAt);
expect(job.state.lastStatus).toBe("ok");
expect(job.state.scheduleErrorCount).toBe(1);
expect(job.state.lastError).toMatch(/^schedule error:/);
expect(job.state.nextRunAtMs).toBe(endedAt + 2_000);
expect(job.enabled).toBe(true);
});
it("falls back to backoff schedule when cron next-run computation throws on error path (#30905)", () => {
const startedAt = Date.parse("2026-03-02T12:05:00.000Z");
const endedAt = startedAt + 25;
const state = createCronServiceState({
cronEnabled: true,
storePath: "/tmp/cron-30905-error.json",
log: noopLogger,
nowMs: () => endedAt,
enqueueSystemEvent: vi.fn(),
requestHeartbeatNow: vi.fn(),
runIsolatedAgentJob: createDefaultIsolatedRunner(),
});
const job = createIsolatedRegressionJob({
id: "apply-result-error-30905",
name: "apply-result-error-30905",
scheduledAt: startedAt,
schedule: { kind: "cron", expr: "0 7 * * *", tz: "Invalid/Timezone" },
payload: { kind: "agentTurn", message: "ping" },
state: { nextRunAtMs: startedAt - 1_000, runningAtMs: startedAt - 500 },
});
const shouldDelete = applyJobResult(state, job, {
status: "error",
error: "synthetic failure",
startedAt,
endedAt,
});
expect(shouldDelete).toBe(false);
expect(job.state.runningAtMs).toBeUndefined();
expect(job.state.lastRunAtMs).toBe(startedAt);
expect(job.state.lastStatus).toBe("error");
expect(job.state.consecutiveErrors).toBe(1);
expect(job.state.scheduleErrorCount).toBe(1);
expect(job.state.lastError).toMatch(/^schedule error:/);
expect(job.state.nextRunAtMs).toBe(endedAt + 30_000);
expect(job.enabled).toBe(true);
});
});

View File

@@ -214,7 +214,7 @@ export function computeJobNextRunAtMs(job: CronJob, nowMs: number): number | und
/** Maximum consecutive schedule errors before auto-disabling a job. */
const MAX_SCHEDULE_ERRORS = 3;
function recordScheduleComputeError(params: {
export function recordScheduleComputeError(params: {
state: CronServiceState;
job: CronJob;
err: unknown;

View File

@@ -15,6 +15,7 @@ import {
computeJobNextRunAtMs,
nextWakeAtMs,
recomputeNextRunsForMaintenance,
recordScheduleComputeError,
resolveJobPayloadTextForMain,
} from "./jobs.js";
import { locked } from "./locked.js";
@@ -356,7 +357,15 @@ export function applyJobResult(
} else if (result.status === "error" && job.enabled) {
// Apply exponential backoff for errored jobs to prevent retry storms.
const backoff = errorBackoffMs(job.state.consecutiveErrors ?? 1);
const normalNext = computeJobNextRunAtMs(job, result.endedAt);
let normalNext: number | undefined;
try {
normalNext = computeJobNextRunAtMs(job, result.endedAt);
} catch (err) {
// If the schedule expression/timezone throws (croner edge cases),
// record the schedule error (auto-disables after repeated failures)
// and fall back to backoff-only schedule so the state update is not lost.
recordScheduleComputeError({ state, job, err });
}
const backoffNext = result.endedAt + backoff;
// Use whichever is later: the natural next run or the backoff delay.
job.state.nextRunAtMs =
@@ -371,7 +380,15 @@ export function applyJobResult(
"cron: applying error backoff",
);
} else if (job.enabled) {
const naturalNext = computeJobNextRunAtMs(job, result.endedAt);
let naturalNext: number | undefined;
try {
naturalNext = computeJobNextRunAtMs(job, result.endedAt);
} catch (err) {
// If the schedule expression/timezone throws (croner edge cases),
// record the schedule error (auto-disables after repeated failures)
// so a persistent throw doesn't cause a MIN_REFIRE_GAP_MS hot loop.
recordScheduleComputeError({ state, job, err });
}
if (job.schedule.kind === "cron") {
// Safety net: ensure the next fire is at least MIN_REFIRE_GAP_MS
// after the current run ended. Prevents spin-loops when the
@@ -399,6 +416,10 @@ function applyOutcomeToStoredJob(state: CronServiceState, result: TimedCronRunOu
const jobs = store.jobs;
const job = jobs.find((entry) => entry.id === result.jobId);
if (!job) {
state.deps.log.warn(
{ jobId: result.jobId },
"cron: applyOutcomeToStoredJob — job not found after forceReload, result discarded",
);
return;
}