fix: clear stale runningAtMs in cron.run() before already-running check (#17949)

Add recomputeNextRunsForMaintenance() call in run() so that stale
runningAtMs markers (from a crashed Phase-1 persist) are cleared by the
existing normalizeJobTickState logic before the already-running guard.

Without this, a manual cron.run() could be blocked for up to
STUCK_RUN_MS (2 hours) even though no job was actually running.

Fixes #17554

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Yasunori Morishima(盛島康徳)
2026-03-02 22:47:36 +09:00
committed by GitHub
parent 60b8d645de
commit be8930d6f9
2 changed files with 60 additions and 0 deletions

View File

@@ -8,6 +8,7 @@ import * as schedule from "./schedule.js";
import { CronService } from "./service.js";
import { createDeferred, createRunningCronServiceState } from "./service.test-harness.js";
import { computeJobNextRunAtMs } from "./service/jobs.js";
import { run } from "./service/ops.js";
import { createCronServiceState, type CronEvent } from "./service/state.js";
import {
DEFAULT_JOB_TIMEOUT_MS,
@@ -1450,6 +1451,61 @@ describe("Cron issue regressions", () => {
expect(startedAtEvents).toEqual([dueAt, dueAt + 50]);
});
it("#17554: run() clears stale runningAtMs and executes the job", async () => {
const store = await makeStorePath();
const now = Date.parse("2026-02-06T10:05:00.000Z");
const staleRunningAtMs = now - 2 * 60 * 60 * 1000 - 1;
await fs.writeFile(
store.storePath,
JSON.stringify(
{
version: 1,
jobs: [
{
id: "stale-running",
name: "stale-running",
enabled: true,
createdAtMs: now - 3_600_000,
updatedAtMs: now - 3_600_000,
schedule: { kind: "at", at: new Date(now - 60_000).toISOString() },
sessionTarget: "main",
wakeMode: "now",
payload: { kind: "systemEvent", text: "stale-running" },
state: {
runningAtMs: staleRunningAtMs,
lastRunAtMs: now - 3_600_000,
lastStatus: "ok",
nextRunAtMs: now - 60_000,
},
},
],
},
null,
2,
),
"utf-8",
);
const enqueueSystemEvent = vi.fn();
const state = createCronServiceState({
cronEnabled: true,
storePath: store.storePath,
log: noopLogger,
nowMs: () => now,
enqueueSystemEvent,
requestHeartbeatNow: vi.fn(),
runIsolatedAgentJob: vi.fn().mockResolvedValue({ status: "ok", summary: "ok" }),
});
const result = await run(state, "stale-running", "force");
expect(result).toEqual({ ok: true, ran: true });
expect(enqueueSystemEvent).toHaveBeenCalledWith(
"stale-running",
expect.objectContaining({ agentId: undefined }),
);
});
it("honors cron maxConcurrentRuns for due jobs", async () => {
vi.useRealTimers();
const store = await makeStorePath();

View File

@@ -341,6 +341,10 @@ export async function run(state: CronServiceState, id: string, mode?: "due" | "f
const prepared = await locked(state, async () => {
warnIfDisabled(state, "run");
await ensureLoaded(state, { skipRecompute: true });
// Normalize job tick state (clears stale runningAtMs markers) before
// checking if already running, so a stale marker from a crashed Phase-1
// persist does not block manual triggers for up to STUCK_RUN_MS (#17554).
recomputeNextRunsForMaintenance(state);
const job = findJobOrThrow(state, id);
if (typeof job.state.runningAtMs === "number") {
return { ok: true, ran: false, reason: "already-running" as const };