diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e62014f6..ca6346e61 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -77,6 +77,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Cron/One-shot reliability: retry transient one-shot failures with bounded backoff and configurable retry policy before disabling. (#24435) Thanks . - Gateway/Cron auditability: add gateway info logs for successful cron create, update, and remove operations. (#25090) Thanks . - Cron/Schedule errors: notify users when a job is auto-disabled after repeated schedule computation failures. (#29098) Thanks . - Cron/Schedule errors: notify users when a job is auto-disabled after repeated schedule computation failures. (#29098) Thanks . diff --git a/docs/automation/cron-jobs.md b/docs/automation/cron-jobs.md index 8d1401926..bb12570bd 100644 --- a/docs/automation/cron-jobs.md +++ b/docs/automation/cron-jobs.md @@ -353,6 +353,38 @@ Notes: - Isolated cron run sessions in `sessions.json` are pruned by `cron.sessionRetention` (default `24h`; set `false` to disable). - Override store path: `cron.store` in config. +## Retry policy + +When a job fails, OpenClaw classifies errors as **transient** (retryable) or **permanent** (disable immediately). + +### Transient errors (retried) + +- Rate limit (429, too many requests, resource exhausted) +- Network errors (timeout, ECONNRESET, fetch failed, socket) +- Server errors (5xx) +- Cloudflare-related errors + +### Permanent errors (no retry) + +- Auth failures (invalid API key, unauthorized) +- Config or validation errors +- Other non-transient errors + +### Default behavior (no config) + +**One-shot jobs (`schedule.kind: "at"`):** + +- On transient error: retry up to 3 times with exponential backoff (30s → 1m → 5m). +- On permanent error: disable immediately. +- On success or skip: disable (or delete if `deleteAfterRun: true`). + +**Recurring jobs (`cron` / `every`):** + +- On any error: apply exponential backoff (30s → 1m → 5m → 15m → 60m) before the next scheduled run. +- Job stays enabled; backoff resets after the next successful run. + +Configure `cron.retry` to override these defaults (see [Configuration](/automation/cron-jobs#configuration)). + ## Configuration ```json5 @@ -361,6 +393,12 @@ Notes: enabled: true, // default true store: "~/.openclaw/cron/jobs.json", maxConcurrentRuns: 1, // default 1 + // Optional: override retry policy for one-shot jobs + retry: { + maxAttempts: 3, + backoffMs: [60000, 120000, 300000], + retryOn: ["rate_limit", "network", "server_error"], + }, webhook: "https://example.invalid/legacy", // deprecated fallback for stored notify:true jobs webhookToken: "replace-with-dedicated-webhook-token", // optional bearer token for webhook mode sessionRetention: "24h", // duration string or false @@ -617,7 +655,7 @@ openclaw system event --mode now --text "Next heartbeat: check battery." - OpenClaw applies exponential retry backoff for recurring jobs after consecutive errors: 30s, 1m, 5m, 15m, then 60m between retries. - Backoff resets automatically after the next successful run. -- One-shot (`at`) jobs disable after a terminal run (`ok`, `error`, or `skipped`) and do not retry. +- One-shot (`at`) jobs retry transient errors (rate limit, network, server_error) up to 3 times with backoff; permanent errors disable immediately. See [Retry policy](/automation/cron-jobs#retry-policy). ### Telegram delivers to the wrong place diff --git a/src/config/config-misc.test.ts b/src/config/config-misc.test.ts index 71a82e426..ee083efad 100644 --- a/src/config/config-misc.test.ts +++ b/src/config/config-misc.test.ts @@ -193,6 +193,19 @@ describe("cron webhook schema", () => { expect(res.success).toBe(false); }); + + it("accepts cron.retry config", () => { + const res = OpenClawSchema.safeParse({ + cron: { + retry: { + maxAttempts: 5, + backoffMs: [60000, 120000, 300000], + retryOn: ["rate_limit", "network"], + }, + }, + }); + expect(res.success).toBe(true); + }); }); describe("broadcast", () => { diff --git a/src/config/schema.help.quality.test.ts b/src/config/schema.help.quality.test.ts index 603be7ed7..d10992935 100644 --- a/src/config/schema.help.quality.test.ts +++ b/src/config/schema.help.quality.test.ts @@ -108,6 +108,10 @@ const TARGET_KEYS = [ "cron.enabled", "cron.store", "cron.maxConcurrentRuns", + "cron.retry", + "cron.retry.maxAttempts", + "cron.retry.backoffMs", + "cron.retry.retryOn", "cron.webhook", "cron.webhookToken", "cron.sessionRetention", diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts index ef2e06cbe..fbcb86286 100644 --- a/src/config/schema.help.ts +++ b/src/config/schema.help.ts @@ -1064,6 +1064,14 @@ export const FIELD_HELP: Record = { "Path to the cron job store file used to persist scheduled jobs across restarts. Set an explicit path only when you need custom storage layout, backups, or mounted volumes.", "cron.maxConcurrentRuns": "Limits how many cron jobs can execute at the same time when multiple schedules fire together. Use lower values to protect CPU/memory under heavy automation load, or raise carefully for higher throughput.", + "cron.retry": + "Overrides the default retry policy for one-shot jobs when they fail with transient errors (rate limit, network, server_error). Omit to use defaults: maxAttempts 3, backoffMs [30000, 60000, 300000], retry all transient types.", + "cron.retry.maxAttempts": + "Max retries for one-shot jobs on transient errors before permanent disable (default: 3).", + "cron.retry.backoffMs": + "Backoff delays in ms for each retry attempt (default: [30000, 60000, 300000]). Use shorter values for faster retries.", + "cron.retry.retryOn": + "Error types to retry: rate_limit, network, timeout, server_error. Use to restrict which errors trigger retries; omit to retry all transient types.", "cron.webhook": 'Deprecated legacy fallback webhook URL used only for old jobs with `notify=true`. Migrate to per-job delivery using `delivery.mode="webhook"` plus `delivery.to`, and avoid relying on this global field.', "cron.webhookToken": diff --git a/src/config/schema.labels.ts b/src/config/schema.labels.ts index 7005613b6..82ed7d052 100644 --- a/src/config/schema.labels.ts +++ b/src/config/schema.labels.ts @@ -504,6 +504,10 @@ export const FIELD_LABELS: Record = { "cron.enabled": "Cron Enabled", "cron.store": "Cron Store Path", "cron.maxConcurrentRuns": "Cron Max Concurrent Runs", + "cron.retry": "Cron Retry Policy", + "cron.retry.maxAttempts": "Cron Retry Max Attempts", + "cron.retry.backoffMs": "Cron Retry Backoff (ms)", + "cron.retry.retryOn": "Cron Retry Error Types", "cron.webhook": "Cron Legacy Webhook (Deprecated)", "cron.webhookToken": "Cron Webhook Bearer Token", "cron.sessionRetention": "Cron Session Retention", diff --git a/src/config/types.cron.ts b/src/config/types.cron.ts index 300e0c2ce..6568f4ad7 100644 --- a/src/config/types.cron.ts +++ b/src/config/types.cron.ts @@ -1,7 +1,21 @@ +/** Error types that can trigger retries for one-shot jobs. */ +export type CronRetryOn = "rate_limit" | "network" | "timeout" | "server_error"; + +export type CronRetryConfig = { + /** Max retries for transient errors before permanent disable (default: 3). */ + maxAttempts?: number; + /** Backoff delays in ms for each retry attempt (default: [30000, 60000, 300000]). */ + backoffMs?: number[]; + /** Error types to retry; omit to retry all transient types. */ + retryOn?: CronRetryOn[]; +}; + export type CronConfig = { enabled?: boolean; store?: string; maxConcurrentRuns?: number; + /** Override default retry policy for one-shot jobs on transient errors. */ + retry?: CronRetryConfig; /** * Deprecated legacy fallback webhook URL used only for stored jobs with notify=true. * Prefer per-job delivery.mode="webhook" with delivery.to. diff --git a/src/config/zod-schema.ts b/src/config/zod-schema.ts index e072c1fd9..ca9362dbc 100644 --- a/src/config/zod-schema.ts +++ b/src/config/zod-schema.ts @@ -374,6 +374,17 @@ export const OpenClawSchema = z enabled: z.boolean().optional(), store: z.string().optional(), maxConcurrentRuns: z.number().int().positive().optional(), + retry: z + .object({ + maxAttempts: z.number().int().min(0).max(10).optional(), + backoffMs: z.array(z.number().int().nonnegative()).min(1).max(10).optional(), + retryOn: z + .array(z.enum(["rate_limit", "network", "timeout", "server_error"])) + .min(1) + .optional(), + }) + .strict() + .optional(), webhook: HttpUrlSchema.optional(), webhookToken: z.string().optional().register(sensitive), sessionRetention: z.union([z.string(), z.literal(false)]).optional(), diff --git a/src/cron/service.issue-regressions.test.ts b/src/cron/service.issue-regressions.test.ts index 88eef2c9b..09f5cf0b1 100644 --- a/src/cron/service.issue-regressions.test.ts +++ b/src/cron/service.issue-regressions.test.ts @@ -752,6 +752,224 @@ describe("Cron issue regressions", () => { } }); + it("#24355: one-shot job retries on transient error, then succeeds", async () => { + const store = await makeStorePath(); + const scheduledAt = Date.parse("2026-02-06T10:00:00.000Z"); + + const cronJob = createIsolatedRegressionJob({ + id: "oneshot-retry", + name: "reminder", + scheduledAt, + schedule: { kind: "at", at: new Date(scheduledAt).toISOString() }, + payload: { kind: "agentTurn", message: "remind me" }, + state: { nextRunAtMs: scheduledAt }, + }); + cronJob.deleteAfterRun = false; + await writeCronJobs(store.storePath, [cronJob]); + + let now = scheduledAt; + const runIsolatedAgentJob = vi + .fn() + .mockResolvedValueOnce({ status: "error", error: "429 rate limit exceeded" }) + .mockResolvedValueOnce({ status: "ok", summary: "done" }); + const state = createCronServiceState({ + cronEnabled: true, + storePath: store.storePath, + log: noopLogger, + nowMs: () => now, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + runIsolatedAgentJob, + }); + + await onTimer(state); + let job = state.store?.jobs.find((j) => j.id === "oneshot-retry"); + expect(job).toBeDefined(); + expect(job!.enabled).toBe(true); + expect(job!.state.lastStatus).toBe("error"); + expect(job!.state.nextRunAtMs).toBeDefined(); + expect(job!.state.nextRunAtMs).toBeGreaterThan(scheduledAt); + + now = (job!.state.nextRunAtMs ?? 0) + 1; + await onTimer(state); + job = state.store?.jobs.find((j) => j.id === "oneshot-retry"); + expect(job).toBeDefined(); + expect(job!.state.lastStatus).toBe("ok"); + expect(runIsolatedAgentJob).toHaveBeenCalledTimes(2); + }); + + it("#24355: one-shot job disabled after max transient retries", async () => { + const store = await makeStorePath(); + const scheduledAt = Date.parse("2026-02-06T10:00:00.000Z"); + + const cronJob = createIsolatedRegressionJob({ + id: "oneshot-max-retries", + name: "reminder", + scheduledAt, + schedule: { kind: "at", at: new Date(scheduledAt).toISOString() }, + payload: { kind: "agentTurn", message: "remind me" }, + state: { nextRunAtMs: scheduledAt }, + }); + await writeCronJobs(store.storePath, [cronJob]); + + let now = scheduledAt; + const runIsolatedAgentJob = vi.fn().mockResolvedValue({ + status: "error", + error: "429 rate limit exceeded", + }); + const state = createCronServiceState({ + cronEnabled: true, + storePath: store.storePath, + log: noopLogger, + nowMs: () => now, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + runIsolatedAgentJob, + }); + + for (let i = 0; i < 4; i++) { + await onTimer(state); + const job = state.store?.jobs.find((j) => j.id === "oneshot-max-retries"); + expect(job).toBeDefined(); + if (i < 3) { + expect(job!.enabled).toBe(true); + now = (job!.state.nextRunAtMs ?? now) + 1; + } else { + expect(job!.enabled).toBe(false); + } + } + expect(runIsolatedAgentJob).toHaveBeenCalledTimes(4); + }); + + it("#24355: one-shot job respects cron.retry config", async () => { + const store = await makeStorePath(); + const scheduledAt = Date.parse("2026-02-06T10:00:00.000Z"); + + const cronJob = createIsolatedRegressionJob({ + id: "oneshot-custom-retry", + name: "reminder", + scheduledAt, + schedule: { kind: "at", at: new Date(scheduledAt).toISOString() }, + payload: { kind: "agentTurn", message: "remind me" }, + state: { nextRunAtMs: scheduledAt }, + }); + await writeCronJobs(store.storePath, [cronJob]); + + let now = scheduledAt; + const runIsolatedAgentJob = vi.fn().mockResolvedValue({ + status: "error", + error: "429 rate limit exceeded", + }); + const state = createCronServiceState({ + cronEnabled: true, + storePath: store.storePath, + log: noopLogger, + nowMs: () => now, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + runIsolatedAgentJob, + cronConfig: { + retry: { maxAttempts: 2, backoffMs: [1000, 2000] }, + }, + }); + + for (let i = 0; i < 4; i++) { + await onTimer(state); + const job = state.store?.jobs.find((j) => j.id === "oneshot-custom-retry"); + expect(job).toBeDefined(); + if (i < 2) { + expect(job!.enabled).toBe(true); + now = (job!.state.nextRunAtMs ?? now) + 1; + } else { + expect(job!.enabled).toBe(false); + } + } + expect(runIsolatedAgentJob).toHaveBeenCalledTimes(3); + }); + + it("#24355: one-shot job disabled immediately on permanent error", async () => { + const store = await makeStorePath(); + const scheduledAt = Date.parse("2026-02-06T10:00:00.000Z"); + + const cronJob = createIsolatedRegressionJob({ + id: "oneshot-permanent-error", + name: "reminder", + scheduledAt, + schedule: { kind: "at", at: new Date(scheduledAt).toISOString() }, + payload: { kind: "agentTurn", message: "remind me" }, + state: { nextRunAtMs: scheduledAt }, + }); + await writeCronJobs(store.storePath, [cronJob]); + + let now = scheduledAt; + const state = createCronServiceState({ + cronEnabled: true, + storePath: store.storePath, + log: noopLogger, + nowMs: () => now, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + runIsolatedAgentJob: vi.fn().mockResolvedValue({ + status: "error", + error: "invalid API key", + }), + }); + + await onTimer(state); + + const job = state.store?.jobs.find((j) => j.id === "oneshot-permanent-error"); + expect(job).toBeDefined(); + expect(job!.enabled).toBe(false); + expect(job!.state.lastStatus).toBe("error"); + expect(job!.state.nextRunAtMs).toBeUndefined(); + }); + + it("#24355: deleteAfterRun:true one-shot job is deleted after successful retry", async () => { + const store = await makeStorePath(); + const scheduledAt = Date.parse("2026-02-06T10:00:00.000Z"); + + const cronJob = createIsolatedRegressionJob({ + id: "oneshot-deleteAfterRun-retry", + name: "reminder", + scheduledAt, + schedule: { kind: "at", at: new Date(scheduledAt).toISOString() }, + payload: { kind: "agentTurn", message: "remind me" }, + state: { nextRunAtMs: scheduledAt }, + }); + cronJob.deleteAfterRun = true; + await writeCronJobs(store.storePath, [cronJob]); + + let now = scheduledAt; + const runIsolatedAgentJob = vi + .fn() + .mockResolvedValueOnce({ status: "error", error: "429 rate limit exceeded" }) + .mockResolvedValueOnce({ status: "ok", summary: "done" }); + const state = createCronServiceState({ + cronEnabled: true, + storePath: store.storePath, + log: noopLogger, + nowMs: () => now, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + runIsolatedAgentJob, + }); + + // First run: transient error → retry scheduled, job still in store. + await onTimer(state); + let job = state.store?.jobs.find((j) => j.id === "oneshot-deleteAfterRun-retry"); + expect(job).toBeDefined(); + expect(job!.enabled).toBe(true); + expect(job!.state.lastStatus).toBe("error"); + expect(job!.state.nextRunAtMs).toBeGreaterThan(scheduledAt); + + // Second run: success → deleteAfterRun removes the job from the store. + now = (job!.state.nextRunAtMs ?? 0) + 1; + await onTimer(state); + const deleted = state.store?.jobs.find((j) => j.id === "oneshot-deleteAfterRun-retry"); + expect(deleted).toBeUndefined(); + expect(runIsolatedAgentJob).toHaveBeenCalledTimes(2); + }); + it("prevents spin loop when cron job completes within the scheduled second (#17821)", async () => { const store = await makeStorePath(); // Simulate a cron job "0 13 * * *" (daily 13:00 UTC) that fires exactly diff --git a/src/cron/service/timer.ts b/src/cron/service/timer.ts index 5d12e96ee..68bcf52cd 100644 --- a/src/cron/service/timer.ts +++ b/src/cron/service/timer.ts @@ -1,3 +1,4 @@ +import type { CronConfig, CronRetryOn } from "../../config/types.cron.js"; import type { HeartbeatRunResult } from "../../infra/heartbeat-wake.js"; import { DEFAULT_AGENT_ID } from "../../routing/session-key.js"; import { resolveCronDeliveryPlan } from "../delivery.js"; @@ -91,7 +92,7 @@ function isAbortError(err: unknown): boolean { * Exponential backoff delays (in ms) indexed by consecutive error count. * After the last entry the delay stays constant. */ -const ERROR_BACKOFF_SCHEDULE_MS = [ +const DEFAULT_BACKOFF_SCHEDULE_MS = [ 30_000, // 1st error → 30 s 60_000, // 2nd error → 1 min 5 * 60_000, // 3rd error → 5 min @@ -99,9 +100,43 @@ const ERROR_BACKOFF_SCHEDULE_MS = [ 60 * 60_000, // 5th+ error → 60 min ]; -function errorBackoffMs(consecutiveErrors: number): number { - const idx = Math.min(consecutiveErrors - 1, ERROR_BACKOFF_SCHEDULE_MS.length - 1); - return ERROR_BACKOFF_SCHEDULE_MS[Math.max(0, idx)]; +function errorBackoffMs( + consecutiveErrors: number, + scheduleMs = DEFAULT_BACKOFF_SCHEDULE_MS, +): number { + const idx = Math.min(consecutiveErrors - 1, scheduleMs.length - 1); + return scheduleMs[Math.max(0, idx)]; +} + +/** Default max retries for one-shot jobs on transient errors (#24355). */ +const DEFAULT_MAX_TRANSIENT_RETRIES = 3; + +const TRANSIENT_PATTERNS: Record = { + rate_limit: /(rate[_ ]limit|too many requests|429|resource has been exhausted|cloudflare)/i, + network: /(network|econnreset|econnrefused|fetch failed|socket)/i, + timeout: /(timeout|etimedout)/i, + server_error: /\b5\d{2}\b/, +}; + +function isTransientCronError(error: string | undefined, retryOn?: CronRetryOn[]): boolean { + if (!error || typeof error !== "string") { + return false; + } + const keys = retryOn?.length ? retryOn : (Object.keys(TRANSIENT_PATTERNS) as CronRetryOn[]); + return keys.some((k) => TRANSIENT_PATTERNS[k]?.test(error)); +} + +function resolveRetryConfig(cronConfig?: CronConfig) { + const retry = cronConfig?.retry; + return { + maxAttempts: + typeof retry?.maxAttempts === "number" ? retry.maxAttempts : DEFAULT_MAX_TRANSIENT_RETRIES, + backoffMs: + Array.isArray(retry?.backoffMs) && retry.backoffMs.length > 0 + ? retry.backoffMs + : DEFAULT_BACKOFF_SCHEDULE_MS.slice(0, 3), + retryOn: Array.isArray(retry?.retryOn) && retry.retryOn.length > 0 ? retry.retryOn : undefined, + }; } function resolveDeliveryStatus(params: { job: CronJob; delivered?: boolean }): CronDeliveryStatus { @@ -155,21 +190,47 @@ export function applyJobResult( if (!shouldDelete) { if (job.schedule.kind === "at") { - // One-shot jobs are always disabled after ANY terminal status - // (ok, error, or skipped). This prevents tight-loop rescheduling - // when computeJobNextRunAtMs returns the past atMs value (#11452). - job.enabled = false; - job.state.nextRunAtMs = undefined; - if (result.status === "error") { - state.deps.log.warn( - { - jobId: job.id, - jobName: job.name, - consecutiveErrors: job.state.consecutiveErrors, - error: result.error, - }, - "cron: disabling one-shot job after error", - ); + if (result.status === "ok" || result.status === "skipped") { + // One-shot done or skipped: disable to prevent tight-loop (#11452). + job.enabled = false; + job.state.nextRunAtMs = undefined; + } else if (result.status === "error") { + const retryConfig = resolveRetryConfig(state.deps.cronConfig); + const transient = isTransientCronError(result.error, retryConfig.retryOn); + // consecutiveErrors is always set to ≥1 by the increment block above. + const consecutive = job.state.consecutiveErrors; + if (transient && consecutive <= retryConfig.maxAttempts) { + // Schedule retry with backoff (#24355). + const backoff = errorBackoffMs(consecutive, retryConfig.backoffMs); + job.state.nextRunAtMs = result.endedAt + backoff; + state.deps.log.info( + { + jobId: job.id, + jobName: job.name, + consecutiveErrors: consecutive, + backoffMs: backoff, + nextRunAtMs: job.state.nextRunAtMs, + }, + "cron: scheduling one-shot retry after transient error", + ); + } else { + // Permanent error or max retries exhausted: disable. + // Note: deleteAfterRun:true only triggers on ok (see shouldDelete above), + // so exhausted-retry jobs are disabled but intentionally kept in the store + // to preserve the error state for inspection. + job.enabled = false; + job.state.nextRunAtMs = undefined; + state.deps.log.warn( + { + jobId: job.id, + jobName: job.name, + consecutiveErrors: consecutive, + error: result.error, + reason: transient ? "max retries exhausted" : "permanent error", + }, + "cron: disabling one-shot job after error", + ); + } } } else if (result.status === "error" && job.enabled) { // Apply exponential backoff for errored jobs to prevent retry storms. @@ -474,9 +535,20 @@ function isRunnableJob(params: { return false; } if (params.skipAtIfAlreadyRan && job.schedule.kind === "at" && job.state.lastStatus) { - // Any terminal status (ok, error, skipped) means the job already ran at least once. - // Don't re-fire it on restart — applyJobResult disables one-shot jobs, but guard - // here defensively (#13845). + // One-shot with terminal status: skip unless it's a transient-error retry. + // Retries have nextRunAtMs > lastRunAtMs (scheduled after the failed run) (#24355). + // ok/skipped or error-without-retry always skip (#13845). + const lastRun = job.state.lastRunAtMs; + const nextRun = job.state.nextRunAtMs; + if ( + job.state.lastStatus === "error" && + job.enabled && + typeof nextRun === "number" && + typeof lastRun === "number" && + nextRun > lastRun + ) { + return nowMs >= nextRun; + } return false; } const next = job.state.nextRunAtMs;