From 7c3c406a35137ca7bc03de0ccd3607f22f089b82 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 22 Feb 2026 13:13:33 +0100 Subject: [PATCH] fix: keep auth-profile cooldown windows immutable in-window (#23536) (thanks @arosstale) --- CHANGELOG.md | 2 +- src/agents/auth-profiles/usage.test.ts | 54 +++++++++++++++++++------- src/agents/auth-profiles/usage.ts | 36 +++++++++-------- 3 files changed, 60 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d1449580f..f37f68357 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ Docs: https://docs.openclaw.ai ### Fixes -- Auth/Profiles: prevent cooldown deadline from being reset on every retry when the backoff is already saturated. Previously each failed request overwrote `cooldownUntil` with `now + backoffMs`, so a 60-minute cooldown was perpetually extended by cron or inbound retries, trapping the gateway in an unrecoverable loop that required manual `usageStats` deletion to resolve. (#23516) +- Auth/Profiles: keep active `cooldownUntil`/`disabledUntil` windows immutable across retries so mid-window failures cannot extend recovery indefinitely; only recompute a backoff window after the previous deadline has expired. This resolves cron/inbound retry loops that could trap gateways until manual `usageStats` cleanup. (#23516, #23536) Thanks @arosstale. - Channels/Security: fail closed on missing provider group policy config by defaulting runtime group policy to `allowlist` (instead of inheriting `channels.defaults.groupPolicy`) when `channels.` is absent across message channels, and align runtime + security warnings/docs to the same fallback behavior (Slack, Discord, iMessage, Telegram, WhatsApp, Signal, LINE, Matrix, Mattermost, Google Chat, IRC, Nextcloud Talk, Feishu, and Zalo user flows; plus Discord message/native-command paths). (#23367) Thanks @bmendonca3. - Gateway/Onboarding: harden remote gateway onboarding defaults and guidance by defaulting discovered direct URLs to `wss://`, rejecting insecure non-loopback `ws://` targets in onboarding validation, and expanding remote-security remediation messaging across gateway client/call/doctor flows. (#23476) Thanks @bmendonca3. - CLI/Sessions: pass the configured sessions directory when resolving transcript paths in `agentCommand`, so custom `session.store` locations resume sessions reliably. Thanks @davidrudduck. diff --git a/src/agents/auth-profiles/usage.test.ts b/src/agents/auth-profiles/usage.test.ts index 597cb2d7a..6ae71d954 100644 --- a/src/agents/auth-profiles/usage.test.ts +++ b/src/agents/auth-profiles/usage.test.ts @@ -349,12 +349,12 @@ describe("clearAuthProfileCooldown", () => { }); }); -describe("markAuthProfileFailure — cooldown is never reset to an earlier deadline", () => { +describe("markAuthProfileFailure — active windows do not extend on retry", () => { // Regression for https://github.com/openclaw/openclaw/issues/23516 // When all providers are at saturation backoff (60 min) and retries fire every 30 min, // each retry was resetting cooldownUntil to now+60m, preventing recovery. - it("does not shorten an existing cooldown when a retry fires mid-window", async () => { + it("keeps an active cooldownUntil unchanged on a mid-window retry", async () => { const now = 1_000_000; // Profile already has 50 min remaining on its cooldown const existingCooldownUntil = now + 50 * 60 * 1000; @@ -379,19 +379,16 @@ describe("markAuthProfileFailure — cooldown is never reset to an earlier deadl } const stats = store.usageStats?.["anthropic:default"]; - // cooldownUntil must NOT have been reset to now+60m (= now+3_600_000 < existingCooldownUntil) - // It should remain at the original deadline or be extended, never shortened. - expect(stats?.cooldownUntil).toBeGreaterThanOrEqual(existingCooldownUntil); + expect(stats?.cooldownUntil).toBe(existingCooldownUntil); }); - it("does extend cooldownUntil when the new backoff would end later", async () => { + it("recomputes cooldownUntil when the previous window already expired", async () => { const now = 1_000_000; - // Profile has only 5 min remaining but the next backoff level gives 60 min - const existingCooldownUntil = now + 5 * 60 * 1000; + const expiredCooldownUntil = now - 60_000; const store = makeStore({ "anthropic:default": { - cooldownUntil: existingCooldownUntil, - errorCount: 2, // next step: 60-min backoff + cooldownUntil: expiredCooldownUntil, + errorCount: 3, // saturated 60-min backoff lastFailureAt: now - 60_000, }, }); @@ -409,11 +406,10 @@ describe("markAuthProfileFailure — cooldown is never reset to an earlier deadl } const stats = store.usageStats?.["anthropic:default"]; - // now+60min > existingCooldownUntil (now+5min), so it should be extended - expect(stats?.cooldownUntil).toBeGreaterThan(existingCooldownUntil); + expect(stats?.cooldownUntil).toBe(now + 60 * 60 * 1000); }); - it("does not shorten an existing disabledUntil on a billing retry", async () => { + it("keeps an active disabledUntil unchanged on a billing retry", async () => { const now = 1_000_000; // Profile already has 20 hours remaining on a billing disable const existingDisabledUntil = now + 20 * 60 * 60 * 1000; @@ -440,7 +436,35 @@ describe("markAuthProfileFailure — cooldown is never reset to an earlier deadl } const stats = store.usageStats?.["anthropic:default"]; - // disabledUntil must not have been shortened - expect(stats?.disabledUntil).toBeGreaterThanOrEqual(existingDisabledUntil); + expect(stats?.disabledUntil).toBe(existingDisabledUntil); + }); + + it("recomputes disabledUntil when the previous billing window already expired", async () => { + const now = 1_000_000; + const expiredDisabledUntil = now - 60_000; + const store = makeStore({ + "anthropic:default": { + disabledUntil: expiredDisabledUntil, + disabledReason: "billing", + errorCount: 5, + failureCounts: { billing: 2 }, // next billing backoff: 20h + lastFailureAt: now - 60_000, + }, + }); + + vi.useFakeTimers(); + vi.setSystemTime(now); + try { + await markAuthProfileFailure({ + store, + profileId: "anthropic:default", + reason: "billing", + }); + } finally { + vi.useRealTimers(); + } + + const stats = store.usageStats?.["anthropic:default"]; + expect(stats?.disabledUntil).toBe(now + 20 * 60 * 60 * 1000); }); }); diff --git a/src/agents/auth-profiles/usage.ts b/src/agents/auth-profiles/usage.ts index 509710f4f..202780690 100644 --- a/src/agents/auth-profiles/usage.ts +++ b/src/agents/auth-profiles/usage.ts @@ -287,25 +287,29 @@ function computeNextProfileUsageStats(params: { baseMs: params.cfgResolved.billingBackoffMs, maxMs: params.cfgResolved.billingMaxMs, }); - const newDisabledUntil = params.now + backoffMs; - // Only advance disabledUntil — never shorten an existing window. - // A retry that fires while the profile is already disabled must not reset - // the deadline to an earlier time; it may extend it if the new backoff is longer. - if (!params.existing.disabledUntil || newDisabledUntil > params.existing.disabledUntil) { - updatedStats.disabledUntil = newDisabledUntil; - } + const existingDisabledUntil = params.existing.disabledUntil; + const hasActiveDisabledWindow = + typeof existingDisabledUntil === "number" && + Number.isFinite(existingDisabledUntil) && + existingDisabledUntil > params.now; + // Keep active disable windows immutable so retries within the window cannot + // extend recovery time indefinitely. + updatedStats.disabledUntil = hasActiveDisabledWindow + ? existingDisabledUntil + : params.now + backoffMs; updatedStats.disabledReason = "billing"; } else { const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount); - const newCooldownUntil = params.now + backoffMs; - // Only advance cooldownUntil — never shorten an existing window. - // When the backoff saturates (60 min) and retries fire every 30 min, each - // retry was resetting cooldownUntil to now+60m, preventing the profile from - // ever recovering. We only write a new deadline when it is strictly later - // than the one already in the store. - if (!params.existing.cooldownUntil || newCooldownUntil > params.existing.cooldownUntil) { - updatedStats.cooldownUntil = newCooldownUntil; - } + const existingCooldownUntil = params.existing.cooldownUntil; + const hasActiveCooldownWindow = + typeof existingCooldownUntil === "number" && + Number.isFinite(existingCooldownUntil) && + existingCooldownUntil > params.now; + // Keep active cooldown windows immutable so retries within the window + // cannot push recovery further out. + updatedStats.cooldownUntil = hasActiveCooldownWindow + ? existingCooldownUntil + : params.now + backoffMs; } return updatedStats;