fix: keep auth-profile cooldown windows immutable in-window (#23536) (thanks @arosstale)

This commit is contained in:
Peter Steinberger
2026-02-22 13:13:33 +01:00
parent dc69610d51
commit 7c3c406a35
3 changed files with 60 additions and 32 deletions

View File

@@ -19,7 +19,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Auth/Profiles: prevent cooldown deadline from being reset on every retry when the backoff is already saturated. Previously each failed request overwrote `cooldownUntil` with `now + backoffMs`, so a 60-minute cooldown was perpetually extended by cron or inbound retries, trapping the gateway in an unrecoverable loop that required manual `usageStats` deletion to resolve. (#23516)
- Auth/Profiles: keep active `cooldownUntil`/`disabledUntil` windows immutable across retries so mid-window failures cannot extend recovery indefinitely; only recompute a backoff window after the previous deadline has expired. This resolves cron/inbound retry loops that could trap gateways until manual `usageStats` cleanup. (#23516, #23536) Thanks @arosstale.
- Channels/Security: fail closed on missing provider group policy config by defaulting runtime group policy to `allowlist` (instead of inheriting `channels.defaults.groupPolicy`) when `channels.<provider>` is absent across message channels, and align runtime + security warnings/docs to the same fallback behavior (Slack, Discord, iMessage, Telegram, WhatsApp, Signal, LINE, Matrix, Mattermost, Google Chat, IRC, Nextcloud Talk, Feishu, and Zalo user flows; plus Discord message/native-command paths). (#23367) Thanks @bmendonca3.
- Gateway/Onboarding: harden remote gateway onboarding defaults and guidance by defaulting discovered direct URLs to `wss://`, rejecting insecure non-loopback `ws://` targets in onboarding validation, and expanding remote-security remediation messaging across gateway client/call/doctor flows. (#23476) Thanks @bmendonca3.
- CLI/Sessions: pass the configured sessions directory when resolving transcript paths in `agentCommand`, so custom `session.store` locations resume sessions reliably. Thanks @davidrudduck.

View File

@@ -349,12 +349,12 @@ describe("clearAuthProfileCooldown", () => {
});
});
describe("markAuthProfileFailure — cooldown is never reset to an earlier deadline", () => {
describe("markAuthProfileFailure — active windows do not extend on retry", () => {
// Regression for https://github.com/openclaw/openclaw/issues/23516
// When all providers are at saturation backoff (60 min) and retries fire every 30 min,
// each retry was resetting cooldownUntil to now+60m, preventing recovery.
it("does not shorten an existing cooldown when a retry fires mid-window", async () => {
it("keeps an active cooldownUntil unchanged on a mid-window retry", async () => {
const now = 1_000_000;
// Profile already has 50 min remaining on its cooldown
const existingCooldownUntil = now + 50 * 60 * 1000;
@@ -379,19 +379,16 @@ describe("markAuthProfileFailure — cooldown is never reset to an earlier deadl
}
const stats = store.usageStats?.["anthropic:default"];
// cooldownUntil must NOT have been reset to now+60m (= now+3_600_000 < existingCooldownUntil)
// It should remain at the original deadline or be extended, never shortened.
expect(stats?.cooldownUntil).toBeGreaterThanOrEqual(existingCooldownUntil);
expect(stats?.cooldownUntil).toBe(existingCooldownUntil);
});
it("does extend cooldownUntil when the new backoff would end later", async () => {
it("recomputes cooldownUntil when the previous window already expired", async () => {
const now = 1_000_000;
// Profile has only 5 min remaining but the next backoff level gives 60 min
const existingCooldownUntil = now + 5 * 60 * 1000;
const expiredCooldownUntil = now - 60_000;
const store = makeStore({
"anthropic:default": {
cooldownUntil: existingCooldownUntil,
errorCount: 2, // next step: 60-min backoff
cooldownUntil: expiredCooldownUntil,
errorCount: 3, // saturated 60-min backoff
lastFailureAt: now - 60_000,
},
});
@@ -409,11 +406,10 @@ describe("markAuthProfileFailure — cooldown is never reset to an earlier deadl
}
const stats = store.usageStats?.["anthropic:default"];
// now+60min > existingCooldownUntil (now+5min), so it should be extended
expect(stats?.cooldownUntil).toBeGreaterThan(existingCooldownUntil);
expect(stats?.cooldownUntil).toBe(now + 60 * 60 * 1000);
});
it("does not shorten an existing disabledUntil on a billing retry", async () => {
it("keeps an active disabledUntil unchanged on a billing retry", async () => {
const now = 1_000_000;
// Profile already has 20 hours remaining on a billing disable
const existingDisabledUntil = now + 20 * 60 * 60 * 1000;
@@ -440,7 +436,35 @@ describe("markAuthProfileFailure — cooldown is never reset to an earlier deadl
}
const stats = store.usageStats?.["anthropic:default"];
// disabledUntil must not have been shortened
expect(stats?.disabledUntil).toBeGreaterThanOrEqual(existingDisabledUntil);
expect(stats?.disabledUntil).toBe(existingDisabledUntil);
});
it("recomputes disabledUntil when the previous billing window already expired", async () => {
const now = 1_000_000;
const expiredDisabledUntil = now - 60_000;
const store = makeStore({
"anthropic:default": {
disabledUntil: expiredDisabledUntil,
disabledReason: "billing",
errorCount: 5,
failureCounts: { billing: 2 }, // next billing backoff: 20h
lastFailureAt: now - 60_000,
},
});
vi.useFakeTimers();
vi.setSystemTime(now);
try {
await markAuthProfileFailure({
store,
profileId: "anthropic:default",
reason: "billing",
});
} finally {
vi.useRealTimers();
}
const stats = store.usageStats?.["anthropic:default"];
expect(stats?.disabledUntil).toBe(now + 20 * 60 * 60 * 1000);
});
});

View File

@@ -287,25 +287,29 @@ function computeNextProfileUsageStats(params: {
baseMs: params.cfgResolved.billingBackoffMs,
maxMs: params.cfgResolved.billingMaxMs,
});
const newDisabledUntil = params.now + backoffMs;
// Only advance disabledUntil — never shorten an existing window.
// A retry that fires while the profile is already disabled must not reset
// the deadline to an earlier time; it may extend it if the new backoff is longer.
if (!params.existing.disabledUntil || newDisabledUntil > params.existing.disabledUntil) {
updatedStats.disabledUntil = newDisabledUntil;
}
const existingDisabledUntil = params.existing.disabledUntil;
const hasActiveDisabledWindow =
typeof existingDisabledUntil === "number" &&
Number.isFinite(existingDisabledUntil) &&
existingDisabledUntil > params.now;
// Keep active disable windows immutable so retries within the window cannot
// extend recovery time indefinitely.
updatedStats.disabledUntil = hasActiveDisabledWindow
? existingDisabledUntil
: params.now + backoffMs;
updatedStats.disabledReason = "billing";
} else {
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
const newCooldownUntil = params.now + backoffMs;
// Only advance cooldownUntil — never shorten an existing window.
// When the backoff saturates (60 min) and retries fire every 30 min, each
// retry was resetting cooldownUntil to now+60m, preventing the profile from
// ever recovering. We only write a new deadline when it is strictly later
// than the one already in the store.
if (!params.existing.cooldownUntil || newCooldownUntil > params.existing.cooldownUntil) {
updatedStats.cooldownUntil = newCooldownUntil;
}
const existingCooldownUntil = params.existing.cooldownUntil;
const hasActiveCooldownWindow =
typeof existingCooldownUntil === "number" &&
Number.isFinite(existingCooldownUntil) &&
existingCooldownUntil > params.now;
// Keep active cooldown windows immutable so retries within the window
// cannot push recovery further out.
updatedStats.cooldownUntil = hasActiveCooldownWindow
? existingCooldownUntil
: params.now + backoffMs;
}
return updatedStats;