fix: keep auth-profile cooldown windows immutable in-window (#23536) (thanks @arosstale)
This commit is contained in:
@@ -19,7 +19,7 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
### Fixes
|
||||
|
||||
- Auth/Profiles: prevent cooldown deadline from being reset on every retry when the backoff is already saturated. Previously each failed request overwrote `cooldownUntil` with `now + backoffMs`, so a 60-minute cooldown was perpetually extended by cron or inbound retries, trapping the gateway in an unrecoverable loop that required manual `usageStats` deletion to resolve. (#23516)
|
||||
- Auth/Profiles: keep active `cooldownUntil`/`disabledUntil` windows immutable across retries so mid-window failures cannot extend recovery indefinitely; only recompute a backoff window after the previous deadline has expired. This resolves cron/inbound retry loops that could trap gateways until manual `usageStats` cleanup. (#23516, #23536) Thanks @arosstale.
|
||||
- Channels/Security: fail closed on missing provider group policy config by defaulting runtime group policy to `allowlist` (instead of inheriting `channels.defaults.groupPolicy`) when `channels.<provider>` is absent across message channels, and align runtime + security warnings/docs to the same fallback behavior (Slack, Discord, iMessage, Telegram, WhatsApp, Signal, LINE, Matrix, Mattermost, Google Chat, IRC, Nextcloud Talk, Feishu, and Zalo user flows; plus Discord message/native-command paths). (#23367) Thanks @bmendonca3.
|
||||
- Gateway/Onboarding: harden remote gateway onboarding defaults and guidance by defaulting discovered direct URLs to `wss://`, rejecting insecure non-loopback `ws://` targets in onboarding validation, and expanding remote-security remediation messaging across gateway client/call/doctor flows. (#23476) Thanks @bmendonca3.
|
||||
- CLI/Sessions: pass the configured sessions directory when resolving transcript paths in `agentCommand`, so custom `session.store` locations resume sessions reliably. Thanks @davidrudduck.
|
||||
|
||||
@@ -349,12 +349,12 @@ describe("clearAuthProfileCooldown", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("markAuthProfileFailure — cooldown is never reset to an earlier deadline", () => {
|
||||
describe("markAuthProfileFailure — active windows do not extend on retry", () => {
|
||||
// Regression for https://github.com/openclaw/openclaw/issues/23516
|
||||
// When all providers are at saturation backoff (60 min) and retries fire every 30 min,
|
||||
// each retry was resetting cooldownUntil to now+60m, preventing recovery.
|
||||
|
||||
it("does not shorten an existing cooldown when a retry fires mid-window", async () => {
|
||||
it("keeps an active cooldownUntil unchanged on a mid-window retry", async () => {
|
||||
const now = 1_000_000;
|
||||
// Profile already has 50 min remaining on its cooldown
|
||||
const existingCooldownUntil = now + 50 * 60 * 1000;
|
||||
@@ -379,19 +379,16 @@ describe("markAuthProfileFailure — cooldown is never reset to an earlier deadl
|
||||
}
|
||||
|
||||
const stats = store.usageStats?.["anthropic:default"];
|
||||
// cooldownUntil must NOT have been reset to now+60m (= now+3_600_000 < existingCooldownUntil)
|
||||
// It should remain at the original deadline or be extended, never shortened.
|
||||
expect(stats?.cooldownUntil).toBeGreaterThanOrEqual(existingCooldownUntil);
|
||||
expect(stats?.cooldownUntil).toBe(existingCooldownUntil);
|
||||
});
|
||||
|
||||
it("does extend cooldownUntil when the new backoff would end later", async () => {
|
||||
it("recomputes cooldownUntil when the previous window already expired", async () => {
|
||||
const now = 1_000_000;
|
||||
// Profile has only 5 min remaining but the next backoff level gives 60 min
|
||||
const existingCooldownUntil = now + 5 * 60 * 1000;
|
||||
const expiredCooldownUntil = now - 60_000;
|
||||
const store = makeStore({
|
||||
"anthropic:default": {
|
||||
cooldownUntil: existingCooldownUntil,
|
||||
errorCount: 2, // next step: 60-min backoff
|
||||
cooldownUntil: expiredCooldownUntil,
|
||||
errorCount: 3, // saturated 60-min backoff
|
||||
lastFailureAt: now - 60_000,
|
||||
},
|
||||
});
|
||||
@@ -409,11 +406,10 @@ describe("markAuthProfileFailure — cooldown is never reset to an earlier deadl
|
||||
}
|
||||
|
||||
const stats = store.usageStats?.["anthropic:default"];
|
||||
// now+60min > existingCooldownUntil (now+5min), so it should be extended
|
||||
expect(stats?.cooldownUntil).toBeGreaterThan(existingCooldownUntil);
|
||||
expect(stats?.cooldownUntil).toBe(now + 60 * 60 * 1000);
|
||||
});
|
||||
|
||||
it("does not shorten an existing disabledUntil on a billing retry", async () => {
|
||||
it("keeps an active disabledUntil unchanged on a billing retry", async () => {
|
||||
const now = 1_000_000;
|
||||
// Profile already has 20 hours remaining on a billing disable
|
||||
const existingDisabledUntil = now + 20 * 60 * 60 * 1000;
|
||||
@@ -440,7 +436,35 @@ describe("markAuthProfileFailure — cooldown is never reset to an earlier deadl
|
||||
}
|
||||
|
||||
const stats = store.usageStats?.["anthropic:default"];
|
||||
// disabledUntil must not have been shortened
|
||||
expect(stats?.disabledUntil).toBeGreaterThanOrEqual(existingDisabledUntil);
|
||||
expect(stats?.disabledUntil).toBe(existingDisabledUntil);
|
||||
});
|
||||
|
||||
it("recomputes disabledUntil when the previous billing window already expired", async () => {
|
||||
const now = 1_000_000;
|
||||
const expiredDisabledUntil = now - 60_000;
|
||||
const store = makeStore({
|
||||
"anthropic:default": {
|
||||
disabledUntil: expiredDisabledUntil,
|
||||
disabledReason: "billing",
|
||||
errorCount: 5,
|
||||
failureCounts: { billing: 2 }, // next billing backoff: 20h
|
||||
lastFailureAt: now - 60_000,
|
||||
},
|
||||
});
|
||||
|
||||
vi.useFakeTimers();
|
||||
vi.setSystemTime(now);
|
||||
try {
|
||||
await markAuthProfileFailure({
|
||||
store,
|
||||
profileId: "anthropic:default",
|
||||
reason: "billing",
|
||||
});
|
||||
} finally {
|
||||
vi.useRealTimers();
|
||||
}
|
||||
|
||||
const stats = store.usageStats?.["anthropic:default"];
|
||||
expect(stats?.disabledUntil).toBe(now + 20 * 60 * 60 * 1000);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -287,25 +287,29 @@ function computeNextProfileUsageStats(params: {
|
||||
baseMs: params.cfgResolved.billingBackoffMs,
|
||||
maxMs: params.cfgResolved.billingMaxMs,
|
||||
});
|
||||
const newDisabledUntil = params.now + backoffMs;
|
||||
// Only advance disabledUntil — never shorten an existing window.
|
||||
// A retry that fires while the profile is already disabled must not reset
|
||||
// the deadline to an earlier time; it may extend it if the new backoff is longer.
|
||||
if (!params.existing.disabledUntil || newDisabledUntil > params.existing.disabledUntil) {
|
||||
updatedStats.disabledUntil = newDisabledUntil;
|
||||
}
|
||||
const existingDisabledUntil = params.existing.disabledUntil;
|
||||
const hasActiveDisabledWindow =
|
||||
typeof existingDisabledUntil === "number" &&
|
||||
Number.isFinite(existingDisabledUntil) &&
|
||||
existingDisabledUntil > params.now;
|
||||
// Keep active disable windows immutable so retries within the window cannot
|
||||
// extend recovery time indefinitely.
|
||||
updatedStats.disabledUntil = hasActiveDisabledWindow
|
||||
? existingDisabledUntil
|
||||
: params.now + backoffMs;
|
||||
updatedStats.disabledReason = "billing";
|
||||
} else {
|
||||
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
|
||||
const newCooldownUntil = params.now + backoffMs;
|
||||
// Only advance cooldownUntil — never shorten an existing window.
|
||||
// When the backoff saturates (60 min) and retries fire every 30 min, each
|
||||
// retry was resetting cooldownUntil to now+60m, preventing the profile from
|
||||
// ever recovering. We only write a new deadline when it is strictly later
|
||||
// than the one already in the store.
|
||||
if (!params.existing.cooldownUntil || newCooldownUntil > params.existing.cooldownUntil) {
|
||||
updatedStats.cooldownUntil = newCooldownUntil;
|
||||
}
|
||||
const existingCooldownUntil = params.existing.cooldownUntil;
|
||||
const hasActiveCooldownWindow =
|
||||
typeof existingCooldownUntil === "number" &&
|
||||
Number.isFinite(existingCooldownUntil) &&
|
||||
existingCooldownUntil > params.now;
|
||||
// Keep active cooldown windows immutable so retries within the window
|
||||
// cannot push recovery further out.
|
||||
updatedStats.cooldownUntil = hasActiveCooldownWindow
|
||||
? existingCooldownUntil
|
||||
: params.now + backoffMs;
|
||||
}
|
||||
|
||||
return updatedStats;
|
||||
|
||||
Reference in New Issue
Block a user