fix: treat HTTP 502/503/504 as failover-eligible (timeout reason) (#21017)
* fix: treat HTTP 502/503/504 as failover-eligible (timeout reason) When a model API returns 502 Bad Gateway, 503 Service Unavailable, or 504 Gateway Timeout, the error object carries the status code directly. resolveFailoverReasonFromError() only checked 402/429/401/403/408/400, so 5xx server errors fell through to message-based classification which requires the status code to appear at the start of the error message. Many API SDKs (Google, Anthropic) set err.status = 503 without prefixing the message with '503', so the message classifier never matched and failover never triggered — the run retried the same broken model. Add 502/503/504 to the status-code branch, returning 'timeout' (matching the existing behavior of isTransientHttpError in the message classifier). Fixes #20999 * Changelog: add failover 502/503/504 note with credits * Failover: classify HTTP 504 as transient in message parser * Changelog: credit taw0002 and vincentkoc for failover fix --------- Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
This commit is contained in:
@@ -224,6 +224,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Agents/Diagnostics: include resolved lifecycle error text in `embedded run agent end` warnings so UI/TUI “Connection error” runs expose actionable provider failure reasons in gateway logs. (#23054) Thanks @Raize.
|
||||
- Agents/Auth profiles: resolve `agentCommand` session scope before choosing `agentDir`/workspace so resumed runs no longer read auth from `agents/main/agent` when the resolved session belongs to a different/default agent (for example `agent:exec:*` sessions). (#24016) Thanks @abersonFAC.
|
||||
- Agents/Auth profiles: skip auth-profile cooldown writes for timeout failures in embedded runner rotation so model/network timeouts do not poison same-provider fallback model selection while still allowing in-turn account rotation. (#22622) Thanks @vageeshkumar.
|
||||
- Agents/Failover: treat HTTP 502/503/504 errors as failover-eligible transient timeouts so fallback chains can switch providers/models during upstream outages instead of retrying the same failing target. (#20999) Thanks @taw0002 and @vincentkoc.
|
||||
- Plugins/Hooks: run legacy `before_agent_start` once per agent turn and reuse that result across model-resolve and prompt-build compatibility paths, preventing duplicate hook side effects (for example duplicate external API calls). (#23289) Thanks @ksato8710.
|
||||
- Models/Config: default missing Anthropic provider/model `api` fields to `anthropic-messages` during config validation so custom relay model entries are preserved instead of being dropped by runtime model registry validation. (#23332) Thanks @bigbigmonkey123.
|
||||
- Gateway/Pairing: preserve existing approved token scopes when processing repair pairings that omit `scopes`, preventing empty-scope token regressions on reconnecting clients. (#21906) Thanks @paki81.
|
||||
|
||||
@@ -13,7 +13,10 @@ describe("failover-error", () => {
|
||||
expect(resolveFailoverReasonFromError({ status: 403 })).toBe("auth");
|
||||
expect(resolveFailoverReasonFromError({ status: 408 })).toBe("timeout");
|
||||
expect(resolveFailoverReasonFromError({ status: 400 })).toBe("format");
|
||||
// Transient server errors (502/503/504) should trigger failover as timeout.
|
||||
expect(resolveFailoverReasonFromError({ status: 502 })).toBe("timeout");
|
||||
expect(resolveFailoverReasonFromError({ status: 503 })).toBe("timeout");
|
||||
expect(resolveFailoverReasonFromError({ status: 504 })).toBe("timeout");
|
||||
});
|
||||
|
||||
it("infers format errors from error messages", () => {
|
||||
|
||||
@@ -163,7 +163,7 @@ export function resolveFailoverReasonFromError(err: unknown): FailoverReason | n
|
||||
if (status === 408) {
|
||||
return "timeout";
|
||||
}
|
||||
if (status === 503) {
|
||||
if (status === 502 || status === 503 || status === 504) {
|
||||
return "timeout";
|
||||
}
|
||||
if (status === 400) {
|
||||
|
||||
@@ -270,12 +270,12 @@ describe("isTransientHttpError", () => {
|
||||
expect(isTransientHttpError("500 Internal Server Error")).toBe(true);
|
||||
expect(isTransientHttpError("502 Bad Gateway")).toBe(true);
|
||||
expect(isTransientHttpError("503 Service Unavailable")).toBe(true);
|
||||
expect(isTransientHttpError("504 Gateway Timeout")).toBe(true);
|
||||
expect(isTransientHttpError("521 <!DOCTYPE html><html></html>")).toBe(true);
|
||||
expect(isTransientHttpError("529 Overloaded")).toBe(true);
|
||||
});
|
||||
|
||||
it("returns false for non-retryable or non-http text", () => {
|
||||
expect(isTransientHttpError("504 Gateway Timeout")).toBe(false);
|
||||
expect(isTransientHttpError("429 Too Many Requests")).toBe(false);
|
||||
expect(isTransientHttpError("network timeout")).toBe(false);
|
||||
});
|
||||
|
||||
@@ -120,7 +120,7 @@ const HTTP_STATUS_PREFIX_RE = /^(?:http\s*)?(\d{3})\s+(.+)$/i;
|
||||
const HTTP_STATUS_CODE_PREFIX_RE = /^(?:http\s*)?(\d{3})(?:\s+([\s\S]+))?$/i;
|
||||
const HTML_ERROR_PREFIX_RE = /^\s*(?:<!doctype\s+html\b|<html\b)/i;
|
||||
const CLOUDFLARE_HTML_ERROR_CODES = new Set([521, 522, 523, 524, 525, 526, 530]);
|
||||
const TRANSIENT_HTTP_ERROR_CODES = new Set([500, 502, 503, 521, 522, 523, 524, 529]);
|
||||
const TRANSIENT_HTTP_ERROR_CODES = new Set([500, 502, 503, 504, 521, 522, 523, 524, 529]);
|
||||
const HTTP_ERROR_HINTS = [
|
||||
"error",
|
||||
"bad request",
|
||||
|
||||
Reference in New Issue
Block a user