Files
Moltbot/src/agents/model-fallback.ts
2026-02-25 03:46:34 +00:00

486 lines
15 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import type { OpenClawConfig } from "../config/config.js";
import {
resolveAgentModelFallbackValues,
resolveAgentModelPrimaryValue,
} from "../config/model-input.js";
import {
ensureAuthProfileStore,
getSoonestCooldownExpiry,
isProfileInCooldown,
resolveProfilesUnavailableReason,
resolveAuthProfileOrder,
} from "./auth-profiles.js";
import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
import {
coerceToFailoverError,
describeFailoverError,
isFailoverError,
isTimeoutError,
} from "./failover-error.js";
import {
buildConfiguredAllowlistKeys,
buildModelAliasIndex,
modelKey,
normalizeModelRef,
resolveConfiguredModelRef,
resolveModelRefFromString,
} from "./model-selection.js";
import type { FailoverReason } from "./pi-embedded-helpers.js";
import { isLikelyContextOverflowError } from "./pi-embedded-helpers.js";
type ModelCandidate = {
provider: string;
model: string;
};
type FallbackAttempt = {
provider: string;
model: string;
error: string;
reason?: FailoverReason;
status?: number;
code?: string;
};
/**
* Fallback abort check. Only treats explicit AbortError names as user aborts.
* Message-based checks (e.g., "aborted") can mask timeouts and skip fallback.
*/
function isFallbackAbortError(err: unknown): boolean {
if (!err || typeof err !== "object") {
return false;
}
if (isFailoverError(err)) {
return false;
}
const name = "name" in err ? String(err.name) : "";
return name === "AbortError";
}
function shouldRethrowAbort(err: unknown): boolean {
return isFallbackAbortError(err) && !isTimeoutError(err);
}
function createModelCandidateCollector(allowlist: Set<string> | null | undefined): {
candidates: ModelCandidate[];
addCandidate: (candidate: ModelCandidate, enforceAllowlist: boolean) => void;
} {
const seen = new Set<string>();
const candidates: ModelCandidate[] = [];
const addCandidate = (candidate: ModelCandidate, enforceAllowlist: boolean) => {
if (!candidate.provider || !candidate.model) {
return;
}
const key = modelKey(candidate.provider, candidate.model);
if (seen.has(key)) {
return;
}
if (enforceAllowlist && allowlist && !allowlist.has(key)) {
return;
}
seen.add(key);
candidates.push(candidate);
};
return { candidates, addCandidate };
}
type ModelFallbackErrorHandler = (attempt: {
provider: string;
model: string;
error: unknown;
attempt: number;
total: number;
}) => void | Promise<void>;
type ModelFallbackRunResult<T> = {
result: T;
provider: string;
model: string;
attempts: FallbackAttempt[];
};
function sameModelCandidate(a: ModelCandidate, b: ModelCandidate): boolean {
return a.provider === b.provider && a.model === b.model;
}
function throwFallbackFailureSummary(params: {
attempts: FallbackAttempt[];
candidates: ModelCandidate[];
lastError: unknown;
label: string;
formatAttempt: (attempt: FallbackAttempt) => string;
}): never {
if (params.attempts.length <= 1 && params.lastError) {
throw params.lastError;
}
const summary =
params.attempts.length > 0 ? params.attempts.map(params.formatAttempt).join(" | ") : "unknown";
throw new Error(
`All ${params.label} failed (${params.attempts.length || params.candidates.length}): ${summary}`,
{
cause: params.lastError instanceof Error ? params.lastError : undefined,
},
);
}
function resolveImageFallbackCandidates(params: {
cfg: OpenClawConfig | undefined;
defaultProvider: string;
modelOverride?: string;
}): ModelCandidate[] {
const aliasIndex = buildModelAliasIndex({
cfg: params.cfg ?? {},
defaultProvider: params.defaultProvider,
});
const allowlist = buildConfiguredAllowlistKeys({
cfg: params.cfg,
defaultProvider: params.defaultProvider,
});
const { candidates, addCandidate } = createModelCandidateCollector(allowlist);
const addRaw = (raw: string, enforceAllowlist: boolean) => {
const resolved = resolveModelRefFromString({
raw: String(raw ?? ""),
defaultProvider: params.defaultProvider,
aliasIndex,
});
if (!resolved) {
return;
}
addCandidate(resolved.ref, enforceAllowlist);
};
if (params.modelOverride?.trim()) {
addRaw(params.modelOverride, false);
} else {
const primary = resolveAgentModelPrimaryValue(params.cfg?.agents?.defaults?.imageModel);
if (primary?.trim()) {
addRaw(primary, false);
}
}
const imageFallbacks = resolveAgentModelFallbackValues(params.cfg?.agents?.defaults?.imageModel);
for (const raw of imageFallbacks) {
// Explicitly configured image fallbacks should remain reachable even when a
// model allowlist is present.
addRaw(raw, false);
}
return candidates;
}
function resolveFallbackCandidates(params: {
cfg: OpenClawConfig | undefined;
provider: string;
model: string;
/** Optional explicit fallbacks list; when provided (even empty), replaces agents.defaults.model.fallbacks. */
fallbacksOverride?: string[];
}): ModelCandidate[] {
const primary = params.cfg
? resolveConfiguredModelRef({
cfg: params.cfg,
defaultProvider: DEFAULT_PROVIDER,
defaultModel: DEFAULT_MODEL,
})
: null;
const defaultProvider = primary?.provider ?? DEFAULT_PROVIDER;
const defaultModel = primary?.model ?? DEFAULT_MODEL;
const providerRaw = String(params.provider ?? "").trim() || defaultProvider;
const modelRaw = String(params.model ?? "").trim() || defaultModel;
const normalizedPrimary = normalizeModelRef(providerRaw, modelRaw);
const configuredPrimary = normalizeModelRef(defaultProvider, defaultModel);
const aliasIndex = buildModelAliasIndex({
cfg: params.cfg ?? {},
defaultProvider,
});
const allowlist = buildConfiguredAllowlistKeys({
cfg: params.cfg,
defaultProvider,
});
const { candidates, addCandidate } = createModelCandidateCollector(allowlist);
addCandidate(normalizedPrimary, false);
const modelFallbacks = (() => {
if (params.fallbacksOverride !== undefined) {
return params.fallbacksOverride;
}
const configuredFallbacks = resolveAgentModelFallbackValues(
params.cfg?.agents?.defaults?.model,
);
if (sameModelCandidate(normalizedPrimary, configuredPrimary)) {
return configuredFallbacks;
}
// Preserve resilience after failover: when current model is one of the
// configured fallback refs, keep traversing the configured fallback chain.
const isConfiguredFallback = configuredFallbacks.some((raw) => {
const resolved = resolveModelRefFromString({
raw: String(raw ?? ""),
defaultProvider,
aliasIndex,
});
return resolved ? sameModelCandidate(resolved.ref, normalizedPrimary) : false;
});
// Keep legacy override behavior for ad-hoc models outside configured chain.
return isConfiguredFallback ? configuredFallbacks : [];
})();
for (const raw of modelFallbacks) {
const resolved = resolveModelRefFromString({
raw: String(raw ?? ""),
defaultProvider,
aliasIndex,
});
if (!resolved) {
continue;
}
// Fallbacks are explicit user intent; do not silently filter them by the
// model allowlist.
addCandidate(resolved.ref, false);
}
if (params.fallbacksOverride === undefined && primary?.provider && primary.model) {
addCandidate({ provider: primary.provider, model: primary.model }, false);
}
return candidates;
}
const lastProbeAttempt = new Map<string, number>();
const MIN_PROBE_INTERVAL_MS = 30_000; // 30 seconds between probes per key
const PROBE_MARGIN_MS = 2 * 60 * 1000;
const PROBE_SCOPE_DELIMITER = "::";
function resolveProbeThrottleKey(provider: string, agentDir?: string): string {
const scope = String(agentDir ?? "").trim();
return scope ? `${scope}${PROBE_SCOPE_DELIMITER}${provider}` : provider;
}
function shouldProbePrimaryDuringCooldown(params: {
isPrimary: boolean;
hasFallbackCandidates: boolean;
now: number;
throttleKey: string;
authStore: ReturnType<typeof ensureAuthProfileStore>;
profileIds: string[];
}): boolean {
if (!params.isPrimary || !params.hasFallbackCandidates) {
return false;
}
const lastProbe = lastProbeAttempt.get(params.throttleKey) ?? 0;
if (params.now - lastProbe < MIN_PROBE_INTERVAL_MS) {
return false;
}
const soonest = getSoonestCooldownExpiry(params.authStore, params.profileIds);
if (soonest === null || !Number.isFinite(soonest)) {
return true;
}
// Probe when cooldown already expired or within the configured margin.
return params.now >= soonest - PROBE_MARGIN_MS;
}
/** @internal exposed for unit tests only */
export const _probeThrottleInternals = {
lastProbeAttempt,
MIN_PROBE_INTERVAL_MS,
PROBE_MARGIN_MS,
resolveProbeThrottleKey,
} as const;
export async function runWithModelFallback<T>(params: {
cfg: OpenClawConfig | undefined;
provider: string;
model: string;
agentDir?: string;
/** Optional explicit fallbacks list; when provided (even empty), replaces agents.defaults.model.fallbacks. */
fallbacksOverride?: string[];
run: (provider: string, model: string) => Promise<T>;
onError?: ModelFallbackErrorHandler;
}): Promise<ModelFallbackRunResult<T>> {
const candidates = resolveFallbackCandidates({
cfg: params.cfg,
provider: params.provider,
model: params.model,
fallbacksOverride: params.fallbacksOverride,
});
const authStore = params.cfg
? ensureAuthProfileStore(params.agentDir, { allowKeychainPrompt: false })
: null;
const attempts: FallbackAttempt[] = [];
let lastError: unknown;
const hasFallbackCandidates = candidates.length > 1;
for (let i = 0; i < candidates.length; i += 1) {
const candidate = candidates[i];
if (authStore) {
const profileIds = resolveAuthProfileOrder({
cfg: params.cfg,
store: authStore,
provider: candidate.provider,
});
const isAnyProfileAvailable = profileIds.some((id) => !isProfileInCooldown(authStore, id));
if (profileIds.length > 0 && !isAnyProfileAvailable) {
// All profiles for this provider are in cooldown.
// For the primary model (i === 0), probe it if the soonest cooldown
// expiry is close or already past. This avoids staying on a fallback
// model long after the real rate-limit window clears.
const now = Date.now();
const probeThrottleKey = resolveProbeThrottleKey(candidate.provider, params.agentDir);
const shouldProbe = shouldProbePrimaryDuringCooldown({
isPrimary: i === 0,
hasFallbackCandidates,
now,
throttleKey: probeThrottleKey,
authStore,
profileIds,
});
if (!shouldProbe) {
const inferredReason =
resolveProfilesUnavailableReason({
store: authStore,
profileIds,
now,
}) ?? "rate_limit";
// Skip without attempting
attempts.push({
provider: candidate.provider,
model: candidate.model,
error: `Provider ${candidate.provider} is in cooldown (all profiles unavailable)`,
reason: inferredReason,
});
continue;
}
// Primary model probe: attempt it despite cooldown to detect recovery.
// If it fails, the error is caught below and we fall through to the
// next candidate as usual.
lastProbeAttempt.set(probeThrottleKey, now);
}
}
try {
const result = await params.run(candidate.provider, candidate.model);
return {
result,
provider: candidate.provider,
model: candidate.model,
attempts,
};
} catch (err) {
if (shouldRethrowAbort(err)) {
throw err;
}
// Context overflow errors should be handled by the inner runner's
// compaction/retry logic, not by model fallback. If one escapes as a
// throw, rethrow it immediately rather than trying a different model
// that may have a smaller context window and fail worse.
const errMessage = err instanceof Error ? err.message : String(err);
if (isLikelyContextOverflowError(errMessage)) {
throw err;
}
const normalized =
coerceToFailoverError(err, {
provider: candidate.provider,
model: candidate.model,
}) ?? err;
if (!isFailoverError(normalized)) {
throw err;
}
lastError = normalized;
const described = describeFailoverError(normalized);
attempts.push({
provider: candidate.provider,
model: candidate.model,
error: described.message,
reason: described.reason,
status: described.status,
code: described.code,
});
await params.onError?.({
provider: candidate.provider,
model: candidate.model,
error: normalized,
attempt: i + 1,
total: candidates.length,
});
}
}
throwFallbackFailureSummary({
attempts,
candidates,
lastError,
label: "models",
formatAttempt: (attempt) =>
`${attempt.provider}/${attempt.model}: ${attempt.error}${
attempt.reason ? ` (${attempt.reason})` : ""
}`,
});
}
export async function runWithImageModelFallback<T>(params: {
cfg: OpenClawConfig | undefined;
modelOverride?: string;
run: (provider: string, model: string) => Promise<T>;
onError?: ModelFallbackErrorHandler;
}): Promise<ModelFallbackRunResult<T>> {
const candidates = resolveImageFallbackCandidates({
cfg: params.cfg,
defaultProvider: DEFAULT_PROVIDER,
modelOverride: params.modelOverride,
});
if (candidates.length === 0) {
throw new Error(
"No image model configured. Set agents.defaults.imageModel.primary or agents.defaults.imageModel.fallbacks.",
);
}
const attempts: FallbackAttempt[] = [];
let lastError: unknown;
for (let i = 0; i < candidates.length; i += 1) {
const candidate = candidates[i];
try {
const result = await params.run(candidate.provider, candidate.model);
return {
result,
provider: candidate.provider,
model: candidate.model,
attempts,
};
} catch (err) {
if (shouldRethrowAbort(err)) {
throw err;
}
lastError = err;
attempts.push({
provider: candidate.provider,
model: candidate.model,
error: err instanceof Error ? err.message : String(err),
});
await params.onError?.({
provider: candidate.provider,
model: candidate.model,
error: err,
attempt: i + 1,
total: candidates.length,
});
}
}
throwFallbackFailureSummary({
attempts,
candidates,
lastError,
label: "image models",
formatAttempt: (attempt) => `${attempt.provider}/${attempt.model}: ${attempt.error}`,
});
}