486 lines
15 KiB
TypeScript
486 lines
15 KiB
TypeScript
import type { OpenClawConfig } from "../config/config.js";
|
||
import {
|
||
resolveAgentModelFallbackValues,
|
||
resolveAgentModelPrimaryValue,
|
||
} from "../config/model-input.js";
|
||
import {
|
||
ensureAuthProfileStore,
|
||
getSoonestCooldownExpiry,
|
||
isProfileInCooldown,
|
||
resolveProfilesUnavailableReason,
|
||
resolveAuthProfileOrder,
|
||
} from "./auth-profiles.js";
|
||
import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
|
||
import {
|
||
coerceToFailoverError,
|
||
describeFailoverError,
|
||
isFailoverError,
|
||
isTimeoutError,
|
||
} from "./failover-error.js";
|
||
import {
|
||
buildConfiguredAllowlistKeys,
|
||
buildModelAliasIndex,
|
||
modelKey,
|
||
normalizeModelRef,
|
||
resolveConfiguredModelRef,
|
||
resolveModelRefFromString,
|
||
} from "./model-selection.js";
|
||
import type { FailoverReason } from "./pi-embedded-helpers.js";
|
||
import { isLikelyContextOverflowError } from "./pi-embedded-helpers.js";
|
||
|
||
type ModelCandidate = {
|
||
provider: string;
|
||
model: string;
|
||
};
|
||
|
||
type FallbackAttempt = {
|
||
provider: string;
|
||
model: string;
|
||
error: string;
|
||
reason?: FailoverReason;
|
||
status?: number;
|
||
code?: string;
|
||
};
|
||
|
||
/**
|
||
* Fallback abort check. Only treats explicit AbortError names as user aborts.
|
||
* Message-based checks (e.g., "aborted") can mask timeouts and skip fallback.
|
||
*/
|
||
function isFallbackAbortError(err: unknown): boolean {
|
||
if (!err || typeof err !== "object") {
|
||
return false;
|
||
}
|
||
if (isFailoverError(err)) {
|
||
return false;
|
||
}
|
||
const name = "name" in err ? String(err.name) : "";
|
||
return name === "AbortError";
|
||
}
|
||
|
||
function shouldRethrowAbort(err: unknown): boolean {
|
||
return isFallbackAbortError(err) && !isTimeoutError(err);
|
||
}
|
||
|
||
function createModelCandidateCollector(allowlist: Set<string> | null | undefined): {
|
||
candidates: ModelCandidate[];
|
||
addCandidate: (candidate: ModelCandidate, enforceAllowlist: boolean) => void;
|
||
} {
|
||
const seen = new Set<string>();
|
||
const candidates: ModelCandidate[] = [];
|
||
|
||
const addCandidate = (candidate: ModelCandidate, enforceAllowlist: boolean) => {
|
||
if (!candidate.provider || !candidate.model) {
|
||
return;
|
||
}
|
||
const key = modelKey(candidate.provider, candidate.model);
|
||
if (seen.has(key)) {
|
||
return;
|
||
}
|
||
if (enforceAllowlist && allowlist && !allowlist.has(key)) {
|
||
return;
|
||
}
|
||
seen.add(key);
|
||
candidates.push(candidate);
|
||
};
|
||
|
||
return { candidates, addCandidate };
|
||
}
|
||
|
||
type ModelFallbackErrorHandler = (attempt: {
|
||
provider: string;
|
||
model: string;
|
||
error: unknown;
|
||
attempt: number;
|
||
total: number;
|
||
}) => void | Promise<void>;
|
||
|
||
type ModelFallbackRunResult<T> = {
|
||
result: T;
|
||
provider: string;
|
||
model: string;
|
||
attempts: FallbackAttempt[];
|
||
};
|
||
|
||
function sameModelCandidate(a: ModelCandidate, b: ModelCandidate): boolean {
|
||
return a.provider === b.provider && a.model === b.model;
|
||
}
|
||
|
||
function throwFallbackFailureSummary(params: {
|
||
attempts: FallbackAttempt[];
|
||
candidates: ModelCandidate[];
|
||
lastError: unknown;
|
||
label: string;
|
||
formatAttempt: (attempt: FallbackAttempt) => string;
|
||
}): never {
|
||
if (params.attempts.length <= 1 && params.lastError) {
|
||
throw params.lastError;
|
||
}
|
||
const summary =
|
||
params.attempts.length > 0 ? params.attempts.map(params.formatAttempt).join(" | ") : "unknown";
|
||
throw new Error(
|
||
`All ${params.label} failed (${params.attempts.length || params.candidates.length}): ${summary}`,
|
||
{
|
||
cause: params.lastError instanceof Error ? params.lastError : undefined,
|
||
},
|
||
);
|
||
}
|
||
|
||
function resolveImageFallbackCandidates(params: {
|
||
cfg: OpenClawConfig | undefined;
|
||
defaultProvider: string;
|
||
modelOverride?: string;
|
||
}): ModelCandidate[] {
|
||
const aliasIndex = buildModelAliasIndex({
|
||
cfg: params.cfg ?? {},
|
||
defaultProvider: params.defaultProvider,
|
||
});
|
||
const allowlist = buildConfiguredAllowlistKeys({
|
||
cfg: params.cfg,
|
||
defaultProvider: params.defaultProvider,
|
||
});
|
||
const { candidates, addCandidate } = createModelCandidateCollector(allowlist);
|
||
|
||
const addRaw = (raw: string, enforceAllowlist: boolean) => {
|
||
const resolved = resolveModelRefFromString({
|
||
raw: String(raw ?? ""),
|
||
defaultProvider: params.defaultProvider,
|
||
aliasIndex,
|
||
});
|
||
if (!resolved) {
|
||
return;
|
||
}
|
||
addCandidate(resolved.ref, enforceAllowlist);
|
||
};
|
||
|
||
if (params.modelOverride?.trim()) {
|
||
addRaw(params.modelOverride, false);
|
||
} else {
|
||
const primary = resolveAgentModelPrimaryValue(params.cfg?.agents?.defaults?.imageModel);
|
||
if (primary?.trim()) {
|
||
addRaw(primary, false);
|
||
}
|
||
}
|
||
|
||
const imageFallbacks = resolveAgentModelFallbackValues(params.cfg?.agents?.defaults?.imageModel);
|
||
|
||
for (const raw of imageFallbacks) {
|
||
// Explicitly configured image fallbacks should remain reachable even when a
|
||
// model allowlist is present.
|
||
addRaw(raw, false);
|
||
}
|
||
|
||
return candidates;
|
||
}
|
||
|
||
function resolveFallbackCandidates(params: {
|
||
cfg: OpenClawConfig | undefined;
|
||
provider: string;
|
||
model: string;
|
||
/** Optional explicit fallbacks list; when provided (even empty), replaces agents.defaults.model.fallbacks. */
|
||
fallbacksOverride?: string[];
|
||
}): ModelCandidate[] {
|
||
const primary = params.cfg
|
||
? resolveConfiguredModelRef({
|
||
cfg: params.cfg,
|
||
defaultProvider: DEFAULT_PROVIDER,
|
||
defaultModel: DEFAULT_MODEL,
|
||
})
|
||
: null;
|
||
const defaultProvider = primary?.provider ?? DEFAULT_PROVIDER;
|
||
const defaultModel = primary?.model ?? DEFAULT_MODEL;
|
||
const providerRaw = String(params.provider ?? "").trim() || defaultProvider;
|
||
const modelRaw = String(params.model ?? "").trim() || defaultModel;
|
||
const normalizedPrimary = normalizeModelRef(providerRaw, modelRaw);
|
||
const configuredPrimary = normalizeModelRef(defaultProvider, defaultModel);
|
||
const aliasIndex = buildModelAliasIndex({
|
||
cfg: params.cfg ?? {},
|
||
defaultProvider,
|
||
});
|
||
const allowlist = buildConfiguredAllowlistKeys({
|
||
cfg: params.cfg,
|
||
defaultProvider,
|
||
});
|
||
const { candidates, addCandidate } = createModelCandidateCollector(allowlist);
|
||
|
||
addCandidate(normalizedPrimary, false);
|
||
|
||
const modelFallbacks = (() => {
|
||
if (params.fallbacksOverride !== undefined) {
|
||
return params.fallbacksOverride;
|
||
}
|
||
const configuredFallbacks = resolveAgentModelFallbackValues(
|
||
params.cfg?.agents?.defaults?.model,
|
||
);
|
||
if (sameModelCandidate(normalizedPrimary, configuredPrimary)) {
|
||
return configuredFallbacks;
|
||
}
|
||
// Preserve resilience after failover: when current model is one of the
|
||
// configured fallback refs, keep traversing the configured fallback chain.
|
||
const isConfiguredFallback = configuredFallbacks.some((raw) => {
|
||
const resolved = resolveModelRefFromString({
|
||
raw: String(raw ?? ""),
|
||
defaultProvider,
|
||
aliasIndex,
|
||
});
|
||
return resolved ? sameModelCandidate(resolved.ref, normalizedPrimary) : false;
|
||
});
|
||
// Keep legacy override behavior for ad-hoc models outside configured chain.
|
||
return isConfiguredFallback ? configuredFallbacks : [];
|
||
})();
|
||
|
||
for (const raw of modelFallbacks) {
|
||
const resolved = resolveModelRefFromString({
|
||
raw: String(raw ?? ""),
|
||
defaultProvider,
|
||
aliasIndex,
|
||
});
|
||
if (!resolved) {
|
||
continue;
|
||
}
|
||
// Fallbacks are explicit user intent; do not silently filter them by the
|
||
// model allowlist.
|
||
addCandidate(resolved.ref, false);
|
||
}
|
||
|
||
if (params.fallbacksOverride === undefined && primary?.provider && primary.model) {
|
||
addCandidate({ provider: primary.provider, model: primary.model }, false);
|
||
}
|
||
|
||
return candidates;
|
||
}
|
||
|
||
const lastProbeAttempt = new Map<string, number>();
|
||
const MIN_PROBE_INTERVAL_MS = 30_000; // 30 seconds between probes per key
|
||
const PROBE_MARGIN_MS = 2 * 60 * 1000;
|
||
const PROBE_SCOPE_DELIMITER = "::";
|
||
|
||
function resolveProbeThrottleKey(provider: string, agentDir?: string): string {
|
||
const scope = String(agentDir ?? "").trim();
|
||
return scope ? `${scope}${PROBE_SCOPE_DELIMITER}${provider}` : provider;
|
||
}
|
||
|
||
function shouldProbePrimaryDuringCooldown(params: {
|
||
isPrimary: boolean;
|
||
hasFallbackCandidates: boolean;
|
||
now: number;
|
||
throttleKey: string;
|
||
authStore: ReturnType<typeof ensureAuthProfileStore>;
|
||
profileIds: string[];
|
||
}): boolean {
|
||
if (!params.isPrimary || !params.hasFallbackCandidates) {
|
||
return false;
|
||
}
|
||
|
||
const lastProbe = lastProbeAttempt.get(params.throttleKey) ?? 0;
|
||
if (params.now - lastProbe < MIN_PROBE_INTERVAL_MS) {
|
||
return false;
|
||
}
|
||
|
||
const soonest = getSoonestCooldownExpiry(params.authStore, params.profileIds);
|
||
if (soonest === null || !Number.isFinite(soonest)) {
|
||
return true;
|
||
}
|
||
|
||
// Probe when cooldown already expired or within the configured margin.
|
||
return params.now >= soonest - PROBE_MARGIN_MS;
|
||
}
|
||
|
||
/** @internal – exposed for unit tests only */
|
||
export const _probeThrottleInternals = {
|
||
lastProbeAttempt,
|
||
MIN_PROBE_INTERVAL_MS,
|
||
PROBE_MARGIN_MS,
|
||
resolveProbeThrottleKey,
|
||
} as const;
|
||
|
||
export async function runWithModelFallback<T>(params: {
|
||
cfg: OpenClawConfig | undefined;
|
||
provider: string;
|
||
model: string;
|
||
agentDir?: string;
|
||
/** Optional explicit fallbacks list; when provided (even empty), replaces agents.defaults.model.fallbacks. */
|
||
fallbacksOverride?: string[];
|
||
run: (provider: string, model: string) => Promise<T>;
|
||
onError?: ModelFallbackErrorHandler;
|
||
}): Promise<ModelFallbackRunResult<T>> {
|
||
const candidates = resolveFallbackCandidates({
|
||
cfg: params.cfg,
|
||
provider: params.provider,
|
||
model: params.model,
|
||
fallbacksOverride: params.fallbacksOverride,
|
||
});
|
||
const authStore = params.cfg
|
||
? ensureAuthProfileStore(params.agentDir, { allowKeychainPrompt: false })
|
||
: null;
|
||
const attempts: FallbackAttempt[] = [];
|
||
let lastError: unknown;
|
||
|
||
const hasFallbackCandidates = candidates.length > 1;
|
||
|
||
for (let i = 0; i < candidates.length; i += 1) {
|
||
const candidate = candidates[i];
|
||
if (authStore) {
|
||
const profileIds = resolveAuthProfileOrder({
|
||
cfg: params.cfg,
|
||
store: authStore,
|
||
provider: candidate.provider,
|
||
});
|
||
const isAnyProfileAvailable = profileIds.some((id) => !isProfileInCooldown(authStore, id));
|
||
|
||
if (profileIds.length > 0 && !isAnyProfileAvailable) {
|
||
// All profiles for this provider are in cooldown.
|
||
// For the primary model (i === 0), probe it if the soonest cooldown
|
||
// expiry is close or already past. This avoids staying on a fallback
|
||
// model long after the real rate-limit window clears.
|
||
const now = Date.now();
|
||
const probeThrottleKey = resolveProbeThrottleKey(candidate.provider, params.agentDir);
|
||
const shouldProbe = shouldProbePrimaryDuringCooldown({
|
||
isPrimary: i === 0,
|
||
hasFallbackCandidates,
|
||
now,
|
||
throttleKey: probeThrottleKey,
|
||
authStore,
|
||
profileIds,
|
||
});
|
||
if (!shouldProbe) {
|
||
const inferredReason =
|
||
resolveProfilesUnavailableReason({
|
||
store: authStore,
|
||
profileIds,
|
||
now,
|
||
}) ?? "rate_limit";
|
||
// Skip without attempting
|
||
attempts.push({
|
||
provider: candidate.provider,
|
||
model: candidate.model,
|
||
error: `Provider ${candidate.provider} is in cooldown (all profiles unavailable)`,
|
||
reason: inferredReason,
|
||
});
|
||
continue;
|
||
}
|
||
// Primary model probe: attempt it despite cooldown to detect recovery.
|
||
// If it fails, the error is caught below and we fall through to the
|
||
// next candidate as usual.
|
||
lastProbeAttempt.set(probeThrottleKey, now);
|
||
}
|
||
}
|
||
try {
|
||
const result = await params.run(candidate.provider, candidate.model);
|
||
return {
|
||
result,
|
||
provider: candidate.provider,
|
||
model: candidate.model,
|
||
attempts,
|
||
};
|
||
} catch (err) {
|
||
if (shouldRethrowAbort(err)) {
|
||
throw err;
|
||
}
|
||
// Context overflow errors should be handled by the inner runner's
|
||
// compaction/retry logic, not by model fallback. If one escapes as a
|
||
// throw, rethrow it immediately rather than trying a different model
|
||
// that may have a smaller context window and fail worse.
|
||
const errMessage = err instanceof Error ? err.message : String(err);
|
||
if (isLikelyContextOverflowError(errMessage)) {
|
||
throw err;
|
||
}
|
||
const normalized =
|
||
coerceToFailoverError(err, {
|
||
provider: candidate.provider,
|
||
model: candidate.model,
|
||
}) ?? err;
|
||
if (!isFailoverError(normalized)) {
|
||
throw err;
|
||
}
|
||
|
||
lastError = normalized;
|
||
const described = describeFailoverError(normalized);
|
||
attempts.push({
|
||
provider: candidate.provider,
|
||
model: candidate.model,
|
||
error: described.message,
|
||
reason: described.reason,
|
||
status: described.status,
|
||
code: described.code,
|
||
});
|
||
await params.onError?.({
|
||
provider: candidate.provider,
|
||
model: candidate.model,
|
||
error: normalized,
|
||
attempt: i + 1,
|
||
total: candidates.length,
|
||
});
|
||
}
|
||
}
|
||
|
||
throwFallbackFailureSummary({
|
||
attempts,
|
||
candidates,
|
||
lastError,
|
||
label: "models",
|
||
formatAttempt: (attempt) =>
|
||
`${attempt.provider}/${attempt.model}: ${attempt.error}${
|
||
attempt.reason ? ` (${attempt.reason})` : ""
|
||
}`,
|
||
});
|
||
}
|
||
|
||
export async function runWithImageModelFallback<T>(params: {
|
||
cfg: OpenClawConfig | undefined;
|
||
modelOverride?: string;
|
||
run: (provider: string, model: string) => Promise<T>;
|
||
onError?: ModelFallbackErrorHandler;
|
||
}): Promise<ModelFallbackRunResult<T>> {
|
||
const candidates = resolveImageFallbackCandidates({
|
||
cfg: params.cfg,
|
||
defaultProvider: DEFAULT_PROVIDER,
|
||
modelOverride: params.modelOverride,
|
||
});
|
||
if (candidates.length === 0) {
|
||
throw new Error(
|
||
"No image model configured. Set agents.defaults.imageModel.primary or agents.defaults.imageModel.fallbacks.",
|
||
);
|
||
}
|
||
|
||
const attempts: FallbackAttempt[] = [];
|
||
let lastError: unknown;
|
||
|
||
for (let i = 0; i < candidates.length; i += 1) {
|
||
const candidate = candidates[i];
|
||
try {
|
||
const result = await params.run(candidate.provider, candidate.model);
|
||
return {
|
||
result,
|
||
provider: candidate.provider,
|
||
model: candidate.model,
|
||
attempts,
|
||
};
|
||
} catch (err) {
|
||
if (shouldRethrowAbort(err)) {
|
||
throw err;
|
||
}
|
||
lastError = err;
|
||
attempts.push({
|
||
provider: candidate.provider,
|
||
model: candidate.model,
|
||
error: err instanceof Error ? err.message : String(err),
|
||
});
|
||
await params.onError?.({
|
||
provider: candidate.provider,
|
||
model: candidate.model,
|
||
error: err,
|
||
attempt: i + 1,
|
||
total: candidates.length,
|
||
});
|
||
}
|
||
}
|
||
|
||
throwFallbackFailureSummary({
|
||
attempts,
|
||
candidates,
|
||
lastError,
|
||
label: "image models",
|
||
formatAttempt: (attempt) => `${attempt.provider}/${attempt.model}: ${attempt.error}`,
|
||
});
|
||
}
|