* fix: enforce embedding model token limit to prevent 8192 overflow - Replace EMBEDDING_APPROX_CHARS_PER_TOKEN=1 with UTF-8 byte length estimation (safe upper bound for tokenizer output) - Add EMBEDDING_MODEL_MAX_TOKENS=8192 hard cap - Add splitChunkToTokenLimit() that binary-searches for the largest safe split point, with surrogate pair handling - Add enforceChunkTokenLimit() wrapper called in indexFile() after chunkMarkdown(), before any embedding API call - Fixes: session files with large JSONL entries could produce chunks exceeding text-embedding-3-small's 8192 token limit Tests: 2 new colocated tests in manager.embedding-token-limit.test.ts - Verifies oversized ASCII chunks are split to <=8192 bytes each - Verifies multibyte (emoji) content batching respects byte limits * fix: make embedding token limit provider-aware - Add optional maxInputTokens to EmbeddingProvider interface - Each provider (openai, gemini, voyage) reports its own limit - Known-limits map as fallback: openai 8192, gemini 2048, voyage 32K - Resolution: provider field > known map > default 8192 - Backward compatible: local/llama uses fallback * fix: enforce embedding input size limits (#13455) (thanks @rodrigouroz) --------- Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
36 lines
1.2 KiB
TypeScript
36 lines
1.2 KiB
TypeScript
import type { EmbeddingProvider } from "./embeddings.js";
|
|
|
|
const DEFAULT_EMBEDDING_MAX_INPUT_TOKENS = 8192;
|
|
|
|
const KNOWN_EMBEDDING_MAX_INPUT_TOKENS: Record<string, number> = {
|
|
"openai:text-embedding-3-small": 8192,
|
|
"openai:text-embedding-3-large": 8192,
|
|
"openai:text-embedding-ada-002": 8191,
|
|
"gemini:text-embedding-004": 2048,
|
|
"voyage:voyage-3": 32000,
|
|
"voyage:voyage-3-lite": 16000,
|
|
"voyage:voyage-code-3": 32000,
|
|
};
|
|
|
|
export function resolveEmbeddingMaxInputTokens(provider: EmbeddingProvider): number {
|
|
if (typeof provider.maxInputTokens === "number") {
|
|
return provider.maxInputTokens;
|
|
}
|
|
|
|
// Provider/model mapping is best-effort; different providers use different
|
|
// limits and we prefer to be conservative when we don't know.
|
|
const key = `${provider.id}:${provider.model}`.toLowerCase();
|
|
const known = KNOWN_EMBEDDING_MAX_INPUT_TOKENS[key];
|
|
if (typeof known === "number") {
|
|
return known;
|
|
}
|
|
|
|
// Provider-specific conservative fallbacks. This prevents us from accidentally
|
|
// using the OpenAI default for providers with much smaller limits.
|
|
if (provider.id.toLowerCase() === "gemini") {
|
|
return 2048;
|
|
}
|
|
|
|
return DEFAULT_EMBEDDING_MAX_INPUT_TOKENS;
|
|
}
|