* fix: enforce embedding model token limit to prevent 8192 overflow - Replace EMBEDDING_APPROX_CHARS_PER_TOKEN=1 with UTF-8 byte length estimation (safe upper bound for tokenizer output) - Add EMBEDDING_MODEL_MAX_TOKENS=8192 hard cap - Add splitChunkToTokenLimit() that binary-searches for the largest safe split point, with surrogate pair handling - Add enforceChunkTokenLimit() wrapper called in indexFile() after chunkMarkdown(), before any embedding API call - Fixes: session files with large JSONL entries could produce chunks exceeding text-embedding-3-small's 8192 token limit Tests: 2 new colocated tests in manager.embedding-token-limit.test.ts - Verifies oversized ASCII chunks are split to <=8192 bytes each - Verifies multibyte (emoji) content batching respects byte limits * fix: make embedding token limit provider-aware - Add optional maxInputTokens to EmbeddingProvider interface - Each provider (openai, gemini, voyage) reports its own limit - Known-limits map as fallback: openai 8192, gemini 2048, voyage 32K - Resolution: provider field > known map > default 8192 - Backward compatible: local/llama uses fallback * fix: enforce embedding input size limits (#13455) (thanks @rodrigouroz) --------- Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
121 lines
3.9 KiB
TypeScript
121 lines
3.9 KiB
TypeScript
import fs from "node:fs/promises";
|
|
import os from "node:os";
|
|
import path from "node:path";
|
|
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
|
import { getMemorySearchManager, type MemoryIndexManager } from "./index.js";
|
|
|
|
const embedBatch = vi.fn(async (texts: string[]) => texts.map(() => [0, 1, 0]));
|
|
const embedQuery = vi.fn(async () => [0, 1, 0]);
|
|
|
|
vi.mock("./embeddings.js", () => ({
|
|
createEmbeddingProvider: async () => ({
|
|
requestedProvider: "openai",
|
|
provider: {
|
|
id: "mock",
|
|
model: "mock-embed",
|
|
maxInputTokens: 8192,
|
|
embedQuery,
|
|
embedBatch,
|
|
},
|
|
}),
|
|
}));
|
|
|
|
describe("memory embedding token limits", () => {
|
|
let workspaceDir: string;
|
|
let indexPath: string;
|
|
let manager: MemoryIndexManager | null = null;
|
|
|
|
beforeEach(async () => {
|
|
embedBatch.mockReset();
|
|
embedQuery.mockReset();
|
|
embedBatch.mockImplementation(async (texts: string[]) => texts.map(() => [0, 1, 0]));
|
|
embedQuery.mockImplementation(async () => [0, 1, 0]);
|
|
workspaceDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-mem-token-"));
|
|
indexPath = path.join(workspaceDir, "index.sqlite");
|
|
await fs.mkdir(path.join(workspaceDir, "memory"));
|
|
});
|
|
|
|
afterEach(async () => {
|
|
if (manager) {
|
|
await manager.close();
|
|
manager = null;
|
|
}
|
|
await fs.rm(workspaceDir, { recursive: true, force: true });
|
|
});
|
|
|
|
it("splits oversized chunks so each embedding input stays <= 8192 UTF-8 bytes", async () => {
|
|
const content = "x".repeat(9500);
|
|
await fs.writeFile(path.join(workspaceDir, "memory", "2026-01-09.md"), content);
|
|
|
|
const cfg = {
|
|
agents: {
|
|
defaults: {
|
|
workspace: workspaceDir,
|
|
memorySearch: {
|
|
provider: "openai",
|
|
model: "mock-embed",
|
|
store: { path: indexPath },
|
|
chunking: { tokens: 10_000, overlap: 0 },
|
|
sync: { watch: false, onSessionStart: false, onSearch: false },
|
|
query: { minScore: 0 },
|
|
},
|
|
},
|
|
list: [{ id: "main", default: true }],
|
|
},
|
|
};
|
|
|
|
const result = await getMemorySearchManager({ cfg, agentId: "main" });
|
|
expect(result.manager).not.toBeNull();
|
|
if (!result.manager) {
|
|
throw new Error("manager missing");
|
|
}
|
|
manager = result.manager;
|
|
await manager.sync({ force: true });
|
|
|
|
const inputs = embedBatch.mock.calls.flatMap((call) => call[0] ?? []);
|
|
expect(inputs.length).toBeGreaterThan(1);
|
|
expect(
|
|
Math.max(...inputs.map((input) => Buffer.byteLength(input, "utf8"))),
|
|
).toBeLessThanOrEqual(8192);
|
|
});
|
|
|
|
it("uses UTF-8 byte estimates when batching multibyte chunks", async () => {
|
|
const line = "😀".repeat(1800);
|
|
const content = `${line}\n${line}\n${line}`;
|
|
await fs.writeFile(path.join(workspaceDir, "memory", "2026-01-10.md"), content);
|
|
|
|
const cfg = {
|
|
agents: {
|
|
defaults: {
|
|
workspace: workspaceDir,
|
|
memorySearch: {
|
|
provider: "openai",
|
|
model: "mock-embed",
|
|
store: { path: indexPath },
|
|
chunking: { tokens: 1000, overlap: 0 },
|
|
sync: { watch: false, onSessionStart: false, onSearch: false },
|
|
query: { minScore: 0 },
|
|
},
|
|
},
|
|
list: [{ id: "main", default: true }],
|
|
},
|
|
};
|
|
|
|
const result = await getMemorySearchManager({ cfg, agentId: "main" });
|
|
expect(result.manager).not.toBeNull();
|
|
if (!result.manager) {
|
|
throw new Error("manager missing");
|
|
}
|
|
manager = result.manager;
|
|
await manager.sync({ force: true });
|
|
|
|
const batchSizes = embedBatch.mock.calls.map(
|
|
(call) => (call[0] as string[] | undefined)?.length ?? 0,
|
|
);
|
|
expect(batchSizes.length).toBe(3);
|
|
expect(batchSizes.every((size) => size === 1)).toBe(true);
|
|
const inputs = embedBatch.mock.calls.flatMap((call) => call[0] ?? []);
|
|
expect(inputs.every((input) => Buffer.byteLength(input, "utf8") <= 8192)).toBe(true);
|
|
});
|
|
});
|