diff --git a/src/memory/manager-embedding-ops.ts b/src/memory/manager-embedding-ops.ts index 6606c3aea..79d5d35be 100644 --- a/src/memory/manager-embedding-ops.ts +++ b/src/memory/manager-embedding-ops.ts @@ -265,18 +265,10 @@ class MemoryManagerEmbeddingOps { return this.embedChunksInBatches(chunks); } - private async embedChunksWithVoyageBatch( - chunks: MemoryChunk[], - entry: MemoryFileEntry | SessionFileEntry, - source: MemorySource, - ): Promise { - const voyage = this.voyage; - if (!voyage) { - return this.embedChunksInBatches(chunks); - } - if (chunks.length === 0) { - return []; - } + private collectCachedEmbeddings(chunks: MemoryChunk[]): { + embeddings: number[][]; + missing: Array<{ index: number; chunk: MemoryChunk }>; + } { const cached = this.loadEmbeddingCache(chunks.map((chunk) => chunk.hash)); const embeddings: number[][] = Array.from({ length: chunks.length }, () => []); const missing: Array<{ index: number; chunk: MemoryChunk }> = []; @@ -291,25 +283,87 @@ class MemoryManagerEmbeddingOps { } } + return { embeddings, missing }; + } + + private buildBatchCustomId(params: { + source: MemorySource; + entry: MemoryFileEntry | SessionFileEntry; + chunk: MemoryChunk; + index: number; + }): string { + return hashText( + `${params.source}:${params.entry.path}:${params.chunk.startLine}:${params.chunk.endLine}:${params.chunk.hash}:${params.index}`, + ); + } + + private buildBatchRequests(params: { + missing: Array<{ index: number; chunk: MemoryChunk }>; + entry: MemoryFileEntry | SessionFileEntry; + source: MemorySource; + build: (chunk: MemoryChunk) => Omit; + }): { requests: T[]; mapping: Map } { + const requests: T[] = []; + const mapping = new Map(); + + for (const item of params.missing) { + const chunk = item.chunk; + const customId = this.buildBatchCustomId({ + source: params.source, + entry: params.entry, + chunk, + index: item.index, + }); + mapping.set(customId, { index: item.index, hash: chunk.hash }); + const built = params.build(chunk); + requests.push({ custom_id: customId, ...built } as T); + } + + return { requests, mapping }; + } + + private applyBatchEmbeddings(params: { + byCustomId: Map; + mapping: Map; + embeddings: number[][]; + }): void { + const toCache: Array<{ hash: string; embedding: number[] }> = []; + for (const [customId, embedding] of params.byCustomId.entries()) { + const mapped = params.mapping.get(customId); + if (!mapped) { + continue; + } + params.embeddings[mapped.index] = embedding; + toCache.push({ hash: mapped.hash, embedding }); + } + this.upsertEmbeddingCache(toCache); + } + + private async embedChunksWithVoyageBatch( + chunks: MemoryChunk[], + entry: MemoryFileEntry | SessionFileEntry, + source: MemorySource, + ): Promise { + const voyage = this.voyage; + if (!voyage) { + return this.embedChunksInBatches(chunks); + } + if (chunks.length === 0) { + return []; + } + const { embeddings, missing } = this.collectCachedEmbeddings(chunks); if (missing.length === 0) { return embeddings; } - const requests: VoyageBatchRequest[] = []; - const mapping = new Map(); - for (const item of missing) { - const chunk = item.chunk; - const customId = hashText( - `${source}:${entry.path}:${chunk.startLine}:${chunk.endLine}:${chunk.hash}:${item.index}`, - ); - mapping.set(customId, { index: item.index, hash: chunk.hash }); - requests.push({ - custom_id: customId, - body: { - input: chunk.text, - }, - }); - } + const { requests, mapping } = this.buildBatchRequests({ + missing, + entry, + source, + build: (chunk) => ({ + body: { input: chunk.text }, + }), + }); const batchResult = await this.runBatchWithFallback({ provider: "voyage", run: async () => @@ -328,18 +382,7 @@ class MemoryManagerEmbeddingOps { if (Array.isArray(batchResult)) { return batchResult; } - const byCustomId = batchResult; - - const toCache: Array<{ hash: string; embedding: number[] }> = []; - for (const [customId, embedding] of byCustomId.entries()) { - const mapped = mapping.get(customId); - if (!mapped) { - continue; - } - embeddings[mapped.index] = embedding; - toCache.push({ hash: mapped.hash, embedding }); - } - this.upsertEmbeddingCache(toCache); + this.applyBatchEmbeddings({ byCustomId: batchResult, mapping, embeddings }); return embeddings; } @@ -355,42 +398,24 @@ class MemoryManagerEmbeddingOps { if (chunks.length === 0) { return []; } - const cached = this.loadEmbeddingCache(chunks.map((chunk) => chunk.hash)); - const embeddings: number[][] = Array.from({ length: chunks.length }, () => []); - const missing: Array<{ index: number; chunk: MemoryChunk }> = []; - - for (let i = 0; i < chunks.length; i += 1) { - const chunk = chunks[i]; - const hit = chunk?.hash ? cached.get(chunk.hash) : undefined; - if (hit && hit.length > 0) { - embeddings[i] = hit; - } else if (chunk) { - missing.push({ index: i, chunk }); - } - } - + const { embeddings, missing } = this.collectCachedEmbeddings(chunks); if (missing.length === 0) { return embeddings; } - const requests: OpenAiBatchRequest[] = []; - const mapping = new Map(); - for (const item of missing) { - const chunk = item.chunk; - const customId = hashText( - `${source}:${entry.path}:${chunk.startLine}:${chunk.endLine}:${chunk.hash}:${item.index}`, - ); - mapping.set(customId, { index: item.index, hash: chunk.hash }); - requests.push({ - custom_id: customId, + const { requests, mapping } = this.buildBatchRequests({ + missing, + entry, + source, + build: (chunk) => ({ method: "POST", url: OPENAI_BATCH_ENDPOINT, body: { model: this.openAi?.model ?? this.provider.model, input: chunk.text, }, - }); - } + }), + }); const batchResult = await this.runBatchWithFallback({ provider: "openai", run: async () => @@ -409,18 +434,7 @@ class MemoryManagerEmbeddingOps { if (Array.isArray(batchResult)) { return batchResult; } - const byCustomId = batchResult; - - const toCache: Array<{ hash: string; embedding: number[] }> = []; - for (const [customId, embedding] of byCustomId.entries()) { - const mapped = mapping.get(customId); - if (!mapped) { - continue; - } - embeddings[mapped.index] = embedding; - toCache.push({ hash: mapped.hash, embedding }); - } - this.upsertEmbeddingCache(toCache); + this.applyBatchEmbeddings({ byCustomId: batchResult, mapping, embeddings }); return embeddings; } @@ -436,38 +450,20 @@ class MemoryManagerEmbeddingOps { if (chunks.length === 0) { return []; } - const cached = this.loadEmbeddingCache(chunks.map((chunk) => chunk.hash)); - const embeddings: number[][] = Array.from({ length: chunks.length }, () => []); - const missing: Array<{ index: number; chunk: MemoryChunk }> = []; - - for (let i = 0; i < chunks.length; i += 1) { - const chunk = chunks[i]; - const hit = chunk?.hash ? cached.get(chunk.hash) : undefined; - if (hit && hit.length > 0) { - embeddings[i] = hit; - } else if (chunk) { - missing.push({ index: i, chunk }); - } - } - + const { embeddings, missing } = this.collectCachedEmbeddings(chunks); if (missing.length === 0) { return embeddings; } - const requests: GeminiBatchRequest[] = []; - const mapping = new Map(); - for (const item of missing) { - const chunk = item.chunk; - const customId = hashText( - `${source}:${entry.path}:${chunk.startLine}:${chunk.endLine}:${chunk.hash}:${item.index}`, - ); - mapping.set(customId, { index: item.index, hash: chunk.hash }); - requests.push({ - custom_id: customId, + const { requests, mapping } = this.buildBatchRequests({ + missing, + entry, + source, + build: (chunk) => ({ content: { parts: [{ text: chunk.text }] }, taskType: "RETRIEVAL_DOCUMENT", - }); - } + }), + }); const batchResult = await this.runBatchWithFallback({ provider: "gemini", @@ -487,18 +483,7 @@ class MemoryManagerEmbeddingOps { if (Array.isArray(batchResult)) { return batchResult; } - const byCustomId = batchResult; - - const toCache: Array<{ hash: string; embedding: number[] }> = []; - for (const [customId, embedding] of byCustomId.entries()) { - const mapped = mapping.get(customId); - if (!mapped) { - continue; - } - embeddings[mapped.index] = embedding; - toCache.push({ hash: mapped.hash, embedding }); - } - this.upsertEmbeddingCache(toCache); + this.applyBatchEmbeddings({ byCustomId: batchResult, mapping, embeddings }); return embeddings; }