From b703ea3675d6d1896ebc705096da506a2289d44d Mon Sep 17 00:00:00 2001 From: Val Alexander <68980965+BunsDev@users.noreply.github.com> Date: Sat, 21 Feb 2026 14:42:18 -0600 Subject: [PATCH] fix: prevent compaction "prompt too long" errors (#22921) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * includes: prompt overhead in compaction safeguard calculation. Subtracts SUMMARIZATION_OVERHEAD_TOKENS from maxChunkTokens in both the main summarization path and the dropped-messages summarization path. This ensures the chunk budget leaves room for the prompt overhead that generateSummary wraps around each chunk. * adds: budget for overhead tokens to use an effectiveMax instead of maxTokens naïvely. - Added `SUMMARIZATION_OVERHEAD_TOKENS = 4096` — a budget for the tokens that `generateSummary` adds on top of the serialized conversation (system prompt, `` tags, summarization instructions, `` block, and reasoning: "high" thinking budget). - `chunkMessagesByMaxTokens` now divides `maxTokens` by `SAFETY_MARGIN` (1.2) before comparing against estimated token counts. Previously, the safety margin was only used in `computeAdaptiveChunkRatio` and `isOversizedForSummary` but not in the actual chunking loop — so chunks could be built that fit the estimated budget but exceeded the real budget once the API tokenized them properly. --- src/agents/compaction.ts | 13 +++++++++++-- src/agents/pi-extensions/compaction-safeguard.ts | 13 ++++++++++--- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/agents/compaction.ts b/src/agents/compaction.ts index d60d1af2a..80021e7ad 100644 --- a/src/agents/compaction.ts +++ b/src/agents/compaction.ts @@ -68,6 +68,11 @@ export function splitMessagesByTokenShare( return chunks; } +// Overhead reserved for summarization prompt, system prompt, previous summary, +// and serialization wrappers ( tags, instructions, etc.). +// generateSummary uses reasoning: "high" which also consumes context budget. +export const SUMMARIZATION_OVERHEAD_TOKENS = 4096; + export function chunkMessagesByMaxTokens( messages: AgentMessage[], maxTokens: number, @@ -76,13 +81,17 @@ export function chunkMessagesByMaxTokens( return []; } + // Apply safety margin to compensate for estimateTokens() underestimation + // (chars/4 heuristic misses multi-byte chars, special tokens, code tokens, etc.) + const effectiveMax = Math.max(1, Math.floor(maxTokens / SAFETY_MARGIN)); + const chunks: AgentMessage[][] = []; let currentChunk: AgentMessage[] = []; let currentTokens = 0; for (const message of messages) { const messageTokens = estimateTokens(message); - if (currentChunk.length > 0 && currentTokens + messageTokens > maxTokens) { + if (currentChunk.length > 0 && currentTokens + messageTokens > effectiveMax) { chunks.push(currentChunk); currentChunk = []; currentTokens = 0; @@ -91,7 +100,7 @@ export function chunkMessagesByMaxTokens( currentChunk.push(message); currentTokens += messageTokens; - if (messageTokens > maxTokens) { + if (messageTokens > effectiveMax) { // Split oversized messages to avoid unbounded chunk growth. chunks.push(currentChunk); currentChunk = []; diff --git a/src/agents/pi-extensions/compaction-safeguard.ts b/src/agents/pi-extensions/compaction-safeguard.ts index 12c6627e4..ed0f0434c 100644 --- a/src/agents/pi-extensions/compaction-safeguard.ts +++ b/src/agents/pi-extensions/compaction-safeguard.ts @@ -7,6 +7,7 @@ import { BASE_CHUNK_RATIO, MIN_CHUNK_RATIO, SAFETY_MARGIN, + SUMMARIZATION_OVERHEAD_TOKENS, computeAdaptiveChunkRatio, estimateMessagesTokens, isOversizedForSummary, @@ -268,7 +269,8 @@ export default function compactionSafeguardExtension(api: ExtensionAPI): void { ); const droppedMaxChunkTokens = Math.max( 1, - Math.floor(contextWindowTokens * droppedChunkRatio), + Math.floor(contextWindowTokens * droppedChunkRatio) - + SUMMARIZATION_OVERHEAD_TOKENS, ); droppedSummary = await summarizeInStages({ messages: pruned.droppedMessagesList, @@ -293,10 +295,15 @@ export default function compactionSafeguardExtension(api: ExtensionAPI): void { } } - // Use adaptive chunk ratio based on message sizes + // Use adaptive chunk ratio based on message sizes, reserving headroom for + // the summarization prompt, system prompt, previous summary, and reasoning budget + // that generateSummary adds on top of the serialized conversation chunk. const allMessages = [...messagesToSummarize, ...turnPrefixMessages]; const adaptiveRatio = computeAdaptiveChunkRatio(allMessages, contextWindowTokens); - const maxChunkTokens = Math.max(1, Math.floor(contextWindowTokens * adaptiveRatio)); + const maxChunkTokens = Math.max( + 1, + Math.floor(contextWindowTokens * adaptiveRatio) - SUMMARIZATION_OVERHEAD_TOKENS, + ); const reserveTokens = Math.max(1, Math.floor(preparation.settings.reserveTokens)); // Feed dropped-messages summary as previousSummary so the main summarization