fix: prevent compaction "prompt too long" errors (#22921)
* includes: prompt overhead in compaction safeguard calculation. Subtracts SUMMARIZATION_OVERHEAD_TOKENS from maxChunkTokens in both the main summarization path and the dropped-messages summarization path. This ensures the chunk budget leaves room for the prompt overhead that generateSummary wraps around each chunk. * adds: budget for overhead tokens to use an effectiveMax instead of maxTokens naïvely. - Added `SUMMARIZATION_OVERHEAD_TOKENS = 4096` — a budget for the tokens that `generateSummary` adds on top of the serialized conversation (system prompt, `<conversation>` tags, summarization instructions, `<previous-summary>` block, and reasoning: "high" thinking budget). - `chunkMessagesByMaxTokens` now divides `maxTokens` by `SAFETY_MARGIN` (1.2) before comparing against estimated token counts. Previously, the safety margin was only used in `computeAdaptiveChunkRatio` and `isOversizedForSummary` but not in the actual chunking loop — so chunks could be built that fit the estimated budget but exceeded the real budget once the API tokenized them properly.
This commit is contained in:
@@ -68,6 +68,11 @@ export function splitMessagesByTokenShare(
|
||||
return chunks;
|
||||
}
|
||||
|
||||
// Overhead reserved for summarization prompt, system prompt, previous summary,
|
||||
// and serialization wrappers (<conversation> tags, instructions, etc.).
|
||||
// generateSummary uses reasoning: "high" which also consumes context budget.
|
||||
export const SUMMARIZATION_OVERHEAD_TOKENS = 4096;
|
||||
|
||||
export function chunkMessagesByMaxTokens(
|
||||
messages: AgentMessage[],
|
||||
maxTokens: number,
|
||||
@@ -76,13 +81,17 @@ export function chunkMessagesByMaxTokens(
|
||||
return [];
|
||||
}
|
||||
|
||||
// Apply safety margin to compensate for estimateTokens() underestimation
|
||||
// (chars/4 heuristic misses multi-byte chars, special tokens, code tokens, etc.)
|
||||
const effectiveMax = Math.max(1, Math.floor(maxTokens / SAFETY_MARGIN));
|
||||
|
||||
const chunks: AgentMessage[][] = [];
|
||||
let currentChunk: AgentMessage[] = [];
|
||||
let currentTokens = 0;
|
||||
|
||||
for (const message of messages) {
|
||||
const messageTokens = estimateTokens(message);
|
||||
if (currentChunk.length > 0 && currentTokens + messageTokens > maxTokens) {
|
||||
if (currentChunk.length > 0 && currentTokens + messageTokens > effectiveMax) {
|
||||
chunks.push(currentChunk);
|
||||
currentChunk = [];
|
||||
currentTokens = 0;
|
||||
@@ -91,7 +100,7 @@ export function chunkMessagesByMaxTokens(
|
||||
currentChunk.push(message);
|
||||
currentTokens += messageTokens;
|
||||
|
||||
if (messageTokens > maxTokens) {
|
||||
if (messageTokens > effectiveMax) {
|
||||
// Split oversized messages to avoid unbounded chunk growth.
|
||||
chunks.push(currentChunk);
|
||||
currentChunk = [];
|
||||
|
||||
@@ -7,6 +7,7 @@ import {
|
||||
BASE_CHUNK_RATIO,
|
||||
MIN_CHUNK_RATIO,
|
||||
SAFETY_MARGIN,
|
||||
SUMMARIZATION_OVERHEAD_TOKENS,
|
||||
computeAdaptiveChunkRatio,
|
||||
estimateMessagesTokens,
|
||||
isOversizedForSummary,
|
||||
@@ -268,7 +269,8 @@ export default function compactionSafeguardExtension(api: ExtensionAPI): void {
|
||||
);
|
||||
const droppedMaxChunkTokens = Math.max(
|
||||
1,
|
||||
Math.floor(contextWindowTokens * droppedChunkRatio),
|
||||
Math.floor(contextWindowTokens * droppedChunkRatio) -
|
||||
SUMMARIZATION_OVERHEAD_TOKENS,
|
||||
);
|
||||
droppedSummary = await summarizeInStages({
|
||||
messages: pruned.droppedMessagesList,
|
||||
@@ -293,10 +295,15 @@ export default function compactionSafeguardExtension(api: ExtensionAPI): void {
|
||||
}
|
||||
}
|
||||
|
||||
// Use adaptive chunk ratio based on message sizes
|
||||
// Use adaptive chunk ratio based on message sizes, reserving headroom for
|
||||
// the summarization prompt, system prompt, previous summary, and reasoning budget
|
||||
// that generateSummary adds on top of the serialized conversation chunk.
|
||||
const allMessages = [...messagesToSummarize, ...turnPrefixMessages];
|
||||
const adaptiveRatio = computeAdaptiveChunkRatio(allMessages, contextWindowTokens);
|
||||
const maxChunkTokens = Math.max(1, Math.floor(contextWindowTokens * adaptiveRatio));
|
||||
const maxChunkTokens = Math.max(
|
||||
1,
|
||||
Math.floor(contextWindowTokens * adaptiveRatio) - SUMMARIZATION_OVERHEAD_TOKENS,
|
||||
);
|
||||
const reserveTokens = Math.max(1, Math.floor(preparation.settings.reserveTokens));
|
||||
|
||||
// Feed dropped-messages summary as previousSummary so the main summarization
|
||||
|
||||
Reference in New Issue
Block a user