fix: prevent compaction "prompt too long" errors (#22921)

* includes: prompt overhead in compaction safeguard calculation.

Subtracts SUMMARIZATION_OVERHEAD_TOKENS from maxChunkTokens in both the main summarization path and the dropped-messages summarization path.

This ensures the chunk budget leaves room for the prompt overhead that generateSummary wraps around each chunk.

* adds: budget for overhead tokens to use an effectiveMax instead of maxTokens naïvely.

- Added `SUMMARIZATION_OVERHEAD_TOKENS = 4096` — a budget for the tokens that `generateSummary` adds on top of the serialized conversation (system prompt, `<conversation>` tags, summarization instructions, `<previous-summary>` block, and reasoning: "high" thinking budget).
- `chunkMessagesByMaxTokens` now divides `maxTokens` by `SAFETY_MARGIN` (1.2) before comparing against estimated token counts. Previously, the safety margin was only used in `computeAdaptiveChunkRatio` and `isOversizedForSummary` but not in the actual chunking loop — so chunks could be built that fit the estimated budget but exceeded the real budget once the API tokenized them properly.
This commit is contained in:
Val Alexander
2026-02-21 14:42:18 -06:00
committed by GitHub
parent ac633366ce
commit b703ea3675
2 changed files with 21 additions and 5 deletions

View File

@@ -68,6 +68,11 @@ export function splitMessagesByTokenShare(
return chunks;
}
// Overhead reserved for summarization prompt, system prompt, previous summary,
// and serialization wrappers (<conversation> tags, instructions, etc.).
// generateSummary uses reasoning: "high" which also consumes context budget.
export const SUMMARIZATION_OVERHEAD_TOKENS = 4096;
export function chunkMessagesByMaxTokens(
messages: AgentMessage[],
maxTokens: number,
@@ -76,13 +81,17 @@ export function chunkMessagesByMaxTokens(
return [];
}
// Apply safety margin to compensate for estimateTokens() underestimation
// (chars/4 heuristic misses multi-byte chars, special tokens, code tokens, etc.)
const effectiveMax = Math.max(1, Math.floor(maxTokens / SAFETY_MARGIN));
const chunks: AgentMessage[][] = [];
let currentChunk: AgentMessage[] = [];
let currentTokens = 0;
for (const message of messages) {
const messageTokens = estimateTokens(message);
if (currentChunk.length > 0 && currentTokens + messageTokens > maxTokens) {
if (currentChunk.length > 0 && currentTokens + messageTokens > effectiveMax) {
chunks.push(currentChunk);
currentChunk = [];
currentTokens = 0;
@@ -91,7 +100,7 @@ export function chunkMessagesByMaxTokens(
currentChunk.push(message);
currentTokens += messageTokens;
if (messageTokens > maxTokens) {
if (messageTokens > effectiveMax) {
// Split oversized messages to avoid unbounded chunk growth.
chunks.push(currentChunk);
currentChunk = [];

View File

@@ -7,6 +7,7 @@ import {
BASE_CHUNK_RATIO,
MIN_CHUNK_RATIO,
SAFETY_MARGIN,
SUMMARIZATION_OVERHEAD_TOKENS,
computeAdaptiveChunkRatio,
estimateMessagesTokens,
isOversizedForSummary,
@@ -268,7 +269,8 @@ export default function compactionSafeguardExtension(api: ExtensionAPI): void {
);
const droppedMaxChunkTokens = Math.max(
1,
Math.floor(contextWindowTokens * droppedChunkRatio),
Math.floor(contextWindowTokens * droppedChunkRatio) -
SUMMARIZATION_OVERHEAD_TOKENS,
);
droppedSummary = await summarizeInStages({
messages: pruned.droppedMessagesList,
@@ -293,10 +295,15 @@ export default function compactionSafeguardExtension(api: ExtensionAPI): void {
}
}
// Use adaptive chunk ratio based on message sizes
// Use adaptive chunk ratio based on message sizes, reserving headroom for
// the summarization prompt, system prompt, previous summary, and reasoning budget
// that generateSummary adds on top of the serialized conversation chunk.
const allMessages = [...messagesToSummarize, ...turnPrefixMessages];
const adaptiveRatio = computeAdaptiveChunkRatio(allMessages, contextWindowTokens);
const maxChunkTokens = Math.max(1, Math.floor(contextWindowTokens * adaptiveRatio));
const maxChunkTokens = Math.max(
1,
Math.floor(contextWindowTokens * adaptiveRatio) - SUMMARIZATION_OVERHEAD_TOKENS,
);
const reserveTokens = Math.max(1, Math.floor(preparation.settings.reserveTokens));
// Feed dropped-messages summary as previousSummary so the main summarization