From b703ea3675d6d1896ebc705096da506a2289d44d Mon Sep 17 00:00:00 2001
From: Val Alexander <68980965+BunsDev@users.noreply.github.com>
Date: Sat, 21 Feb 2026 14:42:18 -0600
Subject: [PATCH] fix: prevent compaction "prompt too long" errors (#22921)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* includes: prompt overhead in compaction safeguard calculation.

Subtracts SUMMARIZATION_OVERHEAD_TOKENS from maxChunkTokens in both the main summarization path and the dropped-messages summarization path.

This ensures the chunk budget leaves room for the prompt overhead that generateSummary wraps around each chunk.

* adds: budget for overhead tokens to use an effectiveMax instead of maxTokens naïvely.

- Added `SUMMARIZATION_OVERHEAD_TOKENS = 4096` — a budget for the tokens that `generateSummary` adds on top of the serialized conversation (system prompt, `<conversation>` tags, summarization instructions, `<previous-summary>` block, and reasoning: "high" thinking budget).
- `chunkMessagesByMaxTokens` now divides `maxTokens` by `SAFETY_MARGIN` (1.2) before comparing against estimated token counts. Previously, the safety margin was only used in `computeAdaptiveChunkRatio` and `isOversizedForSummary` but not in the actual chunking loop — so chunks could be built that fit the estimated budget but exceeded the real budget once the API tokenized them properly.
---
 src/agents/compaction.ts                         | 13 +++++++++++--
 src/agents/pi-extensions/compaction-safeguard.ts | 13 ++++++++++---
 2 files changed, 21 insertions(+), 5 deletions(-)
diff --git a/src/agents/compaction.ts b/src/agents/compaction.ts
index d60d1af2a..80021e7ad 100644
--- a/src/agents/compaction.ts
+++ b/src/agents/compaction.ts
@@ -68,6 +68,11 @@ export function splitMessagesByTokenShare(
   return chunks;
 }
 
+// Overhead reserved for summarization prompt, system prompt, previous summary,
+// and serialization wrappers (<conversation> tags, instructions, etc.).
+// generateSummary uses reasoning: "high" which also consumes context budget.
+export const SUMMARIZATION_OVERHEAD_TOKENS = 4096;
+
 export function chunkMessagesByMaxTokens(
   messages: AgentMessage[],
   maxTokens: number,
@@ -76,13 +81,17 @@ export function chunkMessagesByMaxTokens(
     return [];
   }
 
+  // Apply safety margin to compensate for estimateTokens() underestimation
+  // (chars/4 heuristic misses multi-byte chars, special tokens, code tokens, etc.)
+  const effectiveMax = Math.max(1, Math.floor(maxTokens / SAFETY_MARGIN));
+
   const chunks: AgentMessage[][] = [];
   let currentChunk: AgentMessage[] = [];
   let currentTokens = 0;
 
   for (const message of messages) {
     const messageTokens = estimateTokens(message);
-    if (currentChunk.length > 0 && currentTokens + messageTokens > maxTokens) {
+    if (currentChunk.length > 0 && currentTokens + messageTokens > effectiveMax) {
       chunks.push(currentChunk);
       currentChunk = [];
       currentTokens = 0;
@@ -91,7 +100,7 @@ export function chunkMessagesByMaxTokens(
     currentChunk.push(message);
     currentTokens += messageTokens;
 
-    if (messageTokens > maxTokens) {
+    if (messageTokens > effectiveMax) {
       // Split oversized messages to avoid unbounded chunk growth.
       chunks.push(currentChunk);
       currentChunk = [];
diff --git a/src/agents/pi-extensions/compaction-safeguard.ts b/src/agents/pi-extensions/compaction-safeguard.ts
index 12c6627e4..ed0f0434c 100644
--- a/src/agents/pi-extensions/compaction-safeguard.ts
+++ b/src/agents/pi-extensions/compaction-safeguard.ts
@@ -7,6 +7,7 @@ import {
   BASE_CHUNK_RATIO,
   MIN_CHUNK_RATIO,
   SAFETY_MARGIN,
+  SUMMARIZATION_OVERHEAD_TOKENS,
   computeAdaptiveChunkRatio,
   estimateMessagesTokens,
   isOversizedForSummary,
@@ -268,7 +269,8 @@ export default function compactionSafeguardExtension(api: ExtensionAPI): void {
                 );
                 const droppedMaxChunkTokens = Math.max(
                   1,
-                  Math.floor(contextWindowTokens * droppedChunkRatio),
+                  Math.floor(contextWindowTokens * droppedChunkRatio) -
+                    SUMMARIZATION_OVERHEAD_TOKENS,
                 );
                 droppedSummary = await summarizeInStages({
                   messages: pruned.droppedMessagesList,
@@ -293,10 +295,15 @@ export default function compactionSafeguardExtension(api: ExtensionAPI): void {
         }
       }
 
-      // Use adaptive chunk ratio based on message sizes
+      // Use adaptive chunk ratio based on message sizes, reserving headroom for
+      // the summarization prompt, system prompt, previous summary, and reasoning budget
+      // that generateSummary adds on top of the serialized conversation chunk.
       const allMessages = [...messagesToSummarize, ...turnPrefixMessages];
       const adaptiveRatio = computeAdaptiveChunkRatio(allMessages, contextWindowTokens);
-      const maxChunkTokens = Math.max(1, Math.floor(contextWindowTokens * adaptiveRatio));
+      const maxChunkTokens = Math.max(
+        1,
+        Math.floor(contextWindowTokens * adaptiveRatio) - SUMMARIZATION_OVERHEAD_TOKENS,
+      );
       const reserveTokens = Math.max(1, Math.floor(preparation.settings.reserveTokens));
 
       // Feed dropped-messages summary as previousSummary so the main summarization