fix(pi): stop history image reinjection token blowup

2026-02-26 16:38:09 +01:00
parent 9a4b2266cc
commit c81e9866ff
6 changed files with 69 additions and 259 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai

 ### Fixes

+- Pi image-token usage: stop re-injecting history image blocks each turn, process image references from the current prompt only, and prune already-answered user-image blocks in stored history to prevent runaway token growth. (#27602)
 - Security/Gateway node pairing: pin paired-device `platform`/`deviceFamily` metadata across reconnects and bind those fields into device-auth signatures, so reconnect metadata spoofing cannot expand node command allowlists without explicit repair pairing. This ships in the next npm release (`2026.2.26`). Thanks @76embiid21 for reporting.
 - Security/Sandbox path alias guard: reject broken symlink targets by resolving through existing ancestors and failing closed on out-of-root targets, preventing workspace-only `apply_patch` writes from escaping sandbox/workspace boundaries via dangling symlinks. This ships in the next npm release (`2026.2.26`). Thanks @tdjackey for reporting.
 - Security/Workspace FS boundary aliases: harden canonical boundary resolution for non-existent-leaf symlink aliases while preserving valid in-root aliases, preventing first-write workspace escapes via out-of-root symlink targets. This ships in the next npm release (`2026.2.26`). Thanks @tdjackey for reporting.
--- a/docs/pi.md
+++ b/docs/pi.md
@@ -232,6 +232,10 @@ await session.prompt(effectivePrompt, { images: imageResult.images });

 The SDK handles the full agent loop: sending to LLM, executing tool calls, streaming responses.

+Image injection is prompt-local: OpenClaw loads image refs from the current prompt and
+passes them via `images` for that turn only. It does not re-scan older history turns
+to re-inject image payloads.
+
 ## Tool Architecture

 ### Tool Pipeline
--- a/src/agents/pi-embedded-runner/run/attempt.test.ts
+++ b/src/agents/pi-embedded-runner/run/attempt.test.ts
@@ -3,24 +3,29 @@ import type { ImageContent } from "@mariozechner/pi-ai";
 import { describe, expect, it, vi } from "vitest";
 import type { OpenClawConfig } from "../../../config/config.js";
 import {
-  injectHistoryImagesIntoMessages,
+  PRUNED_HISTORY_IMAGE_MARKER,
+  pruneProcessedHistoryImages,
  resolveAttemptFsWorkspaceOnly,
  resolvePromptBuildHookResult,
  resolvePromptModeForSession,
 } from "./attempt.js";

-describe("injectHistoryImagesIntoMessages", () => {
+describe("pruneProcessedHistoryImages", () => {
  const image: ImageContent = { type: "image", data: "abc", mimeType: "image/png" };

-  it("injects history images and converts string content", () => {
+  it("prunes image blocks from user messages that already have assistant replies", () => {
    const messages: AgentMessage[] = [
      {
        role: "user",
-        content: "See /tmp/photo.png",
+        content: [{ type: "text", text: "See /tmp/photo.png" }, { ...image }],
      } as AgentMessage,
+      {
+        role: "assistant",
+        content: "got it",
+      } as unknown as AgentMessage,
    ];

-    const didMutate = injectHistoryImagesIntoMessages(messages, new Map([[0, [image]]]));
+    const didMutate = pruneProcessedHistoryImages(messages);

    expect(didMutate).toBe(true);
    const firstUser = messages[0] as Extract<AgentMessage, { role: "user" }> | undefined;
@@ -28,10 +33,10 @@ describe("injectHistoryImagesIntoMessages", () => {
    const content = firstUser?.content as Array<{ type: string; text?: string; data?: string }>;
    expect(content).toHaveLength(2);
    expect(content[0]?.type).toBe("text");
-    expect(content[1]).toMatchObject({ type: "image", data: "abc" });
+    expect(content[1]).toMatchObject({ type: "text", text: PRUNED_HISTORY_IMAGE_MARKER });
  });

-  it("avoids duplicating existing image content", () => {
+  it("does not prune latest user message when no assistant response exists yet", () => {
    const messages: AgentMessage[] = [
      {
        role: "user",
@@ -39,7 +44,7 @@ describe("injectHistoryImagesIntoMessages", () => {
      } as AgentMessage,
    ];

-    const didMutate = injectHistoryImagesIntoMessages(messages, new Map([[0, [image]]]));
+    const didMutate = pruneProcessedHistoryImages(messages);

    expect(didMutate).toBe(false);
    const first = messages[0] as Extract<AgentMessage, { role: "user" }> | undefined;
@@ -47,21 +52,22 @@ describe("injectHistoryImagesIntoMessages", () => {
      throw new Error("expected array content");
    }
    expect(first.content).toHaveLength(2);
+    expect(first.content[1]).toMatchObject({ type: "image", data: "abc" });
  });

-  it("ignores non-user messages and out-of-range indices", () => {
+  it("does not change messages when no assistant turn exists", () => {
    const messages: AgentMessage[] = [
      {
-        role: "assistant",
+        role: "user",
        content: "noop",
-      } as unknown as AgentMessage,
+      } as AgentMessage,
    ];

-    const didMutate = injectHistoryImagesIntoMessages(messages, new Map([[1, [image]]]));
+    const didMutate = pruneProcessedHistoryImages(messages);

    expect(didMutate).toBe(false);
-    const firstAssistant = messages[0] as Extract<AgentMessage, { role: "assistant" }> | undefined;
-    expect(firstAssistant?.content).toBe("noop");
+    const firstUser = messages[0] as Extract<AgentMessage, { role: "user" }> | undefined;
+    expect(firstUser?.content).toBe("noop");
  });
 });

--- a/src/agents/pi-embedded-runner/run/attempt.ts
+++ b/src/agents/pi-embedded-runner/run/attempt.ts
@@ -1,7 +1,6 @@
 import fs from "node:fs/promises";
 import os from "node:os";
 import type { AgentMessage } from "@mariozechner/pi-agent-core";
-import type { ImageContent } from "@mariozechner/pi-ai";
 import { streamSimple } from "@mariozechner/pi-ai";
 import {
  createAgentSession,
@@ -128,51 +127,45 @@ type PromptBuildHookRunner = {
  ) => Promise<PluginHookBeforeAgentStartResult | undefined>;
 };

-export function injectHistoryImagesIntoMessages(
-  messages: AgentMessage[],
-  historyImagesByIndex: Map<number, ImageContent[]>,
-): boolean {
-  if (historyImagesByIndex.size === 0) {
+export const PRUNED_HISTORY_IMAGE_MARKER = "[image data removed - already processed by model]";
+
+/**
+ * Prunes image blocks from user messages that already have an assistant response after them.
+ * This is a one-way cleanup for previously persisted history image data.
+ */
+export function pruneProcessedHistoryImages(messages: AgentMessage[]): boolean {
+  let lastAssistantIndex = -1;
+  for (let i = messages.length - 1; i >= 0; i--) {
+    if (messages[i]?.role === "assistant") {
+      lastAssistantIndex = i;
+      break;
+    }
+  }
+  if (lastAssistantIndex < 0) {
    return false;
  }
-  let didMutate = false;

-  for (const [msgIndex, images] of historyImagesByIndex) {
-    // Bounds check: ensure index is valid before accessing
-    if (msgIndex < 0 || msgIndex >= messages.length) {
+  let didMutate = false;
+  for (let i = 0; i < lastAssistantIndex; i++) {
+    const message = messages[i];
+    if (!message || message.role !== "user" || !Array.isArray(message.content)) {
      continue;
    }
-    const msg = messages[msgIndex];
-    if (msg && msg.role === "user") {
-      // Convert string content to array format if needed
-      if (typeof msg.content === "string") {
-        msg.content = [{ type: "text", text: msg.content }];
-        didMutate = true;
+    for (let j = 0; j < message.content.length; j++) {
+      const block = message.content[j];
+      if (!block || typeof block !== "object") {
+        continue;
      }
-      if (Array.isArray(msg.content)) {
-        // Check for existing image content to avoid duplicates across turns
-        const existingImageData = new Set(
-          msg.content
-            .filter(
-              (c): c is ImageContent =>
-                c != null &&
-                typeof c === "object" &&
-                c.type === "image" &&
-                typeof c.data === "string",
-            )
-            .map((c) => c.data),
-        );
-        for (const img of images) {
-          // Only add if this image isn't already in the message
-          if (!existingImageData.has(img.data)) {
-            msg.content.push(img);
-            didMutate = true;
-          }
-        }
+      if ((block as { type?: string }).type !== "image") {
+        continue;
      }
+      message.content[j] = {
+        type: "text",
+        text: PRUNED_HISTORY_IMAGE_MARKER,
+      } as (typeof message.content)[number];
+      didMutate = true;
    }
  }
-
  return didMutate;
 }

@@ -1092,16 +1085,20 @@ export async function runEmbeddedAttempt(
        }

        try {
+          // One-time migration: prune image blocks from already-processed user turns.
+          // This prevents old persisted base64 payloads from bloating subsequent prompts.
+          const didPruneImages = pruneProcessedHistoryImages(activeSession.messages);
+          if (didPruneImages) {
+            activeSession.agent.replaceMessages(activeSession.messages);
+          }
+
          // Detect and load images referenced in the prompt for vision-capable models.
-          // This eliminates the need for an explicit "view" tool call by injecting
-          // images directly into the prompt when the model supports it.
-          // Also scans conversation history to enable follow-up questions about earlier images.
+          // Images are prompt-local only (pi-like behavior).
          const imageResult = await detectAndLoadPromptImages({
            prompt: effectivePrompt,
            workspaceDir: effectiveWorkspace,
            model: params.model,
            existingImages: params.images,
-            historyMessages: activeSession.messages,
            maxBytes: MAX_IMAGE_BYTES,
            maxDimensionPx: resolveImageSanitizationLimits(params.config).maxDimensionPx,
            workspaceOnly: effectiveFsWorkspaceOnly,
@@ -1112,21 +1109,10 @@ export async function runEmbeddedAttempt(
                : undefined,
          });

-          // Inject history images into their original message positions.
-          // This ensures the model sees images in context (e.g., "compare to the first image").
-          const didMutate = injectHistoryImagesIntoMessages(
-            activeSession.messages,
-            imageResult.historyImagesByIndex,
-          );
-          if (didMutate) {
-            // Persist message mutations (e.g., injected history images) so we don't re-scan/reload.
-            activeSession.agent.replaceMessages(activeSession.messages);
-          }
-
          cacheTrace?.recordStage("prompt:images", {
            prompt: effectivePrompt,
            messages: activeSession.messages,
-            note: `images: prompt=${imageResult.images.length} history=${imageResult.historyImagesByIndex.size}`,
+            note: `images: prompt=${imageResult.images.length}`,
          });

          // Diagnostic: log context sizes before prompt to help debug early overflow errors.
@@ -1143,7 +1129,6 @@ export async function runEmbeddedAttempt(
                `historyImageBlocks=${sessionSummary.totalImageBlocks} ` +
                `systemPromptChars=${systemLen} promptChars=${promptLen} ` +
                `promptImages=${imageResult.images.length} ` +
-                `historyImageMessages=${imageResult.historyImagesByIndex.size} ` +
                `provider=${params.provider}/${params.modelId} sessionFile=${params.sessionFile}`,
            );
          }
--- a/src/agents/pi-embedded-runner/run/images.test.ts
+++ b/src/agents/pi-embedded-runner/run/images.test.ts
@@ -256,25 +256,15 @@ describe("detectAndLoadPromptImages", () => {
    expect(result.detectedRefs).toHaveLength(0);
  });

-  it("skips history messages that already include image content", async () => {
+  it("returns no detected refs when prompt has no image references", async () => {
    const result = await detectAndLoadPromptImages({
      prompt: "no images here",
      workspaceDir: "/tmp",
      model: { input: ["text", "image"] },
-      historyMessages: [
-        {
-          role: "user",
-          content: [
-            { type: "text", text: "See /tmp/should-not-load.png" },
-            { type: "image", data: "abc", mimeType: "image/png" },
-          ],
-        },
-      ],
    });

    expect(result.detectedRefs).toHaveLength(0);
    expect(result.images).toHaveLength(0);
-    expect(result.historyImagesByIndex.size).toBe(0);
  });

  it("blocks prompt image refs outside workspace when sandbox workspaceOnly is enabled", async () => {
@@ -309,43 +299,4 @@ describe("detectAndLoadPromptImages", () => {
      await fs.rm(stateDir, { recursive: true, force: true });
    }
  });
-
-  it("blocks history image refs outside workspace when sandbox workspaceOnly is enabled", async () => {
-    const stateDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-native-image-sandbox-"));
-    const sandboxRoot = path.join(stateDir, "sandbox");
-    const agentRoot = path.join(stateDir, "agent");
-    await fs.mkdir(sandboxRoot, { recursive: true });
-    await fs.mkdir(agentRoot, { recursive: true });
-    const pngB64 =
-      "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII=";
-    await fs.writeFile(path.join(agentRoot, "secret.png"), Buffer.from(pngB64, "base64"));
-    const sandbox = createUnsafeMountedSandbox({ sandboxRoot, agentRoot });
-    const bridge = sandbox.fsBridge;
-    if (!bridge) {
-      throw new Error("sandbox fs bridge missing");
-    }
-
-    try {
-      const result = await detectAndLoadPromptImages({
-        prompt: "No inline image in this turn.",
-        workspaceDir: sandboxRoot,
-        model: { input: ["text", "image"] },
-        workspaceOnly: true,
-        historyMessages: [
-          {
-            role: "user",
-            content: [{ type: "text", text: "Previous image /agent/secret.png" }],
-          },
-        ],
-        sandbox: { root: sandbox.workspaceDir, bridge },
-      });
-
-      expect(result.detectedRefs).toHaveLength(1);
-      expect(result.loadedCount).toBe(0);
-      expect(result.skippedCount).toBe(1);
-      expect(result.historyImagesByIndex.size).toBe(0);
-    } finally {
-      await fs.rm(stateDir, { recursive: true, force: true });
-    }
-  });
 });
--- a/src/agents/pi-embedded-runner/run/images.ts
+++ b/src/agents/pi-embedded-runner/run/images.ts
@@ -36,8 +36,6 @@ export interface DetectedImageRef {
  type: "path" | "url";
  /** The resolved/normalized path or URL */
  resolved: string;
-  /** Index of the message this ref was found in (for history images) */
-  messageIndex?: number;
 }

 /**
@@ -268,93 +266,6 @@ export function modelSupportsImages(model: { input?: string[] }): boolean {
  return model.input?.includes("image") ?? false;
 }

-function extractTextFromMessage(message: unknown): string {
-  if (!message || typeof message !== "object") {
-    return "";
-  }
-  const content = (message as { content?: unknown }).content;
-  if (typeof content === "string") {
-    return content;
-  }
-  if (!Array.isArray(content)) {
-    return "";
-  }
-  const textParts: string[] = [];
-  for (const part of content) {
-    if (!part || typeof part !== "object") {
-      continue;
-    }
-    const record = part as Record<string, unknown>;
-    if (record.type === "text" && typeof record.text === "string") {
-      textParts.push(record.text);
-    }
-  }
-  return textParts.join("\n").trim();
-}
-
-/**
- * Extracts image references from conversation history messages.
- * Scans user messages for image paths/URLs that can be loaded.
- * Each ref includes the messageIndex so images can be injected at their original location.
- *
- * Note: Global deduplication is intentional - if the same image appears in multiple
- * messages, we only inject it at the FIRST occurrence. This is sufficient because:
- * 1. The model sees all message content including the image
- * 2. Later references to "the image" or "that picture" will work since it's in context
- * 3. Injecting duplicates would waste tokens and potentially hit size limits
- */
-function detectImagesFromHistory(messages: unknown[]): DetectedImageRef[] {
-  const allRefs: DetectedImageRef[] = [];
-  const seen = new Set<string>();
-
-  const messageHasImageContent = (msg: unknown): boolean => {
-    if (!msg || typeof msg !== "object") {
-      return false;
-    }
-    const content = (msg as { content?: unknown }).content;
-    if (!Array.isArray(content)) {
-      return false;
-    }
-    return content.some(
-      (part) =>
-        part != null && typeof part === "object" && (part as { type?: string }).type === "image",
-    );
-  };
-
-  for (let i = 0; i < messages.length; i++) {
-    const msg = messages[i];
-    if (!msg || typeof msg !== "object") {
-      continue;
-    }
-    const message = msg as { role?: string };
-    // Only scan user messages for image references
-    if (message.role !== "user") {
-      continue;
-    }
-    // Skip if message already has image content (prevents reloading each turn)
-    if (messageHasImageContent(msg)) {
-      continue;
-    }
-
-    const text = extractTextFromMessage(msg);
-    if (!text) {
-      continue;
-    }
-
-    const refs = detectImageReferences(text);
-    for (const ref of refs) {
-      const key = ref.resolved.toLowerCase();
-      if (seen.has(key)) {
-        continue;
-      }
-      seen.add(key);
-      allRefs.push({ ...ref, messageIndex: i });
-    }
-  }
-
-  return allRefs;
-}
-
 /**
 * Detects and loads images referenced in a prompt for models with vision capability.
 *
@@ -362,18 +273,14 @@ function detectImagesFromHistory(messages: unknown[]): DetectedImageRef[] {
 * loads them, and returns them as ImageContent array ready to be passed to
 * the model's prompt method.
 *
- * Also scans conversation history for images from previous turns and returns
- * them mapped by message index so they can be injected at their original location.
- *
 * @param params Configuration for image detection and loading
- * @returns Object with loaded images for current prompt and history images by message index
+ * @returns Object with loaded images for current prompt only
 */
 export async function detectAndLoadPromptImages(params: {
  prompt: string;
  workspaceDir: string;
  model: { input?: string[] };
  existingImages?: ImageContent[];
-  historyMessages?: unknown[];
  maxBytes?: number;
  maxDimensionPx?: number;
  workspaceOnly?: boolean;
@@ -381,8 +288,6 @@ export async function detectAndLoadPromptImages(params: {
 }): Promise<{
  /** Images for the current prompt (existingImages + detected in current prompt) */
  images: ImageContent[];
-  /** Images from history messages, keyed by message index */
-  historyImagesByIndex: Map<number, ImageContent[]>;
  detectedRefs: DetectedImageRef[];
  loadedCount: number;
  skippedCount: number;
@@ -391,7 +296,6 @@ export async function detectAndLoadPromptImages(params: {
  if (!modelSupportsImages(params.model)) {
    return {
      images: [],
-      historyImagesByIndex: new Map(),
      detectedRefs: [],
      loadedCount: 0,
      skippedCount: 0,
@@ -399,38 +303,20 @@ export async function detectAndLoadPromptImages(params: {
  }

  // Detect images from current prompt
-  const promptRefs = detectImageReferences(params.prompt);
-
-  // Detect images from conversation history (with message indices)
-  const historyRefs = params.historyMessages ? detectImagesFromHistory(params.historyMessages) : [];
-
-  // Deduplicate: if an image is in the current prompt, don't also load it from history.
-  // Current prompt images are passed via the `images` parameter to prompt(), while history
-  // images are injected into their original message positions. We don't want the same
-  // image loaded and sent twice (wasting tokens and potentially causing confusion).
-  const seenPaths = new Set(promptRefs.map((r) => r.resolved.toLowerCase()));
-  const uniqueHistoryRefs = historyRefs.filter((r) => !seenPaths.has(r.resolved.toLowerCase()));
-
-  const allRefs = [...promptRefs, ...uniqueHistoryRefs];
+  const allRefs = detectImageReferences(params.prompt);

  if (allRefs.length === 0) {
    return {
      images: params.existingImages ?? [],
-      historyImagesByIndex: new Map(),
      detectedRefs: [],
      loadedCount: 0,
      skippedCount: 0,
    };
  }

-  log.debug(
-    `Native image: detected ${allRefs.length} image refs (${promptRefs.length} in prompt, ${uniqueHistoryRefs.length} in history)`,
-  );
+  log.debug(`Native image: detected ${allRefs.length} image refs in prompt`);

-  // Load images for current prompt
  const promptImages: ImageContent[] = [...(params.existingImages ?? [])];
-  // Load images for history, grouped by message index
-  const historyImagesByIndex = new Map<number, ImageContent[]>();

  let loadedCount = 0;
  let skippedCount = 0;
@@ -442,18 +328,7 @@ export async function detectAndLoadPromptImages(params: {
      sandbox: params.sandbox,
    });
    if (image) {
-      if (ref.messageIndex !== undefined) {
-        // History image - add to the appropriate message index
-        const existing = historyImagesByIndex.get(ref.messageIndex);
-        if (existing) {
-          existing.push(image);
-        } else {
-          historyImagesByIndex.set(ref.messageIndex, [image]);
-        }
-      } else {
-        // Current prompt image
-        promptImages.push(image);
-      }
+      promptImages.push(image);
      loadedCount++;
      log.debug(`Native image: loaded ${ref.type} ${ref.resolved}`);
    } else {
@@ -469,21 +344,9 @@ export async function detectAndLoadPromptImages(params: {
    "prompt:images",
    imageSanitization,
  );
-  const sanitizedHistoryImagesByIndex = new Map<number, ImageContent[]>();
-  for (const [index, images] of historyImagesByIndex) {
-    const sanitized = await sanitizeImagesWithLog(
-      images,
-      `history:images:${index}`,
-      imageSanitization,
-    );
-    if (sanitized.length > 0) {
-      sanitizedHistoryImagesByIndex.set(index, sanitized);
-    }
-  }

  return {
    images: sanitizedPromptImages,
-    historyImagesByIndex: sanitizedHistoryImagesByIndex,
    detectedRefs: allRefs,
    loadedCount,
    skippedCount,