fix(pi): stop history image reinjection token blowup

This commit is contained in:
Peter Steinberger
2026-02-26 16:38:09 +01:00
parent 9a4b2266cc
commit c81e9866ff
6 changed files with 69 additions and 259 deletions

View File

@@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Pi image-token usage: stop re-injecting history image blocks each turn, process image references from the current prompt only, and prune already-answered user-image blocks in stored history to prevent runaway token growth. (#27602)
- Security/Gateway node pairing: pin paired-device `platform`/`deviceFamily` metadata across reconnects and bind those fields into device-auth signatures, so reconnect metadata spoofing cannot expand node command allowlists without explicit repair pairing. This ships in the next npm release (`2026.2.26`). Thanks @76embiid21 for reporting.
- Security/Sandbox path alias guard: reject broken symlink targets by resolving through existing ancestors and failing closed on out-of-root targets, preventing workspace-only `apply_patch` writes from escaping sandbox/workspace boundaries via dangling symlinks. This ships in the next npm release (`2026.2.26`). Thanks @tdjackey for reporting.
- Security/Workspace FS boundary aliases: harden canonical boundary resolution for non-existent-leaf symlink aliases while preserving valid in-root aliases, preventing first-write workspace escapes via out-of-root symlink targets. This ships in the next npm release (`2026.2.26`). Thanks @tdjackey for reporting.

View File

@@ -232,6 +232,10 @@ await session.prompt(effectivePrompt, { images: imageResult.images });
The SDK handles the full agent loop: sending to LLM, executing tool calls, streaming responses.
Image injection is prompt-local: OpenClaw loads image refs from the current prompt and
passes them via `images` for that turn only. It does not re-scan older history turns
to re-inject image payloads.
## Tool Architecture
### Tool Pipeline

View File

@@ -3,24 +3,29 @@ import type { ImageContent } from "@mariozechner/pi-ai";
import { describe, expect, it, vi } from "vitest";
import type { OpenClawConfig } from "../../../config/config.js";
import {
injectHistoryImagesIntoMessages,
PRUNED_HISTORY_IMAGE_MARKER,
pruneProcessedHistoryImages,
resolveAttemptFsWorkspaceOnly,
resolvePromptBuildHookResult,
resolvePromptModeForSession,
} from "./attempt.js";
describe("injectHistoryImagesIntoMessages", () => {
describe("pruneProcessedHistoryImages", () => {
const image: ImageContent = { type: "image", data: "abc", mimeType: "image/png" };
it("injects history images and converts string content", () => {
it("prunes image blocks from user messages that already have assistant replies", () => {
const messages: AgentMessage[] = [
{
role: "user",
content: "See /tmp/photo.png",
content: [{ type: "text", text: "See /tmp/photo.png" }, { ...image }],
} as AgentMessage,
{
role: "assistant",
content: "got it",
} as unknown as AgentMessage,
];
const didMutate = injectHistoryImagesIntoMessages(messages, new Map([[0, [image]]]));
const didMutate = pruneProcessedHistoryImages(messages);
expect(didMutate).toBe(true);
const firstUser = messages[0] as Extract<AgentMessage, { role: "user" }> | undefined;
@@ -28,10 +33,10 @@ describe("injectHistoryImagesIntoMessages", () => {
const content = firstUser?.content as Array<{ type: string; text?: string; data?: string }>;
expect(content).toHaveLength(2);
expect(content[0]?.type).toBe("text");
expect(content[1]).toMatchObject({ type: "image", data: "abc" });
expect(content[1]).toMatchObject({ type: "text", text: PRUNED_HISTORY_IMAGE_MARKER });
});
it("avoids duplicating existing image content", () => {
it("does not prune latest user message when no assistant response exists yet", () => {
const messages: AgentMessage[] = [
{
role: "user",
@@ -39,7 +44,7 @@ describe("injectHistoryImagesIntoMessages", () => {
} as AgentMessage,
];
const didMutate = injectHistoryImagesIntoMessages(messages, new Map([[0, [image]]]));
const didMutate = pruneProcessedHistoryImages(messages);
expect(didMutate).toBe(false);
const first = messages[0] as Extract<AgentMessage, { role: "user" }> | undefined;
@@ -47,21 +52,22 @@ describe("injectHistoryImagesIntoMessages", () => {
throw new Error("expected array content");
}
expect(first.content).toHaveLength(2);
expect(first.content[1]).toMatchObject({ type: "image", data: "abc" });
});
it("ignores non-user messages and out-of-range indices", () => {
it("does not change messages when no assistant turn exists", () => {
const messages: AgentMessage[] = [
{
role: "assistant",
role: "user",
content: "noop",
} as unknown as AgentMessage,
} as AgentMessage,
];
const didMutate = injectHistoryImagesIntoMessages(messages, new Map([[1, [image]]]));
const didMutate = pruneProcessedHistoryImages(messages);
expect(didMutate).toBe(false);
const firstAssistant = messages[0] as Extract<AgentMessage, { role: "assistant" }> | undefined;
expect(firstAssistant?.content).toBe("noop");
const firstUser = messages[0] as Extract<AgentMessage, { role: "user" }> | undefined;
expect(firstUser?.content).toBe("noop");
});
});

View File

@@ -1,7 +1,6 @@
import fs from "node:fs/promises";
import os from "node:os";
import type { AgentMessage } from "@mariozechner/pi-agent-core";
import type { ImageContent } from "@mariozechner/pi-ai";
import { streamSimple } from "@mariozechner/pi-ai";
import {
createAgentSession,
@@ -128,51 +127,45 @@ type PromptBuildHookRunner = {
) => Promise<PluginHookBeforeAgentStartResult | undefined>;
};
export function injectHistoryImagesIntoMessages(
messages: AgentMessage[],
historyImagesByIndex: Map<number, ImageContent[]>,
): boolean {
if (historyImagesByIndex.size === 0) {
export const PRUNED_HISTORY_IMAGE_MARKER = "[image data removed - already processed by model]";
/**
* Prunes image blocks from user messages that already have an assistant response after them.
* This is a one-way cleanup for previously persisted history image data.
*/
export function pruneProcessedHistoryImages(messages: AgentMessage[]): boolean {
let lastAssistantIndex = -1;
for (let i = messages.length - 1; i >= 0; i--) {
if (messages[i]?.role === "assistant") {
lastAssistantIndex = i;
break;
}
}
if (lastAssistantIndex < 0) {
return false;
}
let didMutate = false;
for (const [msgIndex, images] of historyImagesByIndex) {
// Bounds check: ensure index is valid before accessing
if (msgIndex < 0 || msgIndex >= messages.length) {
let didMutate = false;
for (let i = 0; i < lastAssistantIndex; i++) {
const message = messages[i];
if (!message || message.role !== "user" || !Array.isArray(message.content)) {
continue;
}
const msg = messages[msgIndex];
if (msg && msg.role === "user") {
// Convert string content to array format if needed
if (typeof msg.content === "string") {
msg.content = [{ type: "text", text: msg.content }];
didMutate = true;
for (let j = 0; j < message.content.length; j++) {
const block = message.content[j];
if (!block || typeof block !== "object") {
continue;
}
if (Array.isArray(msg.content)) {
// Check for existing image content to avoid duplicates across turns
const existingImageData = new Set(
msg.content
.filter(
(c): c is ImageContent =>
c != null &&
typeof c === "object" &&
c.type === "image" &&
typeof c.data === "string",
)
.map((c) => c.data),
);
for (const img of images) {
// Only add if this image isn't already in the message
if (!existingImageData.has(img.data)) {
msg.content.push(img);
didMutate = true;
}
}
if ((block as { type?: string }).type !== "image") {
continue;
}
message.content[j] = {
type: "text",
text: PRUNED_HISTORY_IMAGE_MARKER,
} as (typeof message.content)[number];
didMutate = true;
}
}
return didMutate;
}
@@ -1092,16 +1085,20 @@ export async function runEmbeddedAttempt(
}
try {
// One-time migration: prune image blocks from already-processed user turns.
// This prevents old persisted base64 payloads from bloating subsequent prompts.
const didPruneImages = pruneProcessedHistoryImages(activeSession.messages);
if (didPruneImages) {
activeSession.agent.replaceMessages(activeSession.messages);
}
// Detect and load images referenced in the prompt for vision-capable models.
// This eliminates the need for an explicit "view" tool call by injecting
// images directly into the prompt when the model supports it.
// Also scans conversation history to enable follow-up questions about earlier images.
// Images are prompt-local only (pi-like behavior).
const imageResult = await detectAndLoadPromptImages({
prompt: effectivePrompt,
workspaceDir: effectiveWorkspace,
model: params.model,
existingImages: params.images,
historyMessages: activeSession.messages,
maxBytes: MAX_IMAGE_BYTES,
maxDimensionPx: resolveImageSanitizationLimits(params.config).maxDimensionPx,
workspaceOnly: effectiveFsWorkspaceOnly,
@@ -1112,21 +1109,10 @@ export async function runEmbeddedAttempt(
: undefined,
});
// Inject history images into their original message positions.
// This ensures the model sees images in context (e.g., "compare to the first image").
const didMutate = injectHistoryImagesIntoMessages(
activeSession.messages,
imageResult.historyImagesByIndex,
);
if (didMutate) {
// Persist message mutations (e.g., injected history images) so we don't re-scan/reload.
activeSession.agent.replaceMessages(activeSession.messages);
}
cacheTrace?.recordStage("prompt:images", {
prompt: effectivePrompt,
messages: activeSession.messages,
note: `images: prompt=${imageResult.images.length} history=${imageResult.historyImagesByIndex.size}`,
note: `images: prompt=${imageResult.images.length}`,
});
// Diagnostic: log context sizes before prompt to help debug early overflow errors.
@@ -1143,7 +1129,6 @@ export async function runEmbeddedAttempt(
`historyImageBlocks=${sessionSummary.totalImageBlocks} ` +
`systemPromptChars=${systemLen} promptChars=${promptLen} ` +
`promptImages=${imageResult.images.length} ` +
`historyImageMessages=${imageResult.historyImagesByIndex.size} ` +
`provider=${params.provider}/${params.modelId} sessionFile=${params.sessionFile}`,
);
}

View File

@@ -256,25 +256,15 @@ describe("detectAndLoadPromptImages", () => {
expect(result.detectedRefs).toHaveLength(0);
});
it("skips history messages that already include image content", async () => {
it("returns no detected refs when prompt has no image references", async () => {
const result = await detectAndLoadPromptImages({
prompt: "no images here",
workspaceDir: "/tmp",
model: { input: ["text", "image"] },
historyMessages: [
{
role: "user",
content: [
{ type: "text", text: "See /tmp/should-not-load.png" },
{ type: "image", data: "abc", mimeType: "image/png" },
],
},
],
});
expect(result.detectedRefs).toHaveLength(0);
expect(result.images).toHaveLength(0);
expect(result.historyImagesByIndex.size).toBe(0);
});
it("blocks prompt image refs outside workspace when sandbox workspaceOnly is enabled", async () => {
@@ -309,43 +299,4 @@ describe("detectAndLoadPromptImages", () => {
await fs.rm(stateDir, { recursive: true, force: true });
}
});
it("blocks history image refs outside workspace when sandbox workspaceOnly is enabled", async () => {
const stateDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-native-image-sandbox-"));
const sandboxRoot = path.join(stateDir, "sandbox");
const agentRoot = path.join(stateDir, "agent");
await fs.mkdir(sandboxRoot, { recursive: true });
await fs.mkdir(agentRoot, { recursive: true });
const pngB64 =
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII=";
await fs.writeFile(path.join(agentRoot, "secret.png"), Buffer.from(pngB64, "base64"));
const sandbox = createUnsafeMountedSandbox({ sandboxRoot, agentRoot });
const bridge = sandbox.fsBridge;
if (!bridge) {
throw new Error("sandbox fs bridge missing");
}
try {
const result = await detectAndLoadPromptImages({
prompt: "No inline image in this turn.",
workspaceDir: sandboxRoot,
model: { input: ["text", "image"] },
workspaceOnly: true,
historyMessages: [
{
role: "user",
content: [{ type: "text", text: "Previous image /agent/secret.png" }],
},
],
sandbox: { root: sandbox.workspaceDir, bridge },
});
expect(result.detectedRefs).toHaveLength(1);
expect(result.loadedCount).toBe(0);
expect(result.skippedCount).toBe(1);
expect(result.historyImagesByIndex.size).toBe(0);
} finally {
await fs.rm(stateDir, { recursive: true, force: true });
}
});
});

View File

@@ -36,8 +36,6 @@ export interface DetectedImageRef {
type: "path" | "url";
/** The resolved/normalized path or URL */
resolved: string;
/** Index of the message this ref was found in (for history images) */
messageIndex?: number;
}
/**
@@ -268,93 +266,6 @@ export function modelSupportsImages(model: { input?: string[] }): boolean {
return model.input?.includes("image") ?? false;
}
function extractTextFromMessage(message: unknown): string {
if (!message || typeof message !== "object") {
return "";
}
const content = (message as { content?: unknown }).content;
if (typeof content === "string") {
return content;
}
if (!Array.isArray(content)) {
return "";
}
const textParts: string[] = [];
for (const part of content) {
if (!part || typeof part !== "object") {
continue;
}
const record = part as Record<string, unknown>;
if (record.type === "text" && typeof record.text === "string") {
textParts.push(record.text);
}
}
return textParts.join("\n").trim();
}
/**
* Extracts image references from conversation history messages.
* Scans user messages for image paths/URLs that can be loaded.
* Each ref includes the messageIndex so images can be injected at their original location.
*
* Note: Global deduplication is intentional - if the same image appears in multiple
* messages, we only inject it at the FIRST occurrence. This is sufficient because:
* 1. The model sees all message content including the image
* 2. Later references to "the image" or "that picture" will work since it's in context
* 3. Injecting duplicates would waste tokens and potentially hit size limits
*/
function detectImagesFromHistory(messages: unknown[]): DetectedImageRef[] {
const allRefs: DetectedImageRef[] = [];
const seen = new Set<string>();
const messageHasImageContent = (msg: unknown): boolean => {
if (!msg || typeof msg !== "object") {
return false;
}
const content = (msg as { content?: unknown }).content;
if (!Array.isArray(content)) {
return false;
}
return content.some(
(part) =>
part != null && typeof part === "object" && (part as { type?: string }).type === "image",
);
};
for (let i = 0; i < messages.length; i++) {
const msg = messages[i];
if (!msg || typeof msg !== "object") {
continue;
}
const message = msg as { role?: string };
// Only scan user messages for image references
if (message.role !== "user") {
continue;
}
// Skip if message already has image content (prevents reloading each turn)
if (messageHasImageContent(msg)) {
continue;
}
const text = extractTextFromMessage(msg);
if (!text) {
continue;
}
const refs = detectImageReferences(text);
for (const ref of refs) {
const key = ref.resolved.toLowerCase();
if (seen.has(key)) {
continue;
}
seen.add(key);
allRefs.push({ ...ref, messageIndex: i });
}
}
return allRefs;
}
/**
* Detects and loads images referenced in a prompt for models with vision capability.
*
@@ -362,18 +273,14 @@ function detectImagesFromHistory(messages: unknown[]): DetectedImageRef[] {
* loads them, and returns them as ImageContent array ready to be passed to
* the model's prompt method.
*
* Also scans conversation history for images from previous turns and returns
* them mapped by message index so they can be injected at their original location.
*
* @param params Configuration for image detection and loading
* @returns Object with loaded images for current prompt and history images by message index
* @returns Object with loaded images for current prompt only
*/
export async function detectAndLoadPromptImages(params: {
prompt: string;
workspaceDir: string;
model: { input?: string[] };
existingImages?: ImageContent[];
historyMessages?: unknown[];
maxBytes?: number;
maxDimensionPx?: number;
workspaceOnly?: boolean;
@@ -381,8 +288,6 @@ export async function detectAndLoadPromptImages(params: {
}): Promise<{
/** Images for the current prompt (existingImages + detected in current prompt) */
images: ImageContent[];
/** Images from history messages, keyed by message index */
historyImagesByIndex: Map<number, ImageContent[]>;
detectedRefs: DetectedImageRef[];
loadedCount: number;
skippedCount: number;
@@ -391,7 +296,6 @@ export async function detectAndLoadPromptImages(params: {
if (!modelSupportsImages(params.model)) {
return {
images: [],
historyImagesByIndex: new Map(),
detectedRefs: [],
loadedCount: 0,
skippedCount: 0,
@@ -399,38 +303,20 @@ export async function detectAndLoadPromptImages(params: {
}
// Detect images from current prompt
const promptRefs = detectImageReferences(params.prompt);
// Detect images from conversation history (with message indices)
const historyRefs = params.historyMessages ? detectImagesFromHistory(params.historyMessages) : [];
// Deduplicate: if an image is in the current prompt, don't also load it from history.
// Current prompt images are passed via the `images` parameter to prompt(), while history
// images are injected into their original message positions. We don't want the same
// image loaded and sent twice (wasting tokens and potentially causing confusion).
const seenPaths = new Set(promptRefs.map((r) => r.resolved.toLowerCase()));
const uniqueHistoryRefs = historyRefs.filter((r) => !seenPaths.has(r.resolved.toLowerCase()));
const allRefs = [...promptRefs, ...uniqueHistoryRefs];
const allRefs = detectImageReferences(params.prompt);
if (allRefs.length === 0) {
return {
images: params.existingImages ?? [],
historyImagesByIndex: new Map(),
detectedRefs: [],
loadedCount: 0,
skippedCount: 0,
};
}
log.debug(
`Native image: detected ${allRefs.length} image refs (${promptRefs.length} in prompt, ${uniqueHistoryRefs.length} in history)`,
);
log.debug(`Native image: detected ${allRefs.length} image refs in prompt`);
// Load images for current prompt
const promptImages: ImageContent[] = [...(params.existingImages ?? [])];
// Load images for history, grouped by message index
const historyImagesByIndex = new Map<number, ImageContent[]>();
let loadedCount = 0;
let skippedCount = 0;
@@ -442,18 +328,7 @@ export async function detectAndLoadPromptImages(params: {
sandbox: params.sandbox,
});
if (image) {
if (ref.messageIndex !== undefined) {
// History image - add to the appropriate message index
const existing = historyImagesByIndex.get(ref.messageIndex);
if (existing) {
existing.push(image);
} else {
historyImagesByIndex.set(ref.messageIndex, [image]);
}
} else {
// Current prompt image
promptImages.push(image);
}
promptImages.push(image);
loadedCount++;
log.debug(`Native image: loaded ${ref.type} ${ref.resolved}`);
} else {
@@ -469,21 +344,9 @@ export async function detectAndLoadPromptImages(params: {
"prompt:images",
imageSanitization,
);
const sanitizedHistoryImagesByIndex = new Map<number, ImageContent[]>();
for (const [index, images] of historyImagesByIndex) {
const sanitized = await sanitizeImagesWithLog(
images,
`history:images:${index}`,
imageSanitization,
);
if (sanitized.length > 0) {
sanitizedHistoryImagesByIndex.set(index, sanitized);
}
}
return {
images: sanitizedPromptImages,
historyImagesByIndex: sanitizedHistoryImagesByIndex,
detectedRefs: allRefs,
loadedCount,
skippedCount,