diff --git a/CHANGELOG.md b/CHANGELOG.md index 62a533849..6770bcebf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -149,6 +149,7 @@ Docs: https://docs.openclaw.ai - Telegram/multi-account default routing clarity: warn only for ambiguous (2+) account setups without an explicit default, add `openclaw doctor` warnings for missing/invalid multi-account defaults across channels, and document explicit-default guidance for channel routing and Telegram config. (#32544) thanks @Sid-Qin. - Telegram/plugin outbound hook parity: run `message_sending` + `message_sent` in Telegram reply delivery, include reply-path hook metadata (`mediaUrls`, `threadId`), and report `message_sent.success=false` when hooks blank text and no outbound message is delivered. (#32649) Thanks @KimGLee. - CLI/Coding-agent reliability: switch default `claude-cli` non-interactive args to `--permission-mode bypassPermissions`, auto-normalize legacy `--dangerously-skip-permissions` backend overrides to the modern permission-mode form, align coding-agent + live-test docs with the non-PTY Claude path, and emit session system-event heartbeat notices when CLI watchdog no-output timeouts terminate runs. (#28610, #31149, #34055). Thanks @niceysam, @cryptomaltese and @vincentkoc. +- Gateway/OpenAI chat completions: parse active-turn `image_url` content parts (including parameterized data URIs and guarded URL sources), forward them as multimodal `images`, accept image-only user turns, enforce per-request image-part/byte budgets, default URL-based image fetches to disabled unless explicitly enabled by config, and redact image base64 data in cache-trace/provider payload diagnostics. (#17685) Thanks @vincentkoc - ACP/ACPX session bootstrap: retry with `sessions new` when `sessions ensure` returns no session identifiers so ACP spawns avoid `NO_SESSION`/`ACP_TURN_FAILED` failures on affected agents. (#28786, #31338, #34055). Thanks @Sid-Qin and @vincentkoc. - ACP/sessions_spawn parent stream visibility: add `streamTo: "parent"` for `runtime: "acp"` to forward initial child-run progress/no-output/completion updates back into the requester session as system events (instead of direct child delivery), and emit a tail-able session-scoped relay log (`.acp-stream.jsonl`, returned as `streamLogPath` when available), improving orchestrator visibility for blocked or long-running harness turns. (#34310, #29909; reopened from #34055). Thanks @vincentkoc. - Agents/bootstrap truncation warning handling: unify bootstrap budget/truncation analysis across embedded + CLI runtime, `/context`, and `openclaw doctor`; add `agents.defaults.bootstrapPromptTruncationWarning` (`off|once|always`, default `once`) and persist warning-signature metadata so truncation warnings are consistent and deduped across turns. (#32769) Thanks @gumadeiras. diff --git a/src/agents/anthropic-payload-log.test.ts b/src/agents/anthropic-payload-log.test.ts new file mode 100644 index 000000000..c97eda2f2 --- /dev/null +++ b/src/agents/anthropic-payload-log.test.ts @@ -0,0 +1,49 @@ +import crypto from "node:crypto"; +import type { StreamFn } from "@mariozechner/pi-agent-core"; +import { describe, expect, it } from "vitest"; +import { createAnthropicPayloadLogger } from "./anthropic-payload-log.js"; + +describe("createAnthropicPayloadLogger", () => { + it("redacts image base64 payload data before writing logs", async () => { + const lines: string[] = []; + const logger = createAnthropicPayloadLogger({ + env: { OPENCLAW_ANTHROPIC_PAYLOAD_LOG: "1" }, + writer: { + filePath: "memory", + write: (line) => lines.push(line), + }, + }); + expect(logger).not.toBeNull(); + + const payload = { + messages: [ + { + role: "user", + content: [ + { + type: "image", + source: { type: "base64", media_type: "image/png", data: "QUJDRA==" }, + }, + ], + }, + ], + }; + const streamFn: StreamFn = ((_, __, options) => { + options?.onPayload?.(payload); + return {} as never; + }) as StreamFn; + + const wrapped = logger?.wrapStreamFn(streamFn); + await wrapped?.({ api: "anthropic-messages" } as never, { messages: [] } as never, {}); + + const event = JSON.parse(lines[0]?.trim() ?? "{}") as Record; + const message = ((event.payload as { messages?: unknown[] } | undefined)?.messages ?? + []) as Array>; + const source = (((message[0]?.content as Array> | undefined) ?? [])[0] + ?.source ?? {}) as Record; + expect(source.data).toBe(""); + expect(source.bytes).toBe(4); + expect(source.sha256).toBe(crypto.createHash("sha256").update("QUJDRA==").digest("hex")); + expect(event.payloadDigest).toBeDefined(); + }); +}); diff --git a/src/agents/anthropic-payload-log.ts b/src/agents/anthropic-payload-log.ts index 03c2cbc1c..882a85f0f 100644 --- a/src/agents/anthropic-payload-log.ts +++ b/src/agents/anthropic-payload-log.ts @@ -7,6 +7,7 @@ import { createSubsystemLogger } from "../logging/subsystem.js"; import { resolveUserPath } from "../utils.js"; import { parseBooleanValue } from "../utils/boolean.js"; import { safeJsonStringify } from "../utils/safe-json.js"; +import { redactImageDataForDiagnostics } from "./payload-redaction.js"; import { getQueuedFileWriter, type QueuedFileWriter } from "./queued-file-writer.js"; type PayloadLogStage = "request" | "usage"; @@ -103,6 +104,7 @@ export function createAnthropicPayloadLogger(params: { modelId?: string; modelApi?: string | null; workspaceDir?: string; + writer?: PayloadLogWriter; }): AnthropicPayloadLogger | null { const env = params.env ?? process.env; const cfg = resolvePayloadLogConfig(env); @@ -110,7 +112,7 @@ export function createAnthropicPayloadLogger(params: { return null; } - const writer = getWriter(cfg.filePath); + const writer = params.writer ?? getWriter(cfg.filePath); const base: Omit = { runId: params.runId, sessionId: params.sessionId, @@ -135,12 +137,13 @@ export function createAnthropicPayloadLogger(params: { return streamFn(model, context, options); } const nextOnPayload = (payload: unknown) => { + const redactedPayload = redactImageDataForDiagnostics(payload); record({ ...base, ts: new Date().toISOString(), stage: "request", - payload, - payloadDigest: digest(payload), + payload: redactedPayload, + payloadDigest: digest(redactedPayload), }); options?.onPayload?.(payload); }; diff --git a/src/agents/cache-trace.test.ts b/src/agents/cache-trace.test.ts index c2aae1455..be49e93a3 100644 --- a/src/agents/cache-trace.test.ts +++ b/src/agents/cache-trace.test.ts @@ -1,3 +1,4 @@ +import crypto from "node:crypto"; import { describe, expect, it } from "vitest"; import type { OpenClawConfig } from "../config/config.js"; import { resolveUserPath } from "../utils.js"; @@ -89,4 +90,58 @@ describe("createCacheTrace", () => { expect(trace).toBeNull(); }); + + it("redacts image data from options and messages before writing", () => { + const lines: string[] = []; + const trace = createCacheTrace({ + cfg: { + diagnostics: { + cacheTrace: { + enabled: true, + }, + }, + }, + env: {}, + writer: { + filePath: "memory", + write: (line) => lines.push(line), + }, + }); + + trace?.recordStage("stream:context", { + options: { + images: [{ type: "image", mimeType: "image/png", data: "QUJDRA==" }], + }, + messages: [ + { + role: "user", + content: [ + { + type: "image", + source: { type: "base64", media_type: "image/jpeg", data: "U0VDUkVU" }, + }, + ], + }, + ] as unknown as [], + }); + + const event = JSON.parse(lines[0]?.trim() ?? "{}") as Record; + const optionsImages = ( + ((event.options as { images?: unknown[] } | undefined)?.images ?? []) as Array< + Record + > + )[0]; + expect(optionsImages?.data).toBe(""); + expect(optionsImages?.bytes).toBe(4); + expect(optionsImages?.sha256).toBe( + crypto.createHash("sha256").update("QUJDRA==").digest("hex"), + ); + + const firstMessage = ((event.messages as Array> | undefined) ?? [])[0]; + const source = (((firstMessage?.content as Array> | undefined) ?? [])[0] + ?.source ?? {}) as Record; + expect(source.data).toBe(""); + expect(source.bytes).toBe(6); + expect(source.sha256).toBe(crypto.createHash("sha256").update("U0VDUkVU").digest("hex")); + }); }); diff --git a/src/agents/cache-trace.ts b/src/agents/cache-trace.ts index 1edfd086f..508461450 100644 --- a/src/agents/cache-trace.ts +++ b/src/agents/cache-trace.ts @@ -6,6 +6,7 @@ import { resolveStateDir } from "../config/paths.js"; import { resolveUserPath } from "../utils.js"; import { parseBooleanValue } from "../utils/boolean.js"; import { safeJsonStringify } from "../utils/safe-json.js"; +import { redactImageDataForDiagnostics } from "./payload-redaction.js"; import { getQueuedFileWriter, type QueuedFileWriter } from "./queued-file-writer.js"; export type CacheTraceStage = @@ -198,7 +199,7 @@ export function createCacheTrace(params: CacheTraceInit): CacheTrace | null { event.systemDigest = digest(payload.system); } if (payload.options) { - event.options = payload.options; + event.options = redactImageDataForDiagnostics(payload.options) as Record; } if (payload.model) { event.model = payload.model; @@ -212,7 +213,7 @@ export function createCacheTrace(params: CacheTraceInit): CacheTrace | null { event.messageFingerprints = summary.messageFingerprints; event.messagesDigest = summary.messagesDigest; if (cfg.includeMessages) { - event.messages = messages; + event.messages = redactImageDataForDiagnostics(messages) as AgentMessage[]; } } diff --git a/src/agents/payload-redaction.ts b/src/agents/payload-redaction.ts new file mode 100644 index 000000000..ab6b29496 --- /dev/null +++ b/src/agents/payload-redaction.ts @@ -0,0 +1,64 @@ +import crypto from "node:crypto"; +import { estimateBase64DecodedBytes } from "../media/base64.js"; + +export const REDACTED_IMAGE_DATA = ""; + +function toLowerTrimmed(value: unknown): string { + return typeof value === "string" ? value.trim().toLowerCase() : ""; +} + +function hasImageMime(record: Record): boolean { + const candidates = [ + toLowerTrimmed(record.mimeType), + toLowerTrimmed(record.media_type), + toLowerTrimmed(record.mime_type), + ]; + return candidates.some((value) => value.startsWith("image/")); +} + +function shouldRedactImageData(record: Record): record is Record { + if (typeof record.data !== "string") { + return false; + } + const type = toLowerTrimmed(record.type); + return type === "image" || hasImageMime(record); +} + +function digestBase64Payload(data: string): string { + return crypto.createHash("sha256").update(data).digest("hex"); +} + +/** + * Redacts image/base64 payload data from diagnostic objects before persistence. + */ +export function redactImageDataForDiagnostics(value: unknown): unknown { + const seen = new WeakSet(); + + const visit = (input: unknown): unknown => { + if (Array.isArray(input)) { + return input.map((entry) => visit(entry)); + } + if (!input || typeof input !== "object") { + return input; + } + if (seen.has(input)) { + return "[Circular]"; + } + seen.add(input); + + const record = input as Record; + const out: Record = {}; + for (const [key, val] of Object.entries(record)) { + out[key] = visit(val); + } + + if (shouldRedactImageData(record)) { + out.data = REDACTED_IMAGE_DATA; + out.bytes = estimateBase64DecodedBytes(record.data); + out.sha256 = digestBase64Payload(record.data); + } + return out; + }; + + return visit(value); +} diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts index 78e6e461c..911d08620 100644 --- a/src/config/schema.help.ts +++ b/src/config/schema.help.ts @@ -384,6 +384,26 @@ export const FIELD_HELP: Record = { "Disables Control UI device identity checks and relies on token/password only. Use only for short-lived debugging on trusted networks, then turn it off immediately.", "gateway.http.endpoints.chatCompletions.enabled": "Enable the OpenAI-compatible `POST /v1/chat/completions` endpoint (default: false).", + "gateway.http.endpoints.chatCompletions.maxBodyBytes": + "Max request body size in bytes for `/v1/chat/completions` (default: 20MB).", + "gateway.http.endpoints.chatCompletions.maxImageParts": + "Max number of `image_url` parts accepted from the latest user message (default: 8).", + "gateway.http.endpoints.chatCompletions.maxTotalImageBytes": + "Max cumulative decoded bytes across all `image_url` parts in one request (default: 20MB).", + "gateway.http.endpoints.chatCompletions.images": + "Image fetch/validation controls for OpenAI-compatible `image_url` parts.", + "gateway.http.endpoints.chatCompletions.images.allowUrl": + "Allow server-side URL fetches for `image_url` parts (default: false; data URIs remain supported).", + "gateway.http.endpoints.chatCompletions.images.urlAllowlist": + "Optional hostname allowlist for `image_url` URL fetches; supports exact hosts and `*.example.com` wildcards.", + "gateway.http.endpoints.chatCompletions.images.allowedMimes": + "Allowed MIME types for `image_url` parts (case-insensitive list).", + "gateway.http.endpoints.chatCompletions.images.maxBytes": + "Max bytes per fetched/decoded `image_url` image (default: 10MB).", + "gateway.http.endpoints.chatCompletions.images.maxRedirects": + "Max HTTP redirects allowed when fetching `image_url` URLs (default: 3).", + "gateway.http.endpoints.chatCompletions.images.timeoutMs": + "Timeout in milliseconds for `image_url` URL fetches (default: 10000).", "gateway.reload.mode": 'Controls how config edits are applied: "off" ignores live edits, "restart" always restarts, "hot" applies in-process, and "hybrid" tries hot then restarts if required. Keep "hybrid" for safest routine updates.', "gateway.reload.debounceMs": "Debounce window (ms) before applying config changes.", diff --git a/src/config/schema.labels.ts b/src/config/schema.labels.ts index e9a3330a4..9454df66f 100644 --- a/src/config/schema.labels.ts +++ b/src/config/schema.labels.ts @@ -249,6 +249,23 @@ export const FIELD_LABELS: Record = { "gateway.controlUi.allowInsecureAuth": "Insecure Control UI Auth Toggle", "gateway.controlUi.dangerouslyDisableDeviceAuth": "Dangerously Disable Control UI Device Auth", "gateway.http.endpoints.chatCompletions.enabled": "OpenAI Chat Completions Endpoint", + "gateway.http.endpoints.chatCompletions.maxBodyBytes": "OpenAI Chat Completions Max Body Bytes", + "gateway.http.endpoints.chatCompletions.maxImageParts": "OpenAI Chat Completions Max Image Parts", + "gateway.http.endpoints.chatCompletions.maxTotalImageBytes": + "OpenAI Chat Completions Max Total Image Bytes", + "gateway.http.endpoints.chatCompletions.images": "OpenAI Chat Completions Image Limits", + "gateway.http.endpoints.chatCompletions.images.allowUrl": + "OpenAI Chat Completions Allow Image URLs", + "gateway.http.endpoints.chatCompletions.images.urlAllowlist": + "OpenAI Chat Completions Image URL Allowlist", + "gateway.http.endpoints.chatCompletions.images.allowedMimes": + "OpenAI Chat Completions Image MIME Allowlist", + "gateway.http.endpoints.chatCompletions.images.maxBytes": + "OpenAI Chat Completions Image Max Bytes", + "gateway.http.endpoints.chatCompletions.images.maxRedirects": + "OpenAI Chat Completions Image Max Redirects", + "gateway.http.endpoints.chatCompletions.images.timeoutMs": + "OpenAI Chat Completions Image Timeout (ms)", "gateway.reload.mode": "Config Reload Mode", "gateway.reload.debounceMs": "Config Reload Debounce (ms)", "gateway.nodes.browser.mode": "Gateway Node Browser Mode", diff --git a/src/config/types.gateway.ts b/src/config/types.gateway.ts index 421a1f187..0adb9d98b 100644 --- a/src/config/types.gateway.ts +++ b/src/config/types.gateway.ts @@ -203,6 +203,41 @@ export type GatewayHttpChatCompletionsConfig = { * Default: false when absent. */ enabled?: boolean; + /** + * Max request body size in bytes for `/v1/chat/completions`. + * Default: 20MB. + */ + maxBodyBytes?: number; + /** + * Max number of `image_url` parts processed from the latest user message. + * Default: 8. + */ + maxImageParts?: number; + /** + * Max cumulative decoded image bytes for all `image_url` parts in one request. + * Default: 20MB. + */ + maxTotalImageBytes?: number; + /** Image input controls for `image_url` parts. */ + images?: GatewayHttpChatCompletionsImagesConfig; +}; + +export type GatewayHttpChatCompletionsImagesConfig = { + /** Allow URL fetches for `image_url` parts. Default: false. */ + allowUrl?: boolean; + /** + * Optional hostname allowlist for URL fetches. + * Supports exact hosts and `*.example.com` wildcards. + */ + urlAllowlist?: string[]; + /** Allowed MIME types (case-insensitive). */ + allowedMimes?: string[]; + /** Max bytes per image. Default: 10MB. */ + maxBytes?: number; + /** Max redirects when fetching a URL. Default: 3. */ + maxRedirects?: number; + /** Fetch timeout in ms. Default: 10s. */ + timeoutMs?: number; }; export type GatewayHttpResponsesConfig = { diff --git a/src/config/zod-schema.ts b/src/config/zod-schema.ts index fafbad012..4d49e0428 100644 --- a/src/config/zod-schema.ts +++ b/src/config/zod-schema.ts @@ -708,6 +708,15 @@ export const OpenClawSchema = z chatCompletions: z .object({ enabled: z.boolean().optional(), + maxBodyBytes: z.number().int().positive().optional(), + maxImageParts: z.number().int().nonnegative().optional(), + maxTotalImageBytes: z.number().int().positive().optional(), + images: z + .object({ + ...ResponsesEndpointUrlFetchShape, + }) + .strict() + .optional(), }) .strict() .optional(), diff --git a/src/gateway/openai-http.test.ts b/src/gateway/openai-http.test.ts index c9d429521..f3ab97093 100644 --- a/src/gateway/openai-http.test.ts +++ b/src/gateway/openai-http.test.ts @@ -133,6 +133,7 @@ describe("OpenAI-compatible HTTP API (e2e)", () => { sessionKey?: string; message?: string; extraSystemPrompt?: string; + images?: Array<{ type: string; data: string; mimeType: string }>; } | undefined; const getFirstAgentMessage = () => getFirstAgentCall()?.message ?? ""; @@ -251,6 +252,223 @@ describe("OpenAI-compatible HTTP API (e2e)", () => { await res.text(); } + { + const imageData = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAA"; + mockAgentOnce([{ text: "looks good" }]); + const res = await postChatCompletions(port, { + model: "openclaw", + messages: [ + { + role: "user", + content: [ + { type: "text", text: "describe this" }, + { + type: "image_url", + image_url: { url: `data:image/png;base64,${imageData}` }, + }, + ], + }, + ], + }); + expect(res.status).toBe(200); + + const firstCall = getFirstAgentCall(); + expect(firstCall?.message).toBe("describe this"); + expect(firstCall?.images).toEqual([ + { type: "image", data: imageData, mimeType: "image/png" }, + ]); + await res.text(); + } + + { + const imageData = "QUJDRA=="; + mockAgentOnce([{ text: "supports data-uri params" }]); + const res = await postChatCompletions(port, { + model: "openclaw", + messages: [ + { + role: "user", + content: [ + { type: "text", text: "with metadata params" }, + { + type: "image_url", + image_url: { url: `data:image/png;charset=utf-8;base64,${imageData}` }, + }, + ], + }, + ], + }); + expect(res.status).toBe(200); + + const firstCall = getFirstAgentCall(); + expect(firstCall?.images).toEqual([ + { type: "image", data: imageData, mimeType: "image/png" }, + ]); + await res.text(); + } + + { + agentCommand.mockClear(); + const res = await postChatCompletions(port, { + model: "openclaw", + messages: [ + { + role: "user", + content: [ + { + type: "image_url", + image_url: { url: "https://example.com/image.png" }, + }, + ], + }, + ], + }); + expect(res.status).toBe(400); + const json = (await res.json()) as Record; + expect((json.error as Record | undefined)?.type).toBe( + "invalid_request_error", + ); + expect(agentCommand).toHaveBeenCalledTimes(0); + } + + { + mockAgentOnce([{ text: "I can see the image" }]); + const res = await postChatCompletions(port, { + model: "openclaw", + messages: [ + { + role: "user", + content: [ + { + type: "image_url", + image_url: { url: "data:image/jpeg;base64,QUJDRA==" }, + }, + ], + }, + ], + }); + expect(res.status).toBe(200); + + const firstCall = getFirstAgentCall(); + expect(firstCall?.message).toContain("User sent image(s) with no text."); + expect(firstCall?.images).toEqual([ + { type: "image", data: "QUJDRA==", mimeType: "image/jpeg" }, + ]); + await res.text(); + } + + { + mockAgentOnce([{ text: "follow up answer" }]); + const res = await postChatCompletions(port, { + model: "openclaw", + messages: [ + { + role: "user", + content: [ + { type: "image_url", image_url: { url: "data:image/png;base64,QUJDRA==" } }, + ], + }, + { role: "assistant", content: "I can see it." }, + { role: "user", content: "What color was it?" }, + ], + }); + expect(res.status).toBe(200); + + const firstCall = getFirstAgentCall(); + expect(firstCall?.images).toBeUndefined(); + expect(firstCall?.message ?? "").not.toContain("User sent image(s) with no text."); + await res.text(); + } + + { + mockAgentOnce([{ text: "latest image only" }]); + const res = await postChatCompletions(port, { + model: "openclaw", + messages: [ + { + role: "user", + content: [ + { type: "text", text: "first" }, + { type: "image_url", image_url: { url: "data:image/png;base64,QUFBQQ==" } }, + ], + }, + { role: "assistant", content: "noted" }, + { + role: "user", + content: [ + { type: "text", text: "second" }, + { type: "image_url", image_url: { url: "data:image/png;base64,QkJCQg==" } }, + ], + }, + ], + }); + expect(res.status).toBe(200); + + const firstCall = getFirstAgentCall(); + expect(firstCall?.images).toEqual([ + { type: "image", data: "QkJCQg==", mimeType: "image/png" }, + ]); + await res.text(); + } + + { + const largeMessage = "x".repeat(1_200_000); + mockAgentOnce([{ text: "accepted" }]); + const res = await postChatCompletions(port, { + model: "openclaw", + messages: [{ role: "user", content: largeMessage }], + }); + expect(res.status).toBe(200); + await res.text(); + } + + { + agentCommand.mockClear(); + const res = await postChatCompletions(port, { + model: "openclaw", + messages: [ + { + role: "user", + content: [ + { + type: "image_url", + image_url: { url: "data:application/pdf;base64,QUJDRA==" }, + }, + ], + }, + ], + }); + expect(res.status).toBe(400); + const json = (await res.json()) as Record; + expect((json.error as Record | undefined)?.type).toBe( + "invalid_request_error", + ); + expect(agentCommand).toHaveBeenCalledTimes(0); + } + + { + agentCommand.mockClear(); + const manyImageParts = Array.from({ length: 9 }).map(() => ({ + type: "image_url", + image_url: { url: "data:image/png;base64,QUJDRA==" }, + })); + const res = await postChatCompletions(port, { + model: "openclaw", + messages: [ + { + role: "user", + content: manyImageParts, + }, + ], + }); + expect(res.status).toBe(400); + const json = (await res.json()) as Record; + expect((json.error as Record | undefined)?.type).toBe( + "invalid_request_error", + ); + expect(agentCommand).toHaveBeenCalledTimes(0); + } + { mockAgentOnce([{ text: "I am Claude" }]); const res = await postChatCompletions(port, { @@ -327,6 +545,35 @@ describe("OpenAI-compatible HTTP API (e2e)", () => { await res.text(); } + { + mockAgentOnce([{ text: "tool follow-up ok" }]); + const res = await postChatCompletions(port, { + model: "openclaw", + messages: [ + { + role: "user", + content: [ + { type: "text", text: "look at this" }, + { type: "image_url", image_url: { url: "https://example.com/image.png" } }, + ], + }, + { role: "assistant", content: "Checking the image." }, + { role: "tool", content: "Vision tool says it is blue." }, + ], + }); + expect(res.status).toBe(200); + + const firstCall = getFirstAgentCall(); + expect(firstCall?.images).toBeUndefined(); + const message = getFirstAgentMessage(); + expectMessageContext(message, { + history: ["User: look at this", "Assistant: Checking the image."], + current: ["Tool: Vision tool says it is blue."], + }); + expect(message).not.toContain("User sent image(s) with no text."); + await res.text(); + } + { mockAgentOnce([{ text: "hello" }]); const json = await postSyncUserMessage("hi"); diff --git a/src/gateway/openai-http.ts b/src/gateway/openai-http.ts index 10e8d713f..1f37dfb1f 100644 --- a/src/gateway/openai-http.ts +++ b/src/gateway/openai-http.ts @@ -2,8 +2,21 @@ import { randomUUID } from "node:crypto"; import type { IncomingMessage, ServerResponse } from "node:http"; import { createDefaultDeps } from "../cli/deps.js"; import { agentCommandFromIngress } from "../commands/agent.js"; +import type { ImageContent } from "../commands/agent/types.js"; +import type { GatewayHttpChatCompletionsConfig } from "../config/types.gateway.js"; import { emitAgentEvent, onAgentEvent } from "../infra/agent-events.js"; import { logWarn } from "../logger.js"; +import { estimateBase64DecodedBytes } from "../media/base64.js"; +import { + DEFAULT_INPUT_IMAGE_MAX_BYTES, + DEFAULT_INPUT_IMAGE_MIMES, + DEFAULT_INPUT_MAX_REDIRECTS, + DEFAULT_INPUT_TIMEOUT_MS, + extractImageContentFromSource, + normalizeMimeList, + type InputImageLimits, + type InputImageSource, +} from "../media/input-files.js"; import { defaultRuntime } from "../runtime.js"; import { resolveAssistantStreamDeltaText } from "./agent-event-assistant-text.js"; import { @@ -18,6 +31,7 @@ import { resolveGatewayRequestContext } from "./http-utils.js"; type OpenAiHttpOptions = { auth: ResolvedGatewayAuth; + config?: GatewayHttpChatCompletionsConfig; maxBodyBytes?: number; trustedProxies?: string[]; allowRealIpFallback?: boolean; @@ -37,12 +51,64 @@ type OpenAiChatCompletionRequest = { user?: unknown; }; +const DEFAULT_OPENAI_CHAT_COMPLETIONS_BODY_BYTES = 20 * 1024 * 1024; +const IMAGE_ONLY_USER_MESSAGE = "User sent image(s) with no text."; +const DEFAULT_OPENAI_MAX_IMAGE_PARTS = 8; +const DEFAULT_OPENAI_MAX_TOTAL_IMAGE_BYTES = 20 * 1024 * 1024; +const DEFAULT_OPENAI_IMAGE_LIMITS: InputImageLimits = { + allowUrl: false, + allowedMimes: new Set(DEFAULT_INPUT_IMAGE_MIMES), + maxBytes: DEFAULT_INPUT_IMAGE_MAX_BYTES, + maxRedirects: DEFAULT_INPUT_MAX_REDIRECTS, + timeoutMs: DEFAULT_INPUT_TIMEOUT_MS, +}; + +type ResolvedOpenAiChatCompletionsLimits = { + maxBodyBytes: number; + maxImageParts: number; + maxTotalImageBytes: number; + images: InputImageLimits; +}; + +function normalizeHostnameAllowlist(values: string[] | undefined): string[] | undefined { + if (!values || values.length === 0) { + return undefined; + } + const normalized = values.map((value) => value.trim()).filter((value) => value.length > 0); + return normalized.length > 0 ? normalized : undefined; +} + +function resolveOpenAiChatCompletionsLimits( + config: GatewayHttpChatCompletionsConfig | undefined, +): ResolvedOpenAiChatCompletionsLimits { + const imageConfig = config?.images; + return { + maxBodyBytes: config?.maxBodyBytes ?? DEFAULT_OPENAI_CHAT_COMPLETIONS_BODY_BYTES, + maxImageParts: + typeof config?.maxImageParts === "number" + ? Math.max(0, Math.floor(config.maxImageParts)) + : DEFAULT_OPENAI_MAX_IMAGE_PARTS, + maxTotalImageBytes: + typeof config?.maxTotalImageBytes === "number" + ? Math.max(1, Math.floor(config.maxTotalImageBytes)) + : DEFAULT_OPENAI_MAX_TOTAL_IMAGE_BYTES, + images: { + allowUrl: imageConfig?.allowUrl ?? DEFAULT_OPENAI_IMAGE_LIMITS.allowUrl, + urlAllowlist: normalizeHostnameAllowlist(imageConfig?.urlAllowlist), + allowedMimes: normalizeMimeList(imageConfig?.allowedMimes, DEFAULT_INPUT_IMAGE_MIMES), + maxBytes: imageConfig?.maxBytes ?? DEFAULT_INPUT_IMAGE_MAX_BYTES, + maxRedirects: imageConfig?.maxRedirects ?? DEFAULT_INPUT_MAX_REDIRECTS, + timeoutMs: imageConfig?.timeoutMs ?? DEFAULT_INPUT_TIMEOUT_MS, + }, + }; +} + function writeSse(res: ServerResponse, data: unknown) { res.write(`data: ${JSON.stringify(data)}\n\n`); } function buildAgentCommandInput(params: { - prompt: { message: string; extraSystemPrompt?: string }; + prompt: { message: string; extraSystemPrompt?: string; images?: ImageContent[] }; sessionKey: string; runId: string; messageChannel: string; @@ -50,6 +116,7 @@ function buildAgentCommandInput(params: { return { message: params.prompt.message, extraSystemPrompt: params.prompt.extraSystemPrompt, + images: params.prompt.images, sessionKey: params.sessionKey, runId: params.runId, deliver: false as const, @@ -123,7 +190,142 @@ function extractTextContent(content: unknown): string { return ""; } -function buildAgentPrompt(messagesUnknown: unknown): { +function resolveImageUrlPart(part: unknown): string | undefined { + if (!part || typeof part !== "object") { + return undefined; + } + const imageUrl = (part as { image_url?: unknown }).image_url; + if (typeof imageUrl === "string") { + const trimmed = imageUrl.trim(); + return trimmed.length > 0 ? trimmed : undefined; + } + if (!imageUrl || typeof imageUrl !== "object") { + return undefined; + } + const rawUrl = (imageUrl as { url?: unknown }).url; + if (typeof rawUrl !== "string") { + return undefined; + } + const trimmed = rawUrl.trim(); + return trimmed.length > 0 ? trimmed : undefined; +} + +function extractImageUrls(content: unknown): string[] { + if (!Array.isArray(content)) { + return []; + } + const urls: string[] = []; + for (const part of content) { + if (!part || typeof part !== "object") { + continue; + } + if ((part as { type?: unknown }).type !== "image_url") { + continue; + } + const url = resolveImageUrlPart(part); + if (url) { + urls.push(url); + } + } + return urls; +} + +type ActiveTurnContext = { + activeTurnIndex: number; + activeUserMessageIndex: number; + urls: string[]; +}; + +function parseImageUrlToSource(url: string): InputImageSource { + const dataUriMatch = /^data:([^,]*?),(.*)$/is.exec(url); + if (dataUriMatch) { + const metadata = dataUriMatch[1]?.trim() ?? ""; + const data = dataUriMatch[2] ?? ""; + const metadataParts = metadata + .split(";") + .map((part) => part.trim()) + .filter(Boolean); + const isBase64 = metadataParts.some((part) => part.toLowerCase() === "base64"); + if (!isBase64) { + throw new Error("image_url data URI must be base64 encoded"); + } + if (!data.trim()) { + throw new Error("image_url data URI is missing payload data"); + } + const mediaTypeRaw = metadataParts.find((part) => part.includes("/")); + return { + type: "base64", + mediaType: mediaTypeRaw, + data, + }; + } + return { type: "url", url }; +} + +function resolveActiveTurnContext(messagesUnknown: unknown): ActiveTurnContext { + const messages = asMessages(messagesUnknown); + for (let i = messages.length - 1; i >= 0; i -= 1) { + const msg = messages[i]; + if (!msg || typeof msg !== "object") { + continue; + } + const role = typeof msg.role === "string" ? msg.role.trim() : ""; + const normalizedRole = role === "function" ? "tool" : role; + if (normalizedRole !== "user" && normalizedRole !== "tool") { + continue; + } + return { + activeTurnIndex: i, + activeUserMessageIndex: normalizedRole === "user" ? i : -1, + urls: normalizedRole === "user" ? extractImageUrls(msg.content) : [], + }; + } + return { activeTurnIndex: -1, activeUserMessageIndex: -1, urls: [] }; +} + +async function resolveImagesForRequest( + activeTurnContext: Pick, + limits: ResolvedOpenAiChatCompletionsLimits, +): Promise { + const urls = activeTurnContext.urls; + if (urls.length === 0) { + return []; + } + if (urls.length > limits.maxImageParts) { + throw new Error(`Too many image_url parts (${urls.length}; limit ${limits.maxImageParts})`); + } + + const images: ImageContent[] = []; + let totalBytes = 0; + for (const url of urls) { + const source = parseImageUrlToSource(url); + if (source.type === "base64") { + totalBytes += estimateBase64DecodedBytes(source.data); + if (totalBytes > limits.maxTotalImageBytes) { + throw new Error( + `Total image payload too large (${totalBytes}; limit ${limits.maxTotalImageBytes})`, + ); + } + } + + const image = await extractImageContentFromSource(source, limits.images); + if (source.type !== "base64") { + totalBytes += estimateBase64DecodedBytes(image.data); + } + if (totalBytes > limits.maxTotalImageBytes) { + throw new Error( + `Total image payload too large (${totalBytes}; limit ${limits.maxTotalImageBytes})`, + ); + } + images.push(image); + } + return images; +} + +function buildAgentPrompt( + messagesUnknown: unknown, + activeUserMessageIndex: number, +): { message: string; extraSystemPrompt?: string; } { @@ -132,17 +334,20 @@ function buildAgentPrompt(messagesUnknown: unknown): { const systemParts: string[] = []; const conversationEntries: ConversationEntry[] = []; - for (const msg of messages) { + for (const [i, msg] of messages.entries()) { if (!msg || typeof msg !== "object") { continue; } const role = typeof msg.role === "string" ? msg.role.trim() : ""; const content = extractTextContent(msg.content).trim(); - if (!role || !content) { + const hasImage = extractImageUrls(msg.content).length > 0; + if (!role) { continue; } if (role === "system" || role === "developer") { - systemParts.push(content); + if (content) { + systemParts.push(content); + } continue; } @@ -151,6 +356,16 @@ function buildAgentPrompt(messagesUnknown: unknown): { continue; } + // Keep the image-only placeholder scoped to the active user turn so we don't + // mention historical image-only turns whose bytes are intentionally not replayed. + const messageContent = + normalizedRole === "user" && !content && hasImage && i === activeUserMessageIndex + ? IMAGE_ONLY_USER_MESSAGE + : content; + if (!messageContent) { + continue; + } + const name = typeof msg.name === "string" ? msg.name.trim() : ""; const sender = normalizedRole === "assistant" @@ -163,7 +378,7 @@ function buildAgentPrompt(messagesUnknown: unknown): { conversationEntries.push({ role: normalizedRole, - entry: { sender, body: content }, + entry: { sender, body: messageContent }, }); } @@ -199,13 +414,14 @@ export async function handleOpenAiHttpRequest( res: ServerResponse, opts: OpenAiHttpOptions, ): Promise { + const limits = resolveOpenAiChatCompletionsLimits(opts.config); const handled = await handleGatewayPostJsonEndpoint(req, res, { pathname: "/v1/chat/completions", auth: opts.auth, trustedProxies: opts.trustedProxies, allowRealIpFallback: opts.allowRealIpFallback, rateLimiter: opts.rateLimiter, - maxBodyBytes: opts.maxBodyBytes ?? 1024 * 1024, + maxBodyBytes: opts.maxBodyBytes ?? limits.maxBodyBytes, }); if (handled === false) { return false; @@ -227,8 +443,23 @@ export async function handleOpenAiHttpRequest( defaultMessageChannel: "webchat", useMessageChannelHeader: true, }); - const prompt = buildAgentPrompt(payload.messages); - if (!prompt.message) { + const activeTurnContext = resolveActiveTurnContext(payload.messages); + const prompt = buildAgentPrompt(payload.messages, activeTurnContext.activeUserMessageIndex); + let images: ImageContent[] = []; + try { + images = await resolveImagesForRequest(activeTurnContext, limits); + } catch (err) { + logWarn(`openai-compat: invalid image_url content: ${String(err)}`); + sendJson(res, 400, { + error: { + message: "Invalid image_url content in `messages`.", + type: "invalid_request_error", + }, + }); + return true; + } + + if (!prompt.message && images.length === 0) { sendJson(res, 400, { error: { message: "Missing user message in `messages`.", @@ -241,7 +472,11 @@ export async function handleOpenAiHttpRequest( const runId = `chatcmpl_${randomUUID()}`; const deps = createDefaultDeps(); const commandInput = buildAgentCommandInput({ - prompt, + prompt: { + message: prompt.message, + extraSystemPrompt: prompt.extraSystemPrompt, + images: images.length > 0 ? images : undefined, + }, sessionKey, runId, messageChannel, diff --git a/src/gateway/server-http.ts b/src/gateway/server-http.ts index fa3383b41..41911f35b 100644 --- a/src/gateway/server-http.ts +++ b/src/gateway/server-http.ts @@ -509,6 +509,7 @@ export function createGatewayHttpServer(opts: { controlUiBasePath: string; controlUiRoot?: ControlUiRootState; openAiChatCompletionsEnabled: boolean; + openAiChatCompletionsConfig?: import("../config/types.gateway.js").GatewayHttpChatCompletionsConfig; openResponsesEnabled: boolean; openResponsesConfig?: import("../config/types.gateway.js").GatewayHttpResponsesConfig; strictTransportSecurityHeader?: string; @@ -527,6 +528,7 @@ export function createGatewayHttpServer(opts: { controlUiBasePath, controlUiRoot, openAiChatCompletionsEnabled, + openAiChatCompletionsConfig, openResponsesEnabled, openResponsesConfig, strictTransportSecurityHeader, @@ -610,6 +612,7 @@ export function createGatewayHttpServer(opts: { run: () => handleOpenAiHttpRequest(req, res, { auth: resolvedAuth, + config: openAiChatCompletionsConfig, trustedProxies, allowRealIpFallback, rateLimiter, diff --git a/src/gateway/server-runtime-config.ts b/src/gateway/server-runtime-config.ts index d6352edf6..2722d36ac 100644 --- a/src/gateway/server-runtime-config.ts +++ b/src/gateway/server-runtime-config.ts @@ -23,6 +23,7 @@ export type GatewayRuntimeConfig = { bindHost: string; controlUiEnabled: boolean; openAiChatCompletionsEnabled: boolean; + openAiChatCompletionsConfig?: import("../config/types.gateway.js").GatewayHttpChatCompletionsConfig; openResponsesEnabled: boolean; openResponsesConfig?: import("../config/types.gateway.js").GatewayHttpResponsesConfig; strictTransportSecurityHeader?: string; @@ -73,10 +74,9 @@ export async function resolveGatewayRuntimeConfig(params: { } const controlUiEnabled = params.controlUiEnabled ?? params.cfg.gateway?.controlUi?.enabled ?? true; + const openAiChatCompletionsConfig = params.cfg.gateway?.http?.endpoints?.chatCompletions; const openAiChatCompletionsEnabled = - params.openAiChatCompletionsEnabled ?? - params.cfg.gateway?.http?.endpoints?.chatCompletions?.enabled ?? - false; + params.openAiChatCompletionsEnabled ?? openAiChatCompletionsConfig?.enabled ?? false; const openResponsesConfig = params.cfg.gateway?.http?.endpoints?.responses; const openResponsesEnabled = params.openResponsesEnabled ?? openResponsesConfig?.enabled ?? false; const strictTransportSecurityConfig = @@ -168,6 +168,9 @@ export async function resolveGatewayRuntimeConfig(params: { bindHost, controlUiEnabled, openAiChatCompletionsEnabled, + openAiChatCompletionsConfig: openAiChatCompletionsConfig + ? { ...openAiChatCompletionsConfig, enabled: openAiChatCompletionsEnabled } + : undefined, openResponsesEnabled, openResponsesConfig: openResponsesConfig ? { ...openResponsesConfig, enabled: openResponsesEnabled } diff --git a/src/gateway/server-runtime-state.ts b/src/gateway/server-runtime-state.ts index 46111c99c..9054b3a2a 100644 --- a/src/gateway/server-runtime-state.ts +++ b/src/gateway/server-runtime-state.ts @@ -43,6 +43,7 @@ export async function createGatewayRuntimeState(params: { controlUiBasePath: string; controlUiRoot?: ControlUiRootState; openAiChatCompletionsEnabled: boolean; + openAiChatCompletionsConfig?: import("../config/types.gateway.js").GatewayHttpChatCompletionsConfig; openResponsesEnabled: boolean; openResponsesConfig?: import("../config/types.gateway.js").GatewayHttpResponsesConfig; strictTransportSecurityHeader?: string; @@ -146,6 +147,7 @@ export async function createGatewayRuntimeState(params: { controlUiBasePath: params.controlUiBasePath, controlUiRoot: params.controlUiRoot, openAiChatCompletionsEnabled: params.openAiChatCompletionsEnabled, + openAiChatCompletionsConfig: params.openAiChatCompletionsConfig, openResponsesEnabled: params.openResponsesEnabled, openResponsesConfig: params.openResponsesConfig, strictTransportSecurityHeader: params.strictTransportSecurityHeader, diff --git a/src/gateway/server.impl.ts b/src/gateway/server.impl.ts index 1e08eb0c7..2e816c67d 100644 --- a/src/gateway/server.impl.ts +++ b/src/gateway/server.impl.ts @@ -487,6 +487,7 @@ export async function startGatewayServer( bindHost, controlUiEnabled, openAiChatCompletionsEnabled, + openAiChatCompletionsConfig, openResponsesEnabled, openResponsesConfig, strictTransportSecurityHeader, @@ -571,6 +572,7 @@ export async function startGatewayServer( controlUiBasePath, controlUiRoot: controlUiRootState, openAiChatCompletionsEnabled, + openAiChatCompletionsConfig, openResponsesEnabled, openResponsesConfig, strictTransportSecurityHeader,