diff --git a/CHANGELOG.md b/CHANGELOG.md index f318f24c4..4beecf85f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai - fix(agents): validate AbortSignal instances before calling AbortSignal.any() (#7277) (thanks @Elarwei001) - fix(webchat): respect user scroll position during streaming and refresh (#7226) (thanks @marcomarandiz) +- Media understanding: skip binary media from file text extraction. (#7475) Thanks @AlexZhangji. - Security: guard skill installer downloads with SSRF checks (block private/localhost URLs). - Media understanding: apply SSRF guardrails to provider fetches; allow private baseUrl overrides explicitly. - Tests: stub SSRF DNS pinning in web auto-reply + Gemini video coverage. (#6619) Thanks @joshp123. diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts index 26bc8886a..c101f06d2 100644 --- a/src/media-understanding/apply.test.ts +++ b/src/media-understanding/apply.test.ts @@ -528,18 +528,16 @@ describe("applyMediaUnderstanding", () => { expect(ctx.BodyForCommands).toBe("audio ok"); }); - it("treats text-like audio attachments as CSV (comma wins over tabs)", async () => { + it("treats text-like attachments as CSV (comma wins over tabs)", async () => { const { applyMediaUnderstanding } = await loadApply(); const dir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-media-")); - const csvPath = path.join(dir, "data.mp3"); + const csvPath = path.join(dir, "data.bin"); const csvText = '"a","b"\t"c"\n"1","2"\t"3"'; - const csvBuffer = Buffer.concat([Buffer.from([0xff, 0xfe]), Buffer.from(csvText, "utf16le")]); - await fs.writeFile(csvPath, csvBuffer); + await fs.writeFile(csvPath, csvText); const ctx: MsgContext = { - Body: "", + Body: "", MediaPath: csvPath, - MediaType: "audio/mpeg", }; const cfg: OpenClawConfig = { tools: { @@ -554,21 +552,20 @@ describe("applyMediaUnderstanding", () => { const result = await applyMediaUnderstanding({ ctx, cfg }); expect(result.appliedFile).toBe(true); - expect(ctx.Body).toContain(''); + expect(ctx.Body).toContain(''); expect(ctx.Body).toContain('"a","b"\t"c"'); }); it("infers TSV when tabs are present without commas", async () => { const { applyMediaUnderstanding } = await loadApply(); const dir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-media-")); - const tsvPath = path.join(dir, "report.mp3"); + const tsvPath = path.join(dir, "report.bin"); const tsvText = "a\tb\tc\n1\t2\t3"; await fs.writeFile(tsvPath, tsvText); const ctx: MsgContext = { - Body: "", + Body: "", MediaPath: tsvPath, - MediaType: "audio/mpeg", }; const cfg: OpenClawConfig = { tools: { @@ -583,21 +580,20 @@ describe("applyMediaUnderstanding", () => { const result = await applyMediaUnderstanding({ ctx, cfg }); expect(result.appliedFile).toBe(true); - expect(ctx.Body).toContain(''); + expect(ctx.Body).toContain(''); expect(ctx.Body).toContain("a\tb\tc"); }); - it("treats cp1252-like audio attachments as text", async () => { + it("treats cp1252-like attachments as text", async () => { const { applyMediaUnderstanding } = await loadApply(); const dir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-media-")); - const filePath = path.join(dir, "legacy.mp3"); + const filePath = path.join(dir, "legacy.bin"); const cp1252Bytes = Buffer.from([0x93, 0x48, 0x69, 0x94, 0x20, 0x54, 0x65, 0x73, 0x74]); await fs.writeFile(filePath, cp1252Bytes); const ctx: MsgContext = { - Body: "", + Body: "", MediaPath: filePath, - MediaType: "audio/mpeg", }; const cfg: OpenClawConfig = { tools: { @@ -645,17 +641,16 @@ describe("applyMediaUnderstanding", () => { expect(ctx.Body).not.toContain(" { + it("respects configured allowedMimes for text-like attachments", async () => { const { applyMediaUnderstanding } = await loadApply(); const dir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-media-")); - const tsvPath = path.join(dir, "report.mp3"); + const tsvPath = path.join(dir, "report.bin"); const tsvText = "a\tb\tc\n1\t2\t3"; await fs.writeFile(tsvPath, tsvText); const ctx: MsgContext = { - Body: "", + Body: "", MediaPath: tsvPath, - MediaType: "audio/mpeg", }; const cfg: OpenClawConfig = { gateway: { @@ -679,7 +674,7 @@ describe("applyMediaUnderstanding", () => { const result = await applyMediaUnderstanding({ ctx, cfg }); expect(result.appliedFile).toBe(false); - expect(ctx.Body).toBe(""); + expect(ctx.Body).toBe(""); expect(ctx.Body).not.toContain(" { const result = await applyMediaUnderstanding({ ctx, cfg }); + const body = ctx.Body ?? ""; expect(result.appliedFile).toBe(true); - expect(ctx.Body).toContain("</file>"); - expect(ctx.Body).toContain("<file"); - expect((ctx.Body.match(/<\/file>/g) ?? []).length).toBe(1); + expect(body).toContain("</file>"); + expect(body).toContain("<file"); + expect((body.match(/<\/file>/g) ?? []).length).toBe(1); }); it("normalizes MIME types to prevent attribute injection", async () => { diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index e4ec4aab0..766549afc 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -317,6 +317,13 @@ function resolveTextMimeFromName(name?: string): string | undefined { return TEXT_EXT_MIME.get(ext); } +function isBinaryMediaMime(mime?: string): boolean { + if (!mime) { + return false; + } + return mime.startsWith("image/") || mime.startsWith("audio/") || mime.startsWith("video/"); +} + async function extractFileBlocks(params: { attachments: ReturnType; cache: ReturnType; @@ -337,7 +344,7 @@ async function extractFileBlocks(params: { } const forcedTextMime = resolveTextMimeFromName(attachment.path ?? attachment.url ?? ""); const kind = forcedTextMime ? "document" : resolveAttachmentKind(attachment); - if (!forcedTextMime && (kind === "image" || kind === "video")) { + if (!forcedTextMime && (kind === "image" || kind === "video" || kind === "audio")) { continue; } if (!limits.allowUrl && attachment.url && !attachment.path) { @@ -361,16 +368,17 @@ async function extractFileBlocks(params: { } const nameHint = bufferResult?.fileName ?? attachment.path ?? attachment.url; const forcedTextMimeResolved = forcedTextMime ?? resolveTextMimeFromName(nameHint ?? ""); + const rawMime = bufferResult?.mime ?? attachment.mime; + const normalizedRawMime = normalizeMimeType(rawMime); + if (!forcedTextMimeResolved && isBinaryMediaMime(normalizedRawMime)) { + continue; + } const utf16Charset = resolveUtf16Charset(bufferResult?.buffer); const textSample = decodeTextSample(bufferResult?.buffer); const textLike = Boolean(utf16Charset) || looksLikeUtf8Text(bufferResult?.buffer); - if (!forcedTextMimeResolved && kind === "audio" && !textLike) { - continue; - } const guessedDelimited = textLike ? guessDelimitedMime(textSample) : undefined; const textHint = forcedTextMimeResolved ?? guessedDelimited ?? (textLike ? "text/plain" : undefined); - const rawMime = bufferResult?.mime ?? attachment.mime; const mimeType = sanitizeMimeType(textHint ?? normalizeMimeType(rawMime)); // Log when MIME type is overridden from non-text to text for auditability if (textHint && rawMime && !rawMime.startsWith("text/")) {