fix(web_fetch): cap response body before parsing

This commit is contained in:
Peter Steinberger
2026-02-16 01:19:04 +01:00
parent fd3d452f1f
commit 166cf6a3e0
7 changed files with 284 additions and 8 deletions

View File

@@ -29,6 +29,7 @@ Docs: https://docs.openclaw.ai
- Gateway/Send: return an actionable error when `send` targets internal-only `webchat`, guiding callers to use `chat.send` or a deliverable channel. (#15703) Thanks @rodrigouroz.
- Gateway/Agent: reject malformed `agent:`-prefixed session keys (for example, `agent:main`) in `agent` and `agent.identity.get` instead of silently resolving them to the default agent, preventing accidental cross-session routing. (#15707) Thanks @rodrigouroz.
- Gateway/Security: redact sensitive session/path details from `status` responses for non-admin clients; full details remain available to `operator.admin`. (#8590) Thanks @fr33d3m0n.
- Web Fetch/Security: cap downloaded response body size before HTML parsing to prevent memory exhaustion from oversized or deeply nested pages. Thanks @xuemian168.
- Agents: return an explicit timeout error reply when an embedded run times out before producing any payloads, preventing silent dropped turns during slow cache-refresh transitions. (#16659) Thanks @liaosvcaf and @vignesh07.
- Agents/OpenAI: force `store=true` for direct OpenAI Responses/Codex runs to preserve multi-turn server-side conversation state, while leaving proxy/non-OpenAI endpoints unchanged. (#16803) Thanks @mark9232 and @vignesh07.
- Agents/Context: apply configured model `contextWindow` overrides after provider discovery so `lookupContextTokens()` honors operator config values (including discovery-failure paths). (#17404) Thanks @michaelbship and @vignesh07.

View File

@@ -224,6 +224,7 @@ Fetch a URL and extract readable content.
enabled: true,
maxChars: 50000,
maxCharsCap: 50000,
maxResponseBytes: 2000000,
timeoutSeconds: 30,
cacheTtlMinutes: 15,
maxRedirects: 3,
@@ -256,6 +257,7 @@ Notes:
- `web_fetch` sends a Chrome-like User-Agent and `Accept-Language` by default; override `userAgent` if needed.
- `web_fetch` blocks private/internal hostnames and re-checks redirects (limit with `maxRedirects`).
- `maxChars` is clamped to `tools.web.fetch.maxCharsCap`.
- `web_fetch` caps the downloaded response body size to `tools.web.fetch.maxResponseBytes` before parsing; oversized responses are truncated and include a warning.
- `web_fetch` is best-effort extraction; some sites will need the browser tool.
- See [Firecrawl](/tools/firecrawl) for key setup and service details.
- Responses are cached (default 15 minutes) to reduce repeated fetches.

View File

@@ -1,5 +1,8 @@
export type ExtractMode = "markdown" | "text";
const READABILITY_MAX_HTML_CHARS = 1_000_000;
const READABILITY_MAX_ESTIMATED_NESTING_DEPTH = 3_000;
let readabilityDepsPromise:
| Promise<{
Readability: typeof import("@mozilla/readability").Readability;
@@ -107,6 +110,100 @@ export function truncateText(
return { text: value.slice(0, maxChars), truncated: true };
}
function exceedsEstimatedHtmlNestingDepth(html: string, maxDepth: number): boolean {
// Cheap heuristic to skip Readability+DOM parsing on pathological HTML (deep nesting => stack/memory blowups).
// Not an HTML parser; tuned to catch attacker-controlled "<div><div>..." cases.
const voidTags = new Set([
"area",
"base",
"br",
"col",
"embed",
"hr",
"img",
"input",
"link",
"meta",
"param",
"source",
"track",
"wbr",
]);
let depth = 0;
const len = html.length;
for (let i = 0; i < len; i++) {
if (html.charCodeAt(i) !== 60) {
continue; // '<'
}
const next = html.charCodeAt(i + 1);
if (next === 33 || next === 63) {
continue; // <! ...> or <? ...>
}
let j = i + 1;
let closing = false;
if (html.charCodeAt(j) === 47) {
closing = true;
j += 1;
}
while (j < len && html.charCodeAt(j) <= 32) {
j += 1;
}
const nameStart = j;
while (j < len) {
const c = html.charCodeAt(j);
const isNameChar =
(c >= 65 && c <= 90) || // A-Z
(c >= 97 && c <= 122) || // a-z
(c >= 48 && c <= 57) || // 0-9
c === 58 || // :
c === 45; // -
if (!isNameChar) {
break;
}
j += 1;
}
const tagName = html.slice(nameStart, j).toLowerCase();
if (!tagName) {
continue;
}
if (closing) {
depth = Math.max(0, depth - 1);
continue;
}
if (voidTags.has(tagName)) {
continue;
}
// Best-effort self-closing detection: scan a short window for "/>".
let selfClosing = false;
for (let k = j; k < len && k < j + 200; k++) {
const c = html.charCodeAt(k);
if (c === 62) {
if (html.charCodeAt(k - 1) === 47) {
selfClosing = true;
}
break;
}
}
if (selfClosing) {
continue;
}
depth += 1;
if (depth > maxDepth) {
return true;
}
}
return false;
}
export async function extractReadableContent(params: {
html: string;
url: string;
@@ -120,6 +217,12 @@ export async function extractReadableContent(params: {
}
return rendered;
};
if (
params.html.length > READABILITY_MAX_HTML_CHARS ||
exceedsEstimatedHtmlNestingDepth(params.html, READABILITY_MAX_ESTIMATED_NESTING_DEPTH)
) {
return fallback();
}
try {
const { Readability, parseHTML } = await loadReadabilityDeps();
const { document } = parseHTML(params.html);

View File

@@ -0,0 +1,66 @@
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import * as ssrf from "../../infra/net/ssrf.js";
import { createWebFetchTool } from "./web-tools.js";
// Avoid dynamic-importing heavy readability deps in this unit test suite.
vi.mock("./web-fetch-utils.js", async () => {
const actual =
await vi.importActual<typeof import("./web-fetch-utils.js")>("./web-fetch-utils.js");
return {
...actual,
extractReadableContent: vi.fn().mockResolvedValue({
title: "HTML Page",
text: "HTML Page\n\nContent here.",
}),
};
});
const lookupMock = vi.fn();
const resolvePinnedHostname = ssrf.resolvePinnedHostname;
const baseToolConfig = {
config: {
tools: {
web: { fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false }, maxResponseBytes: 1024 } },
},
},
} as const;
describe("web_fetch response size limits", () => {
const priorFetch = global.fetch;
beforeEach(() => {
lookupMock.mockResolvedValue([{ address: "93.184.216.34", family: 4 }]);
vi.spyOn(ssrf, "resolvePinnedHostname").mockImplementation((hostname) =>
resolvePinnedHostname(hostname, lookupMock),
);
});
afterEach(() => {
// @ts-expect-error restore
global.fetch = priorFetch;
lookupMock.mockReset();
vi.restoreAllMocks();
});
it("caps response bytes and does not hang on endless streams", async () => {
const chunk = new TextEncoder().encode("<html><body><div>hi</div></body></html>");
const stream = new ReadableStream<Uint8Array>({
pull(controller) {
controller.enqueue(chunk);
},
});
const response = new Response(stream, {
status: 200,
headers: { "content-type": "text/html; charset=utf-8" },
});
const fetchSpy = vi.fn().mockResolvedValue(response);
// @ts-expect-error mock fetch
global.fetch = fetchSpy;
const tool = createWebFetchTool(baseToolConfig);
const result = await tool?.execute?.("call", { url: "https://example.com/stream" });
expect(result?.details?.warning).toContain("Response body truncated");
});
});

View File

@@ -33,8 +33,12 @@ export { extractReadableContent } from "./web-fetch-utils.js";
const EXTRACT_MODES = ["markdown", "text"] as const;
const DEFAULT_FETCH_MAX_CHARS = 50_000;
const DEFAULT_FETCH_MAX_RESPONSE_BYTES = 2_000_000;
const FETCH_MAX_RESPONSE_BYTES_MIN = 32_000;
const FETCH_MAX_RESPONSE_BYTES_MAX = 10_000_000;
const DEFAULT_FETCH_MAX_REDIRECTS = 3;
const DEFAULT_ERROR_MAX_CHARS = 4_000;
const DEFAULT_ERROR_MAX_BYTES = 64_000;
const DEFAULT_FIRECRAWL_BASE_URL = "https://api.firecrawl.dev";
const DEFAULT_FIRECRAWL_MAX_AGE_MS = 172_800_000;
const DEFAULT_FETCH_USER_AGENT =
@@ -108,6 +112,18 @@ function resolveFetchMaxCharsCap(fetch?: WebFetchConfig): number {
return Math.max(100, Math.floor(raw));
}
function resolveFetchMaxResponseBytes(fetch?: WebFetchConfig): number {
const raw =
fetch && "maxResponseBytes" in fetch && typeof fetch.maxResponseBytes === "number"
? fetch.maxResponseBytes
: undefined;
if (typeof raw !== "number" || !Number.isFinite(raw) || raw <= 0) {
return DEFAULT_FETCH_MAX_RESPONSE_BYTES;
}
const value = Math.floor(raw);
return Math.min(FETCH_MAX_RESPONSE_BYTES_MAX, Math.max(FETCH_MAX_RESPONSE_BYTES_MIN, value));
}
function resolveFirecrawlConfig(fetch?: WebFetchConfig): FirecrawlFetchConfig {
if (!fetch || typeof fetch !== "object") {
return undefined;
@@ -413,6 +429,7 @@ async function runWebFetch(params: {
url: string;
extractMode: ExtractMode;
maxChars: number;
maxResponseBytes: number;
maxRedirects: number;
timeoutSeconds: number;
cacheTtlMs: number;
@@ -530,7 +547,8 @@ async function runWebFetch(params: {
writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs);
return payload;
}
const rawDetail = await readResponseText(res);
const rawDetailResult = await readResponseText(res, { maxBytes: DEFAULT_ERROR_MAX_BYTES });
const rawDetail = rawDetailResult.text;
const detail = formatWebFetchErrorDetail({
detail: rawDetail,
contentType: res.headers.get("content-type"),
@@ -542,7 +560,11 @@ async function runWebFetch(params: {
const contentType = res.headers.get("content-type") ?? "application/octet-stream";
const normalizedContentType = normalizeContentType(contentType) ?? "application/octet-stream";
const body = await readResponseText(res);
const bodyResult = await readResponseText(res, { maxBytes: params.maxResponseBytes });
const body = bodyResult.text;
const responseTruncatedWarning = bodyResult.truncated
? `Response body truncated after ${params.maxResponseBytes} bytes.`
: undefined;
let title: string | undefined;
let extractor = "raw";
@@ -593,6 +615,7 @@ async function runWebFetch(params: {
const wrapped = wrapWebFetchContent(text, params.maxChars);
const wrappedTitle = title ? wrapWebFetchField(title) : undefined;
const wrappedWarning = wrapWebFetchField(responseTruncatedWarning);
const payload = {
url: params.url, // Keep raw for tool chaining
finalUrl, // Keep raw
@@ -613,6 +636,7 @@ async function runWebFetch(params: {
fetchedAt: new Date().toISOString(),
tookMs: Date.now() - start,
text: wrapped.text,
warning: wrappedWarning,
};
writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs);
return payload;
@@ -695,6 +719,7 @@ export function createWebFetchTool(options?: {
const userAgent =
(fetch && "userAgent" in fetch && typeof fetch.userAgent === "string" && fetch.userAgent) ||
DEFAULT_FETCH_USER_AGENT;
const maxResponseBytes = resolveFetchMaxResponseBytes(fetch);
return {
label: "Web Fetch",
name: "web_fetch",
@@ -715,6 +740,7 @@ export function createWebFetchTool(options?: {
DEFAULT_FETCH_MAX_CHARS,
maxCharsCap,
),
maxResponseBytes,
maxRedirects: resolveMaxRedirects(fetch?.maxRedirects, DEFAULT_FETCH_MAX_REDIRECTS),
timeoutSeconds: resolveTimeoutSeconds(fetch?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS),
cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES),

View File

@@ -486,7 +486,8 @@ async function runPerplexitySearch(params: {
});
if (!res.ok) {
const detail = await readResponseText(res);
const detailResult = await readResponseText(res, { maxBytes: 64_000 });
const detail = detailResult.text;
throw new Error(`Perplexity API error (${res.status}): ${detail || res.statusText}`);
}
@@ -535,7 +536,8 @@ async function runGrokSearch(params: {
});
if (!res.ok) {
const detail = await readResponseText(res);
const detailResult = await readResponseText(res, { maxBytes: 64_000 });
const detail = detailResult.text;
throw new Error(`xAI API error (${res.status}): ${detail || res.statusText}`);
}
@@ -665,7 +667,8 @@ async function runWebSearch(params: {
});
if (!res.ok) {
const detail = await readResponseText(res);
const detailResult = await readResponseText(res, { maxBytes: 64_000 });
const detail = detailResult.text;
throw new Error(`Brave Search API error (${res.status}): ${detail || res.statusText}`);
}

View File

@@ -86,10 +86,85 @@ export function withTimeout(signal: AbortSignal | undefined, timeoutMs: number):
return controller.signal;
}
export async function readResponseText(res: Response): Promise<string> {
export type ReadResponseTextResult = {
text: string;
truncated: boolean;
bytesRead: number;
};
export async function readResponseText(
res: Response,
options?: { maxBytes?: number },
): Promise<ReadResponseTextResult> {
const maxBytesRaw = options?.maxBytes;
const maxBytes =
typeof maxBytesRaw === "number" && Number.isFinite(maxBytesRaw) && maxBytesRaw > 0
? Math.floor(maxBytesRaw)
: undefined;
const body = (res as unknown as { body?: unknown }).body;
if (
maxBytes &&
body &&
typeof body === "object" &&
"getReader" in body &&
typeof (body as { getReader: () => unknown }).getReader === "function"
) {
const reader = (body as ReadableStream<Uint8Array>).getReader();
const decoder = new TextDecoder();
let bytesRead = 0;
let truncated = false;
const parts: string[] = [];
try {
while (true) {
const { value, done } = await reader.read();
if (done) {
break;
}
if (!value || value.byteLength === 0) {
continue;
}
let chunk = value;
if (bytesRead + chunk.byteLength > maxBytes) {
const remaining = Math.max(0, maxBytes - bytesRead);
if (remaining <= 0) {
truncated = true;
break;
}
chunk = chunk.subarray(0, remaining);
truncated = true;
}
bytesRead += chunk.byteLength;
parts.push(decoder.decode(chunk, { stream: true }));
if (truncated || bytesRead >= maxBytes) {
truncated = true;
break;
}
}
} catch {
// Best-effort: return whatever we decoded so far.
} finally {
if (truncated) {
try {
await reader.cancel();
} catch {
// ignore
}
}
}
parts.push(decoder.decode());
return { text: parts.join(""), truncated, bytesRead };
}
try {
return await res.text();
const text = await res.text();
return { text, truncated: false, bytesRead: text.length };
} catch {
return "";
return { text: "", truncated: false, bytesRead: 0 };
}
}