diff --git a/CHANGELOG.md b/CHANGELOG.md index 4179961a1..0859cdaf6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ Docs: https://docs.openclaw.ai - Gateway/Send: return an actionable error when `send` targets internal-only `webchat`, guiding callers to use `chat.send` or a deliverable channel. (#15703) Thanks @rodrigouroz. - Gateway/Agent: reject malformed `agent:`-prefixed session keys (for example, `agent:main`) in `agent` and `agent.identity.get` instead of silently resolving them to the default agent, preventing accidental cross-session routing. (#15707) Thanks @rodrigouroz. - Gateway/Security: redact sensitive session/path details from `status` responses for non-admin clients; full details remain available to `operator.admin`. (#8590) Thanks @fr33d3m0n. +- Web Fetch/Security: cap downloaded response body size before HTML parsing to prevent memory exhaustion from oversized or deeply nested pages. Thanks @xuemian168. - Agents: return an explicit timeout error reply when an embedded run times out before producing any payloads, preventing silent dropped turns during slow cache-refresh transitions. (#16659) Thanks @liaosvcaf and @vignesh07. - Agents/OpenAI: force `store=true` for direct OpenAI Responses/Codex runs to preserve multi-turn server-side conversation state, while leaving proxy/non-OpenAI endpoints unchanged. (#16803) Thanks @mark9232 and @vignesh07. - Agents/Context: apply configured model `contextWindow` overrides after provider discovery so `lookupContextTokens()` honors operator config values (including discovery-failure paths). (#17404) Thanks @michaelbship and @vignesh07. diff --git a/docs/tools/web.md b/docs/tools/web.md index 859e6144c..b0e295cd2 100644 --- a/docs/tools/web.md +++ b/docs/tools/web.md @@ -224,6 +224,7 @@ Fetch a URL and extract readable content. enabled: true, maxChars: 50000, maxCharsCap: 50000, + maxResponseBytes: 2000000, timeoutSeconds: 30, cacheTtlMinutes: 15, maxRedirects: 3, @@ -256,6 +257,7 @@ Notes: - `web_fetch` sends a Chrome-like User-Agent and `Accept-Language` by default; override `userAgent` if needed. - `web_fetch` blocks private/internal hostnames and re-checks redirects (limit with `maxRedirects`). - `maxChars` is clamped to `tools.web.fetch.maxCharsCap`. +- `web_fetch` caps the downloaded response body size to `tools.web.fetch.maxResponseBytes` before parsing; oversized responses are truncated and include a warning. - `web_fetch` is best-effort extraction; some sites will need the browser tool. - See [Firecrawl](/tools/firecrawl) for key setup and service details. - Responses are cached (default 15 minutes) to reduce repeated fetches. diff --git a/src/agents/tools/web-fetch-utils.ts b/src/agents/tools/web-fetch-utils.ts index 09716e2cd..a9ef9d5ba 100644 --- a/src/agents/tools/web-fetch-utils.ts +++ b/src/agents/tools/web-fetch-utils.ts @@ -1,5 +1,8 @@ export type ExtractMode = "markdown" | "text"; +const READABILITY_MAX_HTML_CHARS = 1_000_000; +const READABILITY_MAX_ESTIMATED_NESTING_DEPTH = 3_000; + let readabilityDepsPromise: | Promise<{ Readability: typeof import("@mozilla/readability").Readability; @@ -107,6 +110,100 @@ export function truncateText( return { text: value.slice(0, maxChars), truncated: true }; } +function exceedsEstimatedHtmlNestingDepth(html: string, maxDepth: number): boolean { + // Cheap heuristic to skip Readability+DOM parsing on pathological HTML (deep nesting => stack/memory blowups). + // Not an HTML parser; tuned to catch attacker-controlled "
..." cases. + const voidTags = new Set([ + "area", + "base", + "br", + "col", + "embed", + "hr", + "img", + "input", + "link", + "meta", + "param", + "source", + "track", + "wbr", + ]); + + let depth = 0; + const len = html.length; + for (let i = 0; i < len; i++) { + if (html.charCodeAt(i) !== 60) { + continue; // '<' + } + const next = html.charCodeAt(i + 1); + if (next === 33 || next === 63) { + continue; // or + } + + let j = i + 1; + let closing = false; + if (html.charCodeAt(j) === 47) { + closing = true; + j += 1; + } + + while (j < len && html.charCodeAt(j) <= 32) { + j += 1; + } + + const nameStart = j; + while (j < len) { + const c = html.charCodeAt(j); + const isNameChar = + (c >= 65 && c <= 90) || // A-Z + (c >= 97 && c <= 122) || // a-z + (c >= 48 && c <= 57) || // 0-9 + c === 58 || // : + c === 45; // - + if (!isNameChar) { + break; + } + j += 1; + } + + const tagName = html.slice(nameStart, j).toLowerCase(); + if (!tagName) { + continue; + } + + if (closing) { + depth = Math.max(0, depth - 1); + continue; + } + + if (voidTags.has(tagName)) { + continue; + } + + // Best-effort self-closing detection: scan a short window for "/>". + let selfClosing = false; + for (let k = j; k < len && k < j + 200; k++) { + const c = html.charCodeAt(k); + if (c === 62) { + if (html.charCodeAt(k - 1) === 47) { + selfClosing = true; + } + break; + } + } + if (selfClosing) { + continue; + } + + depth += 1; + if (depth > maxDepth) { + return true; + } + } + return false; +} + export async function extractReadableContent(params: { html: string; url: string; @@ -120,6 +217,12 @@ export async function extractReadableContent(params: { } return rendered; }; + if ( + params.html.length > READABILITY_MAX_HTML_CHARS || + exceedsEstimatedHtmlNestingDepth(params.html, READABILITY_MAX_ESTIMATED_NESTING_DEPTH) + ) { + return fallback(); + } try { const { Readability, parseHTML } = await loadReadabilityDeps(); const { document } = parseHTML(params.html); diff --git a/src/agents/tools/web-fetch.response-limit.test.ts b/src/agents/tools/web-fetch.response-limit.test.ts new file mode 100644 index 000000000..2755fd0b1 --- /dev/null +++ b/src/agents/tools/web-fetch.response-limit.test.ts @@ -0,0 +1,66 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import * as ssrf from "../../infra/net/ssrf.js"; +import { createWebFetchTool } from "./web-tools.js"; + +// Avoid dynamic-importing heavy readability deps in this unit test suite. +vi.mock("./web-fetch-utils.js", async () => { + const actual = + await vi.importActual("./web-fetch-utils.js"); + return { + ...actual, + extractReadableContent: vi.fn().mockResolvedValue({ + title: "HTML Page", + text: "HTML Page\n\nContent here.", + }), + }; +}); + +const lookupMock = vi.fn(); +const resolvePinnedHostname = ssrf.resolvePinnedHostname; +const baseToolConfig = { + config: { + tools: { + web: { fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false }, maxResponseBytes: 1024 } }, + }, + }, +} as const; + +describe("web_fetch response size limits", () => { + const priorFetch = global.fetch; + + beforeEach(() => { + lookupMock.mockResolvedValue([{ address: "93.184.216.34", family: 4 }]); + vi.spyOn(ssrf, "resolvePinnedHostname").mockImplementation((hostname) => + resolvePinnedHostname(hostname, lookupMock), + ); + }); + + afterEach(() => { + // @ts-expect-error restore + global.fetch = priorFetch; + lookupMock.mockReset(); + vi.restoreAllMocks(); + }); + + it("caps response bytes and does not hang on endless streams", async () => { + const chunk = new TextEncoder().encode("
hi
"); + const stream = new ReadableStream({ + pull(controller) { + controller.enqueue(chunk); + }, + }); + const response = new Response(stream, { + status: 200, + headers: { "content-type": "text/html; charset=utf-8" }, + }); + + const fetchSpy = vi.fn().mockResolvedValue(response); + // @ts-expect-error mock fetch + global.fetch = fetchSpy; + + const tool = createWebFetchTool(baseToolConfig); + const result = await tool?.execute?.("call", { url: "https://example.com/stream" }); + + expect(result?.details?.warning).toContain("Response body truncated"); + }); +}); diff --git a/src/agents/tools/web-fetch.ts b/src/agents/tools/web-fetch.ts index a703aa54f..b92fec9db 100644 --- a/src/agents/tools/web-fetch.ts +++ b/src/agents/tools/web-fetch.ts @@ -33,8 +33,12 @@ export { extractReadableContent } from "./web-fetch-utils.js"; const EXTRACT_MODES = ["markdown", "text"] as const; const DEFAULT_FETCH_MAX_CHARS = 50_000; +const DEFAULT_FETCH_MAX_RESPONSE_BYTES = 2_000_000; +const FETCH_MAX_RESPONSE_BYTES_MIN = 32_000; +const FETCH_MAX_RESPONSE_BYTES_MAX = 10_000_000; const DEFAULT_FETCH_MAX_REDIRECTS = 3; const DEFAULT_ERROR_MAX_CHARS = 4_000; +const DEFAULT_ERROR_MAX_BYTES = 64_000; const DEFAULT_FIRECRAWL_BASE_URL = "https://api.firecrawl.dev"; const DEFAULT_FIRECRAWL_MAX_AGE_MS = 172_800_000; const DEFAULT_FETCH_USER_AGENT = @@ -108,6 +112,18 @@ function resolveFetchMaxCharsCap(fetch?: WebFetchConfig): number { return Math.max(100, Math.floor(raw)); } +function resolveFetchMaxResponseBytes(fetch?: WebFetchConfig): number { + const raw = + fetch && "maxResponseBytes" in fetch && typeof fetch.maxResponseBytes === "number" + ? fetch.maxResponseBytes + : undefined; + if (typeof raw !== "number" || !Number.isFinite(raw) || raw <= 0) { + return DEFAULT_FETCH_MAX_RESPONSE_BYTES; + } + const value = Math.floor(raw); + return Math.min(FETCH_MAX_RESPONSE_BYTES_MAX, Math.max(FETCH_MAX_RESPONSE_BYTES_MIN, value)); +} + function resolveFirecrawlConfig(fetch?: WebFetchConfig): FirecrawlFetchConfig { if (!fetch || typeof fetch !== "object") { return undefined; @@ -413,6 +429,7 @@ async function runWebFetch(params: { url: string; extractMode: ExtractMode; maxChars: number; + maxResponseBytes: number; maxRedirects: number; timeoutSeconds: number; cacheTtlMs: number; @@ -530,7 +547,8 @@ async function runWebFetch(params: { writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs); return payload; } - const rawDetail = await readResponseText(res); + const rawDetailResult = await readResponseText(res, { maxBytes: DEFAULT_ERROR_MAX_BYTES }); + const rawDetail = rawDetailResult.text; const detail = formatWebFetchErrorDetail({ detail: rawDetail, contentType: res.headers.get("content-type"), @@ -542,7 +560,11 @@ async function runWebFetch(params: { const contentType = res.headers.get("content-type") ?? "application/octet-stream"; const normalizedContentType = normalizeContentType(contentType) ?? "application/octet-stream"; - const body = await readResponseText(res); + const bodyResult = await readResponseText(res, { maxBytes: params.maxResponseBytes }); + const body = bodyResult.text; + const responseTruncatedWarning = bodyResult.truncated + ? `Response body truncated after ${params.maxResponseBytes} bytes.` + : undefined; let title: string | undefined; let extractor = "raw"; @@ -593,6 +615,7 @@ async function runWebFetch(params: { const wrapped = wrapWebFetchContent(text, params.maxChars); const wrappedTitle = title ? wrapWebFetchField(title) : undefined; + const wrappedWarning = wrapWebFetchField(responseTruncatedWarning); const payload = { url: params.url, // Keep raw for tool chaining finalUrl, // Keep raw @@ -613,6 +636,7 @@ async function runWebFetch(params: { fetchedAt: new Date().toISOString(), tookMs: Date.now() - start, text: wrapped.text, + warning: wrappedWarning, }; writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs); return payload; @@ -695,6 +719,7 @@ export function createWebFetchTool(options?: { const userAgent = (fetch && "userAgent" in fetch && typeof fetch.userAgent === "string" && fetch.userAgent) || DEFAULT_FETCH_USER_AGENT; + const maxResponseBytes = resolveFetchMaxResponseBytes(fetch); return { label: "Web Fetch", name: "web_fetch", @@ -715,6 +740,7 @@ export function createWebFetchTool(options?: { DEFAULT_FETCH_MAX_CHARS, maxCharsCap, ), + maxResponseBytes, maxRedirects: resolveMaxRedirects(fetch?.maxRedirects, DEFAULT_FETCH_MAX_REDIRECTS), timeoutSeconds: resolveTimeoutSeconds(fetch?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS), cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES), diff --git a/src/agents/tools/web-search.ts b/src/agents/tools/web-search.ts index f2e059f43..be174b951 100644 --- a/src/agents/tools/web-search.ts +++ b/src/agents/tools/web-search.ts @@ -486,7 +486,8 @@ async function runPerplexitySearch(params: { }); if (!res.ok) { - const detail = await readResponseText(res); + const detailResult = await readResponseText(res, { maxBytes: 64_000 }); + const detail = detailResult.text; throw new Error(`Perplexity API error (${res.status}): ${detail || res.statusText}`); } @@ -535,7 +536,8 @@ async function runGrokSearch(params: { }); if (!res.ok) { - const detail = await readResponseText(res); + const detailResult = await readResponseText(res, { maxBytes: 64_000 }); + const detail = detailResult.text; throw new Error(`xAI API error (${res.status}): ${detail || res.statusText}`); } @@ -665,7 +667,8 @@ async function runWebSearch(params: { }); if (!res.ok) { - const detail = await readResponseText(res); + const detailResult = await readResponseText(res, { maxBytes: 64_000 }); + const detail = detailResult.text; throw new Error(`Brave Search API error (${res.status}): ${detail || res.statusText}`); } diff --git a/src/agents/tools/web-shared.ts b/src/agents/tools/web-shared.ts index 2a7353796..da0fbb38b 100644 --- a/src/agents/tools/web-shared.ts +++ b/src/agents/tools/web-shared.ts @@ -86,10 +86,85 @@ export function withTimeout(signal: AbortSignal | undefined, timeoutMs: number): return controller.signal; } -export async function readResponseText(res: Response): Promise { +export type ReadResponseTextResult = { + text: string; + truncated: boolean; + bytesRead: number; +}; + +export async function readResponseText( + res: Response, + options?: { maxBytes?: number }, +): Promise { + const maxBytesRaw = options?.maxBytes; + const maxBytes = + typeof maxBytesRaw === "number" && Number.isFinite(maxBytesRaw) && maxBytesRaw > 0 + ? Math.floor(maxBytesRaw) + : undefined; + + const body = (res as unknown as { body?: unknown }).body; + if ( + maxBytes && + body && + typeof body === "object" && + "getReader" in body && + typeof (body as { getReader: () => unknown }).getReader === "function" + ) { + const reader = (body as ReadableStream).getReader(); + const decoder = new TextDecoder(); + let bytesRead = 0; + let truncated = false; + const parts: string[] = []; + + try { + while (true) { + const { value, done } = await reader.read(); + if (done) { + break; + } + if (!value || value.byteLength === 0) { + continue; + } + + let chunk = value; + if (bytesRead + chunk.byteLength > maxBytes) { + const remaining = Math.max(0, maxBytes - bytesRead); + if (remaining <= 0) { + truncated = true; + break; + } + chunk = chunk.subarray(0, remaining); + truncated = true; + } + + bytesRead += chunk.byteLength; + parts.push(decoder.decode(chunk, { stream: true })); + + if (truncated || bytesRead >= maxBytes) { + truncated = true; + break; + } + } + } catch { + // Best-effort: return whatever we decoded so far. + } finally { + if (truncated) { + try { + await reader.cancel(); + } catch { + // ignore + } + } + } + + parts.push(decoder.decode()); + return { text: parts.join(""), truncated, bytesRead }; + } + try { - return await res.text(); + const text = await res.text(); + return { text, truncated: false, bytesRead: text.length }; } catch { - return ""; + return { text: "", truncated: false, bytesRead: 0 }; } }