fix(web_fetch): cap response body before parsing
This commit is contained in:
@@ -29,6 +29,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Gateway/Send: return an actionable error when `send` targets internal-only `webchat`, guiding callers to use `chat.send` or a deliverable channel. (#15703) Thanks @rodrigouroz.
|
||||
- Gateway/Agent: reject malformed `agent:`-prefixed session keys (for example, `agent:main`) in `agent` and `agent.identity.get` instead of silently resolving them to the default agent, preventing accidental cross-session routing. (#15707) Thanks @rodrigouroz.
|
||||
- Gateway/Security: redact sensitive session/path details from `status` responses for non-admin clients; full details remain available to `operator.admin`. (#8590) Thanks @fr33d3m0n.
|
||||
- Web Fetch/Security: cap downloaded response body size before HTML parsing to prevent memory exhaustion from oversized or deeply nested pages. Thanks @xuemian168.
|
||||
- Agents: return an explicit timeout error reply when an embedded run times out before producing any payloads, preventing silent dropped turns during slow cache-refresh transitions. (#16659) Thanks @liaosvcaf and @vignesh07.
|
||||
- Agents/OpenAI: force `store=true` for direct OpenAI Responses/Codex runs to preserve multi-turn server-side conversation state, while leaving proxy/non-OpenAI endpoints unchanged. (#16803) Thanks @mark9232 and @vignesh07.
|
||||
- Agents/Context: apply configured model `contextWindow` overrides after provider discovery so `lookupContextTokens()` honors operator config values (including discovery-failure paths). (#17404) Thanks @michaelbship and @vignesh07.
|
||||
|
||||
@@ -224,6 +224,7 @@ Fetch a URL and extract readable content.
|
||||
enabled: true,
|
||||
maxChars: 50000,
|
||||
maxCharsCap: 50000,
|
||||
maxResponseBytes: 2000000,
|
||||
timeoutSeconds: 30,
|
||||
cacheTtlMinutes: 15,
|
||||
maxRedirects: 3,
|
||||
@@ -256,6 +257,7 @@ Notes:
|
||||
- `web_fetch` sends a Chrome-like User-Agent and `Accept-Language` by default; override `userAgent` if needed.
|
||||
- `web_fetch` blocks private/internal hostnames and re-checks redirects (limit with `maxRedirects`).
|
||||
- `maxChars` is clamped to `tools.web.fetch.maxCharsCap`.
|
||||
- `web_fetch` caps the downloaded response body size to `tools.web.fetch.maxResponseBytes` before parsing; oversized responses are truncated and include a warning.
|
||||
- `web_fetch` is best-effort extraction; some sites will need the browser tool.
|
||||
- See [Firecrawl](/tools/firecrawl) for key setup and service details.
|
||||
- Responses are cached (default 15 minutes) to reduce repeated fetches.
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
export type ExtractMode = "markdown" | "text";
|
||||
|
||||
const READABILITY_MAX_HTML_CHARS = 1_000_000;
|
||||
const READABILITY_MAX_ESTIMATED_NESTING_DEPTH = 3_000;
|
||||
|
||||
let readabilityDepsPromise:
|
||||
| Promise<{
|
||||
Readability: typeof import("@mozilla/readability").Readability;
|
||||
@@ -107,6 +110,100 @@ export function truncateText(
|
||||
return { text: value.slice(0, maxChars), truncated: true };
|
||||
}
|
||||
|
||||
function exceedsEstimatedHtmlNestingDepth(html: string, maxDepth: number): boolean {
|
||||
// Cheap heuristic to skip Readability+DOM parsing on pathological HTML (deep nesting => stack/memory blowups).
|
||||
// Not an HTML parser; tuned to catch attacker-controlled "<div><div>..." cases.
|
||||
const voidTags = new Set([
|
||||
"area",
|
||||
"base",
|
||||
"br",
|
||||
"col",
|
||||
"embed",
|
||||
"hr",
|
||||
"img",
|
||||
"input",
|
||||
"link",
|
||||
"meta",
|
||||
"param",
|
||||
"source",
|
||||
"track",
|
||||
"wbr",
|
||||
]);
|
||||
|
||||
let depth = 0;
|
||||
const len = html.length;
|
||||
for (let i = 0; i < len; i++) {
|
||||
if (html.charCodeAt(i) !== 60) {
|
||||
continue; // '<'
|
||||
}
|
||||
const next = html.charCodeAt(i + 1);
|
||||
if (next === 33 || next === 63) {
|
||||
continue; // <! ...> or <? ...>
|
||||
}
|
||||
|
||||
let j = i + 1;
|
||||
let closing = false;
|
||||
if (html.charCodeAt(j) === 47) {
|
||||
closing = true;
|
||||
j += 1;
|
||||
}
|
||||
|
||||
while (j < len && html.charCodeAt(j) <= 32) {
|
||||
j += 1;
|
||||
}
|
||||
|
||||
const nameStart = j;
|
||||
while (j < len) {
|
||||
const c = html.charCodeAt(j);
|
||||
const isNameChar =
|
||||
(c >= 65 && c <= 90) || // A-Z
|
||||
(c >= 97 && c <= 122) || // a-z
|
||||
(c >= 48 && c <= 57) || // 0-9
|
||||
c === 58 || // :
|
||||
c === 45; // -
|
||||
if (!isNameChar) {
|
||||
break;
|
||||
}
|
||||
j += 1;
|
||||
}
|
||||
|
||||
const tagName = html.slice(nameStart, j).toLowerCase();
|
||||
if (!tagName) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (closing) {
|
||||
depth = Math.max(0, depth - 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (voidTags.has(tagName)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Best-effort self-closing detection: scan a short window for "/>".
|
||||
let selfClosing = false;
|
||||
for (let k = j; k < len && k < j + 200; k++) {
|
||||
const c = html.charCodeAt(k);
|
||||
if (c === 62) {
|
||||
if (html.charCodeAt(k - 1) === 47) {
|
||||
selfClosing = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (selfClosing) {
|
||||
continue;
|
||||
}
|
||||
|
||||
depth += 1;
|
||||
if (depth > maxDepth) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
export async function extractReadableContent(params: {
|
||||
html: string;
|
||||
url: string;
|
||||
@@ -120,6 +217,12 @@ export async function extractReadableContent(params: {
|
||||
}
|
||||
return rendered;
|
||||
};
|
||||
if (
|
||||
params.html.length > READABILITY_MAX_HTML_CHARS ||
|
||||
exceedsEstimatedHtmlNestingDepth(params.html, READABILITY_MAX_ESTIMATED_NESTING_DEPTH)
|
||||
) {
|
||||
return fallback();
|
||||
}
|
||||
try {
|
||||
const { Readability, parseHTML } = await loadReadabilityDeps();
|
||||
const { document } = parseHTML(params.html);
|
||||
|
||||
66
src/agents/tools/web-fetch.response-limit.test.ts
Normal file
66
src/agents/tools/web-fetch.response-limit.test.ts
Normal file
@@ -0,0 +1,66 @@
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import * as ssrf from "../../infra/net/ssrf.js";
|
||||
import { createWebFetchTool } from "./web-tools.js";
|
||||
|
||||
// Avoid dynamic-importing heavy readability deps in this unit test suite.
|
||||
vi.mock("./web-fetch-utils.js", async () => {
|
||||
const actual =
|
||||
await vi.importActual<typeof import("./web-fetch-utils.js")>("./web-fetch-utils.js");
|
||||
return {
|
||||
...actual,
|
||||
extractReadableContent: vi.fn().mockResolvedValue({
|
||||
title: "HTML Page",
|
||||
text: "HTML Page\n\nContent here.",
|
||||
}),
|
||||
};
|
||||
});
|
||||
|
||||
const lookupMock = vi.fn();
|
||||
const resolvePinnedHostname = ssrf.resolvePinnedHostname;
|
||||
const baseToolConfig = {
|
||||
config: {
|
||||
tools: {
|
||||
web: { fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false }, maxResponseBytes: 1024 } },
|
||||
},
|
||||
},
|
||||
} as const;
|
||||
|
||||
describe("web_fetch response size limits", () => {
|
||||
const priorFetch = global.fetch;
|
||||
|
||||
beforeEach(() => {
|
||||
lookupMock.mockResolvedValue([{ address: "93.184.216.34", family: 4 }]);
|
||||
vi.spyOn(ssrf, "resolvePinnedHostname").mockImplementation((hostname) =>
|
||||
resolvePinnedHostname(hostname, lookupMock),
|
||||
);
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
// @ts-expect-error restore
|
||||
global.fetch = priorFetch;
|
||||
lookupMock.mockReset();
|
||||
vi.restoreAllMocks();
|
||||
});
|
||||
|
||||
it("caps response bytes and does not hang on endless streams", async () => {
|
||||
const chunk = new TextEncoder().encode("<html><body><div>hi</div></body></html>");
|
||||
const stream = new ReadableStream<Uint8Array>({
|
||||
pull(controller) {
|
||||
controller.enqueue(chunk);
|
||||
},
|
||||
});
|
||||
const response = new Response(stream, {
|
||||
status: 200,
|
||||
headers: { "content-type": "text/html; charset=utf-8" },
|
||||
});
|
||||
|
||||
const fetchSpy = vi.fn().mockResolvedValue(response);
|
||||
// @ts-expect-error mock fetch
|
||||
global.fetch = fetchSpy;
|
||||
|
||||
const tool = createWebFetchTool(baseToolConfig);
|
||||
const result = await tool?.execute?.("call", { url: "https://example.com/stream" });
|
||||
|
||||
expect(result?.details?.warning).toContain("Response body truncated");
|
||||
});
|
||||
});
|
||||
@@ -33,8 +33,12 @@ export { extractReadableContent } from "./web-fetch-utils.js";
|
||||
const EXTRACT_MODES = ["markdown", "text"] as const;
|
||||
|
||||
const DEFAULT_FETCH_MAX_CHARS = 50_000;
|
||||
const DEFAULT_FETCH_MAX_RESPONSE_BYTES = 2_000_000;
|
||||
const FETCH_MAX_RESPONSE_BYTES_MIN = 32_000;
|
||||
const FETCH_MAX_RESPONSE_BYTES_MAX = 10_000_000;
|
||||
const DEFAULT_FETCH_MAX_REDIRECTS = 3;
|
||||
const DEFAULT_ERROR_MAX_CHARS = 4_000;
|
||||
const DEFAULT_ERROR_MAX_BYTES = 64_000;
|
||||
const DEFAULT_FIRECRAWL_BASE_URL = "https://api.firecrawl.dev";
|
||||
const DEFAULT_FIRECRAWL_MAX_AGE_MS = 172_800_000;
|
||||
const DEFAULT_FETCH_USER_AGENT =
|
||||
@@ -108,6 +112,18 @@ function resolveFetchMaxCharsCap(fetch?: WebFetchConfig): number {
|
||||
return Math.max(100, Math.floor(raw));
|
||||
}
|
||||
|
||||
function resolveFetchMaxResponseBytes(fetch?: WebFetchConfig): number {
|
||||
const raw =
|
||||
fetch && "maxResponseBytes" in fetch && typeof fetch.maxResponseBytes === "number"
|
||||
? fetch.maxResponseBytes
|
||||
: undefined;
|
||||
if (typeof raw !== "number" || !Number.isFinite(raw) || raw <= 0) {
|
||||
return DEFAULT_FETCH_MAX_RESPONSE_BYTES;
|
||||
}
|
||||
const value = Math.floor(raw);
|
||||
return Math.min(FETCH_MAX_RESPONSE_BYTES_MAX, Math.max(FETCH_MAX_RESPONSE_BYTES_MIN, value));
|
||||
}
|
||||
|
||||
function resolveFirecrawlConfig(fetch?: WebFetchConfig): FirecrawlFetchConfig {
|
||||
if (!fetch || typeof fetch !== "object") {
|
||||
return undefined;
|
||||
@@ -413,6 +429,7 @@ async function runWebFetch(params: {
|
||||
url: string;
|
||||
extractMode: ExtractMode;
|
||||
maxChars: number;
|
||||
maxResponseBytes: number;
|
||||
maxRedirects: number;
|
||||
timeoutSeconds: number;
|
||||
cacheTtlMs: number;
|
||||
@@ -530,7 +547,8 @@ async function runWebFetch(params: {
|
||||
writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs);
|
||||
return payload;
|
||||
}
|
||||
const rawDetail = await readResponseText(res);
|
||||
const rawDetailResult = await readResponseText(res, { maxBytes: DEFAULT_ERROR_MAX_BYTES });
|
||||
const rawDetail = rawDetailResult.text;
|
||||
const detail = formatWebFetchErrorDetail({
|
||||
detail: rawDetail,
|
||||
contentType: res.headers.get("content-type"),
|
||||
@@ -542,7 +560,11 @@ async function runWebFetch(params: {
|
||||
|
||||
const contentType = res.headers.get("content-type") ?? "application/octet-stream";
|
||||
const normalizedContentType = normalizeContentType(contentType) ?? "application/octet-stream";
|
||||
const body = await readResponseText(res);
|
||||
const bodyResult = await readResponseText(res, { maxBytes: params.maxResponseBytes });
|
||||
const body = bodyResult.text;
|
||||
const responseTruncatedWarning = bodyResult.truncated
|
||||
? `Response body truncated after ${params.maxResponseBytes} bytes.`
|
||||
: undefined;
|
||||
|
||||
let title: string | undefined;
|
||||
let extractor = "raw";
|
||||
@@ -593,6 +615,7 @@ async function runWebFetch(params: {
|
||||
|
||||
const wrapped = wrapWebFetchContent(text, params.maxChars);
|
||||
const wrappedTitle = title ? wrapWebFetchField(title) : undefined;
|
||||
const wrappedWarning = wrapWebFetchField(responseTruncatedWarning);
|
||||
const payload = {
|
||||
url: params.url, // Keep raw for tool chaining
|
||||
finalUrl, // Keep raw
|
||||
@@ -613,6 +636,7 @@ async function runWebFetch(params: {
|
||||
fetchedAt: new Date().toISOString(),
|
||||
tookMs: Date.now() - start,
|
||||
text: wrapped.text,
|
||||
warning: wrappedWarning,
|
||||
};
|
||||
writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs);
|
||||
return payload;
|
||||
@@ -695,6 +719,7 @@ export function createWebFetchTool(options?: {
|
||||
const userAgent =
|
||||
(fetch && "userAgent" in fetch && typeof fetch.userAgent === "string" && fetch.userAgent) ||
|
||||
DEFAULT_FETCH_USER_AGENT;
|
||||
const maxResponseBytes = resolveFetchMaxResponseBytes(fetch);
|
||||
return {
|
||||
label: "Web Fetch",
|
||||
name: "web_fetch",
|
||||
@@ -715,6 +740,7 @@ export function createWebFetchTool(options?: {
|
||||
DEFAULT_FETCH_MAX_CHARS,
|
||||
maxCharsCap,
|
||||
),
|
||||
maxResponseBytes,
|
||||
maxRedirects: resolveMaxRedirects(fetch?.maxRedirects, DEFAULT_FETCH_MAX_REDIRECTS),
|
||||
timeoutSeconds: resolveTimeoutSeconds(fetch?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS),
|
||||
cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES),
|
||||
|
||||
@@ -486,7 +486,8 @@ async function runPerplexitySearch(params: {
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
const detail = await readResponseText(res);
|
||||
const detailResult = await readResponseText(res, { maxBytes: 64_000 });
|
||||
const detail = detailResult.text;
|
||||
throw new Error(`Perplexity API error (${res.status}): ${detail || res.statusText}`);
|
||||
}
|
||||
|
||||
@@ -535,7 +536,8 @@ async function runGrokSearch(params: {
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
const detail = await readResponseText(res);
|
||||
const detailResult = await readResponseText(res, { maxBytes: 64_000 });
|
||||
const detail = detailResult.text;
|
||||
throw new Error(`xAI API error (${res.status}): ${detail || res.statusText}`);
|
||||
}
|
||||
|
||||
@@ -665,7 +667,8 @@ async function runWebSearch(params: {
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
const detail = await readResponseText(res);
|
||||
const detailResult = await readResponseText(res, { maxBytes: 64_000 });
|
||||
const detail = detailResult.text;
|
||||
throw new Error(`Brave Search API error (${res.status}): ${detail || res.statusText}`);
|
||||
}
|
||||
|
||||
|
||||
@@ -86,10 +86,85 @@ export function withTimeout(signal: AbortSignal | undefined, timeoutMs: number):
|
||||
return controller.signal;
|
||||
}
|
||||
|
||||
export async function readResponseText(res: Response): Promise<string> {
|
||||
export type ReadResponseTextResult = {
|
||||
text: string;
|
||||
truncated: boolean;
|
||||
bytesRead: number;
|
||||
};
|
||||
|
||||
export async function readResponseText(
|
||||
res: Response,
|
||||
options?: { maxBytes?: number },
|
||||
): Promise<ReadResponseTextResult> {
|
||||
const maxBytesRaw = options?.maxBytes;
|
||||
const maxBytes =
|
||||
typeof maxBytesRaw === "number" && Number.isFinite(maxBytesRaw) && maxBytesRaw > 0
|
||||
? Math.floor(maxBytesRaw)
|
||||
: undefined;
|
||||
|
||||
const body = (res as unknown as { body?: unknown }).body;
|
||||
if (
|
||||
maxBytes &&
|
||||
body &&
|
||||
typeof body === "object" &&
|
||||
"getReader" in body &&
|
||||
typeof (body as { getReader: () => unknown }).getReader === "function"
|
||||
) {
|
||||
const reader = (body as ReadableStream<Uint8Array>).getReader();
|
||||
const decoder = new TextDecoder();
|
||||
let bytesRead = 0;
|
||||
let truncated = false;
|
||||
const parts: string[] = [];
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
const { value, done } = await reader.read();
|
||||
if (done) {
|
||||
break;
|
||||
}
|
||||
if (!value || value.byteLength === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let chunk = value;
|
||||
if (bytesRead + chunk.byteLength > maxBytes) {
|
||||
const remaining = Math.max(0, maxBytes - bytesRead);
|
||||
if (remaining <= 0) {
|
||||
truncated = true;
|
||||
break;
|
||||
}
|
||||
chunk = chunk.subarray(0, remaining);
|
||||
truncated = true;
|
||||
}
|
||||
|
||||
bytesRead += chunk.byteLength;
|
||||
parts.push(decoder.decode(chunk, { stream: true }));
|
||||
|
||||
if (truncated || bytesRead >= maxBytes) {
|
||||
truncated = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// Best-effort: return whatever we decoded so far.
|
||||
} finally {
|
||||
if (truncated) {
|
||||
try {
|
||||
await reader.cancel();
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
parts.push(decoder.decode());
|
||||
return { text: parts.join(""), truncated, bytesRead };
|
||||
}
|
||||
|
||||
try {
|
||||
return await res.text();
|
||||
const text = await res.text();
|
||||
return { text, truncated: false, bytesRead: text.length };
|
||||
} catch {
|
||||
return "";
|
||||
return { text: "", truncated: false, bytesRead: 0 };
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user