fix(web_fetch): cap response body before parsing

2026-02-16 01:19:04 +01:00
parent fd3d452f1f
commit 166cf6a3e0
7 changed files with 284 additions and 8 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,6 +29,7 @@ Docs: https://docs.openclaw.ai
 - Gateway/Send: return an actionable error when `send` targets internal-only `webchat`, guiding callers to use `chat.send` or a deliverable channel. (#15703) Thanks @rodrigouroz.
 - Gateway/Agent: reject malformed `agent:`-prefixed session keys (for example, `agent:main`) in `agent` and `agent.identity.get` instead of silently resolving them to the default agent, preventing accidental cross-session routing. (#15707) Thanks @rodrigouroz.
 - Gateway/Security: redact sensitive session/path details from `status` responses for non-admin clients; full details remain available to `operator.admin`. (#8590) Thanks @fr33d3m0n.
+- Web Fetch/Security: cap downloaded response body size before HTML parsing to prevent memory exhaustion from oversized or deeply nested pages. Thanks @xuemian168.
 - Agents: return an explicit timeout error reply when an embedded run times out before producing any payloads, preventing silent dropped turns during slow cache-refresh transitions. (#16659) Thanks @liaosvcaf and @vignesh07.
 - Agents/OpenAI: force `store=true` for direct OpenAI Responses/Codex runs to preserve multi-turn server-side conversation state, while leaving proxy/non-OpenAI endpoints unchanged. (#16803) Thanks @mark9232 and @vignesh07.
 - Agents/Context: apply configured model `contextWindow` overrides after provider discovery so `lookupContextTokens()` honors operator config values (including discovery-failure paths). (#17404) Thanks @michaelbship and @vignesh07.
--- a/docs/tools/web.md
+++ b/docs/tools/web.md
@@ -224,6 +224,7 @@ Fetch a URL and extract readable content.
        enabled: true,
        maxChars: 50000,
        maxCharsCap: 50000,
+        maxResponseBytes: 2000000,
        timeoutSeconds: 30,
        cacheTtlMinutes: 15,
        maxRedirects: 3,
@@ -256,6 +257,7 @@ Notes:
 - `web_fetch` sends a Chrome-like User-Agent and `Accept-Language` by default; override `userAgent` if needed.
 - `web_fetch` blocks private/internal hostnames and re-checks redirects (limit with `maxRedirects`).
 - `maxChars` is clamped to `tools.web.fetch.maxCharsCap`.
+- `web_fetch` caps the downloaded response body size to `tools.web.fetch.maxResponseBytes` before parsing; oversized responses are truncated and include a warning.
 - `web_fetch` is best-effort extraction; some sites will need the browser tool.
 - See [Firecrawl](/tools/firecrawl) for key setup and service details.
 - Responses are cached (default 15 minutes) to reduce repeated fetches.
--- a/src/agents/tools/web-fetch-utils.ts
+++ b/src/agents/tools/web-fetch-utils.ts
@@ -1,5 +1,8 @@
 export type ExtractMode = "markdown" | "text";

+const READABILITY_MAX_HTML_CHARS = 1_000_000;
+const READABILITY_MAX_ESTIMATED_NESTING_DEPTH = 3_000;
+
 let readabilityDepsPromise:
  | Promise<{
      Readability: typeof import("@mozilla/readability").Readability;
@@ -107,6 +110,100 @@ export function truncateText(
  return { text: value.slice(0, maxChars), truncated: true };
 }

+function exceedsEstimatedHtmlNestingDepth(html: string, maxDepth: number): boolean {
+  // Cheap heuristic to skip Readability+DOM parsing on pathological HTML (deep nesting => stack/memory blowups).
+  // Not an HTML parser; tuned to catch attacker-controlled "<div><div>..." cases.
+  const voidTags = new Set([
+    "area",
+    "base",
+    "br",
+    "col",
+    "embed",
+    "hr",
+    "img",
+    "input",
+    "link",
+    "meta",
+    "param",
+    "source",
+    "track",
+    "wbr",
+  ]);
+
+  let depth = 0;
+  const len = html.length;
+  for (let i = 0; i < len; i++) {
+    if (html.charCodeAt(i) !== 60) {
+      continue; // '<'
+    }
+    const next = html.charCodeAt(i + 1);
+    if (next === 33 || next === 63) {
+      continue; // <! ...> or <? ...>
+    }
+
+    let j = i + 1;
+    let closing = false;
+    if (html.charCodeAt(j) === 47) {
+      closing = true;
+      j += 1;
+    }
+
+    while (j < len && html.charCodeAt(j) <= 32) {
+      j += 1;
+    }
+
+    const nameStart = j;
+    while (j < len) {
+      const c = html.charCodeAt(j);
+      const isNameChar =
+        (c >= 65 && c <= 90) || // A-Z
+        (c >= 97 && c <= 122) || // a-z
+        (c >= 48 && c <= 57) || // 0-9
+        c === 58 || // :
+        c === 45; // -
+      if (!isNameChar) {
+        break;
+      }
+      j += 1;
+    }
+
+    const tagName = html.slice(nameStart, j).toLowerCase();
+    if (!tagName) {
+      continue;
+    }
+
+    if (closing) {
+      depth = Math.max(0, depth - 1);
+      continue;
+    }
+
+    if (voidTags.has(tagName)) {
+      continue;
+    }
+
+    // Best-effort self-closing detection: scan a short window for "/>".
+    let selfClosing = false;
+    for (let k = j; k < len && k < j + 200; k++) {
+      const c = html.charCodeAt(k);
+      if (c === 62) {
+        if (html.charCodeAt(k - 1) === 47) {
+          selfClosing = true;
+        }
+        break;
+      }
+    }
+    if (selfClosing) {
+      continue;
+    }
+
+    depth += 1;
+    if (depth > maxDepth) {
+      return true;
+    }
+  }
+  return false;
+}
+
 export async function extractReadableContent(params: {
  html: string;
  url: string;
@@ -120,6 +217,12 @@ export async function extractReadableContent(params: {
    }
    return rendered;
  };
+  if (
+    params.html.length > READABILITY_MAX_HTML_CHARS ||
+    exceedsEstimatedHtmlNestingDepth(params.html, READABILITY_MAX_ESTIMATED_NESTING_DEPTH)
+  ) {
+    return fallback();
+  }
  try {
    const { Readability, parseHTML } = await loadReadabilityDeps();
    const { document } = parseHTML(params.html);
--- a/src/agents/tools/web-fetch.response-limit.test.ts
+++ b/src/agents/tools/web-fetch.response-limit.test.ts
@@ -0,0 +1,66 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import * as ssrf from "../../infra/net/ssrf.js";
+import { createWebFetchTool } from "./web-tools.js";
+
+// Avoid dynamic-importing heavy readability deps in this unit test suite.
+vi.mock("./web-fetch-utils.js", async () => {
+  const actual =
+    await vi.importActual<typeof import("./web-fetch-utils.js")>("./web-fetch-utils.js");
+  return {
+    ...actual,
+    extractReadableContent: vi.fn().mockResolvedValue({
+      title: "HTML Page",
+      text: "HTML Page\n\nContent here.",
+    }),
+  };
+});
+
+const lookupMock = vi.fn();
+const resolvePinnedHostname = ssrf.resolvePinnedHostname;
+const baseToolConfig = {
+  config: {
+    tools: {
+      web: { fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false }, maxResponseBytes: 1024 } },
+    },
+  },
+} as const;
+
+describe("web_fetch response size limits", () => {
+  const priorFetch = global.fetch;
+
+  beforeEach(() => {
+    lookupMock.mockResolvedValue([{ address: "93.184.216.34", family: 4 }]);
+    vi.spyOn(ssrf, "resolvePinnedHostname").mockImplementation((hostname) =>
+      resolvePinnedHostname(hostname, lookupMock),
+    );
+  });
+
+  afterEach(() => {
+    // @ts-expect-error restore
+    global.fetch = priorFetch;
+    lookupMock.mockReset();
+    vi.restoreAllMocks();
+  });
+
+  it("caps response bytes and does not hang on endless streams", async () => {
+    const chunk = new TextEncoder().encode("<html><body><div>hi</div></body></html>");
+    const stream = new ReadableStream<Uint8Array>({
+      pull(controller) {
+        controller.enqueue(chunk);
+      },
+    });
+    const response = new Response(stream, {
+      status: 200,
+      headers: { "content-type": "text/html; charset=utf-8" },
+    });
+
+    const fetchSpy = vi.fn().mockResolvedValue(response);
+    // @ts-expect-error mock fetch
+    global.fetch = fetchSpy;
+
+    const tool = createWebFetchTool(baseToolConfig);
+    const result = await tool?.execute?.("call", { url: "https://example.com/stream" });
+
+    expect(result?.details?.warning).toContain("Response body truncated");
+  });
+});
--- a/src/agents/tools/web-fetch.ts
+++ b/src/agents/tools/web-fetch.ts
@@ -33,8 +33,12 @@ export { extractReadableContent } from "./web-fetch-utils.js";
 const EXTRACT_MODES = ["markdown", "text"] as const;

 const DEFAULT_FETCH_MAX_CHARS = 50_000;
+const DEFAULT_FETCH_MAX_RESPONSE_BYTES = 2_000_000;
+const FETCH_MAX_RESPONSE_BYTES_MIN = 32_000;
+const FETCH_MAX_RESPONSE_BYTES_MAX = 10_000_000;
 const DEFAULT_FETCH_MAX_REDIRECTS = 3;
 const DEFAULT_ERROR_MAX_CHARS = 4_000;
+const DEFAULT_ERROR_MAX_BYTES = 64_000;
 const DEFAULT_FIRECRAWL_BASE_URL = "https://api.firecrawl.dev";
 const DEFAULT_FIRECRAWL_MAX_AGE_MS = 172_800_000;
 const DEFAULT_FETCH_USER_AGENT =
@@ -108,6 +112,18 @@ function resolveFetchMaxCharsCap(fetch?: WebFetchConfig): number {
  return Math.max(100, Math.floor(raw));
 }

+function resolveFetchMaxResponseBytes(fetch?: WebFetchConfig): number {
+  const raw =
+    fetch && "maxResponseBytes" in fetch && typeof fetch.maxResponseBytes === "number"
+      ? fetch.maxResponseBytes
+      : undefined;
+  if (typeof raw !== "number" || !Number.isFinite(raw) || raw <= 0) {
+    return DEFAULT_FETCH_MAX_RESPONSE_BYTES;
+  }
+  const value = Math.floor(raw);
+  return Math.min(FETCH_MAX_RESPONSE_BYTES_MAX, Math.max(FETCH_MAX_RESPONSE_BYTES_MIN, value));
+}
+
 function resolveFirecrawlConfig(fetch?: WebFetchConfig): FirecrawlFetchConfig {
  if (!fetch || typeof fetch !== "object") {
    return undefined;
@@ -413,6 +429,7 @@ async function runWebFetch(params: {
  url: string;
  extractMode: ExtractMode;
  maxChars: number;
+  maxResponseBytes: number;
  maxRedirects: number;
  timeoutSeconds: number;
  cacheTtlMs: number;
@@ -530,7 +547,8 @@ async function runWebFetch(params: {
        writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs);
        return payload;
      }
-      const rawDetail = await readResponseText(res);
+      const rawDetailResult = await readResponseText(res, { maxBytes: DEFAULT_ERROR_MAX_BYTES });
+      const rawDetail = rawDetailResult.text;
      const detail = formatWebFetchErrorDetail({
        detail: rawDetail,
        contentType: res.headers.get("content-type"),
@@ -542,7 +560,11 @@ async function runWebFetch(params: {

    const contentType = res.headers.get("content-type") ?? "application/octet-stream";
    const normalizedContentType = normalizeContentType(contentType) ?? "application/octet-stream";
-    const body = await readResponseText(res);
+    const bodyResult = await readResponseText(res, { maxBytes: params.maxResponseBytes });
+    const body = bodyResult.text;
+    const responseTruncatedWarning = bodyResult.truncated
+      ? `Response body truncated after ${params.maxResponseBytes} bytes.`
+      : undefined;

    let title: string | undefined;
    let extractor = "raw";
@@ -593,6 +615,7 @@ async function runWebFetch(params: {

    const wrapped = wrapWebFetchContent(text, params.maxChars);
    const wrappedTitle = title ? wrapWebFetchField(title) : undefined;
+    const wrappedWarning = wrapWebFetchField(responseTruncatedWarning);
    const payload = {
      url: params.url, // Keep raw for tool chaining
      finalUrl, // Keep raw
@@ -613,6 +636,7 @@ async function runWebFetch(params: {
      fetchedAt: new Date().toISOString(),
      tookMs: Date.now() - start,
      text: wrapped.text,
+      warning: wrappedWarning,
    };
    writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs);
    return payload;
@@ -695,6 +719,7 @@ export function createWebFetchTool(options?: {
  const userAgent =
    (fetch && "userAgent" in fetch && typeof fetch.userAgent === "string" && fetch.userAgent) ||
    DEFAULT_FETCH_USER_AGENT;
+  const maxResponseBytes = resolveFetchMaxResponseBytes(fetch);
  return {
    label: "Web Fetch",
    name: "web_fetch",
@@ -715,6 +740,7 @@ export function createWebFetchTool(options?: {
          DEFAULT_FETCH_MAX_CHARS,
          maxCharsCap,
        ),
+        maxResponseBytes,
        maxRedirects: resolveMaxRedirects(fetch?.maxRedirects, DEFAULT_FETCH_MAX_REDIRECTS),
        timeoutSeconds: resolveTimeoutSeconds(fetch?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS),
        cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES),
--- a/src/agents/tools/web-search.ts
+++ b/src/agents/tools/web-search.ts
@@ -486,7 +486,8 @@ async function runPerplexitySearch(params: {
  });

  if (!res.ok) {
-    const detail = await readResponseText(res);
+    const detailResult = await readResponseText(res, { maxBytes: 64_000 });
+    const detail = detailResult.text;
    throw new Error(`Perplexity API error (${res.status}): ${detail || res.statusText}`);
  }

@@ -535,7 +536,8 @@ async function runGrokSearch(params: {
  });

  if (!res.ok) {
-    const detail = await readResponseText(res);
+    const detailResult = await readResponseText(res, { maxBytes: 64_000 });
+    const detail = detailResult.text;
    throw new Error(`xAI API error (${res.status}): ${detail || res.statusText}`);
  }

@@ -665,7 +667,8 @@ async function runWebSearch(params: {
  });

  if (!res.ok) {
-    const detail = await readResponseText(res);
+    const detailResult = await readResponseText(res, { maxBytes: 64_000 });
+    const detail = detailResult.text;
    throw new Error(`Brave Search API error (${res.status}): ${detail || res.statusText}`);
  }

--- a/src/agents/tools/web-shared.ts
+++ b/src/agents/tools/web-shared.ts
@@ -86,10 +86,85 @@ export function withTimeout(signal: AbortSignal | undefined, timeoutMs: number):
  return controller.signal;
 }

-export async function readResponseText(res: Response): Promise<string> {
+export type ReadResponseTextResult = {
+  text: string;
+  truncated: boolean;
+  bytesRead: number;
+};
+
+export async function readResponseText(
+  res: Response,
+  options?: { maxBytes?: number },
+): Promise<ReadResponseTextResult> {
+  const maxBytesRaw = options?.maxBytes;
+  const maxBytes =
+    typeof maxBytesRaw === "number" && Number.isFinite(maxBytesRaw) && maxBytesRaw > 0
+      ? Math.floor(maxBytesRaw)
+      : undefined;
+
+  const body = (res as unknown as { body?: unknown }).body;
+  if (
+    maxBytes &&
+    body &&
+    typeof body === "object" &&
+    "getReader" in body &&
+    typeof (body as { getReader: () => unknown }).getReader === "function"
+  ) {
+    const reader = (body as ReadableStream<Uint8Array>).getReader();
+    const decoder = new TextDecoder();
+    let bytesRead = 0;
+    let truncated = false;
+    const parts: string[] = [];
+
+    try {
+      while (true) {
+        const { value, done } = await reader.read();
+        if (done) {
+          break;
+        }
+        if (!value || value.byteLength === 0) {
+          continue;
+        }
+
+        let chunk = value;
+        if (bytesRead + chunk.byteLength > maxBytes) {
+          const remaining = Math.max(0, maxBytes - bytesRead);
+          if (remaining <= 0) {
+            truncated = true;
+            break;
+          }
+          chunk = chunk.subarray(0, remaining);
+          truncated = true;
+        }
+
+        bytesRead += chunk.byteLength;
+        parts.push(decoder.decode(chunk, { stream: true }));
+
+        if (truncated || bytesRead >= maxBytes) {
+          truncated = true;
+          break;
+        }
+      }
+    } catch {
+      // Best-effort: return whatever we decoded so far.
+    } finally {
+      if (truncated) {
+        try {
+          await reader.cancel();
+        } catch {
+          // ignore
+        }
+      }
+    }
+
+    parts.push(decoder.decode());
+    return { text: parts.join(""), truncated, bytesRead };
+  }
+
  try {
-    return await res.text();
+    const text = await res.text();
+    return { text, truncated: false, bytesRead: text.length };
  } catch {
-    return "";
+    return { text: "", truncated: false, bytesRead: 0 };
  }
 }