From cdc1ef85e8f64eb66d4706f62dbc398e15fde524 Mon Sep 17 00:00:00 2001
From: Brian Mendonca <bmendonca3@gatech.edu>
Date: Mon, 2 Mar 2026 18:37:07 -0700
Subject: [PATCH] Feishu: cache failing probes (#29970)

* Feishu: cache failing probes

* Changelog: add Feishu probe failure backoff note

---------

Co-authored-by: bmendonca3 <208517100+bmendonca3@users.noreply.github.com>
Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
---
 CHANGELOG.md                        |  1 +
 extensions/feishu/src/probe.test.ts | 58 ++++++++++++------
 extensions/feishu/src/probe.ts      | 94 +++++++++++++++++------------
 3 files changed, 96 insertions(+), 57 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 893cf928c..dd3e22224 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -185,6 +185,7 @@ Docs: https://docs.openclaw.ai
 - Feishu/Inbound ordering: serialize message handling per chat while preserving cross-chat concurrency to avoid same-chat race drops under bursty inbound traffic. (#31807)
 - Feishu/Dedup restart resilience: warm persistent dedup state into memory on monitor startup so retry events after gateway restart stay suppressed without requiring initial on-disk probe misses. (#31605)
 - Feishu/Typing notification suppression: skip typing keepalive reaction re-adds when the indicator is already active, preventing duplicate notification pings from repeated identical emoji adds. (#31580)
+- Feishu/Probe failure backoff: cache API and timeout probe failures for one minute per account key while preserving abort-aware probe timeouts, reducing repeated health-check retries during transient credential/network outages. (#29970)
 - BlueBubbles/Message metadata: harden send response ID extraction, include sender identity in DM context, and normalize inbound `message_id` selection to avoid duplicate ID metadata. (#23970) Thanks @tyler6204.
 - Docker/Image health checks: add Dockerfile `HEALTHCHECK` that probes gateway `GET /healthz` so container runtimes can mark unhealthy instances without requiring auth credentials in the probe command. (#11478) Thanks @U-C4N and @vincentkoc.
 - Docker/Sandbox bootstrap hardening: make `OPENCLAW_SANDBOX` opt-in parsing explicit (`1|true|yes|on`), support custom Docker socket paths via `OPENCLAW_DOCKER_SOCKET`, defer docker.sock exposure until sandbox prerequisites pass, and reset/roll back persisted sandbox mode to `off` when setup is skipped or partially fails to avoid stale broken sandbox state. (#29974) Thanks @jamtujest and @vincentkoc.
diff --git a/extensions/feishu/src/probe.test.ts b/extensions/feishu/src/probe.test.ts
index 521b0b4d6..e46929959 100644
--- a/extensions/feishu/src/probe.test.ts
+++ b/extensions/feishu/src/probe.test.ts
@@ -59,7 +59,7 @@ describe("probeFeishu", () => {
     expect(requestFn).toHaveBeenCalledTimes(1);
   });
 
-  it("uses explicit timeout for bot info request", async () => {
+  it("passes the probe timeout to the Feishu request", async () => {
     const requestFn = setupClient({
       code: 0,
       bot: { bot_name: "TestBot", open_id: "ou_abc123" },
@@ -105,7 +105,6 @@ describe("probeFeishu", () => {
     expect(result).toMatchObject({ ok: false, error: "probe aborted" });
     expect(createFeishuClientMock).not.toHaveBeenCalled();
   });
-
   it("returns cached result on subsequent calls within TTL", async () => {
     const requestFn = setupClient({
       code: 0,
@@ -133,7 +132,7 @@ describe("probeFeishu", () => {
       await probeFeishu(creds);
       expect(requestFn).toHaveBeenCalledTimes(1);
 
-      // Advance time past the 10-minute TTL
+      // Advance time past the success TTL
       vi.advanceTimersByTime(10 * 60 * 1000 + 1);
 
       await probeFeishu(creds);
@@ -143,29 +142,48 @@ describe("probeFeishu", () => {
     }
   });
 
-  it("does not cache failed probe results (API error)", async () => {
-    const requestFn = makeRequestFn({ code: 99, msg: "token expired" });
-    createFeishuClientMock.mockReturnValue({ request: requestFn });
+  it("caches failed probe results (API error) for the error TTL", async () => {
+    vi.useFakeTimers();
+    try {
+      const requestFn = makeRequestFn({ code: 99, msg: "token expired" });
+      createFeishuClientMock.mockReturnValue({ request: requestFn });
 
-    const creds = { appId: "cli_123", appSecret: "secret" };
-    const first = await probeFeishu(creds);
-    expect(first).toMatchObject({ ok: false, error: "API error: token expired" });
+      const creds = { appId: "cli_123", appSecret: "secret" };
+      const first = await probeFeishu(creds);
+      const second = await probeFeishu(creds);
+      expect(first).toMatchObject({ ok: false, error: "API error: token expired" });
+      expect(second).toMatchObject({ ok: false, error: "API error: token expired" });
+      expect(requestFn).toHaveBeenCalledTimes(1);
 
-    // Second call should make a fresh request since failures are not cached
-    await probeFeishu(creds);
-    expect(requestFn).toHaveBeenCalledTimes(2);
+      vi.advanceTimersByTime(60 * 1000 + 1);
+
+      await probeFeishu(creds);
+      expect(requestFn).toHaveBeenCalledTimes(2);
+    } finally {
+      vi.useRealTimers();
+    }
   });
 
-  it("does not cache results when request throws", async () => {
-    const requestFn = vi.fn().mockRejectedValue(new Error("network error"));
-    createFeishuClientMock.mockReturnValue({ request: requestFn });
+  it("caches thrown request errors for the error TTL", async () => {
+    vi.useFakeTimers();
+    try {
+      const requestFn = vi.fn().mockRejectedValue(new Error("network error"));
+      createFeishuClientMock.mockReturnValue({ request: requestFn });
 
-    const creds = { appId: "cli_123", appSecret: "secret" };
-    const first = await probeFeishu(creds);
-    expect(first).toMatchObject({ ok: false, error: "network error" });
+      const creds = { appId: "cli_123", appSecret: "secret" };
+      const first = await probeFeishu(creds);
+      const second = await probeFeishu(creds);
+      expect(first).toMatchObject({ ok: false, error: "network error" });
+      expect(second).toMatchObject({ ok: false, error: "network error" });
+      expect(requestFn).toHaveBeenCalledTimes(1);
 
-    await probeFeishu(creds);
-    expect(requestFn).toHaveBeenCalledTimes(2);
+      vi.advanceTimersByTime(60 * 1000 + 1);
+
+      await probeFeishu(creds);
+      expect(requestFn).toHaveBeenCalledTimes(2);
+    } finally {
+      vi.useRealTimers();
+    }
   });
 
   it("caches per account independently", async () => {
diff --git a/extensions/feishu/src/probe.ts b/extensions/feishu/src/probe.ts
index 31da461f8..e4b8d76f0 100644
--- a/extensions/feishu/src/probe.ts
+++ b/extensions/feishu/src/probe.ts
@@ -2,15 +2,16 @@ import { raceWithTimeoutAndAbort } from "./async.js";
 import { createFeishuClient, type FeishuClientCredentials } from "./client.js";
 import type { FeishuProbeResult } from "./types.js";
 
-/** Cache successful probe results to reduce API calls (bot info is static).
+/** Cache probe results to reduce repeated health-check calls.
  * Gateway health checks call probeFeishu() every minute; without caching this
  * burns ~43,200 calls/month, easily exceeding Feishu's free-tier quota.
- * A 10-min TTL cuts that to ~4,320 calls/month. (#26684) */
+ * Successful bot info is effectively static, while failures are cached briefly
+ * to avoid hammering the API during transient outages. */
 const probeCache = new Map<string, { result: FeishuProbeResult; expiresAt: number }>();
-const PROBE_CACHE_TTL_MS = 10 * 60 * 1000; // 10 minutes
+const PROBE_SUCCESS_TTL_MS = 10 * 60 * 1000; // 10 minutes
+const PROBE_ERROR_TTL_MS = 60 * 1000; // 1 minute
 const MAX_PROBE_CACHE_SIZE = 64;
 export const FEISHU_PROBE_REQUEST_TIMEOUT_MS = 10_000;
-
 export type ProbeFeishuOptions = {
   timeoutMs?: number;
   abortSignal?: AbortSignal;
@@ -23,6 +24,21 @@ type FeishuBotInfoResponse = {
   data?: { bot?: { bot_name?: string; open_id?: string } };
 };
 
+function setCachedProbeResult(
+  cacheKey: string,
+  result: FeishuProbeResult,
+  ttlMs: number,
+): FeishuProbeResult {
+  probeCache.set(cacheKey, { result, expiresAt: Date.now() + ttlMs });
+  if (probeCache.size > MAX_PROBE_CACHE_SIZE) {
+    const oldest = probeCache.keys().next().value;
+    if (oldest !== undefined) {
+      probeCache.delete(oldest);
+    }
+  }
+  return result;
+}
+
 export async function probeFeishu(
   creds?: FeishuClientCredentials,
   options: ProbeFeishuOptions = {},
@@ -78,11 +94,15 @@ export async function probeFeishu(
       };
     }
     if (responseResult.status === "timeout") {
-      return {
-        ok: false,
-        appId: creds.appId,
-        error: `probe timed out after ${timeoutMs}ms`,
-      };
+      return setCachedProbeResult(
+        cacheKey,
+        {
+          ok: false,
+          appId: creds.appId,
+          error: `probe timed out after ${timeoutMs}ms`,
+        },
+        PROBE_ERROR_TTL_MS,
+      );
     }
 
     const response = responseResult.value;
@@ -95,38 +115,38 @@ export async function probeFeishu(
     }
 
     if (response.code !== 0) {
-      return {
-        ok: false,
-        appId: creds.appId,
-        error: `API error: ${response.msg || `code ${response.code}`}`,
-      };
+      return setCachedProbeResult(
+        cacheKey,
+        {
+          ok: false,
+          appId: creds.appId,
+          error: `API error: ${response.msg || `code ${response.code}`}`,
+        },
+        PROBE_ERROR_TTL_MS,
+      );
     }
 
     const bot = response.bot || response.data?.bot;
-    const result: FeishuProbeResult = {
-      ok: true,
-      appId: creds.appId,
-      botName: bot?.bot_name,
-      botOpenId: bot?.open_id,
-    };
-
-    // Cache successful results only
-    probeCache.set(cacheKey, { result, expiresAt: Date.now() + PROBE_CACHE_TTL_MS });
-    // Evict oldest entry if cache exceeds max size
-    if (probeCache.size > MAX_PROBE_CACHE_SIZE) {
-      const oldest = probeCache.keys().next().value;
-      if (oldest !== undefined) {
-        probeCache.delete(oldest);
-      }
-    }
-
-    return result;
+    return setCachedProbeResult(
+      cacheKey,
+      {
+        ok: true,
+        appId: creds.appId,
+        botName: bot?.bot_name,
+        botOpenId: bot?.open_id,
+      },
+      PROBE_SUCCESS_TTL_MS,
+    );
   } catch (err) {
-    return {
-      ok: false,
-      appId: creds.appId,
-      error: err instanceof Error ? err.message : String(err),
-    };
+    return setCachedProbeResult(
+      cacheKey,
+      {
+        ok: false,
+        appId: creds.appId,
+        error: err instanceof Error ? err.message : String(err),
+      },
+      PROBE_ERROR_TTL_MS,
+    );
   }
 }