From cdc1ef85e8f64eb66d4706f62dbc398e15fde524 Mon Sep 17 00:00:00 2001 From: Brian Mendonca Date: Mon, 2 Mar 2026 18:37:07 -0700 Subject: [PATCH] Feishu: cache failing probes (#29970) * Feishu: cache failing probes * Changelog: add Feishu probe failure backoff note --------- Co-authored-by: bmendonca3 <208517100+bmendonca3@users.noreply.github.com> Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com> --- CHANGELOG.md | 1 + extensions/feishu/src/probe.test.ts | 58 ++++++++++++------ extensions/feishu/src/probe.ts | 94 +++++++++++++++++------------ 3 files changed, 96 insertions(+), 57 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 893cf928c..dd3e22224 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -185,6 +185,7 @@ Docs: https://docs.openclaw.ai - Feishu/Inbound ordering: serialize message handling per chat while preserving cross-chat concurrency to avoid same-chat race drops under bursty inbound traffic. (#31807) - Feishu/Dedup restart resilience: warm persistent dedup state into memory on monitor startup so retry events after gateway restart stay suppressed without requiring initial on-disk probe misses. (#31605) - Feishu/Typing notification suppression: skip typing keepalive reaction re-adds when the indicator is already active, preventing duplicate notification pings from repeated identical emoji adds. (#31580) +- Feishu/Probe failure backoff: cache API and timeout probe failures for one minute per account key while preserving abort-aware probe timeouts, reducing repeated health-check retries during transient credential/network outages. (#29970) - BlueBubbles/Message metadata: harden send response ID extraction, include sender identity in DM context, and normalize inbound `message_id` selection to avoid duplicate ID metadata. (#23970) Thanks @tyler6204. - Docker/Image health checks: add Dockerfile `HEALTHCHECK` that probes gateway `GET /healthz` so container runtimes can mark unhealthy instances without requiring auth credentials in the probe command. (#11478) Thanks @U-C4N and @vincentkoc. - Docker/Sandbox bootstrap hardening: make `OPENCLAW_SANDBOX` opt-in parsing explicit (`1|true|yes|on`), support custom Docker socket paths via `OPENCLAW_DOCKER_SOCKET`, defer docker.sock exposure until sandbox prerequisites pass, and reset/roll back persisted sandbox mode to `off` when setup is skipped or partially fails to avoid stale broken sandbox state. (#29974) Thanks @jamtujest and @vincentkoc. diff --git a/extensions/feishu/src/probe.test.ts b/extensions/feishu/src/probe.test.ts index 521b0b4d6..e46929959 100644 --- a/extensions/feishu/src/probe.test.ts +++ b/extensions/feishu/src/probe.test.ts @@ -59,7 +59,7 @@ describe("probeFeishu", () => { expect(requestFn).toHaveBeenCalledTimes(1); }); - it("uses explicit timeout for bot info request", async () => { + it("passes the probe timeout to the Feishu request", async () => { const requestFn = setupClient({ code: 0, bot: { bot_name: "TestBot", open_id: "ou_abc123" }, @@ -105,7 +105,6 @@ describe("probeFeishu", () => { expect(result).toMatchObject({ ok: false, error: "probe aborted" }); expect(createFeishuClientMock).not.toHaveBeenCalled(); }); - it("returns cached result on subsequent calls within TTL", async () => { const requestFn = setupClient({ code: 0, @@ -133,7 +132,7 @@ describe("probeFeishu", () => { await probeFeishu(creds); expect(requestFn).toHaveBeenCalledTimes(1); - // Advance time past the 10-minute TTL + // Advance time past the success TTL vi.advanceTimersByTime(10 * 60 * 1000 + 1); await probeFeishu(creds); @@ -143,29 +142,48 @@ describe("probeFeishu", () => { } }); - it("does not cache failed probe results (API error)", async () => { - const requestFn = makeRequestFn({ code: 99, msg: "token expired" }); - createFeishuClientMock.mockReturnValue({ request: requestFn }); + it("caches failed probe results (API error) for the error TTL", async () => { + vi.useFakeTimers(); + try { + const requestFn = makeRequestFn({ code: 99, msg: "token expired" }); + createFeishuClientMock.mockReturnValue({ request: requestFn }); - const creds = { appId: "cli_123", appSecret: "secret" }; - const first = await probeFeishu(creds); - expect(first).toMatchObject({ ok: false, error: "API error: token expired" }); + const creds = { appId: "cli_123", appSecret: "secret" }; + const first = await probeFeishu(creds); + const second = await probeFeishu(creds); + expect(first).toMatchObject({ ok: false, error: "API error: token expired" }); + expect(second).toMatchObject({ ok: false, error: "API error: token expired" }); + expect(requestFn).toHaveBeenCalledTimes(1); - // Second call should make a fresh request since failures are not cached - await probeFeishu(creds); - expect(requestFn).toHaveBeenCalledTimes(2); + vi.advanceTimersByTime(60 * 1000 + 1); + + await probeFeishu(creds); + expect(requestFn).toHaveBeenCalledTimes(2); + } finally { + vi.useRealTimers(); + } }); - it("does not cache results when request throws", async () => { - const requestFn = vi.fn().mockRejectedValue(new Error("network error")); - createFeishuClientMock.mockReturnValue({ request: requestFn }); + it("caches thrown request errors for the error TTL", async () => { + vi.useFakeTimers(); + try { + const requestFn = vi.fn().mockRejectedValue(new Error("network error")); + createFeishuClientMock.mockReturnValue({ request: requestFn }); - const creds = { appId: "cli_123", appSecret: "secret" }; - const first = await probeFeishu(creds); - expect(first).toMatchObject({ ok: false, error: "network error" }); + const creds = { appId: "cli_123", appSecret: "secret" }; + const first = await probeFeishu(creds); + const second = await probeFeishu(creds); + expect(first).toMatchObject({ ok: false, error: "network error" }); + expect(second).toMatchObject({ ok: false, error: "network error" }); + expect(requestFn).toHaveBeenCalledTimes(1); - await probeFeishu(creds); - expect(requestFn).toHaveBeenCalledTimes(2); + vi.advanceTimersByTime(60 * 1000 + 1); + + await probeFeishu(creds); + expect(requestFn).toHaveBeenCalledTimes(2); + } finally { + vi.useRealTimers(); + } }); it("caches per account independently", async () => { diff --git a/extensions/feishu/src/probe.ts b/extensions/feishu/src/probe.ts index 31da461f8..e4b8d76f0 100644 --- a/extensions/feishu/src/probe.ts +++ b/extensions/feishu/src/probe.ts @@ -2,15 +2,16 @@ import { raceWithTimeoutAndAbort } from "./async.js"; import { createFeishuClient, type FeishuClientCredentials } from "./client.js"; import type { FeishuProbeResult } from "./types.js"; -/** Cache successful probe results to reduce API calls (bot info is static). +/** Cache probe results to reduce repeated health-check calls. * Gateway health checks call probeFeishu() every minute; without caching this * burns ~43,200 calls/month, easily exceeding Feishu's free-tier quota. - * A 10-min TTL cuts that to ~4,320 calls/month. (#26684) */ + * Successful bot info is effectively static, while failures are cached briefly + * to avoid hammering the API during transient outages. */ const probeCache = new Map(); -const PROBE_CACHE_TTL_MS = 10 * 60 * 1000; // 10 minutes +const PROBE_SUCCESS_TTL_MS = 10 * 60 * 1000; // 10 minutes +const PROBE_ERROR_TTL_MS = 60 * 1000; // 1 minute const MAX_PROBE_CACHE_SIZE = 64; export const FEISHU_PROBE_REQUEST_TIMEOUT_MS = 10_000; - export type ProbeFeishuOptions = { timeoutMs?: number; abortSignal?: AbortSignal; @@ -23,6 +24,21 @@ type FeishuBotInfoResponse = { data?: { bot?: { bot_name?: string; open_id?: string } }; }; +function setCachedProbeResult( + cacheKey: string, + result: FeishuProbeResult, + ttlMs: number, +): FeishuProbeResult { + probeCache.set(cacheKey, { result, expiresAt: Date.now() + ttlMs }); + if (probeCache.size > MAX_PROBE_CACHE_SIZE) { + const oldest = probeCache.keys().next().value; + if (oldest !== undefined) { + probeCache.delete(oldest); + } + } + return result; +} + export async function probeFeishu( creds?: FeishuClientCredentials, options: ProbeFeishuOptions = {}, @@ -78,11 +94,15 @@ export async function probeFeishu( }; } if (responseResult.status === "timeout") { - return { - ok: false, - appId: creds.appId, - error: `probe timed out after ${timeoutMs}ms`, - }; + return setCachedProbeResult( + cacheKey, + { + ok: false, + appId: creds.appId, + error: `probe timed out after ${timeoutMs}ms`, + }, + PROBE_ERROR_TTL_MS, + ); } const response = responseResult.value; @@ -95,38 +115,38 @@ export async function probeFeishu( } if (response.code !== 0) { - return { - ok: false, - appId: creds.appId, - error: `API error: ${response.msg || `code ${response.code}`}`, - }; + return setCachedProbeResult( + cacheKey, + { + ok: false, + appId: creds.appId, + error: `API error: ${response.msg || `code ${response.code}`}`, + }, + PROBE_ERROR_TTL_MS, + ); } const bot = response.bot || response.data?.bot; - const result: FeishuProbeResult = { - ok: true, - appId: creds.appId, - botName: bot?.bot_name, - botOpenId: bot?.open_id, - }; - - // Cache successful results only - probeCache.set(cacheKey, { result, expiresAt: Date.now() + PROBE_CACHE_TTL_MS }); - // Evict oldest entry if cache exceeds max size - if (probeCache.size > MAX_PROBE_CACHE_SIZE) { - const oldest = probeCache.keys().next().value; - if (oldest !== undefined) { - probeCache.delete(oldest); - } - } - - return result; + return setCachedProbeResult( + cacheKey, + { + ok: true, + appId: creds.appId, + botName: bot?.bot_name, + botOpenId: bot?.open_id, + }, + PROBE_SUCCESS_TTL_MS, + ); } catch (err) { - return { - ok: false, - appId: creds.appId, - error: err instanceof Error ? err.message : String(err), - }; + return setCachedProbeResult( + cacheKey, + { + ok: false, + appId: creds.appId, + error: err instanceof Error ? err.message : String(err), + }, + PROBE_ERROR_TTL_MS, + ); } }