From b6ea5895b683bc5948f7d1e344346101877bd373 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 17 Jan 2026 09:33:53 +0000 Subject: [PATCH] fix: gate image tool and deepgram audio payload --- CHANGELOG.md | 1 + src/agents/tools/image-tool.test.ts | 21 +++++ src/agents/tools/image-tool.ts | 94 +++++++++++++++++-- ...uzzy-model-matches-model-directive.test.ts | 7 +- src/gateway/ws-log.ts | 4 +- .../providers/deepgram/audio.live.test.ts | 30 +++--- .../providers/deepgram/audio.test.ts | 2 +- src/media-understanding/runner.ts | 6 +- src/web/auto-reply/monitor/process-message.ts | 5 +- 9 files changed, 139 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f77fa9f29..ac86fa449 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -81,6 +81,7 @@ Docs: https://docs.clawd.bot - WhatsApp: scope self-chat response prefix; inject pending-only group history and clear after any processed message. - WhatsApp: include `linked` field in `describeAccount`. - Agents: drop unsigned Gemini tool calls and avoid JSON Schema `format` keyword collisions. +- Agents: hide the image tool when the primary model already supports images. - Agents: avoid duplicate sends by replying with `NO_REPLY` after `message` tool sends. - Auth: inherit/merge sub-agent auth profiles from the main agent. - Gateway: resolve local auth for security probe and validate gateway token/password file modes. (#1011, #1022) — thanks @ivanrvpereira, @kkarimi. diff --git a/src/agents/tools/image-tool.test.ts b/src/agents/tools/image-tool.test.ts index 5e22f7353..255e20183 100644 --- a/src/agents/tools/image-tool.test.ts +++ b/src/agents/tools/image-tool.test.ts @@ -102,6 +102,27 @@ describe("image tool implicit imageModel config", () => { }); }); + it("disables image tool when primary model already supports images", async () => { + const agentDir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-image-")); + const cfg: ClawdbotConfig = { + agents: { + defaults: { + model: { primary: "acme/vision-1" }, + imageModel: { primary: "openai/gpt-5-mini" }, + }, + }, + models: { + providers: { + acme: { + models: [{ id: "vision-1", input: ["text", "image"] }], + }, + }, + }, + }; + expect(resolveImageModelConfigForTool({ cfg, agentDir })).toBeNull(); + expect(createImageTool({ config: cfg, agentDir })).toBeNull(); + }); + it("sandboxes image paths like the read tool", async () => { const stateDir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-image-sandbox-")); const agentDir = path.join(stateDir, "agent"); diff --git a/src/agents/tools/image-tool.ts b/src/agents/tools/image-tool.ts index a013e98e4..52627e547 100644 --- a/src/agents/tools/image-tool.ts +++ b/src/agents/tools/image-tool.ts @@ -1,3 +1,4 @@ +import fsSync from "node:fs"; import fs from "node:fs/promises"; import path from "node:path"; @@ -19,7 +20,7 @@ import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js"; import { minimaxUnderstandImage } from "../minimax-vlm.js"; import { getApiKeyForModel, resolveEnvApiKey } from "../model-auth.js"; import { runWithImageModelFallback } from "../model-fallback.js"; -import { parseModelRef } from "../model-selection.js"; +import { normalizeProviderId, resolveConfiguredModelRef } from "../model-selection.js"; import { ensureClawdbotModelsJson } from "../models-config.js"; import { assertSandboxPath } from "../sandbox-paths.js"; import type { AnyAgentTool } from "./common.js"; @@ -42,12 +43,15 @@ function resolveDefaultModelRef(cfg?: ClawdbotConfig): { provider: string; model: string; } { - const modelConfig = cfg?.agents?.defaults?.model as { primary?: string } | string | undefined; - const raw = typeof modelConfig === "string" ? modelConfig.trim() : modelConfig?.primary?.trim(); - const parsed = - parseModelRef(raw ?? "", DEFAULT_PROVIDER) ?? - ({ provider: DEFAULT_PROVIDER, model: DEFAULT_MODEL } as const); - return { provider: parsed.provider, model: parsed.model }; + if (cfg) { + const resolved = resolveConfiguredModelRef({ + cfg, + defaultProvider: DEFAULT_PROVIDER, + defaultModel: DEFAULT_MODEL, + }); + return { provider: resolved.provider, model: resolved.model }; + } + return { provider: DEFAULT_PROVIDER, model: DEFAULT_MODEL }; } function hasAuthForProvider(params: { provider: string; agentDir: string }): boolean { @@ -58,6 +62,77 @@ function hasAuthForProvider(params: { provider: string; agentDir: string }): boo return listProfilesForProvider(store, params.provider).length > 0; } +type ProviderModelEntry = { + id?: string; + input?: string[]; +}; + +type ProviderConfigLike = { + models?: ProviderModelEntry[]; +}; + +function resolveProviderConfig( + providers: Record | undefined, + provider: string, +): ProviderConfigLike | null { + if (!providers) return null; + const normalized = normalizeProviderId(provider); + for (const [key, value] of Object.entries(providers)) { + if (normalizeProviderId(key) === normalized) return value; + } + return null; +} + +function resolveModelSupportsImages(params: { + providerConfig: ProviderConfigLike | null; + modelId: string; +}): boolean | null { + const models = params.providerConfig?.models; + if (!Array.isArray(models) || models.length === 0) return null; + const trimmedId = params.modelId.trim(); + if (!trimmedId) return null; + const match = + models.find((model) => String(model?.id ?? "").trim() === trimmedId) ?? + models.find( + (model) => + String(model?.id ?? "") + .trim() + .toLowerCase() === trimmedId.toLowerCase(), + ); + if (!match) return null; + const input = Array.isArray(match.input) ? match.input : []; + return input.includes("image"); +} + +function resolvePrimaryModelSupportsImages(params: { + cfg?: ClawdbotConfig; + agentDir: string; +}): boolean | null { + if (!params.cfg) return null; + const primary = resolveDefaultModelRef(params.cfg); + const providerConfig = resolveProviderConfig( + params.cfg.models?.providers as Record | undefined, + primary.provider, + ); + const fromConfig = resolveModelSupportsImages({ + providerConfig, + modelId: primary.model, + }); + if (fromConfig !== null) return fromConfig; + try { + const modelsPath = path.join(params.agentDir, "models.json"); + const raw = fsSync.readFileSync(modelsPath, "utf8"); + const parsed = JSON.parse(raw) as { providers?: Record }; + const provider = resolveProviderConfig(parsed.providers, primary.provider); + return resolveModelSupportsImages({ + providerConfig: provider, + modelId: primary.model, + }); + } catch { + return null; + } +} + /** * Resolve the effective image model config for the `image` tool. * @@ -70,6 +145,11 @@ export function resolveImageModelConfigForTool(params: { cfg?: ClawdbotConfig; agentDir: string; }): ImageModelConfig | null { + const primarySupportsImages = resolvePrimaryModelSupportsImages({ + cfg: params.cfg, + agentDir: params.agentDir, + }); + if (primarySupportsImages === true) return null; const explicit = coerceImageModelConfig(params.cfg); if (explicit.primary?.trim() || (explicit.fallbacks?.length ?? 0) > 0) { return explicit; diff --git a/src/auto-reply/reply.directive.directive-behavior.supports-fuzzy-model-matches-model-directive.test.ts b/src/auto-reply/reply.directive.directive-behavior.supports-fuzzy-model-matches-model-directive.test.ts index 28191da4a..c78919f97 100644 --- a/src/auto-reply/reply.directive.directive-behavior.supports-fuzzy-model-matches-model-directive.test.ts +++ b/src/auto-reply/reply.directive.directive-behavior.supports-fuzzy-model-matches-model-directive.test.ts @@ -107,7 +107,12 @@ describe("directive behavior", () => { const storePath = path.join(home, "sessions.json"); await getReplyFromConfig( - { Body: "/model kimi-k2-0905-preview", From: "+1222", To: "+1222", CommandAuthorized: true }, + { + Body: "/model kimi-k2-0905-preview", + From: "+1222", + To: "+1222", + CommandAuthorized: true, + }, {}, { agents: { diff --git a/src/gateway/ws-log.ts b/src/gateway/ws-log.ts index 31f88584a..934578c5b 100644 --- a/src/gateway/ws-log.ts +++ b/src/gateway/ws-log.ts @@ -67,7 +67,9 @@ export function formatForLog(value: unknown): string { : JSON.stringify(value); if (!str) return ""; const redacted = redactSensitiveText(str, WS_LOG_REDACT_OPTIONS); - return redacted.length > LOG_VALUE_LIMIT ? `${redacted.slice(0, LOG_VALUE_LIMIT)}...` : redacted; + return redacted.length > LOG_VALUE_LIMIT + ? `${redacted.slice(0, LOG_VALUE_LIMIT)}...` + : redacted; } catch { return String(value); } diff --git a/src/media-understanding/providers/deepgram/audio.live.test.ts b/src/media-understanding/providers/deepgram/audio.live.test.ts index cf2223732..9040e982a 100644 --- a/src/media-understanding/providers/deepgram/audio.live.test.ts +++ b/src/media-understanding/providers/deepgram/audio.live.test.ts @@ -31,21 +31,17 @@ async function fetchSampleBuffer(url: string, timeoutMs: number): Promise { - it( - "transcribes sample audio", - async () => { - const buffer = await fetchSampleBuffer(SAMPLE_URL, 15000); - const result = await transcribeDeepgramAudio({ - buffer, - fileName: "sample.wav", - mime: "audio/wav", - apiKey: DEEPGRAM_KEY, - model: DEEPGRAM_MODEL, - baseUrl: DEEPGRAM_BASE_URL, - timeoutMs: 20000, - }); - expect(result.text.trim().length).toBeGreaterThan(0); - }, - 30000, - ); + it("transcribes sample audio", async () => { + const buffer = await fetchSampleBuffer(SAMPLE_URL, 15000); + const result = await transcribeDeepgramAudio({ + buffer, + fileName: "sample.wav", + mime: "audio/wav", + apiKey: DEEPGRAM_KEY, + model: DEEPGRAM_MODEL, + baseUrl: DEEPGRAM_BASE_URL, + timeoutMs: 20000, + }); + expect(result.text.trim().length).toBeGreaterThan(0); + }, 30000); }); diff --git a/src/media-understanding/providers/deepgram/audio.test.ts b/src/media-understanding/providers/deepgram/audio.test.ts index 737649cbe..17af8443c 100644 --- a/src/media-understanding/providers/deepgram/audio.test.ts +++ b/src/media-understanding/providers/deepgram/audio.test.ts @@ -84,6 +84,6 @@ describe("transcribeDeepgramAudio", () => { expect(headers.get("authorization")).toBe("Token test-key"); expect(headers.get("x-custom")).toBe("1"); expect(headers.get("content-type")).toBe("audio/wav"); - expect(Buffer.isBuffer(seenInit?.body)).toBe(true); + expect(seenInit?.body).toBeInstanceOf(Uint8Array); }); }); diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index 4e1b192af..90e5cc597 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -293,9 +293,9 @@ async function runProviderEntry(params: { const providerConfig = cfg.models?.providers?.[providerId]; const baseUrl = entry.baseUrl ?? params.config?.baseUrl ?? providerConfig?.baseUrl; const mergedHeaders = { - ...(providerConfig?.headers ?? {}), - ...(params.config?.headers ?? {}), - ...(entry.headers ?? {}), + ...providerConfig?.headers, + ...params.config?.headers, + ...entry.headers, }; const headers = Object.keys(mergedHeaders).length > 0 ? mergedHeaders : undefined; const providerQuery = resolveProviderQuery({ diff --git a/src/web/auto-reply/monitor/process-message.ts b/src/web/auto-reply/monitor/process-message.ts index f5d7ee198..7081afe9d 100644 --- a/src/web/auto-reply/monitor/process-message.ts +++ b/src/web/auto-reply/monitor/process-message.ts @@ -16,7 +16,10 @@ import { import { dispatchReplyWithBufferedBlockDispatcher } from "../../../auto-reply/reply/provider-dispatcher.js"; import type { getReplyFromConfig } from "../../../auto-reply/reply.js"; import type { ReplyPayload } from "../../../auto-reply/types.js"; -import { hasInlineCommandTokens, isControlCommandMessage } from "../../../auto-reply/command-detection.js"; +import { + hasInlineCommandTokens, + isControlCommandMessage, +} from "../../../auto-reply/command-detection.js"; import { finalizeInboundContext } from "../../../auto-reply/reply/inbound-context.js"; import { toLocationContext } from "../../../channels/location.js"; import type { loadConfig } from "../../../config/config.js";