Files
Moltbot/src/agents/tools/image-tool.ts
2026-01-06 00:56:58 +00:00

193 lines
5.6 KiB
TypeScript

import {
type Api,
type AssistantMessage,
type Context,
complete,
type Model,
} from "@mariozechner/pi-ai";
import {
discoverAuthStorage,
discoverModels,
} from "@mariozechner/pi-coding-agent";
import { Type } from "@sinclair/typebox";
import type { ClawdbotConfig } from "../../config/config.js";
import { resolveUserPath } from "../../utils.js";
import { loadWebMedia } from "../../web/media.js";
import { resolveClawdbotAgentDir } from "../agent-paths.js";
import { getApiKeyForModel } from "../model-auth.js";
import { runWithImageModelFallback } from "../model-fallback.js";
import { ensureClawdbotModelsJson } from "../models-config.js";
import { extractAssistantText } from "../pi-embedded-utils.js";
import type { AnyAgentTool } from "./common.js";
const DEFAULT_PROMPT = "Describe the image.";
function ensureImageToolConfigured(cfg?: ClawdbotConfig): boolean {
const imageModel = cfg?.agent?.imageModel as
| { primary?: string; fallbacks?: string[] }
| string
| undefined;
const primary =
typeof imageModel === "string" ? imageModel.trim() : imageModel?.primary;
const fallbacks =
typeof imageModel === "object" ? (imageModel?.fallbacks ?? []) : [];
return Boolean(primary?.trim() || fallbacks.length > 0);
}
function pickMaxBytes(
cfg?: ClawdbotConfig,
maxBytesMb?: number,
): number | undefined {
if (
typeof maxBytesMb === "number" &&
Number.isFinite(maxBytesMb) &&
maxBytesMb > 0
) {
return Math.floor(maxBytesMb * 1024 * 1024);
}
const configured = cfg?.agent?.mediaMaxMb;
if (
typeof configured === "number" &&
Number.isFinite(configured) &&
configured > 0
) {
return Math.floor(configured * 1024 * 1024);
}
return undefined;
}
function buildImageContext(
prompt: string,
base64: string,
mimeType: string,
): Context {
return {
messages: [
{
role: "user",
content: [
{ type: "text", text: prompt },
{ type: "image", data: base64, mimeType },
],
timestamp: Date.now(),
},
],
};
}
async function runImagePrompt(params: {
cfg?: ClawdbotConfig;
modelOverride?: string;
prompt: string;
base64: string;
mimeType: string;
}): Promise<{ text: string; provider: string; model: string }> {
const agentDir = resolveClawdbotAgentDir();
await ensureClawdbotModelsJson(params.cfg);
const authStorage = discoverAuthStorage(agentDir);
const modelRegistry = discoverModels(authStorage, agentDir);
const result = await runWithImageModelFallback({
cfg: params.cfg,
modelOverride: params.modelOverride,
run: async (provider, modelId) => {
const model = modelRegistry.find(provider, modelId) as Model<Api> | null;
if (!model) {
throw new Error(`Unknown model: ${provider}/${modelId}`);
}
if (!model.input?.includes("image")) {
throw new Error(
`Model does not support images: ${provider}/${modelId}`,
);
}
const apiKeyInfo = await getApiKeyForModel({
model,
cfg: params.cfg,
});
authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey);
const context = buildImageContext(
params.prompt,
params.base64,
params.mimeType,
);
const message = (await complete(model, context, {
apiKey: apiKeyInfo.apiKey,
maxTokens: 512,
temperature: 0,
})) as AssistantMessage;
return message;
},
});
const text = extractAssistantText(result.result);
return {
text: text || "(no text returned)",
provider: result.provider,
model: result.model,
};
}
export function createImageTool(options?: {
config?: ClawdbotConfig;
}): AnyAgentTool | null {
if (!ensureImageToolConfigured(options?.config)) return null;
return {
label: "Image",
name: "image",
description:
"Analyze an image with the configured image model (agent.imageModel). Provide a prompt and image path or URL.",
parameters: Type.Object({
prompt: Type.Optional(Type.String()),
image: Type.String(),
model: Type.Optional(Type.String()),
maxBytesMb: Type.Optional(Type.Number()),
}),
execute: async (_toolCallId, args) => {
const record =
args && typeof args === "object"
? (args as Record<string, unknown>)
: {};
const imageRaw =
typeof record.image === "string" ? record.image.trim() : "";
if (!imageRaw) throw new Error("image required");
const promptRaw =
typeof record.prompt === "string" && record.prompt.trim()
? record.prompt.trim()
: DEFAULT_PROMPT;
const modelOverride =
typeof record.model === "string" && record.model.trim()
? record.model.trim()
: undefined;
const maxBytesMb =
typeof record.maxBytesMb === "number" ? record.maxBytesMb : undefined;
const maxBytes = pickMaxBytes(options?.config, maxBytesMb);
const resolvedImage = imageRaw.startsWith("~")
? resolveUserPath(imageRaw)
: imageRaw;
const media = await loadWebMedia(resolvedImage, maxBytes);
if (media.kind !== "image") {
throw new Error(`Unsupported media type: ${media.kind}`);
}
const mimeType = media.contentType ?? "image/png";
const base64 = media.buffer.toString("base64");
const result = await runImagePrompt({
cfg: options?.config,
modelOverride,
prompt: promptRaw,
base64,
mimeType,
});
return {
content: [{ type: "text", text: result.text }],
details: {
model: `${result.provider}/${result.model}`,
image: resolvedImage,
},
};
},
};
}