diff --git a/src/tts/tts-core.ts b/src/tts/tts-core.ts new file mode 100644 index 000000000..da7b17877 --- /dev/null +++ b/src/tts/tts-core.ts @@ -0,0 +1,673 @@ +import { completeSimple, type TextContent } from "@mariozechner/pi-ai"; +import { EdgeTTS } from "node-edge-tts"; +import { rmSync } from "node:fs"; +import type { OpenClawConfig } from "../config/config.js"; +import type { + ResolvedTtsConfig, + ResolvedTtsModelOverrides, + TtsDirectiveOverrides, + TtsDirectiveParseResult, +} from "./tts.js"; +import { getApiKeyForModel, requireApiKey } from "../agents/model-auth.js"; +import { + buildModelAliasIndex, + resolveDefaultModelForAgent, + resolveModelRefFromString, + type ModelRef, +} from "../agents/model-selection.js"; +import { resolveModel } from "../agents/pi-embedded-runner/model.js"; + +const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io"; +const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes + +export function isValidVoiceId(voiceId: string): boolean { + return /^[a-zA-Z0-9]{10,40}$/.test(voiceId); +} + +function normalizeElevenLabsBaseUrl(baseUrl: string): string { + const trimmed = baseUrl.trim(); + if (!trimmed) { + return DEFAULT_ELEVENLABS_BASE_URL; + } + return trimmed.replace(/\/+$/, ""); +} + +function requireInRange(value: number, min: number, max: number, label: string): void { + if (!Number.isFinite(value) || value < min || value > max) { + throw new Error(`${label} must be between ${min} and ${max}`); + } +} + +function assertElevenLabsVoiceSettings(settings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"]) { + requireInRange(settings.stability, 0, 1, "stability"); + requireInRange(settings.similarityBoost, 0, 1, "similarityBoost"); + requireInRange(settings.style, 0, 1, "style"); + requireInRange(settings.speed, 0.5, 2, "speed"); +} + +function normalizeLanguageCode(code?: string): string | undefined { + const trimmed = code?.trim(); + if (!trimmed) { + return undefined; + } + const normalized = trimmed.toLowerCase(); + if (!/^[a-z]{2}$/.test(normalized)) { + throw new Error("languageCode must be a 2-letter ISO 639-1 code (e.g. en, de, fr)"); + } + return normalized; +} + +function normalizeApplyTextNormalization(mode?: string): "auto" | "on" | "off" | undefined { + const trimmed = mode?.trim(); + if (!trimmed) { + return undefined; + } + const normalized = trimmed.toLowerCase(); + if (normalized === "auto" || normalized === "on" || normalized === "off") { + return normalized; + } + throw new Error("applyTextNormalization must be one of: auto, on, off"); +} + +function normalizeSeed(seed?: number): number | undefined { + if (seed == null) { + return undefined; + } + const next = Math.floor(seed); + if (!Number.isFinite(next) || next < 0 || next > 4_294_967_295) { + throw new Error("seed must be between 0 and 4294967295"); + } + return next; +} + +function parseBooleanValue(value: string): boolean | undefined { + const normalized = value.trim().toLowerCase(); + if (["true", "1", "yes", "on"].includes(normalized)) { + return true; + } + if (["false", "0", "no", "off"].includes(normalized)) { + return false; + } + return undefined; +} + +function parseNumberValue(value: string): number | undefined { + const parsed = Number.parseFloat(value); + return Number.isFinite(parsed) ? parsed : undefined; +} + +export function parseTtsDirectives( + text: string, + policy: ResolvedTtsModelOverrides, +): TtsDirectiveParseResult { + if (!policy.enabled) { + return { cleanedText: text, overrides: {}, warnings: [], hasDirective: false }; + } + + const overrides: TtsDirectiveOverrides = {}; + const warnings: string[] = []; + let cleanedText = text; + let hasDirective = false; + + const blockRegex = /\[\[tts:text\]\]([\s\S]*?)\[\[\/tts:text\]\]/gi; + cleanedText = cleanedText.replace(blockRegex, (_match, inner: string) => { + hasDirective = true; + if (policy.allowText && overrides.ttsText == null) { + overrides.ttsText = inner.trim(); + } + return ""; + }); + + const directiveRegex = /\[\[tts:([^\]]+)\]\]/gi; + cleanedText = cleanedText.replace(directiveRegex, (_match, body: string) => { + hasDirective = true; + const tokens = body.split(/\s+/).filter(Boolean); + for (const token of tokens) { + const eqIndex = token.indexOf("="); + if (eqIndex === -1) { + continue; + } + const rawKey = token.slice(0, eqIndex).trim(); + const rawValue = token.slice(eqIndex + 1).trim(); + if (!rawKey || !rawValue) { + continue; + } + const key = rawKey.toLowerCase(); + try { + switch (key) { + case "provider": + if (!policy.allowProvider) { + break; + } + if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") { + overrides.provider = rawValue; + } else { + warnings.push(`unsupported provider "${rawValue}"`); + } + break; + case "voice": + case "openai_voice": + case "openaivoice": + if (!policy.allowVoice) { + break; + } + if (isValidOpenAIVoice(rawValue)) { + overrides.openai = { ...overrides.openai, voice: rawValue }; + } else { + warnings.push(`invalid OpenAI voice "${rawValue}"`); + } + break; + case "voiceid": + case "voice_id": + case "elevenlabs_voice": + case "elevenlabsvoice": + if (!policy.allowVoice) { + break; + } + if (isValidVoiceId(rawValue)) { + overrides.elevenlabs = { ...overrides.elevenlabs, voiceId: rawValue }; + } else { + warnings.push(`invalid ElevenLabs voiceId "${rawValue}"`); + } + break; + case "model": + case "modelid": + case "model_id": + case "elevenlabs_model": + case "elevenlabsmodel": + case "openai_model": + case "openaimodel": + if (!policy.allowModelId) { + break; + } + if (isValidOpenAIModel(rawValue)) { + overrides.openai = { ...overrides.openai, model: rawValue }; + } else { + overrides.elevenlabs = { ...overrides.elevenlabs, modelId: rawValue }; + } + break; + case "stability": + if (!policy.allowVoiceSettings) { + break; + } + { + const value = parseNumberValue(rawValue); + if (value == null) { + warnings.push("invalid stability value"); + break; + } + requireInRange(value, 0, 1, "stability"); + overrides.elevenlabs = { + ...overrides.elevenlabs, + voiceSettings: { ...overrides.elevenlabs?.voiceSettings, stability: value }, + }; + } + break; + case "similarity": + case "similarityboost": + case "similarity_boost": + if (!policy.allowVoiceSettings) { + break; + } + { + const value = parseNumberValue(rawValue); + if (value == null) { + warnings.push("invalid similarityBoost value"); + break; + } + requireInRange(value, 0, 1, "similarityBoost"); + overrides.elevenlabs = { + ...overrides.elevenlabs, + voiceSettings: { ...overrides.elevenlabs?.voiceSettings, similarityBoost: value }, + }; + } + break; + case "style": + if (!policy.allowVoiceSettings) { + break; + } + { + const value = parseNumberValue(rawValue); + if (value == null) { + warnings.push("invalid style value"); + break; + } + requireInRange(value, 0, 1, "style"); + overrides.elevenlabs = { + ...overrides.elevenlabs, + voiceSettings: { ...overrides.elevenlabs?.voiceSettings, style: value }, + }; + } + break; + case "speed": + if (!policy.allowVoiceSettings) { + break; + } + { + const value = parseNumberValue(rawValue); + if (value == null) { + warnings.push("invalid speed value"); + break; + } + requireInRange(value, 0.5, 2, "speed"); + overrides.elevenlabs = { + ...overrides.elevenlabs, + voiceSettings: { ...overrides.elevenlabs?.voiceSettings, speed: value }, + }; + } + break; + case "speakerboost": + case "speaker_boost": + case "usespeakerboost": + case "use_speaker_boost": + if (!policy.allowVoiceSettings) { + break; + } + { + const value = parseBooleanValue(rawValue); + if (value == null) { + warnings.push("invalid useSpeakerBoost value"); + break; + } + overrides.elevenlabs = { + ...overrides.elevenlabs, + voiceSettings: { ...overrides.elevenlabs?.voiceSettings, useSpeakerBoost: value }, + }; + } + break; + case "normalize": + case "applytextnormalization": + case "apply_text_normalization": + if (!policy.allowNormalization) { + break; + } + overrides.elevenlabs = { + ...overrides.elevenlabs, + applyTextNormalization: normalizeApplyTextNormalization(rawValue), + }; + break; + case "language": + case "languagecode": + case "language_code": + if (!policy.allowNormalization) { + break; + } + overrides.elevenlabs = { + ...overrides.elevenlabs, + languageCode: normalizeLanguageCode(rawValue), + }; + break; + case "seed": + if (!policy.allowSeed) { + break; + } + overrides.elevenlabs = { + ...overrides.elevenlabs, + seed: normalizeSeed(Number.parseInt(rawValue, 10)), + }; + break; + default: + break; + } + } catch (err) { + warnings.push((err as Error).message); + } + } + return ""; + }); + + return { + cleanedText, + ttsText: overrides.ttsText, + hasDirective, + overrides, + warnings, + }; +} + +export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] as const; + +/** + * Custom OpenAI-compatible TTS endpoint. + * When set, model/voice validation is relaxed to allow non-OpenAI models. + * Example: OPENAI_TTS_BASE_URL=http://localhost:8880/v1 + * + * Note: Read at runtime (not module load) to support config.env loading. + */ +function getOpenAITtsBaseUrl(): string { + return (process.env.OPENAI_TTS_BASE_URL?.trim() || "https://api.openai.com/v1").replace( + /\/+$/, + "", + ); +} + +function isCustomOpenAIEndpoint(): boolean { + return getOpenAITtsBaseUrl() !== "https://api.openai.com/v1"; +} +export const OPENAI_TTS_VOICES = [ + "alloy", + "ash", + "ballad", + "cedar", + "coral", + "echo", + "fable", + "juniper", + "marin", + "onyx", + "nova", + "sage", + "shimmer", + "verse", +] as const; + +type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number]; + +export function isValidOpenAIModel(model: string): boolean { + // Allow any model when using custom endpoint (e.g., Kokoro, LocalAI) + if (isCustomOpenAIEndpoint()) { + return true; + } + return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]); +} + +export function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice { + // Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices) + if (isCustomOpenAIEndpoint()) { + return true; + } + return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice); +} + +type SummarizeResult = { + summary: string; + latencyMs: number; + inputLength: number; + outputLength: number; +}; + +type SummaryModelSelection = { + ref: ModelRef; + source: "summaryModel" | "default"; +}; + +function resolveSummaryModelRef( + cfg: OpenClawConfig, + config: ResolvedTtsConfig, +): SummaryModelSelection { + const defaultRef = resolveDefaultModelForAgent({ cfg }); + const override = config.summaryModel?.trim(); + if (!override) { + return { ref: defaultRef, source: "default" }; + } + + const aliasIndex = buildModelAliasIndex({ cfg, defaultProvider: defaultRef.provider }); + const resolved = resolveModelRefFromString({ + raw: override, + defaultProvider: defaultRef.provider, + aliasIndex, + }); + if (!resolved) { + return { ref: defaultRef, source: "default" }; + } + return { ref: resolved.ref, source: "summaryModel" }; +} + +function isTextContentBlock(block: { type: string }): block is TextContent { + return block.type === "text"; +} + +export async function summarizeText(params: { + text: string; + targetLength: number; + cfg: OpenClawConfig; + config: ResolvedTtsConfig; + timeoutMs: number; +}): Promise { + const { text, targetLength, cfg, config, timeoutMs } = params; + if (targetLength < 100 || targetLength > 10_000) { + throw new Error(`Invalid targetLength: ${targetLength}`); + } + + const startTime = Date.now(); + const { ref } = resolveSummaryModelRef(cfg, config); + const resolved = resolveModel(ref.provider, ref.model, undefined, cfg); + if (!resolved.model) { + throw new Error(resolved.error ?? `Unknown summary model: ${ref.provider}/${ref.model}`); + } + const apiKey = requireApiKey( + await getApiKeyForModel({ model: resolved.model, cfg }), + ref.provider, + ); + + try { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + + try { + const res = await completeSimple( + resolved.model, + { + messages: [ + { + role: "user", + content: + `You are an assistant that summarizes texts concisely while keeping the most important information. ` + + `Summarize the text to approximately ${targetLength} characters. Maintain the original tone and style. ` + + `Reply only with the summary, without additional explanations.\n\n` + + `\n${text}\n`, + timestamp: Date.now(), + }, + ], + }, + { + apiKey, + maxTokens: Math.ceil(targetLength / 2), + temperature: 0.3, + signal: controller.signal, + }, + ); + + const summary = res.content + .filter(isTextContentBlock) + .map((block) => block.text.trim()) + .filter(Boolean) + .join(" ") + .trim(); + + if (!summary) { + throw new Error("No summary returned"); + } + + return { + summary, + latencyMs: Date.now() - startTime, + inputLength: text.length, + outputLength: summary.length, + }; + } finally { + clearTimeout(timeout); + } + } catch (err) { + const error = err as Error; + if (error.name === "AbortError") { + throw new Error("Summarization timed out", { cause: err }); + } + throw err; + } +} + +export function scheduleCleanup( + tempDir: string, + delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS, +): void { + const timer = setTimeout(() => { + try { + rmSync(tempDir, { recursive: true, force: true }); + } catch { + // ignore cleanup errors + } + }, delayMs); + timer.unref(); +} + +export async function elevenLabsTTS(params: { + text: string; + apiKey: string; + baseUrl: string; + voiceId: string; + modelId: string; + outputFormat: string; + seed?: number; + applyTextNormalization?: "auto" | "on" | "off"; + languageCode?: string; + voiceSettings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"]; + timeoutMs: number; +}): Promise { + const { + text, + apiKey, + baseUrl, + voiceId, + modelId, + outputFormat, + seed, + applyTextNormalization, + languageCode, + voiceSettings, + timeoutMs, + } = params; + if (!isValidVoiceId(voiceId)) { + throw new Error("Invalid voiceId format"); + } + assertElevenLabsVoiceSettings(voiceSettings); + const normalizedLanguage = normalizeLanguageCode(languageCode); + const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization); + const normalizedSeed = normalizeSeed(seed); + + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + + try { + const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`); + if (outputFormat) { + url.searchParams.set("output_format", outputFormat); + } + + const response = await fetch(url.toString(), { + method: "POST", + headers: { + "xi-api-key": apiKey, + "Content-Type": "application/json", + Accept: "audio/mpeg", + }, + body: JSON.stringify({ + text, + model_id: modelId, + seed: normalizedSeed, + apply_text_normalization: normalizedNormalization, + language_code: normalizedLanguage, + voice_settings: { + stability: voiceSettings.stability, + similarity_boost: voiceSettings.similarityBoost, + style: voiceSettings.style, + use_speaker_boost: voiceSettings.useSpeakerBoost, + speed: voiceSettings.speed, + }, + }), + signal: controller.signal, + }); + + if (!response.ok) { + throw new Error(`ElevenLabs API error (${response.status})`); + } + + return Buffer.from(await response.arrayBuffer()); + } finally { + clearTimeout(timeout); + } +} + +export async function openaiTTS(params: { + text: string; + apiKey: string; + model: string; + voice: string; + responseFormat: "mp3" | "opus" | "pcm"; + timeoutMs: number; +}): Promise { + const { text, apiKey, model, voice, responseFormat, timeoutMs } = params; + + if (!isValidOpenAIModel(model)) { + throw new Error(`Invalid model: ${model}`); + } + if (!isValidOpenAIVoice(voice)) { + throw new Error(`Invalid voice: ${voice}`); + } + + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + + try { + const response = await fetch(`${getOpenAITtsBaseUrl()}/audio/speech`, { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model, + input: text, + voice, + response_format: responseFormat, + }), + signal: controller.signal, + }); + + if (!response.ok) { + throw new Error(`OpenAI TTS API error (${response.status})`); + } + + return Buffer.from(await response.arrayBuffer()); + } finally { + clearTimeout(timeout); + } +} + +export function inferEdgeExtension(outputFormat: string): string { + const normalized = outputFormat.toLowerCase(); + if (normalized.includes("webm")) { + return ".webm"; + } + if (normalized.includes("ogg")) { + return ".ogg"; + } + if (normalized.includes("opus")) { + return ".opus"; + } + if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) { + return ".wav"; + } + return ".mp3"; +} + +export async function edgeTTS(params: { + text: string; + outputPath: string; + config: ResolvedTtsConfig["edge"]; + timeoutMs: number; +}): Promise { + const { text, outputPath, config, timeoutMs } = params; + const tts = new EdgeTTS({ + voice: config.voice, + lang: config.lang, + outputFormat: config.outputFormat, + saveSubtitles: config.saveSubtitles, + proxy: config.proxy, + rate: config.rate, + pitch: config.pitch, + volume: config.volume, + timeout: config.timeoutMs ?? timeoutMs, + }); + await tts.ttsPromise(text, outputPath); +} diff --git a/src/tts/tts.ts b/src/tts/tts.ts index 4b4b3197c..b1177ce55 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -1,5 +1,3 @@ -import { completeSimple, type TextContent } from "@mariozechner/pi-ai"; -import { EdgeTTS } from "node-edge-tts"; import { existsSync, mkdirSync, @@ -22,25 +20,31 @@ import type { TtsProvider, TtsModelOverrideConfig, } from "../config/types.tts.js"; -import { getApiKeyForModel, requireApiKey } from "../agents/model-auth.js"; -import { - buildModelAliasIndex, - resolveDefaultModelForAgent, - resolveModelRefFromString, - type ModelRef, -} from "../agents/model-selection.js"; -import { resolveModel } from "../agents/pi-embedded-runner/model.js"; import { normalizeChannelId } from "../channels/plugins/index.js"; import { logVerbose } from "../globals.js"; import { stripMarkdown } from "../line/markdown-to-line.js"; import { isVoiceCompatibleAudio } from "../media/audio.js"; import { CONFIG_DIR, resolveUserPath } from "../utils.js"; +import { + edgeTTS, + elevenLabsTTS, + inferEdgeExtension, + isValidOpenAIModel, + isValidOpenAIVoice, + isValidVoiceId, + OPENAI_TTS_MODELS, + OPENAI_TTS_VOICES, + openaiTTS, + parseTtsDirectives, + scheduleCleanup, + summarizeText, +} from "./tts-core.js"; +export { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES } from "./tts-core.js"; const DEFAULT_TIMEOUT_MS = 30_000; const DEFAULT_TTS_MAX_LENGTH = 1500; const DEFAULT_TTS_SUMMARIZE = true; const DEFAULT_MAX_TEXT_LENGTH = 4096; -const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io"; const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE"; @@ -138,7 +142,7 @@ type TtsUserPrefs = { }; }; -type ResolvedTtsModelOverrides = { +export type ResolvedTtsModelOverrides = { enabled: boolean; allowText: boolean; allowProvider: boolean; @@ -149,7 +153,7 @@ type ResolvedTtsModelOverrides = { allowSeed: boolean; }; -type TtsDirectiveOverrides = { +export type TtsDirectiveOverrides = { ttsText?: string; provider?: TtsProvider; openai?: { @@ -166,7 +170,7 @@ type TtsDirectiveOverrides = { }; }; -type TtsDirectiveParseResult = { +export type TtsDirectiveParseResult = { cleanedText: string; ttsText?: string; hasDirective: boolean; @@ -515,655 +519,6 @@ export function isTtsProviderConfigured(config: ResolvedTtsConfig, provider: Tts return Boolean(resolveTtsApiKey(config, provider)); } -function isValidVoiceId(voiceId: string): boolean { - return /^[a-zA-Z0-9]{10,40}$/.test(voiceId); -} - -function normalizeElevenLabsBaseUrl(baseUrl: string): string { - const trimmed = baseUrl.trim(); - if (!trimmed) { - return DEFAULT_ELEVENLABS_BASE_URL; - } - return trimmed.replace(/\/+$/, ""); -} - -function requireInRange(value: number, min: number, max: number, label: string): void { - if (!Number.isFinite(value) || value < min || value > max) { - throw new Error(`${label} must be between ${min} and ${max}`); - } -} - -function assertElevenLabsVoiceSettings(settings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"]) { - requireInRange(settings.stability, 0, 1, "stability"); - requireInRange(settings.similarityBoost, 0, 1, "similarityBoost"); - requireInRange(settings.style, 0, 1, "style"); - requireInRange(settings.speed, 0.5, 2, "speed"); -} - -function normalizeLanguageCode(code?: string): string | undefined { - const trimmed = code?.trim(); - if (!trimmed) { - return undefined; - } - const normalized = trimmed.toLowerCase(); - if (!/^[a-z]{2}$/.test(normalized)) { - throw new Error("languageCode must be a 2-letter ISO 639-1 code (e.g. en, de, fr)"); - } - return normalized; -} - -function normalizeApplyTextNormalization(mode?: string): "auto" | "on" | "off" | undefined { - const trimmed = mode?.trim(); - if (!trimmed) { - return undefined; - } - const normalized = trimmed.toLowerCase(); - if (normalized === "auto" || normalized === "on" || normalized === "off") { - return normalized; - } - throw new Error("applyTextNormalization must be one of: auto, on, off"); -} - -function normalizeSeed(seed?: number): number | undefined { - if (seed == null) { - return undefined; - } - const next = Math.floor(seed); - if (!Number.isFinite(next) || next < 0 || next > 4_294_967_295) { - throw new Error("seed must be between 0 and 4294967295"); - } - return next; -} - -function parseBooleanValue(value: string): boolean | undefined { - const normalized = value.trim().toLowerCase(); - if (["true", "1", "yes", "on"].includes(normalized)) { - return true; - } - if (["false", "0", "no", "off"].includes(normalized)) { - return false; - } - return undefined; -} - -function parseNumberValue(value: string): number | undefined { - const parsed = Number.parseFloat(value); - return Number.isFinite(parsed) ? parsed : undefined; -} - -function parseTtsDirectives( - text: string, - policy: ResolvedTtsModelOverrides, -): TtsDirectiveParseResult { - if (!policy.enabled) { - return { cleanedText: text, overrides: {}, warnings: [], hasDirective: false }; - } - - const overrides: TtsDirectiveOverrides = {}; - const warnings: string[] = []; - let cleanedText = text; - let hasDirective = false; - - const blockRegex = /\[\[tts:text\]\]([\s\S]*?)\[\[\/tts:text\]\]/gi; - cleanedText = cleanedText.replace(blockRegex, (_match, inner: string) => { - hasDirective = true; - if (policy.allowText && overrides.ttsText == null) { - overrides.ttsText = inner.trim(); - } - return ""; - }); - - const directiveRegex = /\[\[tts:([^\]]+)\]\]/gi; - cleanedText = cleanedText.replace(directiveRegex, (_match, body: string) => { - hasDirective = true; - const tokens = body.split(/\s+/).filter(Boolean); - for (const token of tokens) { - const eqIndex = token.indexOf("="); - if (eqIndex === -1) { - continue; - } - const rawKey = token.slice(0, eqIndex).trim(); - const rawValue = token.slice(eqIndex + 1).trim(); - if (!rawKey || !rawValue) { - continue; - } - const key = rawKey.toLowerCase(); - try { - switch (key) { - case "provider": - if (!policy.allowProvider) { - break; - } - if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") { - overrides.provider = rawValue; - } else { - warnings.push(`unsupported provider "${rawValue}"`); - } - break; - case "voice": - case "openai_voice": - case "openaivoice": - if (!policy.allowVoice) { - break; - } - if (isValidOpenAIVoice(rawValue)) { - overrides.openai = { ...overrides.openai, voice: rawValue }; - } else { - warnings.push(`invalid OpenAI voice "${rawValue}"`); - } - break; - case "voiceid": - case "voice_id": - case "elevenlabs_voice": - case "elevenlabsvoice": - if (!policy.allowVoice) { - break; - } - if (isValidVoiceId(rawValue)) { - overrides.elevenlabs = { ...overrides.elevenlabs, voiceId: rawValue }; - } else { - warnings.push(`invalid ElevenLabs voiceId "${rawValue}"`); - } - break; - case "model": - case "modelid": - case "model_id": - case "elevenlabs_model": - case "elevenlabsmodel": - case "openai_model": - case "openaimodel": - if (!policy.allowModelId) { - break; - } - if (isValidOpenAIModel(rawValue)) { - overrides.openai = { ...overrides.openai, model: rawValue }; - } else { - overrides.elevenlabs = { ...overrides.elevenlabs, modelId: rawValue }; - } - break; - case "stability": - if (!policy.allowVoiceSettings) { - break; - } - { - const value = parseNumberValue(rawValue); - if (value == null) { - warnings.push("invalid stability value"); - break; - } - requireInRange(value, 0, 1, "stability"); - overrides.elevenlabs = { - ...overrides.elevenlabs, - voiceSettings: { ...overrides.elevenlabs?.voiceSettings, stability: value }, - }; - } - break; - case "similarity": - case "similarityboost": - case "similarity_boost": - if (!policy.allowVoiceSettings) { - break; - } - { - const value = parseNumberValue(rawValue); - if (value == null) { - warnings.push("invalid similarityBoost value"); - break; - } - requireInRange(value, 0, 1, "similarityBoost"); - overrides.elevenlabs = { - ...overrides.elevenlabs, - voiceSettings: { ...overrides.elevenlabs?.voiceSettings, similarityBoost: value }, - }; - } - break; - case "style": - if (!policy.allowVoiceSettings) { - break; - } - { - const value = parseNumberValue(rawValue); - if (value == null) { - warnings.push("invalid style value"); - break; - } - requireInRange(value, 0, 1, "style"); - overrides.elevenlabs = { - ...overrides.elevenlabs, - voiceSettings: { ...overrides.elevenlabs?.voiceSettings, style: value }, - }; - } - break; - case "speed": - if (!policy.allowVoiceSettings) { - break; - } - { - const value = parseNumberValue(rawValue); - if (value == null) { - warnings.push("invalid speed value"); - break; - } - requireInRange(value, 0.5, 2, "speed"); - overrides.elevenlabs = { - ...overrides.elevenlabs, - voiceSettings: { ...overrides.elevenlabs?.voiceSettings, speed: value }, - }; - } - break; - case "speakerboost": - case "speaker_boost": - case "usespeakerboost": - case "use_speaker_boost": - if (!policy.allowVoiceSettings) { - break; - } - { - const value = parseBooleanValue(rawValue); - if (value == null) { - warnings.push("invalid useSpeakerBoost value"); - break; - } - overrides.elevenlabs = { - ...overrides.elevenlabs, - voiceSettings: { ...overrides.elevenlabs?.voiceSettings, useSpeakerBoost: value }, - }; - } - break; - case "normalize": - case "applytextnormalization": - case "apply_text_normalization": - if (!policy.allowNormalization) { - break; - } - overrides.elevenlabs = { - ...overrides.elevenlabs, - applyTextNormalization: normalizeApplyTextNormalization(rawValue), - }; - break; - case "language": - case "languagecode": - case "language_code": - if (!policy.allowNormalization) { - break; - } - overrides.elevenlabs = { - ...overrides.elevenlabs, - languageCode: normalizeLanguageCode(rawValue), - }; - break; - case "seed": - if (!policy.allowSeed) { - break; - } - overrides.elevenlabs = { - ...overrides.elevenlabs, - seed: normalizeSeed(Number.parseInt(rawValue, 10)), - }; - break; - default: - break; - } - } catch (err) { - warnings.push((err as Error).message); - } - } - return ""; - }); - - return { - cleanedText, - ttsText: overrides.ttsText, - hasDirective, - overrides, - warnings, - }; -} - -export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] as const; - -/** - * Custom OpenAI-compatible TTS endpoint. - * When set, model/voice validation is relaxed to allow non-OpenAI models. - * Example: OPENAI_TTS_BASE_URL=http://localhost:8880/v1 - * - * Note: Read at runtime (not module load) to support config.env loading. - */ -function getOpenAITtsBaseUrl(): string { - return (process.env.OPENAI_TTS_BASE_URL?.trim() || "https://api.openai.com/v1").replace( - /\/+$/, - "", - ); -} - -function isCustomOpenAIEndpoint(): boolean { - return getOpenAITtsBaseUrl() !== "https://api.openai.com/v1"; -} -export const OPENAI_TTS_VOICES = [ - "alloy", - "ash", - "ballad", - "cedar", - "coral", - "echo", - "fable", - "juniper", - "marin", - "onyx", - "nova", - "sage", - "shimmer", - "verse", -] as const; - -type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number]; - -function isValidOpenAIModel(model: string): boolean { - // Allow any model when using custom endpoint (e.g., Kokoro, LocalAI) - if (isCustomOpenAIEndpoint()) { - return true; - } - return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]); -} - -function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice { - // Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices) - if (isCustomOpenAIEndpoint()) { - return true; - } - return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice); -} - -type SummarizeResult = { - summary: string; - latencyMs: number; - inputLength: number; - outputLength: number; -}; - -type SummaryModelSelection = { - ref: ModelRef; - source: "summaryModel" | "default"; -}; - -function resolveSummaryModelRef( - cfg: OpenClawConfig, - config: ResolvedTtsConfig, -): SummaryModelSelection { - const defaultRef = resolveDefaultModelForAgent({ cfg }); - const override = config.summaryModel?.trim(); - if (!override) { - return { ref: defaultRef, source: "default" }; - } - - const aliasIndex = buildModelAliasIndex({ cfg, defaultProvider: defaultRef.provider }); - const resolved = resolveModelRefFromString({ - raw: override, - defaultProvider: defaultRef.provider, - aliasIndex, - }); - if (!resolved) { - return { ref: defaultRef, source: "default" }; - } - return { ref: resolved.ref, source: "summaryModel" }; -} - -function isTextContentBlock(block: { type: string }): block is TextContent { - return block.type === "text"; -} - -async function summarizeText(params: { - text: string; - targetLength: number; - cfg: OpenClawConfig; - config: ResolvedTtsConfig; - timeoutMs: number; -}): Promise { - const { text, targetLength, cfg, config, timeoutMs } = params; - if (targetLength < 100 || targetLength > 10_000) { - throw new Error(`Invalid targetLength: ${targetLength}`); - } - - const startTime = Date.now(); - const { ref } = resolveSummaryModelRef(cfg, config); - const resolved = resolveModel(ref.provider, ref.model, undefined, cfg); - if (!resolved.model) { - throw new Error(resolved.error ?? `Unknown summary model: ${ref.provider}/${ref.model}`); - } - const apiKey = requireApiKey( - await getApiKeyForModel({ model: resolved.model, cfg }), - ref.provider, - ); - - try { - const controller = new AbortController(); - const timeout = setTimeout(controller.abort.bind(controller), timeoutMs); - - try { - const res = await completeSimple( - resolved.model, - { - messages: [ - { - role: "user", - content: - `You are an assistant that summarizes texts concisely while keeping the most important information. ` + - `Summarize the text to approximately ${targetLength} characters. Maintain the original tone and style. ` + - `Reply only with the summary, without additional explanations.\n\n` + - `\n${text}\n`, - timestamp: Date.now(), - }, - ], - }, - { - apiKey, - maxTokens: Math.ceil(targetLength / 2), - temperature: 0.3, - signal: controller.signal, - }, - ); - - const summary = res.content - .filter(isTextContentBlock) - .map((block) => block.text.trim()) - .filter(Boolean) - .join(" ") - .trim(); - - if (!summary) { - throw new Error("No summary returned"); - } - - return { - summary, - latencyMs: Date.now() - startTime, - inputLength: text.length, - outputLength: summary.length, - }; - } finally { - clearTimeout(timeout); - } - } catch (err) { - const error = err as Error; - if (error.name === "AbortError") { - throw new Error("Summarization timed out", { cause: err }); - } - throw err; - } -} - -function scheduleCleanup(tempDir: string, delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS): void { - const timer = setTimeout(() => { - try { - rmSync(tempDir, { recursive: true, force: true }); - } catch { - // ignore cleanup errors - } - }, delayMs); - timer.unref(); -} - -async function elevenLabsTTS(params: { - text: string; - apiKey: string; - baseUrl: string; - voiceId: string; - modelId: string; - outputFormat: string; - seed?: number; - applyTextNormalization?: "auto" | "on" | "off"; - languageCode?: string; - voiceSettings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"]; - timeoutMs: number; -}): Promise { - const { - text, - apiKey, - baseUrl, - voiceId, - modelId, - outputFormat, - seed, - applyTextNormalization, - languageCode, - voiceSettings, - timeoutMs, - } = params; - if (!isValidVoiceId(voiceId)) { - throw new Error("Invalid voiceId format"); - } - assertElevenLabsVoiceSettings(voiceSettings); - const normalizedLanguage = normalizeLanguageCode(languageCode); - const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization); - const normalizedSeed = normalizeSeed(seed); - - const controller = new AbortController(); - const timeout = setTimeout(controller.abort.bind(controller), timeoutMs); - - try { - const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`); - if (outputFormat) { - url.searchParams.set("output_format", outputFormat); - } - - const response = await fetch(url.toString(), { - method: "POST", - headers: { - "xi-api-key": apiKey, - "Content-Type": "application/json", - Accept: "audio/mpeg", - }, - body: JSON.stringify({ - text, - model_id: modelId, - seed: normalizedSeed, - apply_text_normalization: normalizedNormalization, - language_code: normalizedLanguage, - voice_settings: { - stability: voiceSettings.stability, - similarity_boost: voiceSettings.similarityBoost, - style: voiceSettings.style, - use_speaker_boost: voiceSettings.useSpeakerBoost, - speed: voiceSettings.speed, - }, - }), - signal: controller.signal, - }); - - if (!response.ok) { - throw new Error(`ElevenLabs API error (${response.status})`); - } - - return Buffer.from(await response.arrayBuffer()); - } finally { - clearTimeout(timeout); - } -} - -async function openaiTTS(params: { - text: string; - apiKey: string; - model: string; - voice: string; - responseFormat: "mp3" | "opus" | "pcm"; - timeoutMs: number; -}): Promise { - const { text, apiKey, model, voice, responseFormat, timeoutMs } = params; - - if (!isValidOpenAIModel(model)) { - throw new Error(`Invalid model: ${model}`); - } - if (!isValidOpenAIVoice(voice)) { - throw new Error(`Invalid voice: ${voice}`); - } - - const controller = new AbortController(); - const timeout = setTimeout(controller.abort.bind(controller), timeoutMs); - - try { - const response = await fetch(`${getOpenAITtsBaseUrl()}/audio/speech`, { - method: "POST", - headers: { - Authorization: `Bearer ${apiKey}`, - "Content-Type": "application/json", - }, - body: JSON.stringify({ - model, - input: text, - voice, - response_format: responseFormat, - }), - signal: controller.signal, - }); - - if (!response.ok) { - throw new Error(`OpenAI TTS API error (${response.status})`); - } - - return Buffer.from(await response.arrayBuffer()); - } finally { - clearTimeout(timeout); - } -} - -function inferEdgeExtension(outputFormat: string): string { - const normalized = outputFormat.toLowerCase(); - if (normalized.includes("webm")) { - return ".webm"; - } - if (normalized.includes("ogg")) { - return ".ogg"; - } - if (normalized.includes("opus")) { - return ".opus"; - } - if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) { - return ".wav"; - } - return ".mp3"; -} - -async function edgeTTS(params: { - text: string; - outputPath: string; - config: ResolvedTtsConfig["edge"]; - timeoutMs: number; -}): Promise { - const { text, outputPath, config, timeoutMs } = params; - const tts = new EdgeTTS({ - voice: config.voice, - lang: config.lang, - outputFormat: config.outputFormat, - saveSubtitles: config.saveSubtitles, - proxy: config.proxy, - rate: config.rate, - pitch: config.pitch, - volume: config.volume, - timeout: config.timeoutMs ?? timeoutMs, - }); - await tts.ttsPromise(text, outputPath); -} - export async function textToSpeech(params: { text: string; cfg: OpenClawConfig;