refactor(tts): extract directives and provider core

This commit is contained in:
Peter Steinberger
2026-02-13 17:44:22 +00:00
parent 83bc73f4ea
commit 3f5e72835e
2 changed files with 691 additions and 663 deletions

673
src/tts/tts-core.ts Normal file
View File

@@ -0,0 +1,673 @@
import { completeSimple, type TextContent } from "@mariozechner/pi-ai";
import { EdgeTTS } from "node-edge-tts";
import { rmSync } from "node:fs";
import type { OpenClawConfig } from "../config/config.js";
import type {
ResolvedTtsConfig,
ResolvedTtsModelOverrides,
TtsDirectiveOverrides,
TtsDirectiveParseResult,
} from "./tts.js";
import { getApiKeyForModel, requireApiKey } from "../agents/model-auth.js";
import {
buildModelAliasIndex,
resolveDefaultModelForAgent,
resolveModelRefFromString,
type ModelRef,
} from "../agents/model-selection.js";
import { resolveModel } from "../agents/pi-embedded-runner/model.js";
const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes
export function isValidVoiceId(voiceId: string): boolean {
return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
}
function normalizeElevenLabsBaseUrl(baseUrl: string): string {
const trimmed = baseUrl.trim();
if (!trimmed) {
return DEFAULT_ELEVENLABS_BASE_URL;
}
return trimmed.replace(/\/+$/, "");
}
function requireInRange(value: number, min: number, max: number, label: string): void {
if (!Number.isFinite(value) || value < min || value > max) {
throw new Error(`${label} must be between ${min} and ${max}`);
}
}
function assertElevenLabsVoiceSettings(settings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"]) {
requireInRange(settings.stability, 0, 1, "stability");
requireInRange(settings.similarityBoost, 0, 1, "similarityBoost");
requireInRange(settings.style, 0, 1, "style");
requireInRange(settings.speed, 0.5, 2, "speed");
}
function normalizeLanguageCode(code?: string): string | undefined {
const trimmed = code?.trim();
if (!trimmed) {
return undefined;
}
const normalized = trimmed.toLowerCase();
if (!/^[a-z]{2}$/.test(normalized)) {
throw new Error("languageCode must be a 2-letter ISO 639-1 code (e.g. en, de, fr)");
}
return normalized;
}
function normalizeApplyTextNormalization(mode?: string): "auto" | "on" | "off" | undefined {
const trimmed = mode?.trim();
if (!trimmed) {
return undefined;
}
const normalized = trimmed.toLowerCase();
if (normalized === "auto" || normalized === "on" || normalized === "off") {
return normalized;
}
throw new Error("applyTextNormalization must be one of: auto, on, off");
}
function normalizeSeed(seed?: number): number | undefined {
if (seed == null) {
return undefined;
}
const next = Math.floor(seed);
if (!Number.isFinite(next) || next < 0 || next > 4_294_967_295) {
throw new Error("seed must be between 0 and 4294967295");
}
return next;
}
function parseBooleanValue(value: string): boolean | undefined {
const normalized = value.trim().toLowerCase();
if (["true", "1", "yes", "on"].includes(normalized)) {
return true;
}
if (["false", "0", "no", "off"].includes(normalized)) {
return false;
}
return undefined;
}
function parseNumberValue(value: string): number | undefined {
const parsed = Number.parseFloat(value);
return Number.isFinite(parsed) ? parsed : undefined;
}
export function parseTtsDirectives(
text: string,
policy: ResolvedTtsModelOverrides,
): TtsDirectiveParseResult {
if (!policy.enabled) {
return { cleanedText: text, overrides: {}, warnings: [], hasDirective: false };
}
const overrides: TtsDirectiveOverrides = {};
const warnings: string[] = [];
let cleanedText = text;
let hasDirective = false;
const blockRegex = /\[\[tts:text\]\]([\s\S]*?)\[\[\/tts:text\]\]/gi;
cleanedText = cleanedText.replace(blockRegex, (_match, inner: string) => {
hasDirective = true;
if (policy.allowText && overrides.ttsText == null) {
overrides.ttsText = inner.trim();
}
return "";
});
const directiveRegex = /\[\[tts:([^\]]+)\]\]/gi;
cleanedText = cleanedText.replace(directiveRegex, (_match, body: string) => {
hasDirective = true;
const tokens = body.split(/\s+/).filter(Boolean);
for (const token of tokens) {
const eqIndex = token.indexOf("=");
if (eqIndex === -1) {
continue;
}
const rawKey = token.slice(0, eqIndex).trim();
const rawValue = token.slice(eqIndex + 1).trim();
if (!rawKey || !rawValue) {
continue;
}
const key = rawKey.toLowerCase();
try {
switch (key) {
case "provider":
if (!policy.allowProvider) {
break;
}
if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") {
overrides.provider = rawValue;
} else {
warnings.push(`unsupported provider "${rawValue}"`);
}
break;
case "voice":
case "openai_voice":
case "openaivoice":
if (!policy.allowVoice) {
break;
}
if (isValidOpenAIVoice(rawValue)) {
overrides.openai = { ...overrides.openai, voice: rawValue };
} else {
warnings.push(`invalid OpenAI voice "${rawValue}"`);
}
break;
case "voiceid":
case "voice_id":
case "elevenlabs_voice":
case "elevenlabsvoice":
if (!policy.allowVoice) {
break;
}
if (isValidVoiceId(rawValue)) {
overrides.elevenlabs = { ...overrides.elevenlabs, voiceId: rawValue };
} else {
warnings.push(`invalid ElevenLabs voiceId "${rawValue}"`);
}
break;
case "model":
case "modelid":
case "model_id":
case "elevenlabs_model":
case "elevenlabsmodel":
case "openai_model":
case "openaimodel":
if (!policy.allowModelId) {
break;
}
if (isValidOpenAIModel(rawValue)) {
overrides.openai = { ...overrides.openai, model: rawValue };
} else {
overrides.elevenlabs = { ...overrides.elevenlabs, modelId: rawValue };
}
break;
case "stability":
if (!policy.allowVoiceSettings) {
break;
}
{
const value = parseNumberValue(rawValue);
if (value == null) {
warnings.push("invalid stability value");
break;
}
requireInRange(value, 0, 1, "stability");
overrides.elevenlabs = {
...overrides.elevenlabs,
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, stability: value },
};
}
break;
case "similarity":
case "similarityboost":
case "similarity_boost":
if (!policy.allowVoiceSettings) {
break;
}
{
const value = parseNumberValue(rawValue);
if (value == null) {
warnings.push("invalid similarityBoost value");
break;
}
requireInRange(value, 0, 1, "similarityBoost");
overrides.elevenlabs = {
...overrides.elevenlabs,
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, similarityBoost: value },
};
}
break;
case "style":
if (!policy.allowVoiceSettings) {
break;
}
{
const value = parseNumberValue(rawValue);
if (value == null) {
warnings.push("invalid style value");
break;
}
requireInRange(value, 0, 1, "style");
overrides.elevenlabs = {
...overrides.elevenlabs,
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, style: value },
};
}
break;
case "speed":
if (!policy.allowVoiceSettings) {
break;
}
{
const value = parseNumberValue(rawValue);
if (value == null) {
warnings.push("invalid speed value");
break;
}
requireInRange(value, 0.5, 2, "speed");
overrides.elevenlabs = {
...overrides.elevenlabs,
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, speed: value },
};
}
break;
case "speakerboost":
case "speaker_boost":
case "usespeakerboost":
case "use_speaker_boost":
if (!policy.allowVoiceSettings) {
break;
}
{
const value = parseBooleanValue(rawValue);
if (value == null) {
warnings.push("invalid useSpeakerBoost value");
break;
}
overrides.elevenlabs = {
...overrides.elevenlabs,
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, useSpeakerBoost: value },
};
}
break;
case "normalize":
case "applytextnormalization":
case "apply_text_normalization":
if (!policy.allowNormalization) {
break;
}
overrides.elevenlabs = {
...overrides.elevenlabs,
applyTextNormalization: normalizeApplyTextNormalization(rawValue),
};
break;
case "language":
case "languagecode":
case "language_code":
if (!policy.allowNormalization) {
break;
}
overrides.elevenlabs = {
...overrides.elevenlabs,
languageCode: normalizeLanguageCode(rawValue),
};
break;
case "seed":
if (!policy.allowSeed) {
break;
}
overrides.elevenlabs = {
...overrides.elevenlabs,
seed: normalizeSeed(Number.parseInt(rawValue, 10)),
};
break;
default:
break;
}
} catch (err) {
warnings.push((err as Error).message);
}
}
return "";
});
return {
cleanedText,
ttsText: overrides.ttsText,
hasDirective,
overrides,
warnings,
};
}
export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] as const;
/**
* Custom OpenAI-compatible TTS endpoint.
* When set, model/voice validation is relaxed to allow non-OpenAI models.
* Example: OPENAI_TTS_BASE_URL=http://localhost:8880/v1
*
* Note: Read at runtime (not module load) to support config.env loading.
*/
function getOpenAITtsBaseUrl(): string {
return (process.env.OPENAI_TTS_BASE_URL?.trim() || "https://api.openai.com/v1").replace(
/\/+$/,
"",
);
}
function isCustomOpenAIEndpoint(): boolean {
return getOpenAITtsBaseUrl() !== "https://api.openai.com/v1";
}
export const OPENAI_TTS_VOICES = [
"alloy",
"ash",
"ballad",
"cedar",
"coral",
"echo",
"fable",
"juniper",
"marin",
"onyx",
"nova",
"sage",
"shimmer",
"verse",
] as const;
type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
export function isValidOpenAIModel(model: string): boolean {
// Allow any model when using custom endpoint (e.g., Kokoro, LocalAI)
if (isCustomOpenAIEndpoint()) {
return true;
}
return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
}
export function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice {
// Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices)
if (isCustomOpenAIEndpoint()) {
return true;
}
return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
}
type SummarizeResult = {
summary: string;
latencyMs: number;
inputLength: number;
outputLength: number;
};
type SummaryModelSelection = {
ref: ModelRef;
source: "summaryModel" | "default";
};
function resolveSummaryModelRef(
cfg: OpenClawConfig,
config: ResolvedTtsConfig,
): SummaryModelSelection {
const defaultRef = resolveDefaultModelForAgent({ cfg });
const override = config.summaryModel?.trim();
if (!override) {
return { ref: defaultRef, source: "default" };
}
const aliasIndex = buildModelAliasIndex({ cfg, defaultProvider: defaultRef.provider });
const resolved = resolveModelRefFromString({
raw: override,
defaultProvider: defaultRef.provider,
aliasIndex,
});
if (!resolved) {
return { ref: defaultRef, source: "default" };
}
return { ref: resolved.ref, source: "summaryModel" };
}
function isTextContentBlock(block: { type: string }): block is TextContent {
return block.type === "text";
}
export async function summarizeText(params: {
text: string;
targetLength: number;
cfg: OpenClawConfig;
config: ResolvedTtsConfig;
timeoutMs: number;
}): Promise<SummarizeResult> {
const { text, targetLength, cfg, config, timeoutMs } = params;
if (targetLength < 100 || targetLength > 10_000) {
throw new Error(`Invalid targetLength: ${targetLength}`);
}
const startTime = Date.now();
const { ref } = resolveSummaryModelRef(cfg, config);
const resolved = resolveModel(ref.provider, ref.model, undefined, cfg);
if (!resolved.model) {
throw new Error(resolved.error ?? `Unknown summary model: ${ref.provider}/${ref.model}`);
}
const apiKey = requireApiKey(
await getApiKeyForModel({ model: resolved.model, cfg }),
ref.provider,
);
try {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
const res = await completeSimple(
resolved.model,
{
messages: [
{
role: "user",
content:
`You are an assistant that summarizes texts concisely while keeping the most important information. ` +
`Summarize the text to approximately ${targetLength} characters. Maintain the original tone and style. ` +
`Reply only with the summary, without additional explanations.\n\n` +
`<text_to_summarize>\n${text}\n</text_to_summarize>`,
timestamp: Date.now(),
},
],
},
{
apiKey,
maxTokens: Math.ceil(targetLength / 2),
temperature: 0.3,
signal: controller.signal,
},
);
const summary = res.content
.filter(isTextContentBlock)
.map((block) => block.text.trim())
.filter(Boolean)
.join(" ")
.trim();
if (!summary) {
throw new Error("No summary returned");
}
return {
summary,
latencyMs: Date.now() - startTime,
inputLength: text.length,
outputLength: summary.length,
};
} finally {
clearTimeout(timeout);
}
} catch (err) {
const error = err as Error;
if (error.name === "AbortError") {
throw new Error("Summarization timed out", { cause: err });
}
throw err;
}
}
export function scheduleCleanup(
tempDir: string,
delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS,
): void {
const timer = setTimeout(() => {
try {
rmSync(tempDir, { recursive: true, force: true });
} catch {
// ignore cleanup errors
}
}, delayMs);
timer.unref();
}
export async function elevenLabsTTS(params: {
text: string;
apiKey: string;
baseUrl: string;
voiceId: string;
modelId: string;
outputFormat: string;
seed?: number;
applyTextNormalization?: "auto" | "on" | "off";
languageCode?: string;
voiceSettings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"];
timeoutMs: number;
}): Promise<Buffer> {
const {
text,
apiKey,
baseUrl,
voiceId,
modelId,
outputFormat,
seed,
applyTextNormalization,
languageCode,
voiceSettings,
timeoutMs,
} = params;
if (!isValidVoiceId(voiceId)) {
throw new Error("Invalid voiceId format");
}
assertElevenLabsVoiceSettings(voiceSettings);
const normalizedLanguage = normalizeLanguageCode(languageCode);
const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization);
const normalizedSeed = normalizeSeed(seed);
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`);
if (outputFormat) {
url.searchParams.set("output_format", outputFormat);
}
const response = await fetch(url.toString(), {
method: "POST",
headers: {
"xi-api-key": apiKey,
"Content-Type": "application/json",
Accept: "audio/mpeg",
},
body: JSON.stringify({
text,
model_id: modelId,
seed: normalizedSeed,
apply_text_normalization: normalizedNormalization,
language_code: normalizedLanguage,
voice_settings: {
stability: voiceSettings.stability,
similarity_boost: voiceSettings.similarityBoost,
style: voiceSettings.style,
use_speaker_boost: voiceSettings.useSpeakerBoost,
speed: voiceSettings.speed,
},
}),
signal: controller.signal,
});
if (!response.ok) {
throw new Error(`ElevenLabs API error (${response.status})`);
}
return Buffer.from(await response.arrayBuffer());
} finally {
clearTimeout(timeout);
}
}
export async function openaiTTS(params: {
text: string;
apiKey: string;
model: string;
voice: string;
responseFormat: "mp3" | "opus" | "pcm";
timeoutMs: number;
}): Promise<Buffer> {
const { text, apiKey, model, voice, responseFormat, timeoutMs } = params;
if (!isValidOpenAIModel(model)) {
throw new Error(`Invalid model: ${model}`);
}
if (!isValidOpenAIVoice(voice)) {
throw new Error(`Invalid voice: ${voice}`);
}
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
const response = await fetch(`${getOpenAITtsBaseUrl()}/audio/speech`, {
method: "POST",
headers: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
model,
input: text,
voice,
response_format: responseFormat,
}),
signal: controller.signal,
});
if (!response.ok) {
throw new Error(`OpenAI TTS API error (${response.status})`);
}
return Buffer.from(await response.arrayBuffer());
} finally {
clearTimeout(timeout);
}
}
export function inferEdgeExtension(outputFormat: string): string {
const normalized = outputFormat.toLowerCase();
if (normalized.includes("webm")) {
return ".webm";
}
if (normalized.includes("ogg")) {
return ".ogg";
}
if (normalized.includes("opus")) {
return ".opus";
}
if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) {
return ".wav";
}
return ".mp3";
}
export async function edgeTTS(params: {
text: string;
outputPath: string;
config: ResolvedTtsConfig["edge"];
timeoutMs: number;
}): Promise<void> {
const { text, outputPath, config, timeoutMs } = params;
const tts = new EdgeTTS({
voice: config.voice,
lang: config.lang,
outputFormat: config.outputFormat,
saveSubtitles: config.saveSubtitles,
proxy: config.proxy,
rate: config.rate,
pitch: config.pitch,
volume: config.volume,
timeout: config.timeoutMs ?? timeoutMs,
});
await tts.ttsPromise(text, outputPath);
}

View File

@@ -1,5 +1,3 @@
import { completeSimple, type TextContent } from "@mariozechner/pi-ai";
import { EdgeTTS } from "node-edge-tts";
import {
existsSync,
mkdirSync,
@@ -22,25 +20,31 @@ import type {
TtsProvider,
TtsModelOverrideConfig,
} from "../config/types.tts.js";
import { getApiKeyForModel, requireApiKey } from "../agents/model-auth.js";
import {
buildModelAliasIndex,
resolveDefaultModelForAgent,
resolveModelRefFromString,
type ModelRef,
} from "../agents/model-selection.js";
import { resolveModel } from "../agents/pi-embedded-runner/model.js";
import { normalizeChannelId } from "../channels/plugins/index.js";
import { logVerbose } from "../globals.js";
import { stripMarkdown } from "../line/markdown-to-line.js";
import { isVoiceCompatibleAudio } from "../media/audio.js";
import { CONFIG_DIR, resolveUserPath } from "../utils.js";
import {
edgeTTS,
elevenLabsTTS,
inferEdgeExtension,
isValidOpenAIModel,
isValidOpenAIVoice,
isValidVoiceId,
OPENAI_TTS_MODELS,
OPENAI_TTS_VOICES,
openaiTTS,
parseTtsDirectives,
scheduleCleanup,
summarizeText,
} from "./tts-core.js";
export { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES } from "./tts-core.js";
const DEFAULT_TIMEOUT_MS = 30_000;
const DEFAULT_TTS_MAX_LENGTH = 1500;
const DEFAULT_TTS_SUMMARIZE = true;
const DEFAULT_MAX_TEXT_LENGTH = 4096;
const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes
const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE";
@@ -138,7 +142,7 @@ type TtsUserPrefs = {
};
};
type ResolvedTtsModelOverrides = {
export type ResolvedTtsModelOverrides = {
enabled: boolean;
allowText: boolean;
allowProvider: boolean;
@@ -149,7 +153,7 @@ type ResolvedTtsModelOverrides = {
allowSeed: boolean;
};
type TtsDirectiveOverrides = {
export type TtsDirectiveOverrides = {
ttsText?: string;
provider?: TtsProvider;
openai?: {
@@ -166,7 +170,7 @@ type TtsDirectiveOverrides = {
};
};
type TtsDirectiveParseResult = {
export type TtsDirectiveParseResult = {
cleanedText: string;
ttsText?: string;
hasDirective: boolean;
@@ -515,655 +519,6 @@ export function isTtsProviderConfigured(config: ResolvedTtsConfig, provider: Tts
return Boolean(resolveTtsApiKey(config, provider));
}
function isValidVoiceId(voiceId: string): boolean {
return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
}
function normalizeElevenLabsBaseUrl(baseUrl: string): string {
const trimmed = baseUrl.trim();
if (!trimmed) {
return DEFAULT_ELEVENLABS_BASE_URL;
}
return trimmed.replace(/\/+$/, "");
}
function requireInRange(value: number, min: number, max: number, label: string): void {
if (!Number.isFinite(value) || value < min || value > max) {
throw new Error(`${label} must be between ${min} and ${max}`);
}
}
function assertElevenLabsVoiceSettings(settings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"]) {
requireInRange(settings.stability, 0, 1, "stability");
requireInRange(settings.similarityBoost, 0, 1, "similarityBoost");
requireInRange(settings.style, 0, 1, "style");
requireInRange(settings.speed, 0.5, 2, "speed");
}
function normalizeLanguageCode(code?: string): string | undefined {
const trimmed = code?.trim();
if (!trimmed) {
return undefined;
}
const normalized = trimmed.toLowerCase();
if (!/^[a-z]{2}$/.test(normalized)) {
throw new Error("languageCode must be a 2-letter ISO 639-1 code (e.g. en, de, fr)");
}
return normalized;
}
function normalizeApplyTextNormalization(mode?: string): "auto" | "on" | "off" | undefined {
const trimmed = mode?.trim();
if (!trimmed) {
return undefined;
}
const normalized = trimmed.toLowerCase();
if (normalized === "auto" || normalized === "on" || normalized === "off") {
return normalized;
}
throw new Error("applyTextNormalization must be one of: auto, on, off");
}
function normalizeSeed(seed?: number): number | undefined {
if (seed == null) {
return undefined;
}
const next = Math.floor(seed);
if (!Number.isFinite(next) || next < 0 || next > 4_294_967_295) {
throw new Error("seed must be between 0 and 4294967295");
}
return next;
}
function parseBooleanValue(value: string): boolean | undefined {
const normalized = value.trim().toLowerCase();
if (["true", "1", "yes", "on"].includes(normalized)) {
return true;
}
if (["false", "0", "no", "off"].includes(normalized)) {
return false;
}
return undefined;
}
function parseNumberValue(value: string): number | undefined {
const parsed = Number.parseFloat(value);
return Number.isFinite(parsed) ? parsed : undefined;
}
function parseTtsDirectives(
text: string,
policy: ResolvedTtsModelOverrides,
): TtsDirectiveParseResult {
if (!policy.enabled) {
return { cleanedText: text, overrides: {}, warnings: [], hasDirective: false };
}
const overrides: TtsDirectiveOverrides = {};
const warnings: string[] = [];
let cleanedText = text;
let hasDirective = false;
const blockRegex = /\[\[tts:text\]\]([\s\S]*?)\[\[\/tts:text\]\]/gi;
cleanedText = cleanedText.replace(blockRegex, (_match, inner: string) => {
hasDirective = true;
if (policy.allowText && overrides.ttsText == null) {
overrides.ttsText = inner.trim();
}
return "";
});
const directiveRegex = /\[\[tts:([^\]]+)\]\]/gi;
cleanedText = cleanedText.replace(directiveRegex, (_match, body: string) => {
hasDirective = true;
const tokens = body.split(/\s+/).filter(Boolean);
for (const token of tokens) {
const eqIndex = token.indexOf("=");
if (eqIndex === -1) {
continue;
}
const rawKey = token.slice(0, eqIndex).trim();
const rawValue = token.slice(eqIndex + 1).trim();
if (!rawKey || !rawValue) {
continue;
}
const key = rawKey.toLowerCase();
try {
switch (key) {
case "provider":
if (!policy.allowProvider) {
break;
}
if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") {
overrides.provider = rawValue;
} else {
warnings.push(`unsupported provider "${rawValue}"`);
}
break;
case "voice":
case "openai_voice":
case "openaivoice":
if (!policy.allowVoice) {
break;
}
if (isValidOpenAIVoice(rawValue)) {
overrides.openai = { ...overrides.openai, voice: rawValue };
} else {
warnings.push(`invalid OpenAI voice "${rawValue}"`);
}
break;
case "voiceid":
case "voice_id":
case "elevenlabs_voice":
case "elevenlabsvoice":
if (!policy.allowVoice) {
break;
}
if (isValidVoiceId(rawValue)) {
overrides.elevenlabs = { ...overrides.elevenlabs, voiceId: rawValue };
} else {
warnings.push(`invalid ElevenLabs voiceId "${rawValue}"`);
}
break;
case "model":
case "modelid":
case "model_id":
case "elevenlabs_model":
case "elevenlabsmodel":
case "openai_model":
case "openaimodel":
if (!policy.allowModelId) {
break;
}
if (isValidOpenAIModel(rawValue)) {
overrides.openai = { ...overrides.openai, model: rawValue };
} else {
overrides.elevenlabs = { ...overrides.elevenlabs, modelId: rawValue };
}
break;
case "stability":
if (!policy.allowVoiceSettings) {
break;
}
{
const value = parseNumberValue(rawValue);
if (value == null) {
warnings.push("invalid stability value");
break;
}
requireInRange(value, 0, 1, "stability");
overrides.elevenlabs = {
...overrides.elevenlabs,
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, stability: value },
};
}
break;
case "similarity":
case "similarityboost":
case "similarity_boost":
if (!policy.allowVoiceSettings) {
break;
}
{
const value = parseNumberValue(rawValue);
if (value == null) {
warnings.push("invalid similarityBoost value");
break;
}
requireInRange(value, 0, 1, "similarityBoost");
overrides.elevenlabs = {
...overrides.elevenlabs,
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, similarityBoost: value },
};
}
break;
case "style":
if (!policy.allowVoiceSettings) {
break;
}
{
const value = parseNumberValue(rawValue);
if (value == null) {
warnings.push("invalid style value");
break;
}
requireInRange(value, 0, 1, "style");
overrides.elevenlabs = {
...overrides.elevenlabs,
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, style: value },
};
}
break;
case "speed":
if (!policy.allowVoiceSettings) {
break;
}
{
const value = parseNumberValue(rawValue);
if (value == null) {
warnings.push("invalid speed value");
break;
}
requireInRange(value, 0.5, 2, "speed");
overrides.elevenlabs = {
...overrides.elevenlabs,
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, speed: value },
};
}
break;
case "speakerboost":
case "speaker_boost":
case "usespeakerboost":
case "use_speaker_boost":
if (!policy.allowVoiceSettings) {
break;
}
{
const value = parseBooleanValue(rawValue);
if (value == null) {
warnings.push("invalid useSpeakerBoost value");
break;
}
overrides.elevenlabs = {
...overrides.elevenlabs,
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, useSpeakerBoost: value },
};
}
break;
case "normalize":
case "applytextnormalization":
case "apply_text_normalization":
if (!policy.allowNormalization) {
break;
}
overrides.elevenlabs = {
...overrides.elevenlabs,
applyTextNormalization: normalizeApplyTextNormalization(rawValue),
};
break;
case "language":
case "languagecode":
case "language_code":
if (!policy.allowNormalization) {
break;
}
overrides.elevenlabs = {
...overrides.elevenlabs,
languageCode: normalizeLanguageCode(rawValue),
};
break;
case "seed":
if (!policy.allowSeed) {
break;
}
overrides.elevenlabs = {
...overrides.elevenlabs,
seed: normalizeSeed(Number.parseInt(rawValue, 10)),
};
break;
default:
break;
}
} catch (err) {
warnings.push((err as Error).message);
}
}
return "";
});
return {
cleanedText,
ttsText: overrides.ttsText,
hasDirective,
overrides,
warnings,
};
}
export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] as const;
/**
* Custom OpenAI-compatible TTS endpoint.
* When set, model/voice validation is relaxed to allow non-OpenAI models.
* Example: OPENAI_TTS_BASE_URL=http://localhost:8880/v1
*
* Note: Read at runtime (not module load) to support config.env loading.
*/
function getOpenAITtsBaseUrl(): string {
return (process.env.OPENAI_TTS_BASE_URL?.trim() || "https://api.openai.com/v1").replace(
/\/+$/,
"",
);
}
function isCustomOpenAIEndpoint(): boolean {
return getOpenAITtsBaseUrl() !== "https://api.openai.com/v1";
}
export const OPENAI_TTS_VOICES = [
"alloy",
"ash",
"ballad",
"cedar",
"coral",
"echo",
"fable",
"juniper",
"marin",
"onyx",
"nova",
"sage",
"shimmer",
"verse",
] as const;
type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
function isValidOpenAIModel(model: string): boolean {
// Allow any model when using custom endpoint (e.g., Kokoro, LocalAI)
if (isCustomOpenAIEndpoint()) {
return true;
}
return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
}
function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice {
// Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices)
if (isCustomOpenAIEndpoint()) {
return true;
}
return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
}
type SummarizeResult = {
summary: string;
latencyMs: number;
inputLength: number;
outputLength: number;
};
type SummaryModelSelection = {
ref: ModelRef;
source: "summaryModel" | "default";
};
function resolveSummaryModelRef(
cfg: OpenClawConfig,
config: ResolvedTtsConfig,
): SummaryModelSelection {
const defaultRef = resolveDefaultModelForAgent({ cfg });
const override = config.summaryModel?.trim();
if (!override) {
return { ref: defaultRef, source: "default" };
}
const aliasIndex = buildModelAliasIndex({ cfg, defaultProvider: defaultRef.provider });
const resolved = resolveModelRefFromString({
raw: override,
defaultProvider: defaultRef.provider,
aliasIndex,
});
if (!resolved) {
return { ref: defaultRef, source: "default" };
}
return { ref: resolved.ref, source: "summaryModel" };
}
function isTextContentBlock(block: { type: string }): block is TextContent {
return block.type === "text";
}
async function summarizeText(params: {
text: string;
targetLength: number;
cfg: OpenClawConfig;
config: ResolvedTtsConfig;
timeoutMs: number;
}): Promise<SummarizeResult> {
const { text, targetLength, cfg, config, timeoutMs } = params;
if (targetLength < 100 || targetLength > 10_000) {
throw new Error(`Invalid targetLength: ${targetLength}`);
}
const startTime = Date.now();
const { ref } = resolveSummaryModelRef(cfg, config);
const resolved = resolveModel(ref.provider, ref.model, undefined, cfg);
if (!resolved.model) {
throw new Error(resolved.error ?? `Unknown summary model: ${ref.provider}/${ref.model}`);
}
const apiKey = requireApiKey(
await getApiKeyForModel({ model: resolved.model, cfg }),
ref.provider,
);
try {
const controller = new AbortController();
const timeout = setTimeout(controller.abort.bind(controller), timeoutMs);
try {
const res = await completeSimple(
resolved.model,
{
messages: [
{
role: "user",
content:
`You are an assistant that summarizes texts concisely while keeping the most important information. ` +
`Summarize the text to approximately ${targetLength} characters. Maintain the original tone and style. ` +
`Reply only with the summary, without additional explanations.\n\n` +
`<text_to_summarize>\n${text}\n</text_to_summarize>`,
timestamp: Date.now(),
},
],
},
{
apiKey,
maxTokens: Math.ceil(targetLength / 2),
temperature: 0.3,
signal: controller.signal,
},
);
const summary = res.content
.filter(isTextContentBlock)
.map((block) => block.text.trim())
.filter(Boolean)
.join(" ")
.trim();
if (!summary) {
throw new Error("No summary returned");
}
return {
summary,
latencyMs: Date.now() - startTime,
inputLength: text.length,
outputLength: summary.length,
};
} finally {
clearTimeout(timeout);
}
} catch (err) {
const error = err as Error;
if (error.name === "AbortError") {
throw new Error("Summarization timed out", { cause: err });
}
throw err;
}
}
function scheduleCleanup(tempDir: string, delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS): void {
const timer = setTimeout(() => {
try {
rmSync(tempDir, { recursive: true, force: true });
} catch {
// ignore cleanup errors
}
}, delayMs);
timer.unref();
}
async function elevenLabsTTS(params: {
text: string;
apiKey: string;
baseUrl: string;
voiceId: string;
modelId: string;
outputFormat: string;
seed?: number;
applyTextNormalization?: "auto" | "on" | "off";
languageCode?: string;
voiceSettings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"];
timeoutMs: number;
}): Promise<Buffer> {
const {
text,
apiKey,
baseUrl,
voiceId,
modelId,
outputFormat,
seed,
applyTextNormalization,
languageCode,
voiceSettings,
timeoutMs,
} = params;
if (!isValidVoiceId(voiceId)) {
throw new Error("Invalid voiceId format");
}
assertElevenLabsVoiceSettings(voiceSettings);
const normalizedLanguage = normalizeLanguageCode(languageCode);
const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization);
const normalizedSeed = normalizeSeed(seed);
const controller = new AbortController();
const timeout = setTimeout(controller.abort.bind(controller), timeoutMs);
try {
const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`);
if (outputFormat) {
url.searchParams.set("output_format", outputFormat);
}
const response = await fetch(url.toString(), {
method: "POST",
headers: {
"xi-api-key": apiKey,
"Content-Type": "application/json",
Accept: "audio/mpeg",
},
body: JSON.stringify({
text,
model_id: modelId,
seed: normalizedSeed,
apply_text_normalization: normalizedNormalization,
language_code: normalizedLanguage,
voice_settings: {
stability: voiceSettings.stability,
similarity_boost: voiceSettings.similarityBoost,
style: voiceSettings.style,
use_speaker_boost: voiceSettings.useSpeakerBoost,
speed: voiceSettings.speed,
},
}),
signal: controller.signal,
});
if (!response.ok) {
throw new Error(`ElevenLabs API error (${response.status})`);
}
return Buffer.from(await response.arrayBuffer());
} finally {
clearTimeout(timeout);
}
}
async function openaiTTS(params: {
text: string;
apiKey: string;
model: string;
voice: string;
responseFormat: "mp3" | "opus" | "pcm";
timeoutMs: number;
}): Promise<Buffer> {
const { text, apiKey, model, voice, responseFormat, timeoutMs } = params;
if (!isValidOpenAIModel(model)) {
throw new Error(`Invalid model: ${model}`);
}
if (!isValidOpenAIVoice(voice)) {
throw new Error(`Invalid voice: ${voice}`);
}
const controller = new AbortController();
const timeout = setTimeout(controller.abort.bind(controller), timeoutMs);
try {
const response = await fetch(`${getOpenAITtsBaseUrl()}/audio/speech`, {
method: "POST",
headers: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
model,
input: text,
voice,
response_format: responseFormat,
}),
signal: controller.signal,
});
if (!response.ok) {
throw new Error(`OpenAI TTS API error (${response.status})`);
}
return Buffer.from(await response.arrayBuffer());
} finally {
clearTimeout(timeout);
}
}
function inferEdgeExtension(outputFormat: string): string {
const normalized = outputFormat.toLowerCase();
if (normalized.includes("webm")) {
return ".webm";
}
if (normalized.includes("ogg")) {
return ".ogg";
}
if (normalized.includes("opus")) {
return ".opus";
}
if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) {
return ".wav";
}
return ".mp3";
}
async function edgeTTS(params: {
text: string;
outputPath: string;
config: ResolvedTtsConfig["edge"];
timeoutMs: number;
}): Promise<void> {
const { text, outputPath, config, timeoutMs } = params;
const tts = new EdgeTTS({
voice: config.voice,
lang: config.lang,
outputFormat: config.outputFormat,
saveSubtitles: config.saveSubtitles,
proxy: config.proxy,
rate: config.rate,
pitch: config.pitch,
volume: config.volume,
timeout: config.timeoutMs ?? timeoutMs,
});
await tts.ttsPromise(text, outputPath);
}
export async function textToSpeech(params: {
text: string;
cfg: OpenClawConfig;