refactor(tts): extract directives and provider core
This commit is contained in:
673
src/tts/tts-core.ts
Normal file
673
src/tts/tts-core.ts
Normal file
@@ -0,0 +1,673 @@
|
||||
import { completeSimple, type TextContent } from "@mariozechner/pi-ai";
|
||||
import { EdgeTTS } from "node-edge-tts";
|
||||
import { rmSync } from "node:fs";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import type {
|
||||
ResolvedTtsConfig,
|
||||
ResolvedTtsModelOverrides,
|
||||
TtsDirectiveOverrides,
|
||||
TtsDirectiveParseResult,
|
||||
} from "./tts.js";
|
||||
import { getApiKeyForModel, requireApiKey } from "../agents/model-auth.js";
|
||||
import {
|
||||
buildModelAliasIndex,
|
||||
resolveDefaultModelForAgent,
|
||||
resolveModelRefFromString,
|
||||
type ModelRef,
|
||||
} from "../agents/model-selection.js";
|
||||
import { resolveModel } from "../agents/pi-embedded-runner/model.js";
|
||||
|
||||
const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
|
||||
const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes
|
||||
|
||||
export function isValidVoiceId(voiceId: string): boolean {
|
||||
return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
|
||||
}
|
||||
|
||||
function normalizeElevenLabsBaseUrl(baseUrl: string): string {
|
||||
const trimmed = baseUrl.trim();
|
||||
if (!trimmed) {
|
||||
return DEFAULT_ELEVENLABS_BASE_URL;
|
||||
}
|
||||
return trimmed.replace(/\/+$/, "");
|
||||
}
|
||||
|
||||
function requireInRange(value: number, min: number, max: number, label: string): void {
|
||||
if (!Number.isFinite(value) || value < min || value > max) {
|
||||
throw new Error(`${label} must be between ${min} and ${max}`);
|
||||
}
|
||||
}
|
||||
|
||||
function assertElevenLabsVoiceSettings(settings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"]) {
|
||||
requireInRange(settings.stability, 0, 1, "stability");
|
||||
requireInRange(settings.similarityBoost, 0, 1, "similarityBoost");
|
||||
requireInRange(settings.style, 0, 1, "style");
|
||||
requireInRange(settings.speed, 0.5, 2, "speed");
|
||||
}
|
||||
|
||||
function normalizeLanguageCode(code?: string): string | undefined {
|
||||
const trimmed = code?.trim();
|
||||
if (!trimmed) {
|
||||
return undefined;
|
||||
}
|
||||
const normalized = trimmed.toLowerCase();
|
||||
if (!/^[a-z]{2}$/.test(normalized)) {
|
||||
throw new Error("languageCode must be a 2-letter ISO 639-1 code (e.g. en, de, fr)");
|
||||
}
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function normalizeApplyTextNormalization(mode?: string): "auto" | "on" | "off" | undefined {
|
||||
const trimmed = mode?.trim();
|
||||
if (!trimmed) {
|
||||
return undefined;
|
||||
}
|
||||
const normalized = trimmed.toLowerCase();
|
||||
if (normalized === "auto" || normalized === "on" || normalized === "off") {
|
||||
return normalized;
|
||||
}
|
||||
throw new Error("applyTextNormalization must be one of: auto, on, off");
|
||||
}
|
||||
|
||||
function normalizeSeed(seed?: number): number | undefined {
|
||||
if (seed == null) {
|
||||
return undefined;
|
||||
}
|
||||
const next = Math.floor(seed);
|
||||
if (!Number.isFinite(next) || next < 0 || next > 4_294_967_295) {
|
||||
throw new Error("seed must be between 0 and 4294967295");
|
||||
}
|
||||
return next;
|
||||
}
|
||||
|
||||
function parseBooleanValue(value: string): boolean | undefined {
|
||||
const normalized = value.trim().toLowerCase();
|
||||
if (["true", "1", "yes", "on"].includes(normalized)) {
|
||||
return true;
|
||||
}
|
||||
if (["false", "0", "no", "off"].includes(normalized)) {
|
||||
return false;
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function parseNumberValue(value: string): number | undefined {
|
||||
const parsed = Number.parseFloat(value);
|
||||
return Number.isFinite(parsed) ? parsed : undefined;
|
||||
}
|
||||
|
||||
export function parseTtsDirectives(
|
||||
text: string,
|
||||
policy: ResolvedTtsModelOverrides,
|
||||
): TtsDirectiveParseResult {
|
||||
if (!policy.enabled) {
|
||||
return { cleanedText: text, overrides: {}, warnings: [], hasDirective: false };
|
||||
}
|
||||
|
||||
const overrides: TtsDirectiveOverrides = {};
|
||||
const warnings: string[] = [];
|
||||
let cleanedText = text;
|
||||
let hasDirective = false;
|
||||
|
||||
const blockRegex = /\[\[tts:text\]\]([\s\S]*?)\[\[\/tts:text\]\]/gi;
|
||||
cleanedText = cleanedText.replace(blockRegex, (_match, inner: string) => {
|
||||
hasDirective = true;
|
||||
if (policy.allowText && overrides.ttsText == null) {
|
||||
overrides.ttsText = inner.trim();
|
||||
}
|
||||
return "";
|
||||
});
|
||||
|
||||
const directiveRegex = /\[\[tts:([^\]]+)\]\]/gi;
|
||||
cleanedText = cleanedText.replace(directiveRegex, (_match, body: string) => {
|
||||
hasDirective = true;
|
||||
const tokens = body.split(/\s+/).filter(Boolean);
|
||||
for (const token of tokens) {
|
||||
const eqIndex = token.indexOf("=");
|
||||
if (eqIndex === -1) {
|
||||
continue;
|
||||
}
|
||||
const rawKey = token.slice(0, eqIndex).trim();
|
||||
const rawValue = token.slice(eqIndex + 1).trim();
|
||||
if (!rawKey || !rawValue) {
|
||||
continue;
|
||||
}
|
||||
const key = rawKey.toLowerCase();
|
||||
try {
|
||||
switch (key) {
|
||||
case "provider":
|
||||
if (!policy.allowProvider) {
|
||||
break;
|
||||
}
|
||||
if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") {
|
||||
overrides.provider = rawValue;
|
||||
} else {
|
||||
warnings.push(`unsupported provider "${rawValue}"`);
|
||||
}
|
||||
break;
|
||||
case "voice":
|
||||
case "openai_voice":
|
||||
case "openaivoice":
|
||||
if (!policy.allowVoice) {
|
||||
break;
|
||||
}
|
||||
if (isValidOpenAIVoice(rawValue)) {
|
||||
overrides.openai = { ...overrides.openai, voice: rawValue };
|
||||
} else {
|
||||
warnings.push(`invalid OpenAI voice "${rawValue}"`);
|
||||
}
|
||||
break;
|
||||
case "voiceid":
|
||||
case "voice_id":
|
||||
case "elevenlabs_voice":
|
||||
case "elevenlabsvoice":
|
||||
if (!policy.allowVoice) {
|
||||
break;
|
||||
}
|
||||
if (isValidVoiceId(rawValue)) {
|
||||
overrides.elevenlabs = { ...overrides.elevenlabs, voiceId: rawValue };
|
||||
} else {
|
||||
warnings.push(`invalid ElevenLabs voiceId "${rawValue}"`);
|
||||
}
|
||||
break;
|
||||
case "model":
|
||||
case "modelid":
|
||||
case "model_id":
|
||||
case "elevenlabs_model":
|
||||
case "elevenlabsmodel":
|
||||
case "openai_model":
|
||||
case "openaimodel":
|
||||
if (!policy.allowModelId) {
|
||||
break;
|
||||
}
|
||||
if (isValidOpenAIModel(rawValue)) {
|
||||
overrides.openai = { ...overrides.openai, model: rawValue };
|
||||
} else {
|
||||
overrides.elevenlabs = { ...overrides.elevenlabs, modelId: rawValue };
|
||||
}
|
||||
break;
|
||||
case "stability":
|
||||
if (!policy.allowVoiceSettings) {
|
||||
break;
|
||||
}
|
||||
{
|
||||
const value = parseNumberValue(rawValue);
|
||||
if (value == null) {
|
||||
warnings.push("invalid stability value");
|
||||
break;
|
||||
}
|
||||
requireInRange(value, 0, 1, "stability");
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, stability: value },
|
||||
};
|
||||
}
|
||||
break;
|
||||
case "similarity":
|
||||
case "similarityboost":
|
||||
case "similarity_boost":
|
||||
if (!policy.allowVoiceSettings) {
|
||||
break;
|
||||
}
|
||||
{
|
||||
const value = parseNumberValue(rawValue);
|
||||
if (value == null) {
|
||||
warnings.push("invalid similarityBoost value");
|
||||
break;
|
||||
}
|
||||
requireInRange(value, 0, 1, "similarityBoost");
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, similarityBoost: value },
|
||||
};
|
||||
}
|
||||
break;
|
||||
case "style":
|
||||
if (!policy.allowVoiceSettings) {
|
||||
break;
|
||||
}
|
||||
{
|
||||
const value = parseNumberValue(rawValue);
|
||||
if (value == null) {
|
||||
warnings.push("invalid style value");
|
||||
break;
|
||||
}
|
||||
requireInRange(value, 0, 1, "style");
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, style: value },
|
||||
};
|
||||
}
|
||||
break;
|
||||
case "speed":
|
||||
if (!policy.allowVoiceSettings) {
|
||||
break;
|
||||
}
|
||||
{
|
||||
const value = parseNumberValue(rawValue);
|
||||
if (value == null) {
|
||||
warnings.push("invalid speed value");
|
||||
break;
|
||||
}
|
||||
requireInRange(value, 0.5, 2, "speed");
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, speed: value },
|
||||
};
|
||||
}
|
||||
break;
|
||||
case "speakerboost":
|
||||
case "speaker_boost":
|
||||
case "usespeakerboost":
|
||||
case "use_speaker_boost":
|
||||
if (!policy.allowVoiceSettings) {
|
||||
break;
|
||||
}
|
||||
{
|
||||
const value = parseBooleanValue(rawValue);
|
||||
if (value == null) {
|
||||
warnings.push("invalid useSpeakerBoost value");
|
||||
break;
|
||||
}
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, useSpeakerBoost: value },
|
||||
};
|
||||
}
|
||||
break;
|
||||
case "normalize":
|
||||
case "applytextnormalization":
|
||||
case "apply_text_normalization":
|
||||
if (!policy.allowNormalization) {
|
||||
break;
|
||||
}
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
applyTextNormalization: normalizeApplyTextNormalization(rawValue),
|
||||
};
|
||||
break;
|
||||
case "language":
|
||||
case "languagecode":
|
||||
case "language_code":
|
||||
if (!policy.allowNormalization) {
|
||||
break;
|
||||
}
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
languageCode: normalizeLanguageCode(rawValue),
|
||||
};
|
||||
break;
|
||||
case "seed":
|
||||
if (!policy.allowSeed) {
|
||||
break;
|
||||
}
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
seed: normalizeSeed(Number.parseInt(rawValue, 10)),
|
||||
};
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
} catch (err) {
|
||||
warnings.push((err as Error).message);
|
||||
}
|
||||
}
|
||||
return "";
|
||||
});
|
||||
|
||||
return {
|
||||
cleanedText,
|
||||
ttsText: overrides.ttsText,
|
||||
hasDirective,
|
||||
overrides,
|
||||
warnings,
|
||||
};
|
||||
}
|
||||
|
||||
export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] as const;
|
||||
|
||||
/**
|
||||
* Custom OpenAI-compatible TTS endpoint.
|
||||
* When set, model/voice validation is relaxed to allow non-OpenAI models.
|
||||
* Example: OPENAI_TTS_BASE_URL=http://localhost:8880/v1
|
||||
*
|
||||
* Note: Read at runtime (not module load) to support config.env loading.
|
||||
*/
|
||||
function getOpenAITtsBaseUrl(): string {
|
||||
return (process.env.OPENAI_TTS_BASE_URL?.trim() || "https://api.openai.com/v1").replace(
|
||||
/\/+$/,
|
||||
"",
|
||||
);
|
||||
}
|
||||
|
||||
function isCustomOpenAIEndpoint(): boolean {
|
||||
return getOpenAITtsBaseUrl() !== "https://api.openai.com/v1";
|
||||
}
|
||||
export const OPENAI_TTS_VOICES = [
|
||||
"alloy",
|
||||
"ash",
|
||||
"ballad",
|
||||
"cedar",
|
||||
"coral",
|
||||
"echo",
|
||||
"fable",
|
||||
"juniper",
|
||||
"marin",
|
||||
"onyx",
|
||||
"nova",
|
||||
"sage",
|
||||
"shimmer",
|
||||
"verse",
|
||||
] as const;
|
||||
|
||||
type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
|
||||
|
||||
export function isValidOpenAIModel(model: string): boolean {
|
||||
// Allow any model when using custom endpoint (e.g., Kokoro, LocalAI)
|
||||
if (isCustomOpenAIEndpoint()) {
|
||||
return true;
|
||||
}
|
||||
return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
|
||||
}
|
||||
|
||||
export function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice {
|
||||
// Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices)
|
||||
if (isCustomOpenAIEndpoint()) {
|
||||
return true;
|
||||
}
|
||||
return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
|
||||
}
|
||||
|
||||
type SummarizeResult = {
|
||||
summary: string;
|
||||
latencyMs: number;
|
||||
inputLength: number;
|
||||
outputLength: number;
|
||||
};
|
||||
|
||||
type SummaryModelSelection = {
|
||||
ref: ModelRef;
|
||||
source: "summaryModel" | "default";
|
||||
};
|
||||
|
||||
function resolveSummaryModelRef(
|
||||
cfg: OpenClawConfig,
|
||||
config: ResolvedTtsConfig,
|
||||
): SummaryModelSelection {
|
||||
const defaultRef = resolveDefaultModelForAgent({ cfg });
|
||||
const override = config.summaryModel?.trim();
|
||||
if (!override) {
|
||||
return { ref: defaultRef, source: "default" };
|
||||
}
|
||||
|
||||
const aliasIndex = buildModelAliasIndex({ cfg, defaultProvider: defaultRef.provider });
|
||||
const resolved = resolveModelRefFromString({
|
||||
raw: override,
|
||||
defaultProvider: defaultRef.provider,
|
||||
aliasIndex,
|
||||
});
|
||||
if (!resolved) {
|
||||
return { ref: defaultRef, source: "default" };
|
||||
}
|
||||
return { ref: resolved.ref, source: "summaryModel" };
|
||||
}
|
||||
|
||||
function isTextContentBlock(block: { type: string }): block is TextContent {
|
||||
return block.type === "text";
|
||||
}
|
||||
|
||||
export async function summarizeText(params: {
|
||||
text: string;
|
||||
targetLength: number;
|
||||
cfg: OpenClawConfig;
|
||||
config: ResolvedTtsConfig;
|
||||
timeoutMs: number;
|
||||
}): Promise<SummarizeResult> {
|
||||
const { text, targetLength, cfg, config, timeoutMs } = params;
|
||||
if (targetLength < 100 || targetLength > 10_000) {
|
||||
throw new Error(`Invalid targetLength: ${targetLength}`);
|
||||
}
|
||||
|
||||
const startTime = Date.now();
|
||||
const { ref } = resolveSummaryModelRef(cfg, config);
|
||||
const resolved = resolveModel(ref.provider, ref.model, undefined, cfg);
|
||||
if (!resolved.model) {
|
||||
throw new Error(resolved.error ?? `Unknown summary model: ${ref.provider}/${ref.model}`);
|
||||
}
|
||||
const apiKey = requireApiKey(
|
||||
await getApiKeyForModel({ model: resolved.model, cfg }),
|
||||
ref.provider,
|
||||
);
|
||||
|
||||
try {
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||
|
||||
try {
|
||||
const res = await completeSimple(
|
||||
resolved.model,
|
||||
{
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content:
|
||||
`You are an assistant that summarizes texts concisely while keeping the most important information. ` +
|
||||
`Summarize the text to approximately ${targetLength} characters. Maintain the original tone and style. ` +
|
||||
`Reply only with the summary, without additional explanations.\n\n` +
|
||||
`<text_to_summarize>\n${text}\n</text_to_summarize>`,
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
apiKey,
|
||||
maxTokens: Math.ceil(targetLength / 2),
|
||||
temperature: 0.3,
|
||||
signal: controller.signal,
|
||||
},
|
||||
);
|
||||
|
||||
const summary = res.content
|
||||
.filter(isTextContentBlock)
|
||||
.map((block) => block.text.trim())
|
||||
.filter(Boolean)
|
||||
.join(" ")
|
||||
.trim();
|
||||
|
||||
if (!summary) {
|
||||
throw new Error("No summary returned");
|
||||
}
|
||||
|
||||
return {
|
||||
summary,
|
||||
latencyMs: Date.now() - startTime,
|
||||
inputLength: text.length,
|
||||
outputLength: summary.length,
|
||||
};
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
} catch (err) {
|
||||
const error = err as Error;
|
||||
if (error.name === "AbortError") {
|
||||
throw new Error("Summarization timed out", { cause: err });
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
export function scheduleCleanup(
|
||||
tempDir: string,
|
||||
delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS,
|
||||
): void {
|
||||
const timer = setTimeout(() => {
|
||||
try {
|
||||
rmSync(tempDir, { recursive: true, force: true });
|
||||
} catch {
|
||||
// ignore cleanup errors
|
||||
}
|
||||
}, delayMs);
|
||||
timer.unref();
|
||||
}
|
||||
|
||||
export async function elevenLabsTTS(params: {
|
||||
text: string;
|
||||
apiKey: string;
|
||||
baseUrl: string;
|
||||
voiceId: string;
|
||||
modelId: string;
|
||||
outputFormat: string;
|
||||
seed?: number;
|
||||
applyTextNormalization?: "auto" | "on" | "off";
|
||||
languageCode?: string;
|
||||
voiceSettings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"];
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
const {
|
||||
text,
|
||||
apiKey,
|
||||
baseUrl,
|
||||
voiceId,
|
||||
modelId,
|
||||
outputFormat,
|
||||
seed,
|
||||
applyTextNormalization,
|
||||
languageCode,
|
||||
voiceSettings,
|
||||
timeoutMs,
|
||||
} = params;
|
||||
if (!isValidVoiceId(voiceId)) {
|
||||
throw new Error("Invalid voiceId format");
|
||||
}
|
||||
assertElevenLabsVoiceSettings(voiceSettings);
|
||||
const normalizedLanguage = normalizeLanguageCode(languageCode);
|
||||
const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization);
|
||||
const normalizedSeed = normalizeSeed(seed);
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||
|
||||
try {
|
||||
const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`);
|
||||
if (outputFormat) {
|
||||
url.searchParams.set("output_format", outputFormat);
|
||||
}
|
||||
|
||||
const response = await fetch(url.toString(), {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"xi-api-key": apiKey,
|
||||
"Content-Type": "application/json",
|
||||
Accept: "audio/mpeg",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
text,
|
||||
model_id: modelId,
|
||||
seed: normalizedSeed,
|
||||
apply_text_normalization: normalizedNormalization,
|
||||
language_code: normalizedLanguage,
|
||||
voice_settings: {
|
||||
stability: voiceSettings.stability,
|
||||
similarity_boost: voiceSettings.similarityBoost,
|
||||
style: voiceSettings.style,
|
||||
use_speaker_boost: voiceSettings.useSpeakerBoost,
|
||||
speed: voiceSettings.speed,
|
||||
},
|
||||
}),
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`ElevenLabs API error (${response.status})`);
|
||||
}
|
||||
|
||||
return Buffer.from(await response.arrayBuffer());
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
}
|
||||
|
||||
export async function openaiTTS(params: {
|
||||
text: string;
|
||||
apiKey: string;
|
||||
model: string;
|
||||
voice: string;
|
||||
responseFormat: "mp3" | "opus" | "pcm";
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
const { text, apiKey, model, voice, responseFormat, timeoutMs } = params;
|
||||
|
||||
if (!isValidOpenAIModel(model)) {
|
||||
throw new Error(`Invalid model: ${model}`);
|
||||
}
|
||||
if (!isValidOpenAIVoice(voice)) {
|
||||
throw new Error(`Invalid voice: ${voice}`);
|
||||
}
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||
|
||||
try {
|
||||
const response = await fetch(`${getOpenAITtsBaseUrl()}/audio/speech`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model,
|
||||
input: text,
|
||||
voice,
|
||||
response_format: responseFormat,
|
||||
}),
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`OpenAI TTS API error (${response.status})`);
|
||||
}
|
||||
|
||||
return Buffer.from(await response.arrayBuffer());
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
}
|
||||
|
||||
export function inferEdgeExtension(outputFormat: string): string {
|
||||
const normalized = outputFormat.toLowerCase();
|
||||
if (normalized.includes("webm")) {
|
||||
return ".webm";
|
||||
}
|
||||
if (normalized.includes("ogg")) {
|
||||
return ".ogg";
|
||||
}
|
||||
if (normalized.includes("opus")) {
|
||||
return ".opus";
|
||||
}
|
||||
if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) {
|
||||
return ".wav";
|
||||
}
|
||||
return ".mp3";
|
||||
}
|
||||
|
||||
export async function edgeTTS(params: {
|
||||
text: string;
|
||||
outputPath: string;
|
||||
config: ResolvedTtsConfig["edge"];
|
||||
timeoutMs: number;
|
||||
}): Promise<void> {
|
||||
const { text, outputPath, config, timeoutMs } = params;
|
||||
const tts = new EdgeTTS({
|
||||
voice: config.voice,
|
||||
lang: config.lang,
|
||||
outputFormat: config.outputFormat,
|
||||
saveSubtitles: config.saveSubtitles,
|
||||
proxy: config.proxy,
|
||||
rate: config.rate,
|
||||
pitch: config.pitch,
|
||||
volume: config.volume,
|
||||
timeout: config.timeoutMs ?? timeoutMs,
|
||||
});
|
||||
await tts.ttsPromise(text, outputPath);
|
||||
}
|
||||
681
src/tts/tts.ts
681
src/tts/tts.ts
@@ -1,5 +1,3 @@
|
||||
import { completeSimple, type TextContent } from "@mariozechner/pi-ai";
|
||||
import { EdgeTTS } from "node-edge-tts";
|
||||
import {
|
||||
existsSync,
|
||||
mkdirSync,
|
||||
@@ -22,25 +20,31 @@ import type {
|
||||
TtsProvider,
|
||||
TtsModelOverrideConfig,
|
||||
} from "../config/types.tts.js";
|
||||
import { getApiKeyForModel, requireApiKey } from "../agents/model-auth.js";
|
||||
import {
|
||||
buildModelAliasIndex,
|
||||
resolveDefaultModelForAgent,
|
||||
resolveModelRefFromString,
|
||||
type ModelRef,
|
||||
} from "../agents/model-selection.js";
|
||||
import { resolveModel } from "../agents/pi-embedded-runner/model.js";
|
||||
import { normalizeChannelId } from "../channels/plugins/index.js";
|
||||
import { logVerbose } from "../globals.js";
|
||||
import { stripMarkdown } from "../line/markdown-to-line.js";
|
||||
import { isVoiceCompatibleAudio } from "../media/audio.js";
|
||||
import { CONFIG_DIR, resolveUserPath } from "../utils.js";
|
||||
import {
|
||||
edgeTTS,
|
||||
elevenLabsTTS,
|
||||
inferEdgeExtension,
|
||||
isValidOpenAIModel,
|
||||
isValidOpenAIVoice,
|
||||
isValidVoiceId,
|
||||
OPENAI_TTS_MODELS,
|
||||
OPENAI_TTS_VOICES,
|
||||
openaiTTS,
|
||||
parseTtsDirectives,
|
||||
scheduleCleanup,
|
||||
summarizeText,
|
||||
} from "./tts-core.js";
|
||||
export { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES } from "./tts-core.js";
|
||||
|
||||
const DEFAULT_TIMEOUT_MS = 30_000;
|
||||
const DEFAULT_TTS_MAX_LENGTH = 1500;
|
||||
const DEFAULT_TTS_SUMMARIZE = true;
|
||||
const DEFAULT_MAX_TEXT_LENGTH = 4096;
|
||||
const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes
|
||||
|
||||
const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
|
||||
const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE";
|
||||
@@ -138,7 +142,7 @@ type TtsUserPrefs = {
|
||||
};
|
||||
};
|
||||
|
||||
type ResolvedTtsModelOverrides = {
|
||||
export type ResolvedTtsModelOverrides = {
|
||||
enabled: boolean;
|
||||
allowText: boolean;
|
||||
allowProvider: boolean;
|
||||
@@ -149,7 +153,7 @@ type ResolvedTtsModelOverrides = {
|
||||
allowSeed: boolean;
|
||||
};
|
||||
|
||||
type TtsDirectiveOverrides = {
|
||||
export type TtsDirectiveOverrides = {
|
||||
ttsText?: string;
|
||||
provider?: TtsProvider;
|
||||
openai?: {
|
||||
@@ -166,7 +170,7 @@ type TtsDirectiveOverrides = {
|
||||
};
|
||||
};
|
||||
|
||||
type TtsDirectiveParseResult = {
|
||||
export type TtsDirectiveParseResult = {
|
||||
cleanedText: string;
|
||||
ttsText?: string;
|
||||
hasDirective: boolean;
|
||||
@@ -515,655 +519,6 @@ export function isTtsProviderConfigured(config: ResolvedTtsConfig, provider: Tts
|
||||
return Boolean(resolveTtsApiKey(config, provider));
|
||||
}
|
||||
|
||||
function isValidVoiceId(voiceId: string): boolean {
|
||||
return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
|
||||
}
|
||||
|
||||
function normalizeElevenLabsBaseUrl(baseUrl: string): string {
|
||||
const trimmed = baseUrl.trim();
|
||||
if (!trimmed) {
|
||||
return DEFAULT_ELEVENLABS_BASE_URL;
|
||||
}
|
||||
return trimmed.replace(/\/+$/, "");
|
||||
}
|
||||
|
||||
function requireInRange(value: number, min: number, max: number, label: string): void {
|
||||
if (!Number.isFinite(value) || value < min || value > max) {
|
||||
throw new Error(`${label} must be between ${min} and ${max}`);
|
||||
}
|
||||
}
|
||||
|
||||
function assertElevenLabsVoiceSettings(settings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"]) {
|
||||
requireInRange(settings.stability, 0, 1, "stability");
|
||||
requireInRange(settings.similarityBoost, 0, 1, "similarityBoost");
|
||||
requireInRange(settings.style, 0, 1, "style");
|
||||
requireInRange(settings.speed, 0.5, 2, "speed");
|
||||
}
|
||||
|
||||
function normalizeLanguageCode(code?: string): string | undefined {
|
||||
const trimmed = code?.trim();
|
||||
if (!trimmed) {
|
||||
return undefined;
|
||||
}
|
||||
const normalized = trimmed.toLowerCase();
|
||||
if (!/^[a-z]{2}$/.test(normalized)) {
|
||||
throw new Error("languageCode must be a 2-letter ISO 639-1 code (e.g. en, de, fr)");
|
||||
}
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function normalizeApplyTextNormalization(mode?: string): "auto" | "on" | "off" | undefined {
|
||||
const trimmed = mode?.trim();
|
||||
if (!trimmed) {
|
||||
return undefined;
|
||||
}
|
||||
const normalized = trimmed.toLowerCase();
|
||||
if (normalized === "auto" || normalized === "on" || normalized === "off") {
|
||||
return normalized;
|
||||
}
|
||||
throw new Error("applyTextNormalization must be one of: auto, on, off");
|
||||
}
|
||||
|
||||
function normalizeSeed(seed?: number): number | undefined {
|
||||
if (seed == null) {
|
||||
return undefined;
|
||||
}
|
||||
const next = Math.floor(seed);
|
||||
if (!Number.isFinite(next) || next < 0 || next > 4_294_967_295) {
|
||||
throw new Error("seed must be between 0 and 4294967295");
|
||||
}
|
||||
return next;
|
||||
}
|
||||
|
||||
function parseBooleanValue(value: string): boolean | undefined {
|
||||
const normalized = value.trim().toLowerCase();
|
||||
if (["true", "1", "yes", "on"].includes(normalized)) {
|
||||
return true;
|
||||
}
|
||||
if (["false", "0", "no", "off"].includes(normalized)) {
|
||||
return false;
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function parseNumberValue(value: string): number | undefined {
|
||||
const parsed = Number.parseFloat(value);
|
||||
return Number.isFinite(parsed) ? parsed : undefined;
|
||||
}
|
||||
|
||||
function parseTtsDirectives(
|
||||
text: string,
|
||||
policy: ResolvedTtsModelOverrides,
|
||||
): TtsDirectiveParseResult {
|
||||
if (!policy.enabled) {
|
||||
return { cleanedText: text, overrides: {}, warnings: [], hasDirective: false };
|
||||
}
|
||||
|
||||
const overrides: TtsDirectiveOverrides = {};
|
||||
const warnings: string[] = [];
|
||||
let cleanedText = text;
|
||||
let hasDirective = false;
|
||||
|
||||
const blockRegex = /\[\[tts:text\]\]([\s\S]*?)\[\[\/tts:text\]\]/gi;
|
||||
cleanedText = cleanedText.replace(blockRegex, (_match, inner: string) => {
|
||||
hasDirective = true;
|
||||
if (policy.allowText && overrides.ttsText == null) {
|
||||
overrides.ttsText = inner.trim();
|
||||
}
|
||||
return "";
|
||||
});
|
||||
|
||||
const directiveRegex = /\[\[tts:([^\]]+)\]\]/gi;
|
||||
cleanedText = cleanedText.replace(directiveRegex, (_match, body: string) => {
|
||||
hasDirective = true;
|
||||
const tokens = body.split(/\s+/).filter(Boolean);
|
||||
for (const token of tokens) {
|
||||
const eqIndex = token.indexOf("=");
|
||||
if (eqIndex === -1) {
|
||||
continue;
|
||||
}
|
||||
const rawKey = token.slice(0, eqIndex).trim();
|
||||
const rawValue = token.slice(eqIndex + 1).trim();
|
||||
if (!rawKey || !rawValue) {
|
||||
continue;
|
||||
}
|
||||
const key = rawKey.toLowerCase();
|
||||
try {
|
||||
switch (key) {
|
||||
case "provider":
|
||||
if (!policy.allowProvider) {
|
||||
break;
|
||||
}
|
||||
if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") {
|
||||
overrides.provider = rawValue;
|
||||
} else {
|
||||
warnings.push(`unsupported provider "${rawValue}"`);
|
||||
}
|
||||
break;
|
||||
case "voice":
|
||||
case "openai_voice":
|
||||
case "openaivoice":
|
||||
if (!policy.allowVoice) {
|
||||
break;
|
||||
}
|
||||
if (isValidOpenAIVoice(rawValue)) {
|
||||
overrides.openai = { ...overrides.openai, voice: rawValue };
|
||||
} else {
|
||||
warnings.push(`invalid OpenAI voice "${rawValue}"`);
|
||||
}
|
||||
break;
|
||||
case "voiceid":
|
||||
case "voice_id":
|
||||
case "elevenlabs_voice":
|
||||
case "elevenlabsvoice":
|
||||
if (!policy.allowVoice) {
|
||||
break;
|
||||
}
|
||||
if (isValidVoiceId(rawValue)) {
|
||||
overrides.elevenlabs = { ...overrides.elevenlabs, voiceId: rawValue };
|
||||
} else {
|
||||
warnings.push(`invalid ElevenLabs voiceId "${rawValue}"`);
|
||||
}
|
||||
break;
|
||||
case "model":
|
||||
case "modelid":
|
||||
case "model_id":
|
||||
case "elevenlabs_model":
|
||||
case "elevenlabsmodel":
|
||||
case "openai_model":
|
||||
case "openaimodel":
|
||||
if (!policy.allowModelId) {
|
||||
break;
|
||||
}
|
||||
if (isValidOpenAIModel(rawValue)) {
|
||||
overrides.openai = { ...overrides.openai, model: rawValue };
|
||||
} else {
|
||||
overrides.elevenlabs = { ...overrides.elevenlabs, modelId: rawValue };
|
||||
}
|
||||
break;
|
||||
case "stability":
|
||||
if (!policy.allowVoiceSettings) {
|
||||
break;
|
||||
}
|
||||
{
|
||||
const value = parseNumberValue(rawValue);
|
||||
if (value == null) {
|
||||
warnings.push("invalid stability value");
|
||||
break;
|
||||
}
|
||||
requireInRange(value, 0, 1, "stability");
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, stability: value },
|
||||
};
|
||||
}
|
||||
break;
|
||||
case "similarity":
|
||||
case "similarityboost":
|
||||
case "similarity_boost":
|
||||
if (!policy.allowVoiceSettings) {
|
||||
break;
|
||||
}
|
||||
{
|
||||
const value = parseNumberValue(rawValue);
|
||||
if (value == null) {
|
||||
warnings.push("invalid similarityBoost value");
|
||||
break;
|
||||
}
|
||||
requireInRange(value, 0, 1, "similarityBoost");
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, similarityBoost: value },
|
||||
};
|
||||
}
|
||||
break;
|
||||
case "style":
|
||||
if (!policy.allowVoiceSettings) {
|
||||
break;
|
||||
}
|
||||
{
|
||||
const value = parseNumberValue(rawValue);
|
||||
if (value == null) {
|
||||
warnings.push("invalid style value");
|
||||
break;
|
||||
}
|
||||
requireInRange(value, 0, 1, "style");
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, style: value },
|
||||
};
|
||||
}
|
||||
break;
|
||||
case "speed":
|
||||
if (!policy.allowVoiceSettings) {
|
||||
break;
|
||||
}
|
||||
{
|
||||
const value = parseNumberValue(rawValue);
|
||||
if (value == null) {
|
||||
warnings.push("invalid speed value");
|
||||
break;
|
||||
}
|
||||
requireInRange(value, 0.5, 2, "speed");
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, speed: value },
|
||||
};
|
||||
}
|
||||
break;
|
||||
case "speakerboost":
|
||||
case "speaker_boost":
|
||||
case "usespeakerboost":
|
||||
case "use_speaker_boost":
|
||||
if (!policy.allowVoiceSettings) {
|
||||
break;
|
||||
}
|
||||
{
|
||||
const value = parseBooleanValue(rawValue);
|
||||
if (value == null) {
|
||||
warnings.push("invalid useSpeakerBoost value");
|
||||
break;
|
||||
}
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, useSpeakerBoost: value },
|
||||
};
|
||||
}
|
||||
break;
|
||||
case "normalize":
|
||||
case "applytextnormalization":
|
||||
case "apply_text_normalization":
|
||||
if (!policy.allowNormalization) {
|
||||
break;
|
||||
}
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
applyTextNormalization: normalizeApplyTextNormalization(rawValue),
|
||||
};
|
||||
break;
|
||||
case "language":
|
||||
case "languagecode":
|
||||
case "language_code":
|
||||
if (!policy.allowNormalization) {
|
||||
break;
|
||||
}
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
languageCode: normalizeLanguageCode(rawValue),
|
||||
};
|
||||
break;
|
||||
case "seed":
|
||||
if (!policy.allowSeed) {
|
||||
break;
|
||||
}
|
||||
overrides.elevenlabs = {
|
||||
...overrides.elevenlabs,
|
||||
seed: normalizeSeed(Number.parseInt(rawValue, 10)),
|
||||
};
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
} catch (err) {
|
||||
warnings.push((err as Error).message);
|
||||
}
|
||||
}
|
||||
return "";
|
||||
});
|
||||
|
||||
return {
|
||||
cleanedText,
|
||||
ttsText: overrides.ttsText,
|
||||
hasDirective,
|
||||
overrides,
|
||||
warnings,
|
||||
};
|
||||
}
|
||||
|
||||
export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] as const;
|
||||
|
||||
/**
|
||||
* Custom OpenAI-compatible TTS endpoint.
|
||||
* When set, model/voice validation is relaxed to allow non-OpenAI models.
|
||||
* Example: OPENAI_TTS_BASE_URL=http://localhost:8880/v1
|
||||
*
|
||||
* Note: Read at runtime (not module load) to support config.env loading.
|
||||
*/
|
||||
function getOpenAITtsBaseUrl(): string {
|
||||
return (process.env.OPENAI_TTS_BASE_URL?.trim() || "https://api.openai.com/v1").replace(
|
||||
/\/+$/,
|
||||
"",
|
||||
);
|
||||
}
|
||||
|
||||
function isCustomOpenAIEndpoint(): boolean {
|
||||
return getOpenAITtsBaseUrl() !== "https://api.openai.com/v1";
|
||||
}
|
||||
export const OPENAI_TTS_VOICES = [
|
||||
"alloy",
|
||||
"ash",
|
||||
"ballad",
|
||||
"cedar",
|
||||
"coral",
|
||||
"echo",
|
||||
"fable",
|
||||
"juniper",
|
||||
"marin",
|
||||
"onyx",
|
||||
"nova",
|
||||
"sage",
|
||||
"shimmer",
|
||||
"verse",
|
||||
] as const;
|
||||
|
||||
type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
|
||||
|
||||
function isValidOpenAIModel(model: string): boolean {
|
||||
// Allow any model when using custom endpoint (e.g., Kokoro, LocalAI)
|
||||
if (isCustomOpenAIEndpoint()) {
|
||||
return true;
|
||||
}
|
||||
return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
|
||||
}
|
||||
|
||||
function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice {
|
||||
// Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices)
|
||||
if (isCustomOpenAIEndpoint()) {
|
||||
return true;
|
||||
}
|
||||
return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
|
||||
}
|
||||
|
||||
type SummarizeResult = {
|
||||
summary: string;
|
||||
latencyMs: number;
|
||||
inputLength: number;
|
||||
outputLength: number;
|
||||
};
|
||||
|
||||
type SummaryModelSelection = {
|
||||
ref: ModelRef;
|
||||
source: "summaryModel" | "default";
|
||||
};
|
||||
|
||||
function resolveSummaryModelRef(
|
||||
cfg: OpenClawConfig,
|
||||
config: ResolvedTtsConfig,
|
||||
): SummaryModelSelection {
|
||||
const defaultRef = resolveDefaultModelForAgent({ cfg });
|
||||
const override = config.summaryModel?.trim();
|
||||
if (!override) {
|
||||
return { ref: defaultRef, source: "default" };
|
||||
}
|
||||
|
||||
const aliasIndex = buildModelAliasIndex({ cfg, defaultProvider: defaultRef.provider });
|
||||
const resolved = resolveModelRefFromString({
|
||||
raw: override,
|
||||
defaultProvider: defaultRef.provider,
|
||||
aliasIndex,
|
||||
});
|
||||
if (!resolved) {
|
||||
return { ref: defaultRef, source: "default" };
|
||||
}
|
||||
return { ref: resolved.ref, source: "summaryModel" };
|
||||
}
|
||||
|
||||
function isTextContentBlock(block: { type: string }): block is TextContent {
|
||||
return block.type === "text";
|
||||
}
|
||||
|
||||
async function summarizeText(params: {
|
||||
text: string;
|
||||
targetLength: number;
|
||||
cfg: OpenClawConfig;
|
||||
config: ResolvedTtsConfig;
|
||||
timeoutMs: number;
|
||||
}): Promise<SummarizeResult> {
|
||||
const { text, targetLength, cfg, config, timeoutMs } = params;
|
||||
if (targetLength < 100 || targetLength > 10_000) {
|
||||
throw new Error(`Invalid targetLength: ${targetLength}`);
|
||||
}
|
||||
|
||||
const startTime = Date.now();
|
||||
const { ref } = resolveSummaryModelRef(cfg, config);
|
||||
const resolved = resolveModel(ref.provider, ref.model, undefined, cfg);
|
||||
if (!resolved.model) {
|
||||
throw new Error(resolved.error ?? `Unknown summary model: ${ref.provider}/${ref.model}`);
|
||||
}
|
||||
const apiKey = requireApiKey(
|
||||
await getApiKeyForModel({ model: resolved.model, cfg }),
|
||||
ref.provider,
|
||||
);
|
||||
|
||||
try {
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(controller.abort.bind(controller), timeoutMs);
|
||||
|
||||
try {
|
||||
const res = await completeSimple(
|
||||
resolved.model,
|
||||
{
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content:
|
||||
`You are an assistant that summarizes texts concisely while keeping the most important information. ` +
|
||||
`Summarize the text to approximately ${targetLength} characters. Maintain the original tone and style. ` +
|
||||
`Reply only with the summary, without additional explanations.\n\n` +
|
||||
`<text_to_summarize>\n${text}\n</text_to_summarize>`,
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
apiKey,
|
||||
maxTokens: Math.ceil(targetLength / 2),
|
||||
temperature: 0.3,
|
||||
signal: controller.signal,
|
||||
},
|
||||
);
|
||||
|
||||
const summary = res.content
|
||||
.filter(isTextContentBlock)
|
||||
.map((block) => block.text.trim())
|
||||
.filter(Boolean)
|
||||
.join(" ")
|
||||
.trim();
|
||||
|
||||
if (!summary) {
|
||||
throw new Error("No summary returned");
|
||||
}
|
||||
|
||||
return {
|
||||
summary,
|
||||
latencyMs: Date.now() - startTime,
|
||||
inputLength: text.length,
|
||||
outputLength: summary.length,
|
||||
};
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
} catch (err) {
|
||||
const error = err as Error;
|
||||
if (error.name === "AbortError") {
|
||||
throw new Error("Summarization timed out", { cause: err });
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
function scheduleCleanup(tempDir: string, delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS): void {
|
||||
const timer = setTimeout(() => {
|
||||
try {
|
||||
rmSync(tempDir, { recursive: true, force: true });
|
||||
} catch {
|
||||
// ignore cleanup errors
|
||||
}
|
||||
}, delayMs);
|
||||
timer.unref();
|
||||
}
|
||||
|
||||
async function elevenLabsTTS(params: {
|
||||
text: string;
|
||||
apiKey: string;
|
||||
baseUrl: string;
|
||||
voiceId: string;
|
||||
modelId: string;
|
||||
outputFormat: string;
|
||||
seed?: number;
|
||||
applyTextNormalization?: "auto" | "on" | "off";
|
||||
languageCode?: string;
|
||||
voiceSettings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"];
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
const {
|
||||
text,
|
||||
apiKey,
|
||||
baseUrl,
|
||||
voiceId,
|
||||
modelId,
|
||||
outputFormat,
|
||||
seed,
|
||||
applyTextNormalization,
|
||||
languageCode,
|
||||
voiceSettings,
|
||||
timeoutMs,
|
||||
} = params;
|
||||
if (!isValidVoiceId(voiceId)) {
|
||||
throw new Error("Invalid voiceId format");
|
||||
}
|
||||
assertElevenLabsVoiceSettings(voiceSettings);
|
||||
const normalizedLanguage = normalizeLanguageCode(languageCode);
|
||||
const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization);
|
||||
const normalizedSeed = normalizeSeed(seed);
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(controller.abort.bind(controller), timeoutMs);
|
||||
|
||||
try {
|
||||
const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`);
|
||||
if (outputFormat) {
|
||||
url.searchParams.set("output_format", outputFormat);
|
||||
}
|
||||
|
||||
const response = await fetch(url.toString(), {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"xi-api-key": apiKey,
|
||||
"Content-Type": "application/json",
|
||||
Accept: "audio/mpeg",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
text,
|
||||
model_id: modelId,
|
||||
seed: normalizedSeed,
|
||||
apply_text_normalization: normalizedNormalization,
|
||||
language_code: normalizedLanguage,
|
||||
voice_settings: {
|
||||
stability: voiceSettings.stability,
|
||||
similarity_boost: voiceSettings.similarityBoost,
|
||||
style: voiceSettings.style,
|
||||
use_speaker_boost: voiceSettings.useSpeakerBoost,
|
||||
speed: voiceSettings.speed,
|
||||
},
|
||||
}),
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`ElevenLabs API error (${response.status})`);
|
||||
}
|
||||
|
||||
return Buffer.from(await response.arrayBuffer());
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
}
|
||||
|
||||
async function openaiTTS(params: {
|
||||
text: string;
|
||||
apiKey: string;
|
||||
model: string;
|
||||
voice: string;
|
||||
responseFormat: "mp3" | "opus" | "pcm";
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
const { text, apiKey, model, voice, responseFormat, timeoutMs } = params;
|
||||
|
||||
if (!isValidOpenAIModel(model)) {
|
||||
throw new Error(`Invalid model: ${model}`);
|
||||
}
|
||||
if (!isValidOpenAIVoice(voice)) {
|
||||
throw new Error(`Invalid voice: ${voice}`);
|
||||
}
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(controller.abort.bind(controller), timeoutMs);
|
||||
|
||||
try {
|
||||
const response = await fetch(`${getOpenAITtsBaseUrl()}/audio/speech`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model,
|
||||
input: text,
|
||||
voice,
|
||||
response_format: responseFormat,
|
||||
}),
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`OpenAI TTS API error (${response.status})`);
|
||||
}
|
||||
|
||||
return Buffer.from(await response.arrayBuffer());
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
}
|
||||
|
||||
function inferEdgeExtension(outputFormat: string): string {
|
||||
const normalized = outputFormat.toLowerCase();
|
||||
if (normalized.includes("webm")) {
|
||||
return ".webm";
|
||||
}
|
||||
if (normalized.includes("ogg")) {
|
||||
return ".ogg";
|
||||
}
|
||||
if (normalized.includes("opus")) {
|
||||
return ".opus";
|
||||
}
|
||||
if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) {
|
||||
return ".wav";
|
||||
}
|
||||
return ".mp3";
|
||||
}
|
||||
|
||||
async function edgeTTS(params: {
|
||||
text: string;
|
||||
outputPath: string;
|
||||
config: ResolvedTtsConfig["edge"];
|
||||
timeoutMs: number;
|
||||
}): Promise<void> {
|
||||
const { text, outputPath, config, timeoutMs } = params;
|
||||
const tts = new EdgeTTS({
|
||||
voice: config.voice,
|
||||
lang: config.lang,
|
||||
outputFormat: config.outputFormat,
|
||||
saveSubtitles: config.saveSubtitles,
|
||||
proxy: config.proxy,
|
||||
rate: config.rate,
|
||||
pitch: config.pitch,
|
||||
volume: config.volume,
|
||||
timeout: config.timeoutMs ?? timeoutMs,
|
||||
});
|
||||
await tts.ttsPromise(text, outputPath);
|
||||
}
|
||||
|
||||
export async function textToSpeech(params: {
|
||||
text: string;
|
||||
cfg: OpenClawConfig;
|
||||
|
||||
Reference in New Issue
Block a user