Files
Moltbot/src/tts/tts.ts

1580 lines
46 KiB
TypeScript

import { completeSimple, type TextContent } from "@mariozechner/pi-ai";
import { EdgeTTS } from "node-edge-tts";
import {
existsSync,
mkdirSync,
readFileSync,
writeFileSync,
mkdtempSync,
rmSync,
renameSync,
unlinkSync,
} from "node:fs";
import { tmpdir } from "node:os";
import path from "node:path";
import type { ReplyPayload } from "../auto-reply/types.js";
import type { ChannelId } from "../channels/plugins/types.js";
import type { OpenClawConfig } from "../config/config.js";
import type {
TtsConfig,
TtsAutoMode,
TtsMode,
TtsProvider,
TtsModelOverrideConfig,
} from "../config/types.tts.js";
import { getApiKeyForModel, requireApiKey } from "../agents/model-auth.js";
import {
buildModelAliasIndex,
resolveDefaultModelForAgent,
resolveModelRefFromString,
type ModelRef,
} from "../agents/model-selection.js";
import { resolveModel } from "../agents/pi-embedded-runner/model.js";
import { normalizeChannelId } from "../channels/plugins/index.js";
import { logVerbose } from "../globals.js";
import { isVoiceCompatibleAudio } from "../media/audio.js";
import { CONFIG_DIR, resolveUserPath } from "../utils.js";
const DEFAULT_TIMEOUT_MS = 30_000;
const DEFAULT_TTS_MAX_LENGTH = 1500;
const DEFAULT_TTS_SUMMARIZE = true;
const DEFAULT_MAX_TEXT_LENGTH = 4096;
const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes
const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE";
const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2";
const DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts";
const DEFAULT_OPENAI_VOICE = "alloy";
const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural";
const DEFAULT_EDGE_LANG = "en-US";
const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
stability: 0.5,
similarityBoost: 0.75,
style: 0.0,
useSpeakerBoost: true,
speed: 1.0,
};
const TELEGRAM_OUTPUT = {
openai: "opus" as const,
// ElevenLabs output formats use codec_sample_rate_bitrate naming.
// Opus @ 48kHz/64kbps is a good voice-note tradeoff for Telegram.
elevenlabs: "opus_48000_64",
extension: ".opus",
voiceCompatible: true,
};
const DEFAULT_OUTPUT = {
openai: "mp3" as const,
elevenlabs: "mp3_44100_128",
extension: ".mp3",
voiceCompatible: false,
};
const TELEPHONY_OUTPUT = {
openai: { format: "pcm" as const, sampleRate: 24000 },
elevenlabs: { format: "pcm_22050", sampleRate: 22050 },
};
const TTS_AUTO_MODES = new Set<TtsAutoMode>(["off", "always", "inbound", "tagged"]);
export type ResolvedTtsConfig = {
auto: TtsAutoMode;
mode: TtsMode;
provider: TtsProvider;
providerSource: "config" | "default";
summaryModel?: string;
modelOverrides: ResolvedTtsModelOverrides;
elevenlabs: {
apiKey?: string;
baseUrl: string;
voiceId: string;
modelId: string;
seed?: number;
applyTextNormalization?: "auto" | "on" | "off";
languageCode?: string;
voiceSettings: {
stability: number;
similarityBoost: number;
style: number;
useSpeakerBoost: boolean;
speed: number;
};
};
openai: {
apiKey?: string;
model: string;
voice: string;
};
edge: {
enabled: boolean;
voice: string;
lang: string;
outputFormat: string;
outputFormatConfigured: boolean;
pitch?: string;
rate?: string;
volume?: string;
saveSubtitles: boolean;
proxy?: string;
timeoutMs?: number;
};
prefsPath?: string;
maxTextLength: number;
timeoutMs: number;
};
type TtsUserPrefs = {
tts?: {
auto?: TtsAutoMode;
enabled?: boolean;
provider?: TtsProvider;
maxLength?: number;
summarize?: boolean;
};
};
type ResolvedTtsModelOverrides = {
enabled: boolean;
allowText: boolean;
allowProvider: boolean;
allowVoice: boolean;
allowModelId: boolean;
allowVoiceSettings: boolean;
allowNormalization: boolean;
allowSeed: boolean;
};
type TtsDirectiveOverrides = {
ttsText?: string;
provider?: TtsProvider;
openai?: {
voice?: string;
model?: string;
};
elevenlabs?: {
voiceId?: string;
modelId?: string;
seed?: number;
applyTextNormalization?: "auto" | "on" | "off";
languageCode?: string;
voiceSettings?: Partial<ResolvedTtsConfig["elevenlabs"]["voiceSettings"]>;
};
};
type TtsDirectiveParseResult = {
cleanedText: string;
ttsText?: string;
hasDirective: boolean;
overrides: TtsDirectiveOverrides;
warnings: string[];
};
export type TtsResult = {
success: boolean;
audioPath?: string;
error?: string;
latencyMs?: number;
provider?: string;
outputFormat?: string;
voiceCompatible?: boolean;
};
export type TtsTelephonyResult = {
success: boolean;
audioBuffer?: Buffer;
error?: string;
latencyMs?: number;
provider?: string;
outputFormat?: string;
sampleRate?: number;
};
type TtsStatusEntry = {
timestamp: number;
success: boolean;
textLength: number;
summarized: boolean;
provider?: string;
latencyMs?: number;
error?: string;
};
let lastTtsAttempt: TtsStatusEntry | undefined;
export function normalizeTtsAutoMode(value: unknown): TtsAutoMode | undefined {
if (typeof value !== "string") {
return undefined;
}
const normalized = value.trim().toLowerCase();
if (TTS_AUTO_MODES.has(normalized as TtsAutoMode)) {
return normalized as TtsAutoMode;
}
return undefined;
}
function resolveModelOverridePolicy(
overrides: TtsModelOverrideConfig | undefined,
): ResolvedTtsModelOverrides {
const enabled = overrides?.enabled ?? true;
if (!enabled) {
return {
enabled: false,
allowText: false,
allowProvider: false,
allowVoice: false,
allowModelId: false,
allowVoiceSettings: false,
allowNormalization: false,
allowSeed: false,
};
}
const allow = (value?: boolean) => value ?? true;
return {
enabled: true,
allowText: allow(overrides?.allowText),
allowProvider: allow(overrides?.allowProvider),
allowVoice: allow(overrides?.allowVoice),
allowModelId: allow(overrides?.allowModelId),
allowVoiceSettings: allow(overrides?.allowVoiceSettings),
allowNormalization: allow(overrides?.allowNormalization),
allowSeed: allow(overrides?.allowSeed),
};
}
export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
const raw: TtsConfig = cfg.messages?.tts ?? {};
const providerSource = raw.provider ? "config" : "default";
const edgeOutputFormat = raw.edge?.outputFormat?.trim();
const auto = normalizeTtsAutoMode(raw.auto) ?? (raw.enabled ? "always" : "off");
return {
auto,
mode: raw.mode ?? "final",
provider: raw.provider ?? "edge",
providerSource,
summaryModel: raw.summaryModel?.trim() || undefined,
modelOverrides: resolveModelOverridePolicy(raw.modelOverrides),
elevenlabs: {
apiKey: raw.elevenlabs?.apiKey,
baseUrl: raw.elevenlabs?.baseUrl?.trim() || DEFAULT_ELEVENLABS_BASE_URL,
voiceId: raw.elevenlabs?.voiceId ?? DEFAULT_ELEVENLABS_VOICE_ID,
modelId: raw.elevenlabs?.modelId ?? DEFAULT_ELEVENLABS_MODEL_ID,
seed: raw.elevenlabs?.seed,
applyTextNormalization: raw.elevenlabs?.applyTextNormalization,
languageCode: raw.elevenlabs?.languageCode,
voiceSettings: {
stability:
raw.elevenlabs?.voiceSettings?.stability ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.stability,
similarityBoost:
raw.elevenlabs?.voiceSettings?.similarityBoost ??
DEFAULT_ELEVENLABS_VOICE_SETTINGS.similarityBoost,
style: raw.elevenlabs?.voiceSettings?.style ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.style,
useSpeakerBoost:
raw.elevenlabs?.voiceSettings?.useSpeakerBoost ??
DEFAULT_ELEVENLABS_VOICE_SETTINGS.useSpeakerBoost,
speed: raw.elevenlabs?.voiceSettings?.speed ?? DEFAULT_ELEVENLABS_VOICE_SETTINGS.speed,
},
},
openai: {
apiKey: raw.openai?.apiKey,
model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL,
voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE,
},
edge: {
enabled: raw.edge?.enabled ?? true,
voice: raw.edge?.voice?.trim() || DEFAULT_EDGE_VOICE,
lang: raw.edge?.lang?.trim() || DEFAULT_EDGE_LANG,
outputFormat: edgeOutputFormat || DEFAULT_EDGE_OUTPUT_FORMAT,
outputFormatConfigured: Boolean(edgeOutputFormat),
pitch: raw.edge?.pitch?.trim() || undefined,
rate: raw.edge?.rate?.trim() || undefined,
volume: raw.edge?.volume?.trim() || undefined,
saveSubtitles: raw.edge?.saveSubtitles ?? false,
proxy: raw.edge?.proxy?.trim() || undefined,
timeoutMs: raw.edge?.timeoutMs,
},
prefsPath: raw.prefsPath,
maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH,
timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS,
};
}
export function resolveTtsPrefsPath(config: ResolvedTtsConfig): string {
if (config.prefsPath?.trim()) {
return resolveUserPath(config.prefsPath.trim());
}
const envPath = process.env.OPENCLAW_TTS_PREFS?.trim();
if (envPath) {
return resolveUserPath(envPath);
}
return path.join(CONFIG_DIR, "settings", "tts.json");
}
function resolveTtsAutoModeFromPrefs(prefs: TtsUserPrefs): TtsAutoMode | undefined {
const auto = normalizeTtsAutoMode(prefs.tts?.auto);
if (auto) {
return auto;
}
if (typeof prefs.tts?.enabled === "boolean") {
return prefs.tts.enabled ? "always" : "off";
}
return undefined;
}
export function resolveTtsAutoMode(params: {
config: ResolvedTtsConfig;
prefsPath: string;
sessionAuto?: string;
}): TtsAutoMode {
const sessionAuto = normalizeTtsAutoMode(params.sessionAuto);
if (sessionAuto) {
return sessionAuto;
}
const prefsAuto = resolveTtsAutoModeFromPrefs(readPrefs(params.prefsPath));
if (prefsAuto) {
return prefsAuto;
}
return params.config.auto;
}
export function buildTtsSystemPromptHint(cfg: OpenClawConfig): string | undefined {
const config = resolveTtsConfig(cfg);
const prefsPath = resolveTtsPrefsPath(config);
const autoMode = resolveTtsAutoMode({ config, prefsPath });
if (autoMode === "off") {
return undefined;
}
const maxLength = getTtsMaxLength(prefsPath);
const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off";
const autoHint =
autoMode === "inbound"
? "Only use TTS when the user's last message includes audio/voice."
: autoMode === "tagged"
? "Only use TTS when you include [[tts]] or [[tts:text]] tags."
: undefined;
return [
"Voice (TTS) is enabled.",
autoHint,
`Keep spoken text ≤${maxLength} chars to avoid auto-summary (summary ${summarize}).`,
"Use [[tts:...]] and optional [[tts:text]]...[[/tts:text]] to control voice/expressiveness.",
]
.filter(Boolean)
.join("\n");
}
function readPrefs(prefsPath: string): TtsUserPrefs {
try {
if (!existsSync(prefsPath)) {
return {};
}
return JSON.parse(readFileSync(prefsPath, "utf8")) as TtsUserPrefs;
} catch {
return {};
}
}
function atomicWriteFileSync(filePath: string, content: string): void {
const tmpPath = `${filePath}.tmp.${Date.now()}.${Math.random().toString(36).slice(2)}`;
writeFileSync(tmpPath, content);
try {
renameSync(tmpPath, filePath);
} catch (err) {
try {
unlinkSync(tmpPath);
} catch {
// ignore
}
throw err;
}
}
function updatePrefs(prefsPath: string, update: (prefs: TtsUserPrefs) => void): void {
const prefs = readPrefs(prefsPath);
update(prefs);
mkdirSync(path.dirname(prefsPath), { recursive: true });
atomicWriteFileSync(prefsPath, JSON.stringify(prefs, null, 2));
}
export function isTtsEnabled(
config: ResolvedTtsConfig,
prefsPath: string,
sessionAuto?: string,
): boolean {
return resolveTtsAutoMode({ config, prefsPath, sessionAuto }) !== "off";
}
export function setTtsAutoMode(prefsPath: string, mode: TtsAutoMode): void {
updatePrefs(prefsPath, (prefs) => {
const next = { ...prefs.tts };
delete next.enabled;
next.auto = mode;
prefs.tts = next;
});
}
export function setTtsEnabled(prefsPath: string, enabled: boolean): void {
setTtsAutoMode(prefsPath, enabled ? "always" : "off");
}
export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): TtsProvider {
const prefs = readPrefs(prefsPath);
if (prefs.tts?.provider) {
return prefs.tts.provider;
}
if (config.providerSource === "config") {
return config.provider;
}
if (resolveTtsApiKey(config, "openai")) {
return "openai";
}
if (resolveTtsApiKey(config, "elevenlabs")) {
return "elevenlabs";
}
return "edge";
}
export function setTtsProvider(prefsPath: string, provider: TtsProvider): void {
updatePrefs(prefsPath, (prefs) => {
prefs.tts = { ...prefs.tts, provider };
});
}
export function getTtsMaxLength(prefsPath: string): number {
const prefs = readPrefs(prefsPath);
return prefs.tts?.maxLength ?? DEFAULT_TTS_MAX_LENGTH;
}
export function setTtsMaxLength(prefsPath: string, maxLength: number): void {
updatePrefs(prefsPath, (prefs) => {
prefs.tts = { ...prefs.tts, maxLength };
});
}
export function isSummarizationEnabled(prefsPath: string): boolean {
const prefs = readPrefs(prefsPath);
return prefs.tts?.summarize ?? DEFAULT_TTS_SUMMARIZE;
}
export function setSummarizationEnabled(prefsPath: string, enabled: boolean): void {
updatePrefs(prefsPath, (prefs) => {
prefs.tts = { ...prefs.tts, summarize: enabled };
});
}
export function getLastTtsAttempt(): TtsStatusEntry | undefined {
return lastTtsAttempt;
}
export function setLastTtsAttempt(entry: TtsStatusEntry | undefined): void {
lastTtsAttempt = entry;
}
function resolveOutputFormat(channelId?: string | null) {
if (channelId === "telegram") {
return TELEGRAM_OUTPUT;
}
return DEFAULT_OUTPUT;
}
function resolveChannelId(channel: string | undefined): ChannelId | null {
return channel ? normalizeChannelId(channel) : null;
}
function resolveEdgeOutputFormat(config: ResolvedTtsConfig): string {
return config.edge.outputFormat;
}
export function resolveTtsApiKey(
config: ResolvedTtsConfig,
provider: TtsProvider,
): string | undefined {
if (provider === "elevenlabs") {
return config.elevenlabs.apiKey || process.env.ELEVENLABS_API_KEY || process.env.XI_API_KEY;
}
if (provider === "openai") {
return config.openai.apiKey || process.env.OPENAI_API_KEY;
}
return undefined;
}
export const TTS_PROVIDERS = ["openai", "elevenlabs", "edge"] as const;
export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] {
return [primary, ...TTS_PROVIDERS.filter((provider) => provider !== primary)];
}
export function isTtsProviderConfigured(config: ResolvedTtsConfig, provider: TtsProvider): boolean {
if (provider === "edge") {
return config.edge.enabled;
}
return Boolean(resolveTtsApiKey(config, provider));
}
function isValidVoiceId(voiceId: string): boolean {
return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
}
function normalizeElevenLabsBaseUrl(baseUrl: string): string {
const trimmed = baseUrl.trim();
if (!trimmed) {
return DEFAULT_ELEVENLABS_BASE_URL;
}
return trimmed.replace(/\/+$/, "");
}
function requireInRange(value: number, min: number, max: number, label: string): void {
if (!Number.isFinite(value) || value < min || value > max) {
throw new Error(`${label} must be between ${min} and ${max}`);
}
}
function assertElevenLabsVoiceSettings(settings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"]) {
requireInRange(settings.stability, 0, 1, "stability");
requireInRange(settings.similarityBoost, 0, 1, "similarityBoost");
requireInRange(settings.style, 0, 1, "style");
requireInRange(settings.speed, 0.5, 2, "speed");
}
function normalizeLanguageCode(code?: string): string | undefined {
const trimmed = code?.trim();
if (!trimmed) {
return undefined;
}
const normalized = trimmed.toLowerCase();
if (!/^[a-z]{2}$/.test(normalized)) {
throw new Error("languageCode must be a 2-letter ISO 639-1 code (e.g. en, de, fr)");
}
return normalized;
}
function normalizeApplyTextNormalization(mode?: string): "auto" | "on" | "off" | undefined {
const trimmed = mode?.trim();
if (!trimmed) {
return undefined;
}
const normalized = trimmed.toLowerCase();
if (normalized === "auto" || normalized === "on" || normalized === "off") {
return normalized;
}
throw new Error("applyTextNormalization must be one of: auto, on, off");
}
function normalizeSeed(seed?: number): number | undefined {
if (seed == null) {
return undefined;
}
const next = Math.floor(seed);
if (!Number.isFinite(next) || next < 0 || next > 4_294_967_295) {
throw new Error("seed must be between 0 and 4294967295");
}
return next;
}
function parseBooleanValue(value: string): boolean | undefined {
const normalized = value.trim().toLowerCase();
if (["true", "1", "yes", "on"].includes(normalized)) {
return true;
}
if (["false", "0", "no", "off"].includes(normalized)) {
return false;
}
return undefined;
}
function parseNumberValue(value: string): number | undefined {
const parsed = Number.parseFloat(value);
return Number.isFinite(parsed) ? parsed : undefined;
}
function parseTtsDirectives(
text: string,
policy: ResolvedTtsModelOverrides,
): TtsDirectiveParseResult {
if (!policy.enabled) {
return { cleanedText: text, overrides: {}, warnings: [], hasDirective: false };
}
const overrides: TtsDirectiveOverrides = {};
const warnings: string[] = [];
let cleanedText = text;
let hasDirective = false;
const blockRegex = /\[\[tts:text\]\]([\s\S]*?)\[\[\/tts:text\]\]/gi;
cleanedText = cleanedText.replace(blockRegex, (_match, inner: string) => {
hasDirective = true;
if (policy.allowText && overrides.ttsText == null) {
overrides.ttsText = inner.trim();
}
return "";
});
const directiveRegex = /\[\[tts:([^\]]+)\]\]/gi;
cleanedText = cleanedText.replace(directiveRegex, (_match, body: string) => {
hasDirective = true;
const tokens = body.split(/\s+/).filter(Boolean);
for (const token of tokens) {
const eqIndex = token.indexOf("=");
if (eqIndex === -1) {
continue;
}
const rawKey = token.slice(0, eqIndex).trim();
const rawValue = token.slice(eqIndex + 1).trim();
if (!rawKey || !rawValue) {
continue;
}
const key = rawKey.toLowerCase();
try {
switch (key) {
case "provider":
if (!policy.allowProvider) {
break;
}
if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") {
overrides.provider = rawValue;
} else {
warnings.push(`unsupported provider "${rawValue}"`);
}
break;
case "voice":
case "openai_voice":
case "openaivoice":
if (!policy.allowVoice) {
break;
}
if (isValidOpenAIVoice(rawValue)) {
overrides.openai = { ...overrides.openai, voice: rawValue };
} else {
warnings.push(`invalid OpenAI voice "${rawValue}"`);
}
break;
case "voiceid":
case "voice_id":
case "elevenlabs_voice":
case "elevenlabsvoice":
if (!policy.allowVoice) {
break;
}
if (isValidVoiceId(rawValue)) {
overrides.elevenlabs = { ...overrides.elevenlabs, voiceId: rawValue };
} else {
warnings.push(`invalid ElevenLabs voiceId "${rawValue}"`);
}
break;
case "model":
case "modelid":
case "model_id":
case "elevenlabs_model":
case "elevenlabsmodel":
case "openai_model":
case "openaimodel":
if (!policy.allowModelId) {
break;
}
if (isValidOpenAIModel(rawValue)) {
overrides.openai = { ...overrides.openai, model: rawValue };
} else {
overrides.elevenlabs = { ...overrides.elevenlabs, modelId: rawValue };
}
break;
case "stability":
if (!policy.allowVoiceSettings) {
break;
}
{
const value = parseNumberValue(rawValue);
if (value == null) {
warnings.push("invalid stability value");
break;
}
requireInRange(value, 0, 1, "stability");
overrides.elevenlabs = {
...overrides.elevenlabs,
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, stability: value },
};
}
break;
case "similarity":
case "similarityboost":
case "similarity_boost":
if (!policy.allowVoiceSettings) {
break;
}
{
const value = parseNumberValue(rawValue);
if (value == null) {
warnings.push("invalid similarityBoost value");
break;
}
requireInRange(value, 0, 1, "similarityBoost");
overrides.elevenlabs = {
...overrides.elevenlabs,
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, similarityBoost: value },
};
}
break;
case "style":
if (!policy.allowVoiceSettings) {
break;
}
{
const value = parseNumberValue(rawValue);
if (value == null) {
warnings.push("invalid style value");
break;
}
requireInRange(value, 0, 1, "style");
overrides.elevenlabs = {
...overrides.elevenlabs,
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, style: value },
};
}
break;
case "speed":
if (!policy.allowVoiceSettings) {
break;
}
{
const value = parseNumberValue(rawValue);
if (value == null) {
warnings.push("invalid speed value");
break;
}
requireInRange(value, 0.5, 2, "speed");
overrides.elevenlabs = {
...overrides.elevenlabs,
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, speed: value },
};
}
break;
case "speakerboost":
case "speaker_boost":
case "usespeakerboost":
case "use_speaker_boost":
if (!policy.allowVoiceSettings) {
break;
}
{
const value = parseBooleanValue(rawValue);
if (value == null) {
warnings.push("invalid useSpeakerBoost value");
break;
}
overrides.elevenlabs = {
...overrides.elevenlabs,
voiceSettings: { ...overrides.elevenlabs?.voiceSettings, useSpeakerBoost: value },
};
}
break;
case "normalize":
case "applytextnormalization":
case "apply_text_normalization":
if (!policy.allowNormalization) {
break;
}
overrides.elevenlabs = {
...overrides.elevenlabs,
applyTextNormalization: normalizeApplyTextNormalization(rawValue),
};
break;
case "language":
case "languagecode":
case "language_code":
if (!policy.allowNormalization) {
break;
}
overrides.elevenlabs = {
...overrides.elevenlabs,
languageCode: normalizeLanguageCode(rawValue),
};
break;
case "seed":
if (!policy.allowSeed) {
break;
}
overrides.elevenlabs = {
...overrides.elevenlabs,
seed: normalizeSeed(Number.parseInt(rawValue, 10)),
};
break;
default:
break;
}
} catch (err) {
warnings.push((err as Error).message);
}
}
return "";
});
return {
cleanedText,
ttsText: overrides.ttsText,
hasDirective,
overrides,
warnings,
};
}
export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] as const;
/**
* Custom OpenAI-compatible TTS endpoint.
* When set, model/voice validation is relaxed to allow non-OpenAI models.
* Example: OPENAI_TTS_BASE_URL=http://localhost:8880/v1
*
* Note: Read at runtime (not module load) to support config.env loading.
*/
function getOpenAITtsBaseUrl(): string {
return (process.env.OPENAI_TTS_BASE_URL?.trim() || "https://api.openai.com/v1").replace(
/\/+$/,
"",
);
}
function isCustomOpenAIEndpoint(): boolean {
return getOpenAITtsBaseUrl() !== "https://api.openai.com/v1";
}
export const OPENAI_TTS_VOICES = [
"alloy",
"ash",
"coral",
"echo",
"fable",
"onyx",
"nova",
"sage",
"shimmer",
] as const;
type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
function isValidOpenAIModel(model: string): boolean {
// Allow any model when using custom endpoint (e.g., Kokoro, LocalAI)
if (isCustomOpenAIEndpoint()) {
return true;
}
return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
}
function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice {
// Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices)
if (isCustomOpenAIEndpoint()) {
return true;
}
return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
}
type SummarizeResult = {
summary: string;
latencyMs: number;
inputLength: number;
outputLength: number;
};
type SummaryModelSelection = {
ref: ModelRef;
source: "summaryModel" | "default";
};
function resolveSummaryModelRef(
cfg: OpenClawConfig,
config: ResolvedTtsConfig,
): SummaryModelSelection {
const defaultRef = resolveDefaultModelForAgent({ cfg });
const override = config.summaryModel?.trim();
if (!override) {
return { ref: defaultRef, source: "default" };
}
const aliasIndex = buildModelAliasIndex({ cfg, defaultProvider: defaultRef.provider });
const resolved = resolveModelRefFromString({
raw: override,
defaultProvider: defaultRef.provider,
aliasIndex,
});
if (!resolved) {
return { ref: defaultRef, source: "default" };
}
return { ref: resolved.ref, source: "summaryModel" };
}
function isTextContentBlock(block: { type: string }): block is TextContent {
return block.type === "text";
}
async function summarizeText(params: {
text: string;
targetLength: number;
cfg: OpenClawConfig;
config: ResolvedTtsConfig;
timeoutMs: number;
}): Promise<SummarizeResult> {
const { text, targetLength, cfg, config, timeoutMs } = params;
if (targetLength < 100 || targetLength > 10_000) {
throw new Error(`Invalid targetLength: ${targetLength}`);
}
const startTime = Date.now();
const { ref } = resolveSummaryModelRef(cfg, config);
const resolved = resolveModel(ref.provider, ref.model, undefined, cfg);
if (!resolved.model) {
throw new Error(resolved.error ?? `Unknown summary model: ${ref.provider}/${ref.model}`);
}
const apiKey = requireApiKey(
await getApiKeyForModel({ model: resolved.model, cfg }),
ref.provider,
);
try {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
const res = await completeSimple(
resolved.model,
{
messages: [
{
role: "user",
content:
`You are an assistant that summarizes texts concisely while keeping the most important information. ` +
`Summarize the text to approximately ${targetLength} characters. Maintain the original tone and style. ` +
`Reply only with the summary, without additional explanations.\n\n` +
`<text_to_summarize>\n${text}\n</text_to_summarize>`,
timestamp: Date.now(),
},
],
},
{
apiKey,
maxTokens: Math.ceil(targetLength / 2),
temperature: 0.3,
signal: controller.signal,
},
);
const summary = res.content
.filter(isTextContentBlock)
.map((block) => block.text.trim())
.filter(Boolean)
.join(" ")
.trim();
if (!summary) {
throw new Error("No summary returned");
}
return {
summary,
latencyMs: Date.now() - startTime,
inputLength: text.length,
outputLength: summary.length,
};
} finally {
clearTimeout(timeout);
}
} catch (err) {
const error = err as Error;
if (error.name === "AbortError") {
throw new Error("Summarization timed out", { cause: err });
}
throw err;
}
}
function scheduleCleanup(tempDir: string, delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS): void {
const timer = setTimeout(() => {
try {
rmSync(tempDir, { recursive: true, force: true });
} catch {
// ignore cleanup errors
}
}, delayMs);
timer.unref();
}
async function elevenLabsTTS(params: {
text: string;
apiKey: string;
baseUrl: string;
voiceId: string;
modelId: string;
outputFormat: string;
seed?: number;
applyTextNormalization?: "auto" | "on" | "off";
languageCode?: string;
voiceSettings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"];
timeoutMs: number;
}): Promise<Buffer> {
const {
text,
apiKey,
baseUrl,
voiceId,
modelId,
outputFormat,
seed,
applyTextNormalization,
languageCode,
voiceSettings,
timeoutMs,
} = params;
if (!isValidVoiceId(voiceId)) {
throw new Error("Invalid voiceId format");
}
assertElevenLabsVoiceSettings(voiceSettings);
const normalizedLanguage = normalizeLanguageCode(languageCode);
const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization);
const normalizedSeed = normalizeSeed(seed);
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`);
if (outputFormat) {
url.searchParams.set("output_format", outputFormat);
}
const response = await fetch(url.toString(), {
method: "POST",
headers: {
"xi-api-key": apiKey,
"Content-Type": "application/json",
Accept: "audio/mpeg",
},
body: JSON.stringify({
text,
model_id: modelId,
seed: normalizedSeed,
apply_text_normalization: normalizedNormalization,
language_code: normalizedLanguage,
voice_settings: {
stability: voiceSettings.stability,
similarity_boost: voiceSettings.similarityBoost,
style: voiceSettings.style,
use_speaker_boost: voiceSettings.useSpeakerBoost,
speed: voiceSettings.speed,
},
}),
signal: controller.signal,
});
if (!response.ok) {
throw new Error(`ElevenLabs API error (${response.status})`);
}
return Buffer.from(await response.arrayBuffer());
} finally {
clearTimeout(timeout);
}
}
async function openaiTTS(params: {
text: string;
apiKey: string;
model: string;
voice: string;
responseFormat: "mp3" | "opus" | "pcm";
timeoutMs: number;
}): Promise<Buffer> {
const { text, apiKey, model, voice, responseFormat, timeoutMs } = params;
if (!isValidOpenAIModel(model)) {
throw new Error(`Invalid model: ${model}`);
}
if (!isValidOpenAIVoice(voice)) {
throw new Error(`Invalid voice: ${voice}`);
}
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
const response = await fetch(`${getOpenAITtsBaseUrl()}/audio/speech`, {
method: "POST",
headers: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
model,
input: text,
voice,
response_format: responseFormat,
}),
signal: controller.signal,
});
if (!response.ok) {
throw new Error(`OpenAI TTS API error (${response.status})`);
}
return Buffer.from(await response.arrayBuffer());
} finally {
clearTimeout(timeout);
}
}
function inferEdgeExtension(outputFormat: string): string {
const normalized = outputFormat.toLowerCase();
if (normalized.includes("webm")) {
return ".webm";
}
if (normalized.includes("ogg")) {
return ".ogg";
}
if (normalized.includes("opus")) {
return ".opus";
}
if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) {
return ".wav";
}
return ".mp3";
}
async function edgeTTS(params: {
text: string;
outputPath: string;
config: ResolvedTtsConfig["edge"];
timeoutMs: number;
}): Promise<void> {
const { text, outputPath, config, timeoutMs } = params;
const tts = new EdgeTTS({
voice: config.voice,
lang: config.lang,
outputFormat: config.outputFormat,
saveSubtitles: config.saveSubtitles,
proxy: config.proxy,
rate: config.rate,
pitch: config.pitch,
volume: config.volume,
timeout: config.timeoutMs ?? timeoutMs,
});
await tts.ttsPromise(text, outputPath);
}
export async function textToSpeech(params: {
text: string;
cfg: OpenClawConfig;
prefsPath?: string;
channel?: string;
overrides?: TtsDirectiveOverrides;
}): Promise<TtsResult> {
const config = resolveTtsConfig(params.cfg);
const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config);
const channelId = resolveChannelId(params.channel);
const output = resolveOutputFormat(channelId);
if (params.text.length > config.maxTextLength) {
return {
success: false,
error: `Text too long (${params.text.length} chars, max ${config.maxTextLength})`,
};
}
const userProvider = getTtsProvider(config, prefsPath);
const overrideProvider = params.overrides?.provider;
const provider = overrideProvider ?? userProvider;
const providers = resolveTtsProviderOrder(provider);
let lastError: string | undefined;
for (const provider of providers) {
const providerStart = Date.now();
try {
if (provider === "edge") {
if (!config.edge.enabled) {
lastError = "edge: disabled";
continue;
}
const tempDir = mkdtempSync(path.join(tmpdir(), "tts-"));
let edgeOutputFormat = resolveEdgeOutputFormat(config);
const fallbackEdgeOutputFormat =
edgeOutputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined;
const attemptEdgeTts = async (outputFormat: string) => {
const extension = inferEdgeExtension(outputFormat);
const audioPath = path.join(tempDir, `voice-${Date.now()}${extension}`);
await edgeTTS({
text: params.text,
outputPath: audioPath,
config: {
...config.edge,
outputFormat,
},
timeoutMs: config.timeoutMs,
});
return { audioPath, outputFormat };
};
let edgeResult: { audioPath: string; outputFormat: string };
try {
edgeResult = await attemptEdgeTts(edgeOutputFormat);
} catch (err) {
if (fallbackEdgeOutputFormat && fallbackEdgeOutputFormat !== edgeOutputFormat) {
logVerbose(
`TTS: Edge output ${edgeOutputFormat} failed; retrying with ${fallbackEdgeOutputFormat}.`,
);
edgeOutputFormat = fallbackEdgeOutputFormat;
try {
edgeResult = await attemptEdgeTts(edgeOutputFormat);
} catch (fallbackErr) {
try {
rmSync(tempDir, { recursive: true, force: true });
} catch {
// ignore cleanup errors
}
throw fallbackErr;
}
} else {
try {
rmSync(tempDir, { recursive: true, force: true });
} catch {
// ignore cleanup errors
}
throw err;
}
}
scheduleCleanup(tempDir);
const voiceCompatible = isVoiceCompatibleAudio({ fileName: edgeResult.audioPath });
return {
success: true,
audioPath: edgeResult.audioPath,
latencyMs: Date.now() - providerStart,
provider,
outputFormat: edgeResult.outputFormat,
voiceCompatible,
};
}
const apiKey = resolveTtsApiKey(config, provider);
if (!apiKey) {
lastError = `No API key for ${provider}`;
continue;
}
let audioBuffer: Buffer;
if (provider === "elevenlabs") {
const voiceIdOverride = params.overrides?.elevenlabs?.voiceId;
const modelIdOverride = params.overrides?.elevenlabs?.modelId;
const voiceSettings = {
...config.elevenlabs.voiceSettings,
...params.overrides?.elevenlabs?.voiceSettings,
};
const seedOverride = params.overrides?.elevenlabs?.seed;
const normalizationOverride = params.overrides?.elevenlabs?.applyTextNormalization;
const languageOverride = params.overrides?.elevenlabs?.languageCode;
audioBuffer = await elevenLabsTTS({
text: params.text,
apiKey,
baseUrl: config.elevenlabs.baseUrl,
voiceId: voiceIdOverride ?? config.elevenlabs.voiceId,
modelId: modelIdOverride ?? config.elevenlabs.modelId,
outputFormat: output.elevenlabs,
seed: seedOverride ?? config.elevenlabs.seed,
applyTextNormalization: normalizationOverride ?? config.elevenlabs.applyTextNormalization,
languageCode: languageOverride ?? config.elevenlabs.languageCode,
voiceSettings,
timeoutMs: config.timeoutMs,
});
} else {
const openaiModelOverride = params.overrides?.openai?.model;
const openaiVoiceOverride = params.overrides?.openai?.voice;
audioBuffer = await openaiTTS({
text: params.text,
apiKey,
model: openaiModelOverride ?? config.openai.model,
voice: openaiVoiceOverride ?? config.openai.voice,
responseFormat: output.openai,
timeoutMs: config.timeoutMs,
});
}
const latencyMs = Date.now() - providerStart;
const tempDir = mkdtempSync(path.join(tmpdir(), "tts-"));
const audioPath = path.join(tempDir, `voice-${Date.now()}${output.extension}`);
writeFileSync(audioPath, audioBuffer);
scheduleCleanup(tempDir);
return {
success: true,
audioPath,
latencyMs,
provider,
outputFormat: provider === "openai" ? output.openai : output.elevenlabs,
voiceCompatible: output.voiceCompatible,
};
} catch (err) {
const error = err as Error;
if (error.name === "AbortError") {
lastError = `${provider}: request timed out`;
} else {
lastError = `${provider}: ${error.message}`;
}
}
}
return {
success: false,
error: `TTS conversion failed: ${lastError || "no providers available"}`,
};
}
export async function textToSpeechTelephony(params: {
text: string;
cfg: OpenClawConfig;
prefsPath?: string;
}): Promise<TtsTelephonyResult> {
const config = resolveTtsConfig(params.cfg);
const prefsPath = params.prefsPath ?? resolveTtsPrefsPath(config);
if (params.text.length > config.maxTextLength) {
return {
success: false,
error: `Text too long (${params.text.length} chars, max ${config.maxTextLength})`,
};
}
const userProvider = getTtsProvider(config, prefsPath);
const providers = resolveTtsProviderOrder(userProvider);
let lastError: string | undefined;
for (const provider of providers) {
const providerStart = Date.now();
try {
if (provider === "edge") {
lastError = "edge: unsupported for telephony";
continue;
}
const apiKey = resolveTtsApiKey(config, provider);
if (!apiKey) {
lastError = `No API key for ${provider}`;
continue;
}
if (provider === "elevenlabs") {
const output = TELEPHONY_OUTPUT.elevenlabs;
const audioBuffer = await elevenLabsTTS({
text: params.text,
apiKey,
baseUrl: config.elevenlabs.baseUrl,
voiceId: config.elevenlabs.voiceId,
modelId: config.elevenlabs.modelId,
outputFormat: output.format,
seed: config.elevenlabs.seed,
applyTextNormalization: config.elevenlabs.applyTextNormalization,
languageCode: config.elevenlabs.languageCode,
voiceSettings: config.elevenlabs.voiceSettings,
timeoutMs: config.timeoutMs,
});
return {
success: true,
audioBuffer,
latencyMs: Date.now() - providerStart,
provider,
outputFormat: output.format,
sampleRate: output.sampleRate,
};
}
const output = TELEPHONY_OUTPUT.openai;
const audioBuffer = await openaiTTS({
text: params.text,
apiKey,
model: config.openai.model,
voice: config.openai.voice,
responseFormat: output.format,
timeoutMs: config.timeoutMs,
});
return {
success: true,
audioBuffer,
latencyMs: Date.now() - providerStart,
provider,
outputFormat: output.format,
sampleRate: output.sampleRate,
};
} catch (err) {
const error = err as Error;
if (error.name === "AbortError") {
lastError = `${provider}: request timed out`;
} else {
lastError = `${provider}: ${error.message}`;
}
}
}
return {
success: false,
error: `TTS conversion failed: ${lastError || "no providers available"}`,
};
}
export async function maybeApplyTtsToPayload(params: {
payload: ReplyPayload;
cfg: OpenClawConfig;
channel?: string;
kind?: "tool" | "block" | "final";
inboundAudio?: boolean;
ttsAuto?: string;
}): Promise<ReplyPayload> {
const config = resolveTtsConfig(params.cfg);
const prefsPath = resolveTtsPrefsPath(config);
const autoMode = resolveTtsAutoMode({
config,
prefsPath,
sessionAuto: params.ttsAuto,
});
if (autoMode === "off") {
return params.payload;
}
const text = params.payload.text ?? "";
const directives = parseTtsDirectives(text, config.modelOverrides);
if (directives.warnings.length > 0) {
logVerbose(`TTS: ignored directive overrides (${directives.warnings.join("; ")})`);
}
const cleanedText = directives.cleanedText;
const trimmedCleaned = cleanedText.trim();
const visibleText = trimmedCleaned.length > 0 ? trimmedCleaned : "";
const ttsText = directives.ttsText?.trim() || visibleText;
const nextPayload =
visibleText === text.trim()
? params.payload
: {
...params.payload,
text: visibleText.length > 0 ? visibleText : undefined,
};
if (autoMode === "tagged" && !directives.hasDirective) {
return nextPayload;
}
if (autoMode === "inbound" && params.inboundAudio !== true) {
return nextPayload;
}
const mode = config.mode ?? "final";
if (mode === "final" && params.kind && params.kind !== "final") {
return nextPayload;
}
if (!ttsText.trim()) {
return nextPayload;
}
if (params.payload.mediaUrl || (params.payload.mediaUrls?.length ?? 0) > 0) {
return nextPayload;
}
if (text.includes("MEDIA:")) {
return nextPayload;
}
if (ttsText.trim().length < 10) {
return nextPayload;
}
const maxLength = getTtsMaxLength(prefsPath);
let textForAudio = ttsText.trim();
let wasSummarized = false;
if (textForAudio.length > maxLength) {
if (!isSummarizationEnabled(prefsPath)) {
// Truncate text when summarization is disabled
logVerbose(
`TTS: truncating long text (${textForAudio.length} > ${maxLength}), summarization disabled.`,
);
textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
} else {
// Summarize text when enabled
try {
const summary = await summarizeText({
text: textForAudio,
targetLength: maxLength,
cfg: params.cfg,
config,
timeoutMs: config.timeoutMs,
});
textForAudio = summary.summary;
wasSummarized = true;
if (textForAudio.length > config.maxTextLength) {
logVerbose(
`TTS: summary exceeded hard limit (${textForAudio.length} > ${config.maxTextLength}); truncating.`,
);
textForAudio = `${textForAudio.slice(0, config.maxTextLength - 3)}...`;
}
} catch (err) {
const error = err as Error;
logVerbose(`TTS: summarization failed, truncating instead: ${error.message}`);
textForAudio = `${textForAudio.slice(0, maxLength - 3)}...`;
}
}
}
const ttsStart = Date.now();
const result = await textToSpeech({
text: textForAudio,
cfg: params.cfg,
prefsPath,
channel: params.channel,
overrides: directives.overrides,
});
if (result.success && result.audioPath) {
lastTtsAttempt = {
timestamp: Date.now(),
success: true,
textLength: text.length,
summarized: wasSummarized,
provider: result.provider,
latencyMs: result.latencyMs,
};
const channelId = resolveChannelId(params.channel);
const shouldVoice = channelId === "telegram" && result.voiceCompatible === true;
const finalPayload = {
...nextPayload,
mediaUrl: result.audioPath,
audioAsVoice: shouldVoice || params.payload.audioAsVoice,
};
return finalPayload;
}
lastTtsAttempt = {
timestamp: Date.now(),
success: false,
textLength: text.length,
summarized: wasSummarized,
error: result.error,
};
const latency = Date.now() - ttsStart;
logVerbose(`TTS: conversion failed after ${latency}ms (${result.error ?? "unknown"}).`);
return nextPayload;
}
export const _test = {
isValidVoiceId,
isValidOpenAIVoice,
isValidOpenAIModel,
OPENAI_TTS_MODELS,
OPENAI_TTS_VOICES,
parseTtsDirectives,
resolveModelOverridePolicy,
summarizeText,
resolveOutputFormat,
resolveEdgeOutputFormat,
};