refactor(tts): extract directives and provider core

2026-02-13 17:44:22 +00:00
parent 83bc73f4ea
commit 3f5e72835e
2 changed files with 691 additions and 663 deletions
--- a/src/tts/tts-core.ts
+++ b/src/tts/tts-core.ts
@@ -0,0 +1,673 @@
+import { completeSimple, type TextContent } from "@mariozechner/pi-ai";
+import { EdgeTTS } from "node-edge-tts";
+import { rmSync } from "node:fs";
+import type { OpenClawConfig } from "../config/config.js";
+import type {
+  ResolvedTtsConfig,
+  ResolvedTtsModelOverrides,
+  TtsDirectiveOverrides,
+  TtsDirectiveParseResult,
+} from "./tts.js";
+import { getApiKeyForModel, requireApiKey } from "../agents/model-auth.js";
+import {
+  buildModelAliasIndex,
+  resolveDefaultModelForAgent,
+  resolveModelRefFromString,
+  type ModelRef,
+} from "../agents/model-selection.js";
+import { resolveModel } from "../agents/pi-embedded-runner/model.js";
+
+const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
+const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes
+
+export function isValidVoiceId(voiceId: string): boolean {
+  return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
+}
+
+function normalizeElevenLabsBaseUrl(baseUrl: string): string {
+  const trimmed = baseUrl.trim();
+  if (!trimmed) {
+    return DEFAULT_ELEVENLABS_BASE_URL;
+  }
+  return trimmed.replace(/\/+$/, "");
+}
+
+function requireInRange(value: number, min: number, max: number, label: string): void {
+  if (!Number.isFinite(value) || value < min || value > max) {
+    throw new Error(`${label} must be between ${min} and ${max}`);
+  }
+}
+
+function assertElevenLabsVoiceSettings(settings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"]) {
+  requireInRange(settings.stability, 0, 1, "stability");
+  requireInRange(settings.similarityBoost, 0, 1, "similarityBoost");
+  requireInRange(settings.style, 0, 1, "style");
+  requireInRange(settings.speed, 0.5, 2, "speed");
+}
+
+function normalizeLanguageCode(code?: string): string | undefined {
+  const trimmed = code?.trim();
+  if (!trimmed) {
+    return undefined;
+  }
+  const normalized = trimmed.toLowerCase();
+  if (!/^[a-z]{2}$/.test(normalized)) {
+    throw new Error("languageCode must be a 2-letter ISO 639-1 code (e.g. en, de, fr)");
+  }
+  return normalized;
+}
+
+function normalizeApplyTextNormalization(mode?: string): "auto" | "on" | "off" | undefined {
+  const trimmed = mode?.trim();
+  if (!trimmed) {
+    return undefined;
+  }
+  const normalized = trimmed.toLowerCase();
+  if (normalized === "auto" || normalized === "on" || normalized === "off") {
+    return normalized;
+  }
+  throw new Error("applyTextNormalization must be one of: auto, on, off");
+}
+
+function normalizeSeed(seed?: number): number | undefined {
+  if (seed == null) {
+    return undefined;
+  }
+  const next = Math.floor(seed);
+  if (!Number.isFinite(next) || next < 0 || next > 4_294_967_295) {
+    throw new Error("seed must be between 0 and 4294967295");
+  }
+  return next;
+}
+
+function parseBooleanValue(value: string): boolean | undefined {
+  const normalized = value.trim().toLowerCase();
+  if (["true", "1", "yes", "on"].includes(normalized)) {
+    return true;
+  }
+  if (["false", "0", "no", "off"].includes(normalized)) {
+    return false;
+  }
+  return undefined;
+}
+
+function parseNumberValue(value: string): number | undefined {
+  const parsed = Number.parseFloat(value);
+  return Number.isFinite(parsed) ? parsed : undefined;
+}
+
+export function parseTtsDirectives(
+  text: string,
+  policy: ResolvedTtsModelOverrides,
+): TtsDirectiveParseResult {
+  if (!policy.enabled) {
+    return { cleanedText: text, overrides: {}, warnings: [], hasDirective: false };
+  }
+
+  const overrides: TtsDirectiveOverrides = {};
+  const warnings: string[] = [];
+  let cleanedText = text;
+  let hasDirective = false;
+
+  const blockRegex = /\[\[tts:text\]\]([\s\S]*?)\[\[\/tts:text\]\]/gi;
+  cleanedText = cleanedText.replace(blockRegex, (_match, inner: string) => {
+    hasDirective = true;
+    if (policy.allowText && overrides.ttsText == null) {
+      overrides.ttsText = inner.trim();
+    }
+    return "";
+  });
+
+  const directiveRegex = /\[\[tts:([^\]]+)\]\]/gi;
+  cleanedText = cleanedText.replace(directiveRegex, (_match, body: string) => {
+    hasDirective = true;
+    const tokens = body.split(/\s+/).filter(Boolean);
+    for (const token of tokens) {
+      const eqIndex = token.indexOf("=");
+      if (eqIndex === -1) {
+        continue;
+      }
+      const rawKey = token.slice(0, eqIndex).trim();
+      const rawValue = token.slice(eqIndex + 1).trim();
+      if (!rawKey || !rawValue) {
+        continue;
+      }
+      const key = rawKey.toLowerCase();
+      try {
+        switch (key) {
+          case "provider":
+            if (!policy.allowProvider) {
+              break;
+            }
+            if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") {
+              overrides.provider = rawValue;
+            } else {
+              warnings.push(`unsupported provider "${rawValue}"`);
+            }
+            break;
+          case "voice":
+          case "openai_voice":
+          case "openaivoice":
+            if (!policy.allowVoice) {
+              break;
+            }
+            if (isValidOpenAIVoice(rawValue)) {
+              overrides.openai = { ...overrides.openai, voice: rawValue };
+            } else {
+              warnings.push(`invalid OpenAI voice "${rawValue}"`);
+            }
+            break;
+          case "voiceid":
+          case "voice_id":
+          case "elevenlabs_voice":
+          case "elevenlabsvoice":
+            if (!policy.allowVoice) {
+              break;
+            }
+            if (isValidVoiceId(rawValue)) {
+              overrides.elevenlabs = { ...overrides.elevenlabs, voiceId: rawValue };
+            } else {
+              warnings.push(`invalid ElevenLabs voiceId "${rawValue}"`);
+            }
+            break;
+          case "model":
+          case "modelid":
+          case "model_id":
+          case "elevenlabs_model":
+          case "elevenlabsmodel":
+          case "openai_model":
+          case "openaimodel":
+            if (!policy.allowModelId) {
+              break;
+            }
+            if (isValidOpenAIModel(rawValue)) {
+              overrides.openai = { ...overrides.openai, model: rawValue };
+            } else {
+              overrides.elevenlabs = { ...overrides.elevenlabs, modelId: rawValue };
+            }
+            break;
+          case "stability":
+            if (!policy.allowVoiceSettings) {
+              break;
+            }
+            {
+              const value = parseNumberValue(rawValue);
+              if (value == null) {
+                warnings.push("invalid stability value");
+                break;
+              }
+              requireInRange(value, 0, 1, "stability");
+              overrides.elevenlabs = {
+                ...overrides.elevenlabs,
+                voiceSettings: { ...overrides.elevenlabs?.voiceSettings, stability: value },
+              };
+            }
+            break;
+          case "similarity":
+          case "similarityboost":
+          case "similarity_boost":
+            if (!policy.allowVoiceSettings) {
+              break;
+            }
+            {
+              const value = parseNumberValue(rawValue);
+              if (value == null) {
+                warnings.push("invalid similarityBoost value");
+                break;
+              }
+              requireInRange(value, 0, 1, "similarityBoost");
+              overrides.elevenlabs = {
+                ...overrides.elevenlabs,
+                voiceSettings: { ...overrides.elevenlabs?.voiceSettings, similarityBoost: value },
+              };
+            }
+            break;
+          case "style":
+            if (!policy.allowVoiceSettings) {
+              break;
+            }
+            {
+              const value = parseNumberValue(rawValue);
+              if (value == null) {
+                warnings.push("invalid style value");
+                break;
+              }
+              requireInRange(value, 0, 1, "style");
+              overrides.elevenlabs = {
+                ...overrides.elevenlabs,
+                voiceSettings: { ...overrides.elevenlabs?.voiceSettings, style: value },
+              };
+            }
+            break;
+          case "speed":
+            if (!policy.allowVoiceSettings) {
+              break;
+            }
+            {
+              const value = parseNumberValue(rawValue);
+              if (value == null) {
+                warnings.push("invalid speed value");
+                break;
+              }
+              requireInRange(value, 0.5, 2, "speed");
+              overrides.elevenlabs = {
+                ...overrides.elevenlabs,
+                voiceSettings: { ...overrides.elevenlabs?.voiceSettings, speed: value },
+              };
+            }
+            break;
+          case "speakerboost":
+          case "speaker_boost":
+          case "usespeakerboost":
+          case "use_speaker_boost":
+            if (!policy.allowVoiceSettings) {
+              break;
+            }
+            {
+              const value = parseBooleanValue(rawValue);
+              if (value == null) {
+                warnings.push("invalid useSpeakerBoost value");
+                break;
+              }
+              overrides.elevenlabs = {
+                ...overrides.elevenlabs,
+                voiceSettings: { ...overrides.elevenlabs?.voiceSettings, useSpeakerBoost: value },
+              };
+            }
+            break;
+          case "normalize":
+          case "applytextnormalization":
+          case "apply_text_normalization":
+            if (!policy.allowNormalization) {
+              break;
+            }
+            overrides.elevenlabs = {
+              ...overrides.elevenlabs,
+              applyTextNormalization: normalizeApplyTextNormalization(rawValue),
+            };
+            break;
+          case "language":
+          case "languagecode":
+          case "language_code":
+            if (!policy.allowNormalization) {
+              break;
+            }
+            overrides.elevenlabs = {
+              ...overrides.elevenlabs,
+              languageCode: normalizeLanguageCode(rawValue),
+            };
+            break;
+          case "seed":
+            if (!policy.allowSeed) {
+              break;
+            }
+            overrides.elevenlabs = {
+              ...overrides.elevenlabs,
+              seed: normalizeSeed(Number.parseInt(rawValue, 10)),
+            };
+            break;
+          default:
+            break;
+        }
+      } catch (err) {
+        warnings.push((err as Error).message);
+      }
+    }
+    return "";
+  });
+
+  return {
+    cleanedText,
+    ttsText: overrides.ttsText,
+    hasDirective,
+    overrides,
+    warnings,
+  };
+}
+
+export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] as const;
+
+/**
+ * Custom OpenAI-compatible TTS endpoint.
+ * When set, model/voice validation is relaxed to allow non-OpenAI models.
+ * Example: OPENAI_TTS_BASE_URL=http://localhost:8880/v1
+ *
+ * Note: Read at runtime (not module load) to support config.env loading.
+ */
+function getOpenAITtsBaseUrl(): string {
+  return (process.env.OPENAI_TTS_BASE_URL?.trim() || "https://api.openai.com/v1").replace(
+    /\/+$/,
+    "",
+  );
+}
+
+function isCustomOpenAIEndpoint(): boolean {
+  return getOpenAITtsBaseUrl() !== "https://api.openai.com/v1";
+}
+export const OPENAI_TTS_VOICES = [
+  "alloy",
+  "ash",
+  "ballad",
+  "cedar",
+  "coral",
+  "echo",
+  "fable",
+  "juniper",
+  "marin",
+  "onyx",
+  "nova",
+  "sage",
+  "shimmer",
+  "verse",
+] as const;
+
+type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
+
+export function isValidOpenAIModel(model: string): boolean {
+  // Allow any model when using custom endpoint (e.g., Kokoro, LocalAI)
+  if (isCustomOpenAIEndpoint()) {
+    return true;
+  }
+  return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
+}
+
+export function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice {
+  // Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices)
+  if (isCustomOpenAIEndpoint()) {
+    return true;
+  }
+  return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
+}
+
+type SummarizeResult = {
+  summary: string;
+  latencyMs: number;
+  inputLength: number;
+  outputLength: number;
+};
+
+type SummaryModelSelection = {
+  ref: ModelRef;
+  source: "summaryModel" | "default";
+};
+
+function resolveSummaryModelRef(
+  cfg: OpenClawConfig,
+  config: ResolvedTtsConfig,
+): SummaryModelSelection {
+  const defaultRef = resolveDefaultModelForAgent({ cfg });
+  const override = config.summaryModel?.trim();
+  if (!override) {
+    return { ref: defaultRef, source: "default" };
+  }
+
+  const aliasIndex = buildModelAliasIndex({ cfg, defaultProvider: defaultRef.provider });
+  const resolved = resolveModelRefFromString({
+    raw: override,
+    defaultProvider: defaultRef.provider,
+    aliasIndex,
+  });
+  if (!resolved) {
+    return { ref: defaultRef, source: "default" };
+  }
+  return { ref: resolved.ref, source: "summaryModel" };
+}
+
+function isTextContentBlock(block: { type: string }): block is TextContent {
+  return block.type === "text";
+}
+
+export async function summarizeText(params: {
+  text: string;
+  targetLength: number;
+  cfg: OpenClawConfig;
+  config: ResolvedTtsConfig;
+  timeoutMs: number;
+}): Promise<SummarizeResult> {
+  const { text, targetLength, cfg, config, timeoutMs } = params;
+  if (targetLength < 100 || targetLength > 10_000) {
+    throw new Error(`Invalid targetLength: ${targetLength}`);
+  }
+
+  const startTime = Date.now();
+  const { ref } = resolveSummaryModelRef(cfg, config);
+  const resolved = resolveModel(ref.provider, ref.model, undefined, cfg);
+  if (!resolved.model) {
+    throw new Error(resolved.error ?? `Unknown summary model: ${ref.provider}/${ref.model}`);
+  }
+  const apiKey = requireApiKey(
+    await getApiKeyForModel({ model: resolved.model, cfg }),
+    ref.provider,
+  );
+
+  try {
+    const controller = new AbortController();
+    const timeout = setTimeout(() => controller.abort(), timeoutMs);
+
+    try {
+      const res = await completeSimple(
+        resolved.model,
+        {
+          messages: [
+            {
+              role: "user",
+              content:
+                `You are an assistant that summarizes texts concisely while keeping the most important information. ` +
+                `Summarize the text to approximately ${targetLength} characters. Maintain the original tone and style. ` +
+                `Reply only with the summary, without additional explanations.\n\n` +
+                `<text_to_summarize>\n${text}\n</text_to_summarize>`,
+              timestamp: Date.now(),
+            },
+          ],
+        },
+        {
+          apiKey,
+          maxTokens: Math.ceil(targetLength / 2),
+          temperature: 0.3,
+          signal: controller.signal,
+        },
+      );
+
+      const summary = res.content
+        .filter(isTextContentBlock)
+        .map((block) => block.text.trim())
+        .filter(Boolean)
+        .join(" ")
+        .trim();
+
+      if (!summary) {
+        throw new Error("No summary returned");
+      }
+
+      return {
+        summary,
+        latencyMs: Date.now() - startTime,
+        inputLength: text.length,
+        outputLength: summary.length,
+      };
+    } finally {
+      clearTimeout(timeout);
+    }
+  } catch (err) {
+    const error = err as Error;
+    if (error.name === "AbortError") {
+      throw new Error("Summarization timed out", { cause: err });
+    }
+    throw err;
+  }
+}
+
+export function scheduleCleanup(
+  tempDir: string,
+  delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS,
+): void {
+  const timer = setTimeout(() => {
+    try {
+      rmSync(tempDir, { recursive: true, force: true });
+    } catch {
+      // ignore cleanup errors
+    }
+  }, delayMs);
+  timer.unref();
+}
+
+export async function elevenLabsTTS(params: {
+  text: string;
+  apiKey: string;
+  baseUrl: string;
+  voiceId: string;
+  modelId: string;
+  outputFormat: string;
+  seed?: number;
+  applyTextNormalization?: "auto" | "on" | "off";
+  languageCode?: string;
+  voiceSettings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"];
+  timeoutMs: number;
+}): Promise<Buffer> {
+  const {
+    text,
+    apiKey,
+    baseUrl,
+    voiceId,
+    modelId,
+    outputFormat,
+    seed,
+    applyTextNormalization,
+    languageCode,
+    voiceSettings,
+    timeoutMs,
+  } = params;
+  if (!isValidVoiceId(voiceId)) {
+    throw new Error("Invalid voiceId format");
+  }
+  assertElevenLabsVoiceSettings(voiceSettings);
+  const normalizedLanguage = normalizeLanguageCode(languageCode);
+  const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization);
+  const normalizedSeed = normalizeSeed(seed);
+
+  const controller = new AbortController();
+  const timeout = setTimeout(() => controller.abort(), timeoutMs);
+
+  try {
+    const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`);
+    if (outputFormat) {
+      url.searchParams.set("output_format", outputFormat);
+    }
+
+    const response = await fetch(url.toString(), {
+      method: "POST",
+      headers: {
+        "xi-api-key": apiKey,
+        "Content-Type": "application/json",
+        Accept: "audio/mpeg",
+      },
+      body: JSON.stringify({
+        text,
+        model_id: modelId,
+        seed: normalizedSeed,
+        apply_text_normalization: normalizedNormalization,
+        language_code: normalizedLanguage,
+        voice_settings: {
+          stability: voiceSettings.stability,
+          similarity_boost: voiceSettings.similarityBoost,
+          style: voiceSettings.style,
+          use_speaker_boost: voiceSettings.useSpeakerBoost,
+          speed: voiceSettings.speed,
+        },
+      }),
+      signal: controller.signal,
+    });
+
+    if (!response.ok) {
+      throw new Error(`ElevenLabs API error (${response.status})`);
+    }
+
+    return Buffer.from(await response.arrayBuffer());
+  } finally {
+    clearTimeout(timeout);
+  }
+}
+
+export async function openaiTTS(params: {
+  text: string;
+  apiKey: string;
+  model: string;
+  voice: string;
+  responseFormat: "mp3" | "opus" | "pcm";
+  timeoutMs: number;
+}): Promise<Buffer> {
+  const { text, apiKey, model, voice, responseFormat, timeoutMs } = params;
+
+  if (!isValidOpenAIModel(model)) {
+    throw new Error(`Invalid model: ${model}`);
+  }
+  if (!isValidOpenAIVoice(voice)) {
+    throw new Error(`Invalid voice: ${voice}`);
+  }
+
+  const controller = new AbortController();
+  const timeout = setTimeout(() => controller.abort(), timeoutMs);
+
+  try {
+    const response = await fetch(`${getOpenAITtsBaseUrl()}/audio/speech`, {
+      method: "POST",
+      headers: {
+        Authorization: `Bearer ${apiKey}`,
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify({
+        model,
+        input: text,
+        voice,
+        response_format: responseFormat,
+      }),
+      signal: controller.signal,
+    });
+
+    if (!response.ok) {
+      throw new Error(`OpenAI TTS API error (${response.status})`);
+    }
+
+    return Buffer.from(await response.arrayBuffer());
+  } finally {
+    clearTimeout(timeout);
+  }
+}
+
+export function inferEdgeExtension(outputFormat: string): string {
+  const normalized = outputFormat.toLowerCase();
+  if (normalized.includes("webm")) {
+    return ".webm";
+  }
+  if (normalized.includes("ogg")) {
+    return ".ogg";
+  }
+  if (normalized.includes("opus")) {
+    return ".opus";
+  }
+  if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) {
+    return ".wav";
+  }
+  return ".mp3";
+}
+
+export async function edgeTTS(params: {
+  text: string;
+  outputPath: string;
+  config: ResolvedTtsConfig["edge"];
+  timeoutMs: number;
+}): Promise<void> {
+  const { text, outputPath, config, timeoutMs } = params;
+  const tts = new EdgeTTS({
+    voice: config.voice,
+    lang: config.lang,
+    outputFormat: config.outputFormat,
+    saveSubtitles: config.saveSubtitles,
+    proxy: config.proxy,
+    rate: config.rate,
+    pitch: config.pitch,
+    volume: config.volume,
+    timeout: config.timeoutMs ?? timeoutMs,
+  });
+  await tts.ttsPromise(text, outputPath);
+}
--- a/src/tts/tts.ts
+++ b/src/tts/tts.ts
@@ -1,5 +1,3 @@
-import { completeSimple, type TextContent } from "@mariozechner/pi-ai";
-import { EdgeTTS } from "node-edge-tts";
 import {
  existsSync,
  mkdirSync,
@@ -22,25 +20,31 @@ import type {
  TtsProvider,
  TtsModelOverrideConfig,
 } from "../config/types.tts.js";
-import { getApiKeyForModel, requireApiKey } from "../agents/model-auth.js";
-import {
-  buildModelAliasIndex,
-  resolveDefaultModelForAgent,
-  resolveModelRefFromString,
-  type ModelRef,
-} from "../agents/model-selection.js";
-import { resolveModel } from "../agents/pi-embedded-runner/model.js";
 import { normalizeChannelId } from "../channels/plugins/index.js";
 import { logVerbose } from "../globals.js";
 import { stripMarkdown } from "../line/markdown-to-line.js";
 import { isVoiceCompatibleAudio } from "../media/audio.js";
 import { CONFIG_DIR, resolveUserPath } from "../utils.js";
+import {
+  edgeTTS,
+  elevenLabsTTS,
+  inferEdgeExtension,
+  isValidOpenAIModel,
+  isValidOpenAIVoice,
+  isValidVoiceId,
+  OPENAI_TTS_MODELS,
+  OPENAI_TTS_VOICES,
+  openaiTTS,
+  parseTtsDirectives,
+  scheduleCleanup,
+  summarizeText,
+} from "./tts-core.js";
+export { OPENAI_TTS_MODELS, OPENAI_TTS_VOICES } from "./tts-core.js";

 const DEFAULT_TIMEOUT_MS = 30_000;
 const DEFAULT_TTS_MAX_LENGTH = 1500;
 const DEFAULT_TTS_SUMMARIZE = true;
 const DEFAULT_MAX_TEXT_LENGTH = 4096;
-const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes

 const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io";
 const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE";
@@ -138,7 +142,7 @@ type TtsUserPrefs = {
  };
 };

-type ResolvedTtsModelOverrides = {
+export type ResolvedTtsModelOverrides = {
  enabled: boolean;
  allowText: boolean;
  allowProvider: boolean;
@@ -149,7 +153,7 @@ type ResolvedTtsModelOverrides = {
  allowSeed: boolean;
 };

-type TtsDirectiveOverrides = {
+export type TtsDirectiveOverrides = {
  ttsText?: string;
  provider?: TtsProvider;
  openai?: {
@@ -166,7 +170,7 @@ type TtsDirectiveOverrides = {
  };
 };

-type TtsDirectiveParseResult = {
+export type TtsDirectiveParseResult = {
  cleanedText: string;
  ttsText?: string;
  hasDirective: boolean;
@@ -515,655 +519,6 @@ export function isTtsProviderConfigured(config: ResolvedTtsConfig, provider: Tts
  return Boolean(resolveTtsApiKey(config, provider));
 }

-function isValidVoiceId(voiceId: string): boolean {
-  return /^[a-zA-Z0-9]{10,40}$/.test(voiceId);
-}
-
-function normalizeElevenLabsBaseUrl(baseUrl: string): string {
-  const trimmed = baseUrl.trim();
-  if (!trimmed) {
-    return DEFAULT_ELEVENLABS_BASE_URL;
-  }
-  return trimmed.replace(/\/+$/, "");
-}
-
-function requireInRange(value: number, min: number, max: number, label: string): void {
-  if (!Number.isFinite(value) || value < min || value > max) {
-    throw new Error(`${label} must be between ${min} and ${max}`);
-  }
-}
-
-function assertElevenLabsVoiceSettings(settings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"]) {
-  requireInRange(settings.stability, 0, 1, "stability");
-  requireInRange(settings.similarityBoost, 0, 1, "similarityBoost");
-  requireInRange(settings.style, 0, 1, "style");
-  requireInRange(settings.speed, 0.5, 2, "speed");
-}
-
-function normalizeLanguageCode(code?: string): string | undefined {
-  const trimmed = code?.trim();
-  if (!trimmed) {
-    return undefined;
-  }
-  const normalized = trimmed.toLowerCase();
-  if (!/^[a-z]{2}$/.test(normalized)) {
-    throw new Error("languageCode must be a 2-letter ISO 639-1 code (e.g. en, de, fr)");
-  }
-  return normalized;
-}
-
-function normalizeApplyTextNormalization(mode?: string): "auto" | "on" | "off" | undefined {
-  const trimmed = mode?.trim();
-  if (!trimmed) {
-    return undefined;
-  }
-  const normalized = trimmed.toLowerCase();
-  if (normalized === "auto" || normalized === "on" || normalized === "off") {
-    return normalized;
-  }
-  throw new Error("applyTextNormalization must be one of: auto, on, off");
-}
-
-function normalizeSeed(seed?: number): number | undefined {
-  if (seed == null) {
-    return undefined;
-  }
-  const next = Math.floor(seed);
-  if (!Number.isFinite(next) || next < 0 || next > 4_294_967_295) {
-    throw new Error("seed must be between 0 and 4294967295");
-  }
-  return next;
-}
-
-function parseBooleanValue(value: string): boolean | undefined {
-  const normalized = value.trim().toLowerCase();
-  if (["true", "1", "yes", "on"].includes(normalized)) {
-    return true;
-  }
-  if (["false", "0", "no", "off"].includes(normalized)) {
-    return false;
-  }
-  return undefined;
-}
-
-function parseNumberValue(value: string): number | undefined {
-  const parsed = Number.parseFloat(value);
-  return Number.isFinite(parsed) ? parsed : undefined;
-}
-
-function parseTtsDirectives(
-  text: string,
-  policy: ResolvedTtsModelOverrides,
-): TtsDirectiveParseResult {
-  if (!policy.enabled) {
-    return { cleanedText: text, overrides: {}, warnings: [], hasDirective: false };
-  }
-
-  const overrides: TtsDirectiveOverrides = {};
-  const warnings: string[] = [];
-  let cleanedText = text;
-  let hasDirective = false;
-
-  const blockRegex = /\[\[tts:text\]\]([\s\S]*?)\[\[\/tts:text\]\]/gi;
-  cleanedText = cleanedText.replace(blockRegex, (_match, inner: string) => {
-    hasDirective = true;
-    if (policy.allowText && overrides.ttsText == null) {
-      overrides.ttsText = inner.trim();
-    }
-    return "";
-  });
-
-  const directiveRegex = /\[\[tts:([^\]]+)\]\]/gi;
-  cleanedText = cleanedText.replace(directiveRegex, (_match, body: string) => {
-    hasDirective = true;
-    const tokens = body.split(/\s+/).filter(Boolean);
-    for (const token of tokens) {
-      const eqIndex = token.indexOf("=");
-      if (eqIndex === -1) {
-        continue;
-      }
-      const rawKey = token.slice(0, eqIndex).trim();
-      const rawValue = token.slice(eqIndex + 1).trim();
-      if (!rawKey || !rawValue) {
-        continue;
-      }
-      const key = rawKey.toLowerCase();
-      try {
-        switch (key) {
-          case "provider":
-            if (!policy.allowProvider) {
-              break;
-            }
-            if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") {
-              overrides.provider = rawValue;
-            } else {
-              warnings.push(`unsupported provider "${rawValue}"`);
-            }
-            break;
-          case "voice":
-          case "openai_voice":
-          case "openaivoice":
-            if (!policy.allowVoice) {
-              break;
-            }
-            if (isValidOpenAIVoice(rawValue)) {
-              overrides.openai = { ...overrides.openai, voice: rawValue };
-            } else {
-              warnings.push(`invalid OpenAI voice "${rawValue}"`);
-            }
-            break;
-          case "voiceid":
-          case "voice_id":
-          case "elevenlabs_voice":
-          case "elevenlabsvoice":
-            if (!policy.allowVoice) {
-              break;
-            }
-            if (isValidVoiceId(rawValue)) {
-              overrides.elevenlabs = { ...overrides.elevenlabs, voiceId: rawValue };
-            } else {
-              warnings.push(`invalid ElevenLabs voiceId "${rawValue}"`);
-            }
-            break;
-          case "model":
-          case "modelid":
-          case "model_id":
-          case "elevenlabs_model":
-          case "elevenlabsmodel":
-          case "openai_model":
-          case "openaimodel":
-            if (!policy.allowModelId) {
-              break;
-            }
-            if (isValidOpenAIModel(rawValue)) {
-              overrides.openai = { ...overrides.openai, model: rawValue };
-            } else {
-              overrides.elevenlabs = { ...overrides.elevenlabs, modelId: rawValue };
-            }
-            break;
-          case "stability":
-            if (!policy.allowVoiceSettings) {
-              break;
-            }
-            {
-              const value = parseNumberValue(rawValue);
-              if (value == null) {
-                warnings.push("invalid stability value");
-                break;
-              }
-              requireInRange(value, 0, 1, "stability");
-              overrides.elevenlabs = {
-                ...overrides.elevenlabs,
-                voiceSettings: { ...overrides.elevenlabs?.voiceSettings, stability: value },
-              };
-            }
-            break;
-          case "similarity":
-          case "similarityboost":
-          case "similarity_boost":
-            if (!policy.allowVoiceSettings) {
-              break;
-            }
-            {
-              const value = parseNumberValue(rawValue);
-              if (value == null) {
-                warnings.push("invalid similarityBoost value");
-                break;
-              }
-              requireInRange(value, 0, 1, "similarityBoost");
-              overrides.elevenlabs = {
-                ...overrides.elevenlabs,
-                voiceSettings: { ...overrides.elevenlabs?.voiceSettings, similarityBoost: value },
-              };
-            }
-            break;
-          case "style":
-            if (!policy.allowVoiceSettings) {
-              break;
-            }
-            {
-              const value = parseNumberValue(rawValue);
-              if (value == null) {
-                warnings.push("invalid style value");
-                break;
-              }
-              requireInRange(value, 0, 1, "style");
-              overrides.elevenlabs = {
-                ...overrides.elevenlabs,
-                voiceSettings: { ...overrides.elevenlabs?.voiceSettings, style: value },
-              };
-            }
-            break;
-          case "speed":
-            if (!policy.allowVoiceSettings) {
-              break;
-            }
-            {
-              const value = parseNumberValue(rawValue);
-              if (value == null) {
-                warnings.push("invalid speed value");
-                break;
-              }
-              requireInRange(value, 0.5, 2, "speed");
-              overrides.elevenlabs = {
-                ...overrides.elevenlabs,
-                voiceSettings: { ...overrides.elevenlabs?.voiceSettings, speed: value },
-              };
-            }
-            break;
-          case "speakerboost":
-          case "speaker_boost":
-          case "usespeakerboost":
-          case "use_speaker_boost":
-            if (!policy.allowVoiceSettings) {
-              break;
-            }
-            {
-              const value = parseBooleanValue(rawValue);
-              if (value == null) {
-                warnings.push("invalid useSpeakerBoost value");
-                break;
-              }
-              overrides.elevenlabs = {
-                ...overrides.elevenlabs,
-                voiceSettings: { ...overrides.elevenlabs?.voiceSettings, useSpeakerBoost: value },
-              };
-            }
-            break;
-          case "normalize":
-          case "applytextnormalization":
-          case "apply_text_normalization":
-            if (!policy.allowNormalization) {
-              break;
-            }
-            overrides.elevenlabs = {
-              ...overrides.elevenlabs,
-              applyTextNormalization: normalizeApplyTextNormalization(rawValue),
-            };
-            break;
-          case "language":
-          case "languagecode":
-          case "language_code":
-            if (!policy.allowNormalization) {
-              break;
-            }
-            overrides.elevenlabs = {
-              ...overrides.elevenlabs,
-              languageCode: normalizeLanguageCode(rawValue),
-            };
-            break;
-          case "seed":
-            if (!policy.allowSeed) {
-              break;
-            }
-            overrides.elevenlabs = {
-              ...overrides.elevenlabs,
-              seed: normalizeSeed(Number.parseInt(rawValue, 10)),
-            };
-            break;
-          default:
-            break;
-        }
-      } catch (err) {
-        warnings.push((err as Error).message);
-      }
-    }
-    return "";
-  });
-
-  return {
-    cleanedText,
-    ttsText: overrides.ttsText,
-    hasDirective,
-    overrides,
-    warnings,
-  };
-}
-
-export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] as const;
-
-/**
- * Custom OpenAI-compatible TTS endpoint.
- * When set, model/voice validation is relaxed to allow non-OpenAI models.
- * Example: OPENAI_TTS_BASE_URL=http://localhost:8880/v1
- *
- * Note: Read at runtime (not module load) to support config.env loading.
- */
-function getOpenAITtsBaseUrl(): string {
-  return (process.env.OPENAI_TTS_BASE_URL?.trim() || "https://api.openai.com/v1").replace(
-    /\/+$/,
-    "",
-  );
-}
-
-function isCustomOpenAIEndpoint(): boolean {
-  return getOpenAITtsBaseUrl() !== "https://api.openai.com/v1";
-}
-export const OPENAI_TTS_VOICES = [
-  "alloy",
-  "ash",
-  "ballad",
-  "cedar",
-  "coral",
-  "echo",
-  "fable",
-  "juniper",
-  "marin",
-  "onyx",
-  "nova",
-  "sage",
-  "shimmer",
-  "verse",
-] as const;
-
-type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
-
-function isValidOpenAIModel(model: string): boolean {
-  // Allow any model when using custom endpoint (e.g., Kokoro, LocalAI)
-  if (isCustomOpenAIEndpoint()) {
-    return true;
-  }
-  return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
-}
-
-function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice {
-  // Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices)
-  if (isCustomOpenAIEndpoint()) {
-    return true;
-  }
-  return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
-}
-
-type SummarizeResult = {
-  summary: string;
-  latencyMs: number;
-  inputLength: number;
-  outputLength: number;
-};
-
-type SummaryModelSelection = {
-  ref: ModelRef;
-  source: "summaryModel" | "default";
-};
-
-function resolveSummaryModelRef(
-  cfg: OpenClawConfig,
-  config: ResolvedTtsConfig,
-): SummaryModelSelection {
-  const defaultRef = resolveDefaultModelForAgent({ cfg });
-  const override = config.summaryModel?.trim();
-  if (!override) {
-    return { ref: defaultRef, source: "default" };
-  }
-
-  const aliasIndex = buildModelAliasIndex({ cfg, defaultProvider: defaultRef.provider });
-  const resolved = resolveModelRefFromString({
-    raw: override,
-    defaultProvider: defaultRef.provider,
-    aliasIndex,
-  });
-  if (!resolved) {
-    return { ref: defaultRef, source: "default" };
-  }
-  return { ref: resolved.ref, source: "summaryModel" };
-}
-
-function isTextContentBlock(block: { type: string }): block is TextContent {
-  return block.type === "text";
-}
-
-async function summarizeText(params: {
-  text: string;
-  targetLength: number;
-  cfg: OpenClawConfig;
-  config: ResolvedTtsConfig;
-  timeoutMs: number;
-}): Promise<SummarizeResult> {
-  const { text, targetLength, cfg, config, timeoutMs } = params;
-  if (targetLength < 100 || targetLength > 10_000) {
-    throw new Error(`Invalid targetLength: ${targetLength}`);
-  }
-
-  const startTime = Date.now();
-  const { ref } = resolveSummaryModelRef(cfg, config);
-  const resolved = resolveModel(ref.provider, ref.model, undefined, cfg);
-  if (!resolved.model) {
-    throw new Error(resolved.error ?? `Unknown summary model: ${ref.provider}/${ref.model}`);
-  }
-  const apiKey = requireApiKey(
-    await getApiKeyForModel({ model: resolved.model, cfg }),
-    ref.provider,
-  );
-
-  try {
-    const controller = new AbortController();
-    const timeout = setTimeout(controller.abort.bind(controller), timeoutMs);
-
-    try {
-      const res = await completeSimple(
-        resolved.model,
-        {
-          messages: [
-            {
-              role: "user",
-              content:
-                `You are an assistant that summarizes texts concisely while keeping the most important information. ` +
-                `Summarize the text to approximately ${targetLength} characters. Maintain the original tone and style. ` +
-                `Reply only with the summary, without additional explanations.\n\n` +
-                `<text_to_summarize>\n${text}\n</text_to_summarize>`,
-              timestamp: Date.now(),
-            },
-          ],
-        },
-        {
-          apiKey,
-          maxTokens: Math.ceil(targetLength / 2),
-          temperature: 0.3,
-          signal: controller.signal,
-        },
-      );
-
-      const summary = res.content
-        .filter(isTextContentBlock)
-        .map((block) => block.text.trim())
-        .filter(Boolean)
-        .join(" ")
-        .trim();
-
-      if (!summary) {
-        throw new Error("No summary returned");
-      }
-
-      return {
-        summary,
-        latencyMs: Date.now() - startTime,
-        inputLength: text.length,
-        outputLength: summary.length,
-      };
-    } finally {
-      clearTimeout(timeout);
-    }
-  } catch (err) {
-    const error = err as Error;
-    if (error.name === "AbortError") {
-      throw new Error("Summarization timed out", { cause: err });
-    }
-    throw err;
-  }
-}
-
-function scheduleCleanup(tempDir: string, delayMs: number = TEMP_FILE_CLEANUP_DELAY_MS): void {
-  const timer = setTimeout(() => {
-    try {
-      rmSync(tempDir, { recursive: true, force: true });
-    } catch {
-      // ignore cleanup errors
-    }
-  }, delayMs);
-  timer.unref();
-}
-
-async function elevenLabsTTS(params: {
-  text: string;
-  apiKey: string;
-  baseUrl: string;
-  voiceId: string;
-  modelId: string;
-  outputFormat: string;
-  seed?: number;
-  applyTextNormalization?: "auto" | "on" | "off";
-  languageCode?: string;
-  voiceSettings: ResolvedTtsConfig["elevenlabs"]["voiceSettings"];
-  timeoutMs: number;
-}): Promise<Buffer> {
-  const {
-    text,
-    apiKey,
-    baseUrl,
-    voiceId,
-    modelId,
-    outputFormat,
-    seed,
-    applyTextNormalization,
-    languageCode,
-    voiceSettings,
-    timeoutMs,
-  } = params;
-  if (!isValidVoiceId(voiceId)) {
-    throw new Error("Invalid voiceId format");
-  }
-  assertElevenLabsVoiceSettings(voiceSettings);
-  const normalizedLanguage = normalizeLanguageCode(languageCode);
-  const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization);
-  const normalizedSeed = normalizeSeed(seed);
-
-  const controller = new AbortController();
-  const timeout = setTimeout(controller.abort.bind(controller), timeoutMs);
-
-  try {
-    const url = new URL(`${normalizeElevenLabsBaseUrl(baseUrl)}/v1/text-to-speech/${voiceId}`);
-    if (outputFormat) {
-      url.searchParams.set("output_format", outputFormat);
-    }
-
-    const response = await fetch(url.toString(), {
-      method: "POST",
-      headers: {
-        "xi-api-key": apiKey,
-        "Content-Type": "application/json",
-        Accept: "audio/mpeg",
-      },
-      body: JSON.stringify({
-        text,
-        model_id: modelId,
-        seed: normalizedSeed,
-        apply_text_normalization: normalizedNormalization,
-        language_code: normalizedLanguage,
-        voice_settings: {
-          stability: voiceSettings.stability,
-          similarity_boost: voiceSettings.similarityBoost,
-          style: voiceSettings.style,
-          use_speaker_boost: voiceSettings.useSpeakerBoost,
-          speed: voiceSettings.speed,
-        },
-      }),
-      signal: controller.signal,
-    });
-
-    if (!response.ok) {
-      throw new Error(`ElevenLabs API error (${response.status})`);
-    }
-
-    return Buffer.from(await response.arrayBuffer());
-  } finally {
-    clearTimeout(timeout);
-  }
-}
-
-async function openaiTTS(params: {
-  text: string;
-  apiKey: string;
-  model: string;
-  voice: string;
-  responseFormat: "mp3" | "opus" | "pcm";
-  timeoutMs: number;
-}): Promise<Buffer> {
-  const { text, apiKey, model, voice, responseFormat, timeoutMs } = params;
-
-  if (!isValidOpenAIModel(model)) {
-    throw new Error(`Invalid model: ${model}`);
-  }
-  if (!isValidOpenAIVoice(voice)) {
-    throw new Error(`Invalid voice: ${voice}`);
-  }
-
-  const controller = new AbortController();
-  const timeout = setTimeout(controller.abort.bind(controller), timeoutMs);
-
-  try {
-    const response = await fetch(`${getOpenAITtsBaseUrl()}/audio/speech`, {
-      method: "POST",
-      headers: {
-        Authorization: `Bearer ${apiKey}`,
-        "Content-Type": "application/json",
-      },
-      body: JSON.stringify({
-        model,
-        input: text,
-        voice,
-        response_format: responseFormat,
-      }),
-      signal: controller.signal,
-    });
-
-    if (!response.ok) {
-      throw new Error(`OpenAI TTS API error (${response.status})`);
-    }
-
-    return Buffer.from(await response.arrayBuffer());
-  } finally {
-    clearTimeout(timeout);
-  }
-}
-
-function inferEdgeExtension(outputFormat: string): string {
-  const normalized = outputFormat.toLowerCase();
-  if (normalized.includes("webm")) {
-    return ".webm";
-  }
-  if (normalized.includes("ogg")) {
-    return ".ogg";
-  }
-  if (normalized.includes("opus")) {
-    return ".opus";
-  }
-  if (normalized.includes("wav") || normalized.includes("riff") || normalized.includes("pcm")) {
-    return ".wav";
-  }
-  return ".mp3";
-}
-
-async function edgeTTS(params: {
-  text: string;
-  outputPath: string;
-  config: ResolvedTtsConfig["edge"];
-  timeoutMs: number;
-}): Promise<void> {
-  const { text, outputPath, config, timeoutMs } = params;
-  const tts = new EdgeTTS({
-    voice: config.voice,
-    lang: config.lang,
-    outputFormat: config.outputFormat,
-    saveSubtitles: config.saveSubtitles,
-    proxy: config.proxy,
-    rate: config.rate,
-    pitch: config.pitch,
-    volume: config.volume,
-    timeout: config.timeoutMs ?? timeoutMs,
-  });
-  await tts.ttsPromise(text, outputPath);
-}
-
 export async function textToSpeech(params: {
  text: string;
  cfg: OpenClawConfig;