talk: add configurable silence timeout

This commit is contained in:
dano does design
2026-03-08 17:58:15 +11:00
committed by Peter Steinberger
parent 097c588a6b
commit 6ff7e8f42e
18 changed files with 162 additions and 9 deletions

View File

@@ -8,6 +8,7 @@ Docs: https://docs.openclaw.ai
- TUI: infer the active agent from the current workspace when launched inside a configured agent workspace, while preserving explicit `agent:` session targets. (#39591) thanks @arceus77-7.
- Tools/Brave web search: add opt-in `tools.web.search.brave.mode: "llm-context"` so `web_search` can call Brave's LLM Context endpoint and return extracted grounding snippets with source metadata, plus config/docs/test coverage. (#33383) Thanks @thirumaleshp.
- Talk mode: add top-level `talk.silenceTimeoutMs` config so Talk waits a configurable amount of silence before auto-sending the current transcript, while keeping each platform's existing default pause window when unset. (#39607) Thanks @danodoesdesign. Fixes #17147.
### Fixes

View File

@@ -59,8 +59,8 @@ class TalkModeManager(
private const val tag = "TalkMode"
private const val defaultModelIdFallback = "eleven_v3"
private const val defaultOutputFormatFallback = "pcm_24000"
private const val defaultTalkProvider = "elevenlabs"
private const val silenceWindowMs = 500L
private const val defaultTalkProvider = "elevenlabs"
private const val defaultSilenceTimeoutMs = 700L
private const val listenWatchdogMs = 12_000L
private const val chatFinalWaitWithSubscribeMs = 45_000L
private const val chatFinalWaitWithoutSubscribeMs = 6_000L
@@ -105,6 +105,14 @@ private const val defaultTalkProvider = "elevenlabs"
normalizedPayload = false,
)
}
internal fun resolvedSilenceTimeoutMs(talk: JsonObject?): Long {
val timeout = talk?.get("silenceTimeoutMs").asDoubleOrNull() ?: return defaultSilenceTimeoutMs
if (timeout <= 0 || timeout % 1.0 != 0.0 || timeout > Long.MAX_VALUE.toDouble()) {
return defaultSilenceTimeoutMs
}
return timeout.toLong()
}
}
private val mainHandler = Handler(Looper.getMainLooper())
@@ -134,7 +142,7 @@ private const val defaultTalkProvider = "elevenlabs"
private var listeningMode = false
private var silenceJob: Job? = null
private val silenceWindowMs = 700L
private var silenceWindowMs = defaultSilenceTimeoutMs
private var lastTranscript: String = ""
private var lastHeardAtMs: Long? = null
private var lastSpokenText: String? = null
@@ -1411,6 +1419,7 @@ private const val defaultTalkProvider = "elevenlabs"
activeConfig?.get("outputFormat")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
val key = activeConfig?.get("apiKey")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
val interrupt = talk?.get("interruptOnSpeech")?.asBooleanOrNull()
val silenceTimeoutMs = resolvedSilenceTimeoutMs(talk)
if (!isCanonicalMainSessionKey(mainSessionKey)) {
mainSessionKey = mainKey
@@ -1427,7 +1436,11 @@ private const val defaultTalkProvider = "elevenlabs"
if (!modelOverrideActive) currentModelId = defaultModelId
defaultOutputFormat = outputFormat ?: defaultOutputFormatFallback
apiKey = key ?: envKey?.takeIf { it.isNotEmpty() }
Log.d(tag, "reloadConfig apiKey=${if (apiKey != null) "set" else "null"} voiceId=$defaultVoiceId")
silenceWindowMs = silenceTimeoutMs
Log.d(
tag,
"reloadConfig apiKey=${if (apiKey != null) "set" else "null"} voiceId=$defaultVoiceId silenceTimeoutMs=$silenceTimeoutMs",
)
if (interrupt != null) interruptOnSpeech = interrupt
activeProviderIsElevenLabs = activeProvider == defaultTalkProvider
if (!activeProviderIsElevenLabs) {
@@ -1441,6 +1454,7 @@ private const val defaultTalkProvider = "elevenlabs"
}
configLoaded = true
} catch (_: Throwable) {
silenceWindowMs = defaultSilenceTimeoutMs
defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
defaultModelId = defaultModelIdFallback
if (!modelOverrideActive) currentModelId = defaultModelId

View File

@@ -54,4 +54,23 @@ class TalkModeConfigParsingTest {
assertEquals("voice-legacy", selection?.config?.get("voiceId")?.jsonPrimitive?.content)
assertEquals("legacy-key", selection?.config?.get("apiKey")?.jsonPrimitive?.content)
}
@Test
fun readsConfiguredSilenceTimeoutMs() {
val talk = buildJsonObject { put("silenceTimeoutMs", 1500) }
assertEquals(1500L, TalkModeManager.resolvedSilenceTimeoutMs(talk))
}
@Test
fun defaultsSilenceTimeoutMsWhenMissing() {
assertEquals(700L, TalkModeManager.resolvedSilenceTimeoutMs(null))
}
@Test
fun defaultsSilenceTimeoutMsWhenInvalid() {
val talk = buildJsonObject { put("silenceTimeoutMs", 0) }
assertEquals(700L, TalkModeManager.resolvedSilenceTimeoutMs(talk))
}
}

View File

@@ -34,6 +34,7 @@ final class TalkModeManager: NSObject {
private typealias SpeechRequest = SFSpeechAudioBufferRecognitionRequest
private static let defaultModelIdFallback = "eleven_v3"
private static let defaultTalkProvider = "elevenlabs"
private static let defaultSilenceTimeoutMs = 900
private static let redactedConfigSentinel = "__OPENCLAW_REDACTED__"
var isEnabled: Bool = false
var isListening: Bool = false
@@ -97,7 +98,7 @@ final class TalkModeManager: NSObject {
private var gateway: GatewayNodeSession?
private var gatewayConnected = false
private let silenceWindow: TimeInterval = 0.9
private var silenceWindow: TimeInterval = TimeInterval(Self.defaultSilenceTimeoutMs) / 1000
private var lastAudioActivity: Date?
private var noiseFloorSamples: [Double] = []
private var noiseFloor: Double?
@@ -2001,6 +2002,24 @@ extension TalkModeManager {
config: normalizedProviders[providerID] ?? [:])
}
static func resolvedSilenceTimeoutMs(_ talk: [String: Any]?) -> Int {
switch talk?["silenceTimeoutMs"] {
case let timeout as Int where timeout > 0:
return timeout
case let timeout as Double
where timeout > 0 && timeout.rounded(.towardZero) == timeout && timeout <= Double(Int.max):
return Int(timeout)
case let timeout as NSNumber:
let value = timeout.doubleValue
if value > 0 && value.rounded(.towardZero) == value && value <= Double(Int.max) {
return Int(value)
}
return Self.defaultSilenceTimeoutMs
default:
return Self.defaultSilenceTimeoutMs
}
}
func reloadConfig() async {
guard let gateway else { return }
self.pcmFormatUnavailable = false
@@ -2020,6 +2039,7 @@ extension TalkModeManager {
}
let activeProvider = selection?.provider ?? Self.defaultTalkProvider
let activeConfig = selection?.config
let silenceTimeoutMs = Self.resolvedSilenceTimeoutMs(talk)
self.defaultVoiceId = (activeConfig?["voiceId"] as? String)?
.trimmingCharacters(in: .whitespacesAndNewlines)
if let aliases = activeConfig?["voiceAliases"] as? [String: Any] {
@@ -2067,8 +2087,9 @@ extension TalkModeManager {
if let interrupt = talk?["interruptOnSpeech"] as? Bool {
self.interruptOnSpeech = interrupt
}
self.silenceWindow = TimeInterval(silenceTimeoutMs) / 1000
if selection != nil {
GatewayDiagnostics.log("talk config provider=\(activeProvider)")
GatewayDiagnostics.log("talk config provider=\(activeProvider) silenceTimeoutMs=\(silenceTimeoutMs)")
}
} catch {
self.defaultModelId = Self.defaultModelIdFallback
@@ -2079,6 +2100,7 @@ extension TalkModeManager {
self.gatewayTalkDefaultModelId = nil
self.gatewayTalkApiKeyConfigured = false
self.gatewayTalkConfigLoaded = false
self.silenceWindow = TimeInterval(Self.defaultSilenceTimeoutMs) / 1000
}
}

View File

@@ -47,4 +47,24 @@ import Testing
userInfo: [NSLocalizedDescriptionKey: "queue enqueue failed"])
#expect(TalkModeManager._test_isPCMFormatRejectedByAPI(error) == false)
}
@Test func readsConfiguredSilenceTimeoutMs() {
let talk: [String: Any] = [
"silenceTimeoutMs": 1500,
]
#expect(TalkModeManager.resolvedSilenceTimeoutMs(talk) == 1500)
}
@Test func defaultsSilenceTimeoutMsWhenMissing() {
#expect(TalkModeManager.resolvedSilenceTimeoutMs(nil) == 900)
}
@Test func defaultsSilenceTimeoutMsWhenInvalid() {
let talk: [String: Any] = [
"silenceTimeoutMs": 0,
]
#expect(TalkModeManager.resolvedSilenceTimeoutMs(talk) == 900)
}
}

View File

@@ -12,6 +12,7 @@ actor TalkModeRuntime {
private let ttsLogger = Logger(subsystem: "ai.openclaw", category: "talk.tts")
private static let defaultModelIdFallback = "eleven_v3"
private static let defaultTalkProvider = "elevenlabs"
private static let defaultSilenceTimeoutMs = 700
private final class RMSMeter: @unchecked Sendable {
private let lock = NSLock()
@@ -66,7 +67,7 @@ actor TalkModeRuntime {
private var fallbackVoiceId: String?
private var lastPlaybackWasPCM: Bool = false
private let silenceWindow: TimeInterval = 0.7
private var silenceWindow: TimeInterval = TimeInterval(TalkModeRuntime.defaultSilenceTimeoutMs) / 1000
private let minSpeechRMS: Double = 1e-3
private let speechBoostFactor: Double = 6.0
@@ -783,6 +784,7 @@ extension TalkModeRuntime {
}
self.defaultOutputFormat = cfg.outputFormat
self.interruptOnSpeech = cfg.interruptOnSpeech
self.silenceWindow = TimeInterval(cfg.silenceTimeoutMs) / 1000
self.apiKey = cfg.apiKey
let hasApiKey = (cfg.apiKey?.isEmpty == false)
let voiceLabel = (cfg.voiceId?.isEmpty == false) ? cfg.voiceId! : "none"
@@ -792,7 +794,8 @@ extension TalkModeRuntime {
"talk config voiceId=\(voiceLabel, privacy: .public) " +
"modelId=\(modelLabel, privacy: .public) " +
"apiKey=\(hasApiKey, privacy: .public) " +
"interrupt=\(cfg.interruptOnSpeech, privacy: .public)")
"interrupt=\(cfg.interruptOnSpeech, privacy: .public) " +
"silenceTimeoutMs=\(cfg.silenceTimeoutMs, privacy: .public)")
}
private struct TalkRuntimeConfig {
@@ -801,6 +804,7 @@ extension TalkModeRuntime {
let modelId: String?
let outputFormat: String?
let interruptOnSpeech: Bool
let silenceTimeoutMs: Int
let apiKey: String?
}
@@ -880,6 +884,21 @@ extension TalkModeRuntime {
normalizedPayload: false)
}
static func resolvedSilenceTimeoutMs(_ talk: [String: AnyCodable]?) -> Int {
if let timeout = talk?["silenceTimeoutMs"]?.intValue, timeout > 0 {
return timeout
}
if
let timeout = talk?["silenceTimeoutMs"]?.doubleValue,
timeout > 0,
timeout.rounded(.towardZero) == timeout,
timeout <= Double(Int.max)
{
return Int(timeout)
}
return Self.defaultSilenceTimeoutMs
}
private func fetchTalkConfig() async -> TalkRuntimeConfig {
let env = ProcessInfo.processInfo.environment
let envVoice = env["ELEVENLABS_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines)
@@ -895,6 +914,7 @@ extension TalkModeRuntime {
let selection = Self.selectTalkProviderConfig(talk)
let activeProvider = selection?.provider ?? Self.defaultTalkProvider
let activeConfig = selection?.config
let silenceTimeoutMs = Self.resolvedSilenceTimeoutMs(talk)
let ui = snap.config?["ui"]?.dictionaryValue
let rawSeam = ui?["seamColor"]?.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
await MainActor.run {
@@ -939,6 +959,7 @@ extension TalkModeRuntime {
modelId: resolvedModel,
outputFormat: outputFormat,
interruptOnSpeech: interrupt ?? true,
silenceTimeoutMs: silenceTimeoutMs,
apiKey: resolvedApiKey)
} catch {
let resolvedVoice =
@@ -951,6 +972,7 @@ extension TalkModeRuntime {
modelId: Self.defaultModelIdFallback,
outputFormat: nil,
interruptOnSpeech: true,
silenceTimeoutMs: Self.defaultSilenceTimeoutMs,
apiKey: resolvedApiKey)
}
}

View File

@@ -32,4 +32,24 @@ struct TalkModeConfigParsingTests {
#expect(selection?.config["voiceId"]?.stringValue == "voice-legacy")
#expect(selection?.config["apiKey"]?.stringValue == "legacy-key")
}
@Test func readsConfiguredSilenceTimeoutMs() {
let talk: [String: AnyCodable] = [
"silenceTimeoutMs": AnyCodable(1500),
]
#expect(TalkModeRuntime.resolvedSilenceTimeoutMs(talk) == 1500)
}
@Test func defaultsSilenceTimeoutMsWhenMissing() {
#expect(TalkModeRuntime.resolvedSilenceTimeoutMs(nil) == 700)
}
@Test func defaultsSilenceTimeoutMsWhenInvalid() {
let talk: [String: AnyCodable] = [
"silenceTimeoutMs": AnyCodable(0),
]
#expect(TalkModeRuntime.resolvedSilenceTimeoutMs(talk) == 700)
}
}

View File

@@ -1659,6 +1659,7 @@ Defaults for Talk mode (macOS/iOS/Android).
modelId: "eleven_v3",
outputFormat: "mp3_44100_128",
apiKey: "elevenlabs_api_key",
silenceTimeoutMs: 1500,
interruptOnSpeech: true,
},
}
@@ -1668,6 +1669,7 @@ Defaults for Talk mode (macOS/iOS/Android).
- `apiKey` and `providers.*.apiKey` accept plaintext strings or SecretRef objects.
- `ELEVENLABS_API_KEY` fallback applies only when no Talk API key is configured.
- `voiceAliases` lets Talk directives use friendly names.
- `silenceTimeoutMs` controls how long Talk mode waits after user silence before it sends the transcript. Unset keeps the platform default pause window (`700` ms on macOS and Android, `900` ms on iOS).
---

View File

@@ -56,6 +56,7 @@ Supported keys:
modelId: "eleven_v3",
outputFormat: "mp3_44100_128",
apiKey: "elevenlabs_api_key",
silenceTimeoutMs: 1500,
interruptOnSpeech: true,
},
}
@@ -64,6 +65,7 @@ Supported keys:
Defaults:
- `interruptOnSpeech`: true
- `silenceTimeoutMs`: when unset, Talk keeps the platform default pause window before sending the transcript (`700` ms on macOS and Android, `900` ms on iOS)
- `voiceId`: falls back to `ELEVENLABS_VOICE_ID` / `SAG_VOICE_ID` (or first ElevenLabs voice when API key is available)
- `modelId`: defaults to `eleven_v3` when unset
- `apiKey`: falls back to `ELEVENLABS_API_KEY` (or gateway shell profile if available)

View File

@@ -305,6 +305,7 @@ const TARGET_KEYS = [
"talk.modelId",
"talk.outputFormat",
"talk.interruptOnSpeech",
"talk.silenceTimeoutMs",
"meta",
"env",
"env.shellEnv",

View File

@@ -163,6 +163,8 @@ export const FIELD_HELP: Record<string, string> = {
"Use this legacy ElevenLabs API key for Talk mode only during migration, and keep secrets in env-backed storage. Prefer talk.providers.elevenlabs.apiKey (fallback: ELEVENLABS_API_KEY).",
"talk.interruptOnSpeech":
"If true (default), stop assistant speech when the user starts speaking in Talk mode. Keep enabled for conversational turn-taking.",
"talk.silenceTimeoutMs":
"Milliseconds of user silence before Talk mode finalizes and sends the current transcript. Leave unset to keep the platform default pause window (700 ms on macOS and Android, 900 ms on iOS).",
acp: "ACP runtime controls for enabling dispatch, selecting backends, constraining allowed agent targets, and tuning streamed turn projection behavior.",
"acp.enabled":
"Global ACP feature gate. Keep disabled unless ACP runtime + policy are configured.",

View File

@@ -651,6 +651,7 @@ export const FIELD_LABELS: Record<string, string> = {
"talk.modelId": "Talk Model ID",
"talk.outputFormat": "Talk Output Format",
"talk.interruptOnSpeech": "Talk Interrupt on Speech",
"talk.silenceTimeoutMs": "Talk Silence Timeout (ms)",
messages: "Messages",
"messages.messagePrefix": "Inbound Message Prefix",
"messages.responsePrefix": "Outbound Response Prefix",

View File

@@ -32,6 +32,7 @@ describe("talk normalization", () => {
outputFormat: "pcm_44100",
apiKey: "secret-key", // pragma: allowlist secret
interruptOnSpeech: false,
silenceTimeoutMs: 1500,
});
expect(normalized).toEqual({
@@ -51,6 +52,7 @@ describe("talk normalization", () => {
outputFormat: "pcm_44100",
apiKey: "secret-key", // pragma: allowlist secret
interruptOnSpeech: false,
silenceTimeoutMs: 1500,
});
});

View File

@@ -47,6 +47,13 @@ function normalizeTalkSecretInput(value: unknown): TalkProviderConfig["apiKey"]
return coerceSecretRef(value) ?? undefined;
}
function normalizeSilenceTimeoutMs(value: unknown): number | undefined {
if (typeof value !== "number" || !Number.isInteger(value) || value <= 0) {
return undefined;
}
return value;
}
function normalizeTalkProviderConfig(value: unknown): TalkProviderConfig | undefined {
if (!isPlainObject(value)) {
return undefined;
@@ -125,6 +132,10 @@ function normalizedLegacyTalkFields(source: Record<string, unknown>): Partial<Ta
if (apiKey !== undefined) {
legacy.apiKey = apiKey;
}
const silenceTimeoutMs = normalizeSilenceTimeoutMs(source.silenceTimeoutMs);
if (silenceTimeoutMs !== undefined) {
legacy.silenceTimeoutMs = silenceTimeoutMs;
}
return legacy;
}
@@ -267,6 +278,9 @@ export function buildTalkConfigResponse(value: unknown): TalkConfig | undefined
if (typeof normalized.interruptOnSpeech === "boolean") {
payload.interruptOnSpeech = normalized.interruptOnSpeech;
}
if (typeof normalized.silenceTimeoutMs === "number") {
payload.silenceTimeoutMs = normalized.silenceTimeoutMs;
}
if (normalized.providers && Object.keys(normalized.providers).length > 0) {
payload.providers = normalized.providers;
}

View File

@@ -70,6 +70,8 @@ export type TalkConfig = {
providers?: Record<string, TalkProviderConfig>;
/** Stop speaking when user starts talking (default: true). */
interruptOnSpeech?: boolean;
/** Milliseconds of user silence before Talk mode sends the transcript after a pause. */
silenceTimeoutMs?: number;
/**
* Legacy ElevenLabs compatibility fields.

View File

@@ -595,6 +595,7 @@ export const OpenClawSchema = z
outputFormat: z.string().optional(),
apiKey: SecretInputSchema.optional().register(sensitive),
interruptOnSpeech: z.boolean().optional(),
silenceTimeoutMs: z.number().int().positive().optional(),
})
.strict()
.optional(),

View File

@@ -42,6 +42,7 @@ export const TalkConfigResultSchema = Type.Object(
outputFormat: Type.Optional(Type.String()),
apiKey: Type.Optional(Type.String()),
interruptOnSpeech: Type.Optional(Type.Boolean()),
silenceTimeoutMs: Type.Optional(Type.Integer({ minimum: 1 })),
},
{ additionalProperties: false },
),

View File

@@ -56,7 +56,11 @@ async function connectOperator(ws: GatewaySocket, scopes: string[]) {
});
}
async function writeTalkConfig(config: { apiKey?: string; voiceId?: string }) {
async function writeTalkConfig(config: {
apiKey?: string;
voiceId?: string;
silenceTimeoutMs?: number;
}) {
const { writeConfigFile } = await import("../config/config.js");
await writeConfigFile({ talk: config });
}
@@ -68,6 +72,7 @@ describe("gateway talk.config", () => {
talk: {
voiceId: "voice-123",
apiKey: "secret-key-abc", // pragma: allowlist secret
silenceTimeoutMs: 1500,
},
session: {
mainKey: "main-test",
@@ -88,6 +93,7 @@ describe("gateway talk.config", () => {
};
apiKey?: string;
voiceId?: string;
silenceTimeoutMs?: number;
};
};
}>(ws, "talk.config", {});
@@ -99,6 +105,7 @@ describe("gateway talk.config", () => {
);
expect(res.payload?.config?.talk?.voiceId).toBe("voice-123");
expect(res.payload?.config?.talk?.apiKey).toBe("__OPENCLAW_REDACTED__");
expect(res.payload?.config?.talk?.silenceTimeoutMs).toBe(1500);
});
});