From ebfe55f909a21c498fe565868f55923cdf516629 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 17:13:31 +0100 Subject: [PATCH 001/100] fix: enable canvas webview scrolling on mobile nodes --- AGENTS.md | 2 ++ CHANGELOG.md | 1 + .../com/steipete/clawdis/node/ui/RootScreen.kt | 4 ++++ apps/ios/Sources/Screen/ScreenController.swift | 14 +++++++++++--- apps/ios/Tests/ScreenControllerTests.swift | 9 +++++++++ src/canvas-host/a2ui/.bundle.hash | 2 +- 6 files changed, 28 insertions(+), 4 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index a8d574840..0abed7948 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -41,6 +41,8 @@ - Also read the shared guardrails at `~/Projects/oracle/AGENTS.md` and `~/Projects/agent-scripts/AGENTS.MD` before making changes; align with any cross-repo rules noted there. - SwiftUI state management (iOS/macOS): prefer the `Observation` framework (`@Observable`, `@Bindable`) over `ObservableObject`/`@StateObject`; don’t introduce new `ObservableObject` unless required for compatibility, and migrate existing usages when touching related code. - **Restart apps:** “restart iOS/Android apps” means rebuild (recompile/install) and relaunch, not just kill/launch. +- iOS Team ID lookup: `security find-identity -p codesigning -v` → use Apple Development (…) TEAMID. Fallback: `defaults read com.apple.dt.Xcode IDEProvisioningTeamIdentifiers`. +- A2UI bundle hash: `src/canvas-host/a2ui/.bundle.hash` is auto-generated; regenerate via `pnpm canvas:a2ui:bundle` (or `scripts/bundle-a2ui.sh`) instead of manual conflict resolution. - Notary key file lives at `~/Library/CloudStorage/Dropbox/Backup/AppStore/AuthKey_NJF3NFGTS3.p8` (Sparkle keys live under `~/Library/CloudStorage/Dropbox/Backup/Sparkle`). - **Multi-agent safety:** do **not** create/apply/drop `git stash` entries unless Peter explicitly asks (this includes `git pull --rebase --autostash`). Assume other agents may be working; keep unrelated WIP untouched and avoid cross-cutting state changes. - **Multi-agent safety:** when Peter says "push", you may `git pull --rebase` to integrate latest changes (never discard other agents' work). When Peter says "commit", scope to your changes only. When Peter says "commit all", commit everything in grouped chunks. diff --git a/CHANGELOG.md b/CHANGELOG.md index 27a8716b5..ab4509c82 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Fixes - macOS: Voice Wake now fully tears down the Speech pipeline when disabled (cancel pending restarts, drop stale callbacks) to avoid high CPU in the background. +- iOS/Android nodes: enable scrolling for loaded web pages in the Canvas WebView (default scaffold stays touch-first). ## 2.0.0-beta4 — 2025-12-27 diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt index 4ee3afa1a..49bbee928 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt @@ -163,6 +163,10 @@ private fun CanvasView(viewModel: MainViewModel, modifier: Modifier = Modifier) // Some embedded web UIs (incl. the "background website") use localStorage/sessionStorage. settings.domStorageEnabled = true settings.mixedContentMode = WebSettings.MIXED_CONTENT_COMPATIBILITY_MODE + isScrollContainer = true + overScrollMode = View.OVER_SCROLL_IF_CONTENT_SCROLLS + isVerticalScrollBarEnabled = true + isHorizontalScrollBarEnabled = true webViewClient = object : WebViewClient() { override fun onReceivedError( diff --git a/apps/ios/Sources/Screen/ScreenController.swift b/apps/ios/Sources/Screen/ScreenController.swift index 6b3003360..76a17d55a 100644 --- a/apps/ios/Sources/Screen/ScreenController.swift +++ b/apps/ios/Sources/Screen/ScreenController.swift @@ -43,9 +43,7 @@ final class ScreenController { self.webView.scrollView.contentInset = .zero self.webView.scrollView.scrollIndicatorInsets = .zero self.webView.scrollView.automaticallyAdjustsScrollIndicatorInsets = false - // Disable scroll to allow touch events to pass through to canvas - self.webView.scrollView.isScrollEnabled = false - self.webView.scrollView.bounces = false + self.applyScrollBehavior() self.webView.navigationDelegate = self.navigationDelegate self.navigationDelegate.controller = self a2uiActionHandler.controller = self @@ -60,6 +58,7 @@ final class ScreenController { func reload() { let trimmed = self.urlString.trimmingCharacters(in: .whitespacesAndNewlines) + self.applyScrollBehavior() if trimmed.isEmpty { guard let url = Self.canvasScaffoldURL else { return } self.errorText = nil @@ -250,6 +249,15 @@ final class ScreenController { return false } + private func applyScrollBehavior() { + let trimmed = self.urlString.trimmingCharacters(in: .whitespacesAndNewlines) + let allowScroll = !trimmed.isEmpty + let scrollView = self.webView.scrollView + // Default canvas needs raw touch events; external pages should scroll. + scrollView.isScrollEnabled = allowScroll + scrollView.bounces = allowScroll + } + private static func jsValue(_ value: String?) -> String { guard let value else { return "null" } if let data = try? JSONSerialization.data(withJSONObject: [value]), diff --git a/apps/ios/Tests/ScreenControllerTests.swift b/apps/ios/Tests/ScreenControllerTests.swift index 028a0eae6..835c0081f 100644 --- a/apps/ios/Tests/ScreenControllerTests.swift +++ b/apps/ios/Tests/ScreenControllerTests.swift @@ -16,6 +16,15 @@ import WebKit #expect(scrollView.bounces == false) } + @Test @MainActor func navigateEnablesScrollForWebPages() { + let screen = ScreenController() + screen.navigate(to: "https://example.com") + + let scrollView = screen.webView.scrollView + #expect(scrollView.isScrollEnabled == true) + #expect(scrollView.bounces == true) + } + @Test @MainActor func navigateSlashShowsDefaultCanvas() { let screen = ScreenController() screen.navigate(to: "/") diff --git a/src/canvas-host/a2ui/.bundle.hash b/src/canvas-host/a2ui/.bundle.hash index eca4af8df..23e15ecec 100644 --- a/src/canvas-host/a2ui/.bundle.hash +++ b/src/canvas-host/a2ui/.bundle.hash @@ -1 +1 @@ -8c6030afb0b9f264b0bb9dcfb738b67d361fc5acac7967b4e056169a44f95184 +401ee2e7aa55e8abfd5e8ee94810a75629253b88371dc70f5b04c4830cdcce8d From 510e2a1d17a9c6f95737cc592b4b1257a9ac17ae Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 17:31:23 +0100 Subject: [PATCH 002/100] fix: menu devices list --- CHANGELOG.md | 1 + .../Clawdis/MenuSessionsInjector.swift | 163 +++--------------- apps/macos/Sources/Clawdis/NodesMenu.swift | 61 ++++--- apps/macos/Sources/Clawdis/NodesStore.swift | 84 +++++++++ docs/mac/menu-bar.md | 1 + 5 files changed, 142 insertions(+), 168 deletions(-) create mode 100644 apps/macos/Sources/Clawdis/NodesStore.swift diff --git a/CHANGELOG.md b/CHANGELOG.md index ab4509c82..7cf8b81e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Fixes - macOS: Voice Wake now fully tears down the Speech pipeline when disabled (cancel pending restarts, drop stale callbacks) to avoid high CPU in the background. - iOS/Android nodes: enable scrolling for loaded web pages in the Canvas WebView (default scaffold stays touch-first). +- macOS menu: device list now uses `node.list` (devices only; no agent/tool presence entries). ## 2.0.0-beta4 — 2025-12-27 diff --git a/apps/macos/Sources/Clawdis/MenuSessionsInjector.swift b/apps/macos/Sources/Clawdis/MenuSessionsInjector.swift index a66b1a38d..fb066d303 100644 --- a/apps/macos/Sources/Clawdis/MenuSessionsInjector.swift +++ b/apps/macos/Sources/Clawdis/MenuSessionsInjector.swift @@ -22,8 +22,7 @@ final class MenuSessionsInjector: NSObject, NSMenuDelegate { private var cachedErrorText: String? private var cacheUpdatedAt: Date? private let refreshIntervalSeconds: TimeInterval = 12 - private let nodesStore = InstancesStore.shared - private let gatewayDiscovery = GatewayDiscoveryModel() + private let nodesStore = NodesStore.shared #if DEBUG private var testControlChannelConnected: Bool? #endif @@ -43,7 +42,6 @@ final class MenuSessionsInjector: NSObject, NSMenuDelegate { } self.nodesStore.start() - self.gatewayDiscovery.start() } func menuWillOpen(_ menu: NSMenu) { @@ -218,7 +216,7 @@ final class MenuSessionsInjector: NSObject, NSMenuDelegate { } if entries.isEmpty { - let title = self.nodesStore.isLoading ? "Loading nodes..." : "No nodes yet" + let title = self.nodesStore.isLoading ? "Loading devices..." : "No devices yet" menu.insertItem(self.makeMessageItem(text: title, symbolName: "circle.dashed", width: width), at: cursor) cursor += 1 } else { @@ -239,7 +237,7 @@ final class MenuSessionsInjector: NSObject, NSMenuDelegate { if entries.count > 8 { let moreItem = NSMenuItem() moreItem.tag = self.nodesTag - moreItem.title = "More Nodes..." + moreItem.title = "More Devices..." moreItem.image = NSImage(systemSymbolName: "ellipsis.circle", accessibilityDescription: nil) let overflow = Array(entries.dropFirst(8)) moreItem.submenu = self.buildNodesOverflowMenu(entries: overflow, width: width) @@ -436,7 +434,7 @@ final class MenuSessionsInjector: NSObject, NSMenuDelegate { return menu } - private func buildNodesOverflowMenu(entries: [InstanceInfo], width: CGFloat) -> NSMenu { + private func buildNodesOverflowMenu(entries: [NodeInfo], width: CGFloat) -> NSMenu { let menu = NSMenu() for entry in entries { let item = NSMenuItem() @@ -452,21 +450,21 @@ final class MenuSessionsInjector: NSObject, NSMenuDelegate { return menu } - private func buildNodeSubmenu(entry: InstanceInfo) -> NSMenu { + private func buildNodeSubmenu(entry: NodeInfo) -> NSMenu { let menu = NSMenu() menu.autoenablesItems = false - menu.addItem(self.makeNodeCopyItem(label: "ID", value: entry.id)) + menu.addItem(self.makeNodeCopyItem(label: "Node ID", value: entry.nodeId)) - if let host = entry.host?.nonEmpty { - menu.addItem(self.makeNodeCopyItem(label: "Host", value: host)) + if let name = entry.displayName?.nonEmpty { + menu.addItem(self.makeNodeCopyItem(label: "Name", value: name)) } - if let ip = entry.ip?.nonEmpty { + if let ip = entry.remoteIp?.nonEmpty { menu.addItem(self.makeNodeCopyItem(label: "IP", value: ip)) } - menu.addItem(self.makeNodeCopyItem(label: "Role", value: NodeMenuEntryFormatter.roleText(entry))) + menu.addItem(self.makeNodeCopyItem(label: "Status", value: NodeMenuEntryFormatter.roleText(entry))) if let platform = NodeMenuEntryFormatter.platformText(entry) { menu.addItem(self.makeNodeCopyItem(label: "Platform", value: platform)) @@ -476,19 +474,17 @@ final class MenuSessionsInjector: NSObject, NSMenuDelegate { menu.addItem(self.makeNodeCopyItem(label: "Version", value: self.formatVersionLabel(version))) } - menu.addItem(self.makeNodeDetailItem(label: "Last seen", value: entry.ageDescription)) + menu.addItem(self.makeNodeDetailItem(label: "Connected", value: entry.isConnected ? "Yes" : "No")) + menu.addItem(self.makeNodeDetailItem(label: "Paired", value: entry.isPaired ? "Yes" : "No")) - if entry.lastInputSeconds != nil { - menu.addItem(self.makeNodeDetailItem(label: "Last input", value: entry.lastInputDescription)) + if let caps = entry.caps?.filter({ !$0.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty }), + !caps.isEmpty { + menu.addItem(self.makeNodeCopyItem(label: "Caps", value: caps.joined(separator: ", "))) } - if let reason = entry.reason?.nonEmpty { - menu.addItem(self.makeNodeDetailItem(label: "Reason", value: reason)) - } - - if let sshURL = self.sshURL(for: entry) { - menu.addItem(.separator()) - menu.addItem(self.makeNodeActionItem(title: "Open SSH", url: sshURL)) + if let commands = entry.commands?.filter({ !$0.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty }), + !commands.isEmpty { + menu.addItem(self.makeNodeCopyItem(label: "Commands", value: commands.joined(separator: ", "))) } return menu @@ -507,12 +503,6 @@ final class MenuSessionsInjector: NSObject, NSMenuDelegate { return item } - private func makeNodeActionItem(title: String, url: URL) -> NSMenuItem { - let item = NSMenuItem(title: title, action: #selector(self.openNodeSSH(_:)), keyEquivalent: "") - item.target = self - item.representedObject = url - return item - } private func formatVersionLabel(_ version: String) -> String { let trimmed = version.trimmingCharacters(in: .whitespacesAndNewlines) guard !trimmed.isEmpty else { return version } @@ -638,104 +628,6 @@ final class MenuSessionsInjector: NSObject, NSMenuDelegate { NSPasteboard.general.setString(value, forType: .string) } - @objc - private func openNodeSSH(_ sender: NSMenuItem) { - guard let url = sender.representedObject as? URL else { return } - - if let appURL = self.preferredTerminalAppURL() { - NSWorkspace.shared.open( - [url], - withApplicationAt: appURL, - configuration: NSWorkspace.OpenConfiguration(), - completionHandler: nil) - } else { - NSWorkspace.shared.open(url) - } - } - - private func preferredTerminalAppURL() -> URL? { - if let ghosty = self.ghostyAppURL() { return ghosty } - return NSWorkspace.shared.urlForApplication(withBundleIdentifier: "com.apple.Terminal") - } - - private func ghostyAppURL() -> URL? { - let candidates = [ - "/Applications/Ghosty.app", - ("~/Applications/Ghosty.app" as NSString).expandingTildeInPath, - ] - for path in candidates where FileManager.default.fileExists(atPath: path) { - return URL(fileURLWithPath: path) - } - return nil - } - - private func sshURL(for entry: InstanceInfo) -> URL? { - guard NodeMenuEntryFormatter.isGateway(entry) else { return nil } - guard let gateway = self.matchingGateway(for: entry) else { return nil } - guard let host = self.sanitizedTailnetHost(gateway.tailnetDns) ?? gateway.lanHost else { return nil } - let user = NSUserName() - return self.buildSSHURL(user: user, host: host, port: gateway.sshPort) - } - - private func matchingGateway(for entry: InstanceInfo) -> GatewayDiscoveryModel.DiscoveredGateway? { - let candidates = self.entryHostCandidates(entry) - guard !candidates.isEmpty else { return nil } - return self.gatewayDiscovery.gateways.first { gateway in - let gatewayTokens = self.gatewayHostTokens(gateway) - return candidates.contains { gatewayTokens.contains($0) } - } - } - - private func entryHostCandidates(_ entry: InstanceInfo) -> [String] { - let raw: [String?] = [ - entry.host, - entry.ip, - NodeMenuEntryFormatter.primaryName(entry), - ] - return raw.compactMap(self.normalizedHostToken(_:)) - } - - private func gatewayHostTokens(_ gateway: GatewayDiscoveryModel.DiscoveredGateway) -> [String] { - let raw: [String?] = [ - gateway.displayName, - gateway.lanHost, - gateway.tailnetDns, - ] - return raw.compactMap(self.normalizedHostToken(_:)) - } - - private func normalizedHostToken(_ value: String?) -> String? { - guard let value else { return nil } - let trimmed = value.trimmingCharacters(in: .whitespacesAndNewlines) - if trimmed.isEmpty { return nil } - let lower = trimmed.lowercased().trimmingCharacters(in: CharacterSet(charactersIn: ".")) - if lower.hasSuffix(".localdomain") { - return lower.replacingOccurrences(of: ".localdomain", with: ".local") - } - return lower - } - - private func sanitizedTailnetHost(_ host: String?) -> String? { - guard let host else { return nil } - let trimmed = host.trimmingCharacters(in: .whitespacesAndNewlines) - if trimmed.isEmpty { return nil } - if trimmed.hasSuffix(".internal.") || trimmed.hasSuffix(".internal") { - return nil - } - return trimmed - } - - private func buildSSHURL(user: String, host: String, port: Int) -> URL? { - var components = URLComponents() - components.scheme = "ssh" - components.user = user - components.host = host - if port != 22 { - components.port = port - } - return components.url - } - // MARK: - Width + placement private func findInsertIndex(in menu: NSMenu) -> Int? { @@ -790,23 +682,14 @@ final class MenuSessionsInjector: NSObject, NSMenuDelegate { return width } - private func sortedNodeEntries() -> [InstanceInfo] { - let entries = self.nodesStore.instances.filter { entry in - let mode = entry.mode?.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() - return mode != "health" - } + private func sortedNodeEntries() -> [NodeInfo] { + let entries = self.nodesStore.nodes return entries.sorted { lhs, rhs in - let lhsGateway = NodeMenuEntryFormatter.isGateway(lhs) - let rhsGateway = NodeMenuEntryFormatter.isGateway(rhs) - if lhsGateway != rhsGateway { return lhsGateway } - - let lhsLocal = NodeMenuEntryFormatter.isLocal(lhs) - let rhsLocal = NodeMenuEntryFormatter.isLocal(rhs) - if lhsLocal != rhsLocal { return lhsLocal } - + if lhs.isConnected != rhs.isConnected { return lhs.isConnected } + if lhs.isPaired != rhs.isPaired { return lhs.isPaired } let lhsName = NodeMenuEntryFormatter.primaryName(lhs).lowercased() let rhsName = NodeMenuEntryFormatter.primaryName(rhs).lowercased() - if lhsName == rhsName { return lhs.ts > rhs.ts } + if lhsName == rhsName { return lhs.nodeId < rhs.nodeId } return lhsName < rhsName } } diff --git a/apps/macos/Sources/Clawdis/NodesMenu.swift b/apps/macos/Sources/Clawdis/NodesMenu.swift index ec068ad8b..45e1b0c44 100644 --- a/apps/macos/Sources/Clawdis/NodesMenu.swift +++ b/apps/macos/Sources/Clawdis/NodesMenu.swift @@ -2,40 +2,44 @@ import AppKit import SwiftUI struct NodeMenuEntryFormatter { - static func isGateway(_ entry: InstanceInfo) -> Bool { - entry.mode?.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() == "gateway" + static func isConnected(_ entry: NodeInfo) -> Bool { + entry.isConnected } - static func isLocal(_ entry: InstanceInfo) -> Bool { - entry.mode?.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() == "local" + static func primaryName(_ entry: NodeInfo) -> String { + entry.displayName?.nonEmpty ?? entry.nodeId } - static func primaryName(_ entry: InstanceInfo) -> String { - if self.isGateway(entry) { - let host = entry.host?.nonEmpty - if let host, host.lowercased() != "gateway" { return host } - return "Gateway" + static func summaryText(_ entry: NodeInfo) -> String { + let name = self.primaryName(entry) + var prefix = "Node: \(name)" + if let ip = entry.remoteIp?.nonEmpty { + prefix += " (\(ip))" } - return entry.host?.nonEmpty ?? entry.id + var parts = [prefix] + if let platform = self.platformText(entry) { + parts.append("platform \(platform)") + } + if let version = entry.version?.nonEmpty { + parts.append("app \(self.compactVersion(version))") + } + parts.append("status \(self.roleText(entry))") + return parts.joined(separator: " · ") } - static func summaryText(_ entry: InstanceInfo) -> String { - entry.text.nonEmpty ?? self.primaryName(entry) + static func roleText(_ entry: NodeInfo) -> String { + if entry.isConnected { return "connected" } + if entry.isPaired { return "paired" } + return "unpaired" } - static func roleText(_ entry: InstanceInfo) -> String { - if self.isGateway(entry) { return "gateway" } - if let mode = entry.mode?.nonEmpty { return mode } - return "node" - } - - static func detailLeft(_ entry: InstanceInfo) -> String { + static func detailLeft(_ entry: NodeInfo) -> String { let role = self.roleText(entry) - if let ip = entry.ip?.nonEmpty { return "\(ip) · \(role)" } + if let ip = entry.remoteIp?.nonEmpty { return "\(ip) · \(role)" } return role } - static func detailRight(_ entry: InstanceInfo) -> String? { + static func detailRight(_ entry: NodeInfo) -> String? { var parts: [String] = [] if let platform = self.platformText(entry) { parts.append(platform) } if let version = entry.version?.nonEmpty { @@ -46,7 +50,7 @@ struct NodeMenuEntryFormatter { return parts.joined(separator: " · ") } - static func platformText(_ entry: InstanceInfo) -> String? { + static func platformText(_ entry: NodeInfo) -> String? { if let raw = entry.platform?.nonEmpty { return self.prettyPlatform(raw) ?? raw } @@ -99,8 +103,7 @@ struct NodeMenuEntryFormatter { return trimmed } - static func leadingSymbol(_ entry: InstanceInfo) -> String { - if self.isGateway(entry) { return self.safeSystemSymbol("dot.radiowaves.left.and.right", fallback: "network") } + static func leadingSymbol(_ entry: NodeInfo) -> String { if let family = entry.deviceFamily?.lowercased() { if family.contains("mac") { return self.safeSystemSymbol("laptopcomputer", fallback: "laptopcomputer") @@ -116,9 +119,11 @@ struct NodeMenuEntryFormatter { return "cpu" } - static func isAndroid(_ entry: InstanceInfo) -> Bool { + static func isAndroid(_ entry: NodeInfo) -> Bool { let family = entry.deviceFamily?.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() - return family == "android" + if family == "android" { return true } + let platform = entry.platform?.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + return platform?.contains("android") == true } private static func safeSystemSymbol(_ preferred: String, fallback: String) -> String { @@ -128,7 +133,7 @@ struct NodeMenuEntryFormatter { } struct NodeMenuRowView: View { - let entry: InstanceInfo + let entry: NodeInfo let width: CGFloat @Environment(\.menuItemHighlighted) private var isHighlighted @@ -147,7 +152,7 @@ struct NodeMenuRowView: View { VStack(alignment: .leading, spacing: 2) { Text(NodeMenuEntryFormatter.primaryName(self.entry)) - .font(.callout.weight(NodeMenuEntryFormatter.isGateway(self.entry) ? .semibold : .regular)) + .font(.callout.weight(NodeMenuEntryFormatter.isConnected(self.entry) ? .semibold : .regular)) .foregroundStyle(self.primaryColor) .lineLimit(1) .truncationMode(.middle) diff --git a/apps/macos/Sources/Clawdis/NodesStore.swift b/apps/macos/Sources/Clawdis/NodesStore.swift new file mode 100644 index 000000000..2c00e15f7 --- /dev/null +++ b/apps/macos/Sources/Clawdis/NodesStore.swift @@ -0,0 +1,84 @@ +import Foundation +import Observation +import OSLog + +struct NodeInfo: Identifiable, Codable { + let nodeId: String + let displayName: String? + let platform: String? + let version: String? + let deviceFamily: String? + let modelIdentifier: String? + let remoteIp: String? + let caps: [String]? + let commands: [String]? + let permissions: [String: Bool]? + let paired: Bool? + let connected: Bool? + + var id: String { self.nodeId } + var isConnected: Bool { self.connected ?? false } + var isPaired: Bool { self.paired ?? false } +} + +private struct NodeListResponse: Codable { + let ts: Double? + let nodes: [NodeInfo] +} + +@MainActor +@Observable +final class NodesStore { + static let shared = NodesStore() + + var nodes: [NodeInfo] = [] + var lastError: String? + var statusMessage: String? + var isLoading = false + + private let logger = Logger(subsystem: "com.steipete.clawdis", category: "nodes") + private var task: Task? + private let interval: TimeInterval = 30 + private var startCount = 0 + + func start() { + self.startCount += 1 + guard self.startCount == 1 else { return } + guard self.task == nil else { return } + self.task = Task.detached { [weak self] in + guard let self else { return } + await self.refresh() + while !Task.isCancelled { + try? await Task.sleep(nanoseconds: UInt64(self.interval * 1_000_000_000)) + await self.refresh() + } + } + } + + func stop() { + guard self.startCount > 0 else { return } + self.startCount -= 1 + guard self.startCount == 0 else { return } + self.task?.cancel() + self.task = nil + } + + func refresh() async { + if self.isLoading { return } + self.statusMessage = nil + self.isLoading = true + defer { self.isLoading = false } + do { + let data = try await GatewayConnection.shared.requestRaw(method: "node.list", params: nil, timeoutMs: 8000) + let decoded = try JSONDecoder().decode(NodeListResponse.self, from: data) + self.nodes = decoded.nodes + self.lastError = nil + self.statusMessage = nil + } catch { + self.logger.error("node.list failed \(error.localizedDescription, privacy: .public)") + self.nodes = [] + self.lastError = error.localizedDescription + self.statusMessage = nil + } + } +} diff --git a/docs/mac/menu-bar.md b/docs/mac/menu-bar.md index b4a672629..bfe9a8c36 100644 --- a/docs/mac/menu-bar.md +++ b/docs/mac/menu-bar.md @@ -8,6 +8,7 @@ read_when: ## What is shown - We surface the current agent work state in the menu bar icon and in the first status row of the menu. - Health status is hidden while work is active; it returns when all sessions are idle. +- The “Nodes” block in the menu lists **devices** only (gateway bridge nodes via `node.list`), not client/presence entries. ## State model - Sessions: events arrive with `runId` (session key). The “main” session is the key `main`; if absent, we fall back to the most recently updated session. From aa2700ffa78a47a59b38c3e766dfa704e8aef6aa Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 17:38:21 +0100 Subject: [PATCH 003/100] chore: set ios signing team for device builds --- apps/ios/project.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/ios/project.yml b/apps/ios/project.yml index 033d2f68f..e8dac7a20 100644 --- a/apps/ios/project.yml +++ b/apps/ios/project.yml @@ -62,7 +62,11 @@ targets: swiftlint lint --config "$SRCROOT/.swiftlint.yml" --use-script-input-file-lists settings: base: + CODE_SIGN_IDENTITY: "Apple Development" + CODE_SIGN_STYLE: Manual + DEVELOPMENT_TEAM: Y5PE65HELJ PRODUCT_BUNDLE_IDENTIFIER: com.steipete.clawdis.ios + PROVISIONING_PROFILE_SPECIFIER: "com.steipete.clawdis.ios Development" SWIFT_VERSION: "6.0" info: path: Sources/Info.plist From 09ef991e1a55988aa623cf7c89a969c70a2457a1 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 18:09:27 +0100 Subject: [PATCH 004/100] chore: harden restart script --- scripts/restart-mac.sh | 61 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/scripts/restart-mac.sh b/scripts/restart-mac.sh index a6bb20b8a..35886e71b 100755 --- a/scripts/restart-mac.sh +++ b/scripts/restart-mac.sh @@ -10,6 +10,11 @@ DEBUG_PROCESS_PATTERN="${ROOT_DIR}/apps/macos/.build/debug/Clawdis" LOCAL_PROCESS_PATTERN="${ROOT_DIR}/apps/macos/.build-local/debug/Clawdis" RELEASE_PROCESS_PATTERN="${ROOT_DIR}/apps/macos/.build/release/Clawdis" LAUNCH_AGENT="${HOME}/Library/LaunchAgents/com.steipete.clawdis.plist" +LOCK_KEY="$(printf '%s' "${ROOT_DIR}" | shasum -a 256 | cut -c1-8)" +LOCK_DIR="${TMPDIR:-/tmp}/clawdis-restart-${LOCK_KEY}" +LOCK_PID_FILE="${LOCK_DIR}/pid" +WAIT_FOR_LOCK=0 +LOG_PATH="${CLAWDIS_RESTART_LOG:-/tmp/clawdis-restart.log}" log() { printf '%s\n' "$*"; } fail() { printf 'ERROR: %s\n' "$*" >&2; exit 1; } @@ -25,6 +30,60 @@ run_step() { fi } +cleanup() { + if [[ -d "${LOCK_DIR}" ]]; then + rm -rf "${LOCK_DIR}" + fi +} + +acquire_lock() { + while true; do + if mkdir "${LOCK_DIR}" 2>/dev/null; then + echo "$$" > "${LOCK_PID_FILE}" + return 0 + fi + + local existing_pid="" + if [[ -f "${LOCK_PID_FILE}" ]]; then + existing_pid="$(cat "${LOCK_PID_FILE}" 2>/dev/null || true)" + fi + + if [[ -n "${existing_pid}" ]] && kill -0 "${existing_pid}" 2>/dev/null; then + if [[ "${WAIT_FOR_LOCK}" == "1" ]]; then + log "==> Another restart is running (pid ${existing_pid}); waiting..." + while kill -0 "${existing_pid}" 2>/dev/null; do + sleep 1 + done + continue + fi + log "==> Another restart is running (pid ${existing_pid}); re-run with --wait." + exit 0 + fi + + rm -rf "${LOCK_DIR}" + done +} + +trap cleanup EXIT INT TERM + +for arg in "$@"; do + case "${arg}" in + --wait|-w) WAIT_FOR_LOCK=1 ;; + --help|-h) + log "Usage: $(basename "$0") [--wait]" + exit 0 + ;; + *) ;; + esac +done + +mkdir -p "$(dirname "$LOG_PATH")" +rm -f "$LOG_PATH" +exec > >(tee "$LOG_PATH") 2>&1 +log "==> Log: ${LOG_PATH}" + +acquire_lock + kill_all_clawdis() { for _ in {1..10}; do pkill -f "${APP_PROCESS_PATTERN}" 2>/dev/null || true @@ -102,5 +161,5 @@ sleep 1.5 if pgrep -f "${APP_PROCESS_PATTERN}" >/dev/null 2>&1; then log "OK: Clawdis is running." else - fail "App exited immediately. Check /tmp/clawdis.log or Console.app (User Reports)." + fail "App exited immediately. Check ${LOG_PATH} or Console.app (User Reports)." fi From 653932e50dde71078a178bf3210b094ae8c1a2dc Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 18:35:52 +0100 Subject: [PATCH 005/100] fix: show connected nodes only --- CHANGELOG.md | 1 + apps/macos/Sources/Clawdis/MenuSessionsInjector.swift | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7cf8b81e5..5d0f56cf0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ - macOS: Voice Wake now fully tears down the Speech pipeline when disabled (cancel pending restarts, drop stale callbacks) to avoid high CPU in the background. - iOS/Android nodes: enable scrolling for loaded web pages in the Canvas WebView (default scaffold stays touch-first). - macOS menu: device list now uses `node.list` (devices only; no agent/tool presence entries). +- macOS menu: device list now shows connected nodes only. ## 2.0.0-beta4 — 2025-12-27 diff --git a/apps/macos/Sources/Clawdis/MenuSessionsInjector.swift b/apps/macos/Sources/Clawdis/MenuSessionsInjector.swift index fb066d303..f9a0c141f 100644 --- a/apps/macos/Sources/Clawdis/MenuSessionsInjector.swift +++ b/apps/macos/Sources/Clawdis/MenuSessionsInjector.swift @@ -683,7 +683,7 @@ final class MenuSessionsInjector: NSObject, NSMenuDelegate { } private func sortedNodeEntries() -> [NodeInfo] { - let entries = self.nodesStore.nodes + let entries = self.nodesStore.nodes.filter { $0.isConnected } return entries.sorted { lhs, rhs in if lhs.isConnected != rhs.isConnected { return lhs.isConnected } if lhs.isPaired != rhs.isPaired { return lhs.isPaired } From 41be9232fea19e6d6e5cd5aeb8bf634231812c6f Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 20:10:36 +0100 Subject: [PATCH 006/100] fix: prevent iOS screen capture crash --- CHANGELOG.md | 1 + .../Sources/Screen/ScreenRecordService.swift | 86 +++++++++++++------ 2 files changed, 59 insertions(+), 28 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d0f56cf0..224f37ab4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - iOS/Android nodes: enable scrolling for loaded web pages in the Canvas WebView (default scaffold stays touch-first). - macOS menu: device list now uses `node.list` (devices only; no agent/tool presence entries). - macOS menu: device list now shows connected nodes only. +- iOS node: fix ReplayKit screen recording crash caused by queue isolation assertions during capture. ## 2.0.0-beta4 — 2025-12-27 diff --git a/apps/ios/Sources/Screen/ScreenRecordService.swift b/apps/ios/Sources/Screen/ScreenRecordService.swift index d1e575868..d9a0a4d26 100644 --- a/apps/ios/Sources/Screen/ScreenRecordService.swift +++ b/apps/ios/Sources/Screen/ScreenRecordService.swift @@ -1,7 +1,6 @@ import AVFoundation import ReplayKit -@MainActor final class ScreenRecordService { private struct UncheckedSendableBox: @unchecked Sendable { let value: T @@ -52,7 +51,9 @@ final class ScreenRecordService { try? FileManager.default.removeItem(at: outURL) let recorder = RPScreenRecorder.shared() - recorder.isMicrophoneEnabled = includeAudio + await MainActor.run { + recorder.isMicrophoneEnabled = includeAudio + } var writer: AVAssetWriter? var videoInput: AVAssetWriterInput? @@ -61,16 +62,23 @@ final class ScreenRecordService { var sawVideo = false var lastVideoTime: CMTime? var handlerError: Error? - let lock = NSLock() + let stateLock = NSLock() + + func withStateLock(_ body: () -> T) -> T { + stateLock.lock() + defer { stateLock.unlock() } + return body() + } func setHandlerError(_ error: Error) { - lock.lock() - defer { lock.unlock() } + withStateLock { if handlerError == nil { handlerError = error } + } } try await withCheckedThrowingContinuation { (cont: CheckedContinuation) in - recorder.startCapture(handler: { sample, type, error in + Task { @MainActor in + recorder.startCapture(handler: { sample, type, error in if let error { setHandlerError(error) return @@ -80,12 +88,16 @@ final class ScreenRecordService { switch type { case .video: let pts = CMSampleBufferGetPresentationTimeStamp(sample) - if let lastVideoTime { - let delta = CMTimeSubtract(pts, lastVideoTime) - if delta.seconds < (1.0 / fpsValue) { return } + let shouldSkip = withStateLock { + if let lastVideoTime { + let delta = CMTimeSubtract(pts, lastVideoTime) + return delta.seconds < (1.0 / fpsValue) + } + return false } + if shouldSkip { return } - if writer == nil { + if withStateLock({ writer == nil }) { guard let imageBuffer = CMSampleBufferGetImageBuffer(sample) else { setHandlerError(ScreenRecordError.captureFailed("Missing image buffer")) return @@ -111,7 +123,9 @@ final class ScreenRecordService { aInput.expectsMediaDataInRealTime = true if w.canAdd(aInput) { w.add(aInput) - audioInput = aInput + withStateLock { + audioInput = aInput + } } } @@ -120,29 +134,37 @@ final class ScreenRecordService { .writeFailed(w.error?.localizedDescription ?? "Failed to start writer") } w.startSession(atSourceTime: pts) - writer = w - videoInput = vInput - started = true + withStateLock { + writer = w + videoInput = vInput + started = true + } } catch { setHandlerError(error) return } } - guard let vInput = videoInput, started else { return } + let vInput = withStateLock { videoInput } + let isStarted = withStateLock { started } + guard let vInput, isStarted else { return } if vInput.isReadyForMoreMediaData { if vInput.append(sample) { - sawVideo = true - lastVideoTime = pts + withStateLock { + sawVideo = true + lastVideoTime = pts + } } else { - if let err = writer?.error { + if let err = withStateLock({ writer?.error }) { setHandlerError(ScreenRecordError.writeFailed(err.localizedDescription)) } } } case .audioApp, .audioMic: - guard includeAudio, let aInput = audioInput, started else { return } + let aInput = withStateLock { audioInput } + let isStarted = withStateLock { started } + guard includeAudio, let aInput, isStarted else { return } if aInput.isReadyForMoreMediaData { _ = aInput.append(sample) } @@ -150,27 +172,35 @@ final class ScreenRecordService { @unknown default: break } - }, completionHandler: { error in + }, completionHandler: { error in if let error { cont.resume(throwing: error) } else { cont.resume() } - }) + }) + } } try await Task.sleep(nanoseconds: UInt64(durationMs) * 1_000_000) - let stopError = await withCheckedContinuation { cont in - recorder.stopCapture { error in cont.resume(returning: error) } + let stopError = await MainActor.run { + await withCheckedContinuation { cont in + recorder.stopCapture { error in cont.resume(returning: error) } + } } if let stopError { throw stopError } - if let handlerError { throw handlerError } - guard let writer, let videoInput, sawVideo else { + let handlerErrorSnapshot = withStateLock { handlerError } + if let handlerErrorSnapshot { throw handlerErrorSnapshot } + let writerSnapshot = withStateLock { writer } + let videoInputSnapshot = withStateLock { videoInput } + let audioInputSnapshot = withStateLock { audioInput } + let sawVideoSnapshot = withStateLock { sawVideo } + guard let writerSnapshot, let videoInputSnapshot, sawVideoSnapshot else { throw ScreenRecordError.captureFailed("No frames captured") } - videoInput.markAsFinished() - audioInput?.markAsFinished() + videoInputSnapshot.markAsFinished() + audioInputSnapshot?.markAsFinished() - let writerBox = UncheckedSendableBox(value: writer) + let writerBox = UncheckedSendableBox(value: writerSnapshot) try await withCheckedThrowingContinuation { (cont: CheckedContinuation) in writerBox.value.finishWriting { let writer = writerBox.value From 65478a6ff371891949a914903553e8a8aac89326 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 20:20:14 +0100 Subject: [PATCH 007/100] fix: avoid main-actor stopCapture error --- .../Sources/Screen/ScreenRecordService.swift | 172 +++++++++--------- 1 file changed, 86 insertions(+), 86 deletions(-) diff --git a/apps/ios/Sources/Screen/ScreenRecordService.swift b/apps/ios/Sources/Screen/ScreenRecordService.swift index d9a0a4d26..3201770ee 100644 --- a/apps/ios/Sources/Screen/ScreenRecordService.swift +++ b/apps/ios/Sources/Screen/ScreenRecordService.swift @@ -72,116 +72,116 @@ final class ScreenRecordService { func setHandlerError(_ error: Error) { withStateLock { - if handlerError == nil { handlerError = error } + if handlerError == nil { handlerError = error } } } try await withCheckedThrowingContinuation { (cont: CheckedContinuation) in Task { @MainActor in recorder.startCapture(handler: { sample, type, error in - if let error { - setHandlerError(error) - return - } - guard CMSampleBufferDataIsReady(sample) else { return } - - switch type { - case .video: - let pts = CMSampleBufferGetPresentationTimeStamp(sample) - let shouldSkip = withStateLock { - if let lastVideoTime { - let delta = CMTimeSubtract(pts, lastVideoTime) - return delta.seconds < (1.0 / fpsValue) - } - return false + if let error { + setHandlerError(error) + return } - if shouldSkip { return } + guard CMSampleBufferDataIsReady(sample) else { return } - if withStateLock({ writer == nil }) { - guard let imageBuffer = CMSampleBufferGetImageBuffer(sample) else { - setHandlerError(ScreenRecordError.captureFailed("Missing image buffer")) - return - } - let width = CVPixelBufferGetWidth(imageBuffer) - let height = CVPixelBufferGetHeight(imageBuffer) - do { - let w = try AVAssetWriter(outputURL: outURL, fileType: .mp4) - let settings: [String: Any] = [ - AVVideoCodecKey: AVVideoCodecType.h264, - AVVideoWidthKey: width, - AVVideoHeightKey: height, - ] - let vInput = AVAssetWriterInput(mediaType: .video, outputSettings: settings) - vInput.expectsMediaDataInRealTime = true - guard w.canAdd(vInput) else { - throw ScreenRecordError.writeFailed("Cannot add video input") + switch type { + case .video: + let pts = CMSampleBufferGetPresentationTimeStamp(sample) + let shouldSkip = withStateLock { + if let lastVideoTime { + let delta = CMTimeSubtract(pts, lastVideoTime) + return delta.seconds < (1.0 / fpsValue) } - w.add(vInput) + return false + } + if shouldSkip { return } - if includeAudio { - let aInput = AVAssetWriterInput(mediaType: .audio, outputSettings: nil) - aInput.expectsMediaDataInRealTime = true - if w.canAdd(aInput) { - w.add(aInput) - withStateLock { - audioInput = aInput + if withStateLock({ writer == nil }) { + guard let imageBuffer = CMSampleBufferGetImageBuffer(sample) else { + setHandlerError(ScreenRecordError.captureFailed("Missing image buffer")) + return + } + let width = CVPixelBufferGetWidth(imageBuffer) + let height = CVPixelBufferGetHeight(imageBuffer) + do { + let w = try AVAssetWriter(outputURL: outURL, fileType: .mp4) + let settings: [String: Any] = [ + AVVideoCodecKey: AVVideoCodecType.h264, + AVVideoWidthKey: width, + AVVideoHeightKey: height, + ] + let vInput = AVAssetWriterInput(mediaType: .video, outputSettings: settings) + vInput.expectsMediaDataInRealTime = true + guard w.canAdd(vInput) else { + throw ScreenRecordError.writeFailed("Cannot add video input") + } + w.add(vInput) + + if includeAudio { + let aInput = AVAssetWriterInput(mediaType: .audio, outputSettings: nil) + aInput.expectsMediaDataInRealTime = true + if w.canAdd(aInput) { + w.add(aInput) + withStateLock { + audioInput = aInput + } } } - } - guard w.startWriting() else { - throw ScreenRecordError - .writeFailed(w.error?.localizedDescription ?? "Failed to start writer") - } - w.startSession(atSourceTime: pts) - withStateLock { - writer = w - videoInput = vInput - started = true - } - } catch { - setHandlerError(error) - return - } - } - - let vInput = withStateLock { videoInput } - let isStarted = withStateLock { started } - guard let vInput, isStarted else { return } - if vInput.isReadyForMoreMediaData { - if vInput.append(sample) { - withStateLock { - sawVideo = true - lastVideoTime = pts - } - } else { - if let err = withStateLock({ writer?.error }) { - setHandlerError(ScreenRecordError.writeFailed(err.localizedDescription)) + guard w.startWriting() else { + throw ScreenRecordError + .writeFailed(w.error?.localizedDescription ?? "Failed to start writer") + } + w.startSession(atSourceTime: pts) + withStateLock { + writer = w + videoInput = vInput + started = true + } + } catch { + setHandlerError(error) + return } } - } - case .audioApp, .audioMic: - let aInput = withStateLock { audioInput } - let isStarted = withStateLock { started } - guard includeAudio, let aInput, isStarted else { return } - if aInput.isReadyForMoreMediaData { - _ = aInput.append(sample) - } + let vInput = withStateLock { videoInput } + let isStarted = withStateLock { started } + guard let vInput, isStarted else { return } + if vInput.isReadyForMoreMediaData { + if vInput.append(sample) { + withStateLock { + sawVideo = true + lastVideoTime = pts + } + } else { + if let err = withStateLock({ writer?.error }) { + setHandlerError(ScreenRecordError.writeFailed(err.localizedDescription)) + } + } + } - @unknown default: - break - } + case .audioApp, .audioMic: + let aInput = withStateLock { audioInput } + let isStarted = withStateLock { started } + guard includeAudio, let aInput, isStarted else { return } + if aInput.isReadyForMoreMediaData { + _ = aInput.append(sample) + } + + @unknown default: + break + } }, completionHandler: { error in - if let error { cont.resume(throwing: error) } else { cont.resume() } + if let error { cont.resume(throwing: error) } else { cont.resume() } }) } } try await Task.sleep(nanoseconds: UInt64(durationMs) * 1_000_000) - let stopError = await MainActor.run { - await withCheckedContinuation { cont in + let stopError = await withCheckedContinuation { cont in + Task { @MainActor in recorder.stopCapture { error in cont.resume(returning: error) } } } From c14d738d37e9bf4fc286e88fabe421623e66f659 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 20:22:26 +0100 Subject: [PATCH 008/100] fix: avoid screen recorder data races --- .../Sources/Screen/ScreenRecordService.swift | 184 +++++++++--------- 1 file changed, 93 insertions(+), 91 deletions(-) diff --git a/apps/ios/Sources/Screen/ScreenRecordService.swift b/apps/ios/Sources/Screen/ScreenRecordService.swift index 3201770ee..79151488c 100644 --- a/apps/ios/Sources/Screen/ScreenRecordService.swift +++ b/apps/ios/Sources/Screen/ScreenRecordService.swift @@ -50,11 +50,6 @@ final class ScreenRecordService { }() try? FileManager.default.removeItem(at: outURL) - let recorder = RPScreenRecorder.shared() - await MainActor.run { - recorder.isMicrophoneEnabled = includeAudio - } - var writer: AVAssetWriter? var videoInput: AVAssetWriterInput? var audioInput: AVAssetWriterInput? @@ -77,104 +72,110 @@ final class ScreenRecordService { } try await withCheckedThrowingContinuation { (cont: CheckedContinuation) in - Task { @MainActor in - recorder.startCapture(handler: { sample, type, error in - if let error { - setHandlerError(error) - return - } - guard CMSampleBufferDataIsReady(sample) else { return } + let handler: @Sendable (CMSampleBuffer, RPSampleBufferType, Error?) -> Void = { sample, type, error in + if let error { + setHandlerError(error) + return + } + guard CMSampleBufferDataIsReady(sample) else { return } - switch type { - case .video: - let pts = CMSampleBufferGetPresentationTimeStamp(sample) - let shouldSkip = withStateLock { - if let lastVideoTime { - let delta = CMTimeSubtract(pts, lastVideoTime) - return delta.seconds < (1.0 / fpsValue) - } - return false + switch type { + case .video: + let pts = CMSampleBufferGetPresentationTimeStamp(sample) + let shouldSkip = withStateLock { + if let lastVideoTime { + let delta = CMTimeSubtract(pts, lastVideoTime) + return delta.seconds < (1.0 / fpsValue) } - if shouldSkip { return } + return false + } + if shouldSkip { return } - if withStateLock({ writer == nil }) { - guard let imageBuffer = CMSampleBufferGetImageBuffer(sample) else { - setHandlerError(ScreenRecordError.captureFailed("Missing image buffer")) - return + if withStateLock({ writer == nil }) { + guard let imageBuffer = CMSampleBufferGetImageBuffer(sample) else { + setHandlerError(ScreenRecordError.captureFailed("Missing image buffer")) + return + } + let width = CVPixelBufferGetWidth(imageBuffer) + let height = CVPixelBufferGetHeight(imageBuffer) + do { + let w = try AVAssetWriter(outputURL: outURL, fileType: .mp4) + let settings: [String: Any] = [ + AVVideoCodecKey: AVVideoCodecType.h264, + AVVideoWidthKey: width, + AVVideoHeightKey: height, + ] + let vInput = AVAssetWriterInput(mediaType: .video, outputSettings: settings) + vInput.expectsMediaDataInRealTime = true + guard w.canAdd(vInput) else { + throw ScreenRecordError.writeFailed("Cannot add video input") } - let width = CVPixelBufferGetWidth(imageBuffer) - let height = CVPixelBufferGetHeight(imageBuffer) - do { - let w = try AVAssetWriter(outputURL: outURL, fileType: .mp4) - let settings: [String: Any] = [ - AVVideoCodecKey: AVVideoCodecType.h264, - AVVideoWidthKey: width, - AVVideoHeightKey: height, - ] - let vInput = AVAssetWriterInput(mediaType: .video, outputSettings: settings) - vInput.expectsMediaDataInRealTime = true - guard w.canAdd(vInput) else { - throw ScreenRecordError.writeFailed("Cannot add video input") - } - w.add(vInput) + w.add(vInput) - if includeAudio { - let aInput = AVAssetWriterInput(mediaType: .audio, outputSettings: nil) - aInput.expectsMediaDataInRealTime = true - if w.canAdd(aInput) { - w.add(aInput) - withStateLock { - audioInput = aInput - } + if includeAudio { + let aInput = AVAssetWriterInput(mediaType: .audio, outputSettings: nil) + aInput.expectsMediaDataInRealTime = true + if w.canAdd(aInput) { + w.add(aInput) + withStateLock { + audioInput = aInput } } - - guard w.startWriting() else { - throw ScreenRecordError - .writeFailed(w.error?.localizedDescription ?? "Failed to start writer") - } - w.startSession(atSourceTime: pts) - withStateLock { - writer = w - videoInput = vInput - started = true - } - } catch { - setHandlerError(error) - return } - } - let vInput = withStateLock { videoInput } - let isStarted = withStateLock { started } - guard let vInput, isStarted else { return } - if vInput.isReadyForMoreMediaData { - if vInput.append(sample) { - withStateLock { - sawVideo = true - lastVideoTime = pts - } - } else { - if let err = withStateLock({ writer?.error }) { - setHandlerError(ScreenRecordError.writeFailed(err.localizedDescription)) - } + guard w.startWriting() else { + throw ScreenRecordError + .writeFailed(w.error?.localizedDescription ?? "Failed to start writer") } + w.startSession(atSourceTime: pts) + withStateLock { + writer = w + videoInput = vInput + started = true + } + } catch { + setHandlerError(error) + return } - - case .audioApp, .audioMic: - let aInput = withStateLock { audioInput } - let isStarted = withStateLock { started } - guard includeAudio, let aInput, isStarted else { return } - if aInput.isReadyForMoreMediaData { - _ = aInput.append(sample) - } - - @unknown default: - break } - }, completionHandler: { error in - if let error { cont.resume(throwing: error) } else { cont.resume() } - }) + + let vInput = withStateLock { videoInput } + let isStarted = withStateLock { started } + guard let vInput, isStarted else { return } + if vInput.isReadyForMoreMediaData { + if vInput.append(sample) { + withStateLock { + sawVideo = true + lastVideoTime = pts + } + } else { + if let err = withStateLock({ writer?.error }) { + setHandlerError(ScreenRecordError.writeFailed(err.localizedDescription)) + } + } + } + + case .audioApp, .audioMic: + let aInput = withStateLock { audioInput } + let isStarted = withStateLock { started } + guard includeAudio, let aInput, isStarted else { return } + if aInput.isReadyForMoreMediaData { + _ = aInput.append(sample) + } + + @unknown default: + break + } + } + + let completion: @Sendable (Error?) -> Void = { error in + if let error { cont.resume(throwing: error) } else { cont.resume() } + } + + Task { @MainActor in + let recorder = RPScreenRecorder.shared() + recorder.isMicrophoneEnabled = includeAudio + recorder.startCapture(handler: handler, completionHandler: completion) } } @@ -182,6 +183,7 @@ final class ScreenRecordService { let stopError = await withCheckedContinuation { cont in Task { @MainActor in + let recorder = RPScreenRecorder.shared() recorder.stopCapture { error in cont.resume(returning: error) } } } From 7a849ab7d114900a960c662f3a713ec58c2825b4 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 20:24:34 +0100 Subject: [PATCH 009/100] fix: isolate ReplayKit capture state --- .../Sources/Screen/ScreenRecordService.swift | 129 +++++++++++------- 1 file changed, 78 insertions(+), 51 deletions(-) diff --git a/apps/ios/Sources/Screen/ScreenRecordService.swift b/apps/ios/Sources/Screen/ScreenRecordService.swift index 79151488c..829f29cd7 100644 --- a/apps/ios/Sources/Screen/ScreenRecordService.swift +++ b/apps/ios/Sources/Screen/ScreenRecordService.swift @@ -6,6 +6,23 @@ final class ScreenRecordService { let value: T } + private final class CaptureState: @unchecked Sendable { + private let lock = NSLock() + var writer: AVAssetWriter? + var videoInput: AVAssetWriterInput? + var audioInput: AVAssetWriterInput? + var started = false + var sawVideo = false + var lastVideoTime: CMTime? + var handlerError: Error? + + func withLock(_ body: (CaptureState) -> T) -> T { + self.lock.lock() + defer { lock.unlock() } + return body(self) + } + } + enum ScreenRecordError: LocalizedError { case invalidScreenIndex(Int) case captureFailed(String) @@ -50,31 +67,14 @@ final class ScreenRecordService { }() try? FileManager.default.removeItem(at: outURL) - var writer: AVAssetWriter? - var videoInput: AVAssetWriterInput? - var audioInput: AVAssetWriterInput? - var started = false - var sawVideo = false - var lastVideoTime: CMTime? - var handlerError: Error? - let stateLock = NSLock() - - func withStateLock(_ body: () -> T) -> T { - stateLock.lock() - defer { stateLock.unlock() } - return body() - } - - func setHandlerError(_ error: Error) { - withStateLock { - if handlerError == nil { handlerError = error } - } - } + let state = CaptureState() try await withCheckedThrowingContinuation { (cont: CheckedContinuation) in let handler: @Sendable (CMSampleBuffer, RPSampleBufferType, Error?) -> Void = { sample, type, error in if let error { - setHandlerError(error) + state.withLock { state in + if state.handlerError == nil { state.handlerError = error } + } return } guard CMSampleBufferDataIsReady(sample) else { return } @@ -82,8 +82,8 @@ final class ScreenRecordService { switch type { case .video: let pts = CMSampleBufferGetPresentationTimeStamp(sample) - let shouldSkip = withStateLock { - if let lastVideoTime { + let shouldSkip = state.withLock { state in + if let lastVideoTime = state.lastVideoTime { let delta = CMTimeSubtract(pts, lastVideoTime) return delta.seconds < (1.0 / fpsValue) } @@ -91,9 +91,13 @@ final class ScreenRecordService { } if shouldSkip { return } - if withStateLock({ writer == nil }) { + if state.withLock({ $0.writer == nil }) { guard let imageBuffer = CMSampleBufferGetImageBuffer(sample) else { - setHandlerError(ScreenRecordError.captureFailed("Missing image buffer")) + state.withLock { state in + if state.handlerError == nil { + state.handlerError = ScreenRecordError.captureFailed("Missing image buffer") + } + } return } let width = CVPixelBufferGetWidth(imageBuffer) @@ -117,8 +121,8 @@ final class ScreenRecordService { aInput.expectsMediaDataInRealTime = true if w.canAdd(aInput) { w.add(aInput) - withStateLock { - audioInput = aInput + state.withLock { state in + state.audioInput = aInput } } } @@ -128,36 +132,43 @@ final class ScreenRecordService { .writeFailed(w.error?.localizedDescription ?? "Failed to start writer") } w.startSession(atSourceTime: pts) - withStateLock { - writer = w - videoInput = vInput - started = true + state.withLock { state in + state.writer = w + state.videoInput = vInput + state.started = true } } catch { - setHandlerError(error) + state.withLock { state in + if state.handlerError == nil { state.handlerError = error } + } return } } - let vInput = withStateLock { videoInput } - let isStarted = withStateLock { started } + let vInput = state.withLock { $0.videoInput } + let isStarted = state.withLock { $0.started } guard let vInput, isStarted else { return } if vInput.isReadyForMoreMediaData { if vInput.append(sample) { - withStateLock { - sawVideo = true - lastVideoTime = pts + state.withLock { state in + state.sawVideo = true + state.lastVideoTime = pts } } else { - if let err = withStateLock({ writer?.error }) { - setHandlerError(ScreenRecordError.writeFailed(err.localizedDescription)) + let err = state.withLock { $0.writer?.error } + if let err { + state.withLock { state in + if state.handlerError == nil { + state.handlerError = ScreenRecordError.writeFailed(err.localizedDescription) + } + } } } } case .audioApp, .audioMic: - let aInput = withStateLock { audioInput } - let isStarted = withStateLock { started } + let aInput = state.withLock { $0.audioInput } + let isStarted = state.withLock { $0.started } guard includeAudio, let aInput, isStarted else { return } if aInput.isReadyForMoreMediaData { _ = aInput.append(sample) @@ -173,9 +184,10 @@ final class ScreenRecordService { } Task { @MainActor in - let recorder = RPScreenRecorder.shared() - recorder.isMicrophoneEnabled = includeAudio - recorder.startCapture(handler: handler, completionHandler: completion) + self.startCapture( + includeAudio: includeAudio, + handler: handler, + completion: completion) } } @@ -183,18 +195,17 @@ final class ScreenRecordService { let stopError = await withCheckedContinuation { cont in Task { @MainActor in - let recorder = RPScreenRecorder.shared() - recorder.stopCapture { error in cont.resume(returning: error) } + self.stopCapture { error in cont.resume(returning: error) } } } if let stopError { throw stopError } - let handlerErrorSnapshot = withStateLock { handlerError } + let handlerErrorSnapshot = state.withLock { $0.handlerError } if let handlerErrorSnapshot { throw handlerErrorSnapshot } - let writerSnapshot = withStateLock { writer } - let videoInputSnapshot = withStateLock { videoInput } - let audioInputSnapshot = withStateLock { audioInput } - let sawVideoSnapshot = withStateLock { sawVideo } + let writerSnapshot = state.withLock { $0.writer } + let videoInputSnapshot = state.withLock { $0.videoInput } + let audioInputSnapshot = state.withLock { $0.audioInput } + let sawVideoSnapshot = state.withLock { $0.sawVideo } guard let writerSnapshot, let videoInputSnapshot, sawVideoSnapshot else { throw ScreenRecordError.captureFailed("No frames captured") } @@ -219,6 +230,22 @@ final class ScreenRecordService { return outURL.path } + @MainActor + private func startCapture( + includeAudio: Bool, + handler: @escaping (CMSampleBuffer, RPSampleBufferType, Error?) -> Void, + completion: @escaping (Error?) -> Void) + { + let recorder = RPScreenRecorder.shared() + recorder.isMicrophoneEnabled = includeAudio + recorder.startCapture(handler: handler, completionHandler: completion) + } + + @MainActor + private func stopCapture(_ completion: @escaping (Error?) -> Void) { + RPScreenRecorder.shared().stopCapture(completionHandler: completion) + } + private nonisolated static func clampDurationMs(_ ms: Int?) -> Int { let v = ms ?? 10000 return min(60000, max(250, v)) From a8c9b2810b9e74319412a4e949d0300c2367499c Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 20:25:44 +0100 Subject: [PATCH 010/100] fix: align ReplayKit stopCapture call --- apps/ios/Sources/Screen/ScreenRecordService.swift | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/ios/Sources/Screen/ScreenRecordService.swift b/apps/ios/Sources/Screen/ScreenRecordService.swift index 829f29cd7..5cc1221b9 100644 --- a/apps/ios/Sources/Screen/ScreenRecordService.swift +++ b/apps/ios/Sources/Screen/ScreenRecordService.swift @@ -233,8 +233,8 @@ final class ScreenRecordService { @MainActor private func startCapture( includeAudio: Bool, - handler: @escaping (CMSampleBuffer, RPSampleBufferType, Error?) -> Void, - completion: @escaping (Error?) -> Void) + handler: @escaping @Sendable (CMSampleBuffer, RPSampleBufferType, Error?) -> Void, + completion: @escaping @Sendable (Error?) -> Void) { let recorder = RPScreenRecorder.shared() recorder.isMicrophoneEnabled = includeAudio @@ -242,8 +242,8 @@ final class ScreenRecordService { } @MainActor - private func stopCapture(_ completion: @escaping (Error?) -> Void) { - RPScreenRecorder.shared().stopCapture(completionHandler: completion) + private func stopCapture(_ completion: @escaping @Sendable (Error?) -> Void) { + RPScreenRecorder.shared().stopCapture { error in completion(error) } } private nonisolated static func clampDurationMs(_ ms: Int?) -> Int { From c11e2d9e5e402033b2d6007ed5383076b5b70275 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 20:26:49 +0100 Subject: [PATCH 011/100] fix: avoid self capture in ReplayKit start --- .../Sources/Screen/ScreenRecordService.swift | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/apps/ios/Sources/Screen/ScreenRecordService.swift b/apps/ios/Sources/Screen/ScreenRecordService.swift index 5cc1221b9..cada5bbbd 100644 --- a/apps/ios/Sources/Screen/ScreenRecordService.swift +++ b/apps/ios/Sources/Screen/ScreenRecordService.swift @@ -184,7 +184,7 @@ final class ScreenRecordService { } Task { @MainActor in - self.startCapture( + startReplayKitCapture( includeAudio: includeAudio, handler: handler, completion: completion) @@ -195,7 +195,7 @@ final class ScreenRecordService { let stopError = await withCheckedContinuation { cont in Task { @MainActor in - self.stopCapture { error in cont.resume(returning: error) } + stopReplayKitCapture { error in cont.resume(returning: error) } } } if let stopError { throw stopError } @@ -230,22 +230,6 @@ final class ScreenRecordService { return outURL.path } - @MainActor - private func startCapture( - includeAudio: Bool, - handler: @escaping @Sendable (CMSampleBuffer, RPSampleBufferType, Error?) -> Void, - completion: @escaping @Sendable (Error?) -> Void) - { - let recorder = RPScreenRecorder.shared() - recorder.isMicrophoneEnabled = includeAudio - recorder.startCapture(handler: handler, completionHandler: completion) - } - - @MainActor - private func stopCapture(_ completion: @escaping @Sendable (Error?) -> Void) { - RPScreenRecorder.shared().stopCapture { error in completion(error) } - } - private nonisolated static func clampDurationMs(_ ms: Int?) -> Int { let v = ms ?? 10000 return min(60000, max(250, v)) @@ -258,6 +242,22 @@ final class ScreenRecordService { } } +@MainActor +private func startReplayKitCapture( + includeAudio: Bool, + handler: @escaping @Sendable (CMSampleBuffer, RPSampleBufferType, Error?) -> Void, + completion: @escaping @Sendable (Error?) -> Void) +{ + let recorder = RPScreenRecorder.shared() + recorder.isMicrophoneEnabled = includeAudio + recorder.startCapture(handler: handler, completionHandler: completion) +} + +@MainActor +private func stopReplayKitCapture(_ completion: @escaping @Sendable (Error?) -> Void) { + RPScreenRecorder.shared().stopCapture { error in completion(error) } +} + #if DEBUG extension ScreenRecordService { nonisolated static func _test_clampDurationMs(_ ms: Int?) -> Int { From 24151a2028f31a54aef89ab4c948bcab6d0ba714 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 20:28:06 +0100 Subject: [PATCH 012/100] fix: mark screen recorder sendable --- apps/ios/Sources/Screen/ScreenRecordService.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/ios/Sources/Screen/ScreenRecordService.swift b/apps/ios/Sources/Screen/ScreenRecordService.swift index cada5bbbd..861704310 100644 --- a/apps/ios/Sources/Screen/ScreenRecordService.swift +++ b/apps/ios/Sources/Screen/ScreenRecordService.swift @@ -1,7 +1,7 @@ import AVFoundation import ReplayKit -final class ScreenRecordService { +final class ScreenRecordService: @unchecked Sendable { private struct UncheckedSendableBox: @unchecked Sendable { let value: T } From 52263bd5a3b754181344a7907bb098e28e83750d Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 20:45:50 +0100 Subject: [PATCH 013/100] fix: avoid cli gateway close race --- CHANGELOG.md | 1 + src/gateway/call.ts | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 224f37ab4..10460860c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ - macOS menu: device list now uses `node.list` (devices only; no agent/tool presence entries). - macOS menu: device list now shows connected nodes only. - iOS node: fix ReplayKit screen recording crash caused by queue isolation assertions during capture. +- CLI: avoid spurious gateway close errors after successful request/response cycles. ## 2.0.0-beta4 — 2025-12-27 diff --git a/src/gateway/call.ts b/src/gateway/call.ts index 3ebdcd0dd..c2b14623b 100644 --- a/src/gateway/call.ts +++ b/src/gateway/call.ts @@ -25,6 +25,7 @@ export async function callGateway( const timeoutMs = opts.timeoutMs ?? 10_000; return await new Promise((resolve, reject) => { let settled = false; + let ignoreClose = false; const stop = (err?: Error, value?: T) => { if (settled) return; settled = true; @@ -49,19 +50,23 @@ export async function callGateway( const result = await client.request(opts.method, opts.params, { expectFinal: opts.expectFinal, }); - client.stop(); + ignoreClose = true; stop(undefined, result); + client.stop(); } catch (err) { + ignoreClose = true; client.stop(); stop(err as Error); } }, onClose: (code, reason) => { + if (settled || ignoreClose) return; stop(new Error(`gateway closed (${code}): ${reason}`)); }, }); const timer = setTimeout(() => { + ignoreClose = true; client.stop(); stop(new Error("gateway timeout")); }, timeoutMs); From cf42fabfd86294547e21b004410e2f43abb8697d Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 21:10:44 +0100 Subject: [PATCH 014/100] test: add ios swift testing + android kotest --- CHANGELOG.md | 1 + apps/android/app/build.gradle.kts | 7 +++++++ .../node/bridge/BridgeEndpointKotestTest.kt | 14 ++++++++++++++ apps/ios/Tests/CameraControllerErrorTests.swift | 13 +++++++++++++ 4 files changed, 35 insertions(+) create mode 100644 apps/android/app/src/test/java/com/steipete/clawdis/node/bridge/BridgeEndpointKotestTest.kt create mode 100644 apps/ios/Tests/CameraControllerErrorTests.swift diff --git a/CHANGELOG.md b/CHANGELOG.md index 10460860c..31ef2cad9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ - macOS menu: device list now shows connected nodes only. - iOS node: fix ReplayKit screen recording crash caused by queue isolation assertions during capture. - CLI: avoid spurious gateway close errors after successful request/response cycles. +- Tests: add Swift Testing coverage for camera errors and Kotest coverage for Android bridge endpoints. ## 2.0.0-beta4 — 2025-12-27 diff --git a/apps/android/app/build.gradle.kts b/apps/android/app/build.gradle.kts index c2f1cd817..1b353d83f 100644 --- a/apps/android/app/build.gradle.kts +++ b/apps/android/app/build.gradle.kts @@ -93,4 +93,11 @@ dependencies { testImplementation("junit:junit:4.13.2") testImplementation("org.jetbrains.kotlinx:kotlinx-coroutines-test:1.10.2") + testImplementation("io.kotest:kotest-runner-junit5-jvm:6.0.7") + testImplementation("io.kotest:kotest-assertions-core-jvm:6.0.7") + testRuntimeOnly("org.junit.vintage:junit-vintage-engine:5.13.3") +} + +tasks.withType().configureEach { + useJUnitPlatform() } diff --git a/apps/android/app/src/test/java/com/steipete/clawdis/node/bridge/BridgeEndpointKotestTest.kt b/apps/android/app/src/test/java/com/steipete/clawdis/node/bridge/BridgeEndpointKotestTest.kt new file mode 100644 index 000000000..5e1b09490 --- /dev/null +++ b/apps/android/app/src/test/java/com/steipete/clawdis/node/bridge/BridgeEndpointKotestTest.kt @@ -0,0 +1,14 @@ +package com.steipete.clawdis.node.bridge + +import io.kotest.core.spec.style.StringSpec +import io.kotest.matchers.shouldBe + +class BridgeEndpointKotestTest : StringSpec({ + "manual endpoint builds stable id + name" { + val endpoint = BridgeEndpoint.manual("10.0.0.5", 18790) + endpoint.stableId shouldBe "manual|10.0.0.5|18790" + endpoint.name shouldBe "10.0.0.5:18790" + endpoint.host shouldBe "10.0.0.5" + endpoint.port shouldBe 18790 + } +}) diff --git a/apps/ios/Tests/CameraControllerErrorTests.swift b/apps/ios/Tests/CameraControllerErrorTests.swift new file mode 100644 index 000000000..3b3c94281 --- /dev/null +++ b/apps/ios/Tests/CameraControllerErrorTests.swift @@ -0,0 +1,13 @@ +import Testing +@testable import Clawdis + +@Suite struct CameraControllerErrorTests { + @Test func errorDescriptionsAreStable() { + #expect(CameraController.CameraError.cameraUnavailable.errorDescription == "Camera unavailable") + #expect(CameraController.CameraError.microphoneUnavailable.errorDescription == "Microphone unavailable") + #expect(CameraController.CameraError.permissionDenied(kind: "Camera").errorDescription == "Camera permission denied") + #expect(CameraController.CameraError.invalidParams("bad").errorDescription == "bad") + #expect(CameraController.CameraError.captureFailed("nope").errorDescription == "nope") + #expect(CameraController.CameraError.exportFailed("export").errorDescription == "export") + } +} From b0396e196f46f6b6ed8259bd2fa92edc6ac0da62 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 22:11:12 +0100 Subject: [PATCH 015/100] fix: refresh bridge tokens and enrich node settings --- CHANGELOG.md | 1 + .../clawdis/node/bridge/BridgeDiscovery.kt | 68 ++++++++++++++---- .../clawdis/node/bridge/BridgeEndpoint.kt | 6 +- .../steipete/clawdis/node/ui/SettingsSheet.kt | 40 ++++++++++- .../Bridge/BridgeConnectionController.swift | 69 +++++++++++++++++-- .../Sources/Bridge/BridgeDiscoveryModel.swift | 27 +++++++- apps/ios/Sources/Settings/SettingsTab.swift | 31 +++++++++ .../BridgeConnectionControllerTests.swift | 68 ++++++++++++++++++ 8 files changed, 286 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 31ef2cad9..4edcae2c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ - macOS menu: device list now uses `node.list` (devices only; no agent/tool presence entries). - macOS menu: device list now shows connected nodes only. - iOS node: fix ReplayKit screen recording crash caused by queue isolation assertions during capture. +- iOS/Android nodes: bridge auto-connect refreshes stale tokens and settings now show richer bridge/device details. - CLI: avoid spurious gateway close errors after successful request/response cycles. - Tests: add Swift Testing coverage for camera errors and Kotest coverage for Android bridge endpoints. diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/bridge/BridgeDiscovery.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/bridge/BridgeDiscovery.kt index 17e9120c1..b33261ccb 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/bridge/BridgeDiscovery.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/bridge/BridgeDiscovery.kt @@ -130,20 +130,36 @@ class BridgeDiscovery( object : NsdManager.ResolveListener { override fun onResolveFailed(serviceInfo: NsdServiceInfo, errorCode: Int) {} - override fun onServiceResolved(resolved: NsdServiceInfo) { - val host = resolved.host?.hostAddress ?: return - val port = resolved.port - if (port <= 0) return + override fun onServiceResolved(resolved: NsdServiceInfo) { + val host = resolved.host?.hostAddress ?: return + val port = resolved.port + if (port <= 0) return - val rawServiceName = resolved.serviceName - val serviceName = BonjourEscapes.decode(rawServiceName) - val displayName = BonjourEscapes.decode(txt(resolved, "displayName") ?: serviceName) - val id = stableId(serviceName, "local.") - localById[id] = BridgeEndpoint(stableId = id, name = displayName, host = host, port = port) - publish() - } - }, - ) + val rawServiceName = resolved.serviceName + val serviceName = BonjourEscapes.decode(rawServiceName) + val displayName = BonjourEscapes.decode(txt(resolved, "displayName") ?: serviceName) + val lanHost = txt(resolved, "lanHost") + val tailnetDns = txt(resolved, "tailnetDns") + val gatewayPort = txtInt(resolved, "gatewayPort") + val bridgePort = txtInt(resolved, "bridgePort") + val canvasPort = txtInt(resolved, "canvasPort") + val id = stableId(serviceName, "local.") + localById[id] = + BridgeEndpoint( + stableId = id, + name = displayName, + host = host, + port = port, + lanHost = lanHost, + tailnetDns = tailnetDns, + gatewayPort = gatewayPort, + bridgePort = bridgePort, + canvasPort = canvasPort, + ) + publish() + } + }, + ) } private fun publish() { @@ -189,6 +205,10 @@ class BridgeDiscovery( } } + private fun txtInt(info: NsdServiceInfo, key: String): Int? { + return txt(info, key)?.toIntOrNull() + } + private suspend fun refreshUnicast(domain: String) { val ptrName = "${serviceType}${domain}" val ptrMsg = lookupUnicastMessage(ptrName, Type.PTR) ?: return @@ -227,8 +247,24 @@ class BridgeDiscovery( } val instanceName = BonjourEscapes.decode(decodeInstanceName(instanceFqdn, domain)) val displayName = BonjourEscapes.decode(txtValue(txt, "displayName") ?: instanceName) + val lanHost = txtValue(txt, "lanHost") + val tailnetDns = txtValue(txt, "tailnetDns") + val gatewayPort = txtIntValue(txt, "gatewayPort") + val bridgePort = txtIntValue(txt, "bridgePort") + val canvasPort = txtIntValue(txt, "canvasPort") val id = stableId(instanceName, domain) - next[id] = BridgeEndpoint(stableId = id, name = displayName, host = host, port = port) + next[id] = + BridgeEndpoint( + stableId = id, + name = displayName, + host = host, + port = port, + lanHost = lanHost, + tailnetDns = tailnetDns, + gatewayPort = gatewayPort, + bridgePort = bridgePort, + canvasPort = canvasPort, + ) } unicastById.clear() @@ -434,6 +470,10 @@ class BridgeDiscovery( return null } + private fun txtIntValue(records: List, key: String): Int? { + return txtValue(records, key)?.toIntOrNull() + } + private fun decodeDnsTxtString(raw: String): String { // dnsjava treats TXT as opaque bytes and decodes as ISO-8859-1 to preserve bytes. // Our TXT payload is UTF-8 (written by the gateway), so re-decode when possible. diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/bridge/BridgeEndpoint.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/bridge/BridgeEndpoint.kt index bd359e470..41c415c4b 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/bridge/BridgeEndpoint.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/bridge/BridgeEndpoint.kt @@ -5,6 +5,11 @@ data class BridgeEndpoint( val name: String, val host: String, val port: Int, + val lanHost: String? = null, + val tailnetDns: String? = null, + val gatewayPort: Int? = null, + val bridgePort: Int? = null, + val canvasPort: Int? = null, ) { companion object { fun manual(host: String, port: Int): BridgeEndpoint = @@ -16,4 +21,3 @@ data class BridgeEndpoint( ) } } - diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/SettingsSheet.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/SettingsSheet.kt index 038ef9faf..c7d011892 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/SettingsSheet.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/SettingsSheet.kt @@ -2,6 +2,7 @@ package com.steipete.clawdis.node.ui import android.Manifest import android.content.pm.PackageManager +import android.os.Build import androidx.activity.compose.rememberLauncherForActivityResult import androidx.activity.result.contract.ActivityResultContracts import androidx.compose.animation.AnimatedVisibility @@ -46,6 +47,7 @@ import androidx.compose.ui.platform.LocalContext import androidx.compose.ui.text.style.TextAlign import androidx.compose.ui.unit.dp import androidx.core.content.ContextCompat +import com.steipete.clawdis.node.BuildConfig import com.steipete.clawdis.node.MainViewModel import com.steipete.clawdis.node.NodeForegroundService import com.steipete.clawdis.node.VoiceWakeMode @@ -74,6 +76,22 @@ fun SettingsSheet(viewModel: MainViewModel) { val listState = rememberLazyListState() val (wakeWordsText, setWakeWordsText) = remember { mutableStateOf("") } val (advancedExpanded, setAdvancedExpanded) = remember { mutableStateOf(false) } + val deviceModel = + remember { + listOfNotNull(Build.MANUFACTURER, Build.MODEL) + .joinToString(" ") + .trim() + .ifEmpty { "Android" } + } + val appVersion = + remember { + val versionName = BuildConfig.VERSION_NAME.trim().ifEmpty { "dev" } + if (BuildConfig.DEBUG && !versionName.contains("dev", ignoreCase = true)) { + "$versionName-dev" + } else { + versionName + } + } LaunchedEffect(wakeWords) { setWakeWordsText(wakeWords.joinToString(", ")) } @@ -142,6 +160,8 @@ fun SettingsSheet(viewModel: MainViewModel) { ) } item { Text("Instance ID: $instanceId", color = MaterialTheme.colorScheme.onSurfaceVariant) } + item { Text("Device: $deviceModel", color = MaterialTheme.colorScheme.onSurfaceVariant) } + item { Text("Version: $appVersion", color = MaterialTheme.colorScheme.onSurfaceVariant) } item { HorizontalDivider() } @@ -181,9 +201,27 @@ fun SettingsSheet(viewModel: MainViewModel) { item { Text("No bridges found yet.", color = MaterialTheme.colorScheme.onSurfaceVariant) } } else { items(items = visibleBridges, key = { it.stableId }) { bridge -> + val detailLines = + buildList { + add("IP: ${bridge.host}:${bridge.port}") + bridge.lanHost?.let { add("LAN: $it") } + bridge.tailnetDns?.let { add("Tailnet: $it") } + if (bridge.gatewayPort != null || bridge.bridgePort != null || bridge.canvasPort != null) { + val gw = bridge.gatewayPort?.toString() ?: "—" + val br = (bridge.bridgePort ?: bridge.port).toString() + val canvas = bridge.canvasPort?.toString() ?: "—" + add("Ports: gw $gw · bridge $br · canvas $canvas") + } + } ListItem( headlineContent = { Text(bridge.name) }, - supportingContent = { Text("${bridge.host}:${bridge.port}") }, + supportingContent = { + Column(verticalArrangement = Arrangement.spacedBy(2.dp)) { + detailLines.forEach { line -> + Text(line, color = MaterialTheme.colorScheme.onSurfaceVariant) + } + } + }, trailingContent = { Button( onClick = { diff --git a/apps/ios/Sources/Bridge/BridgeConnectionController.swift b/apps/ios/Sources/Bridge/BridgeConnectionController.swift index 162e13858..256417319 100644 --- a/apps/ios/Sources/Bridge/BridgeConnectionController.swift +++ b/apps/ios/Sources/Bridge/BridgeConnectionController.swift @@ -6,6 +6,15 @@ import Observation import SwiftUI import UIKit +protocol BridgePairingClient: Sendable { + func pairAndHello( + endpoint: NWEndpoint, + hello: BridgeHello, + onStatus: (@Sendable (String) -> Void)?) async throws -> String +} + +extension BridgeClient: BridgePairingClient {} + @MainActor @Observable final class BridgeConnectionController { @@ -18,8 +27,15 @@ final class BridgeConnectionController { private var didAutoConnect = false private var seenStableIDs = Set() - init(appModel: NodeAppModel, startDiscovery: Bool = true) { + private let bridgeClientFactory: @Sendable () -> any BridgePairingClient + + init( + appModel: NodeAppModel, + startDiscovery: Bool = true, + bridgeClientFactory: @escaping @Sendable () -> any BridgePairingClient = { BridgeClient() }) + { self.appModel = appModel + self.bridgeClientFactory = bridgeClientFactory BridgeSettingsStore.bootstrapPersistence() let defaults = UserDefaults.standard @@ -85,7 +101,7 @@ final class BridgeConnectionController { let token = KeychainStore.loadString( service: "com.steipete.clawdis.bridge", - account: "bridge-token.\(instanceId)")? + account: self.keychainAccount(instanceId: instanceId))? .trimmingCharacters(in: .whitespacesAndNewlines) ?? "" guard !token.isEmpty else { return } @@ -99,9 +115,8 @@ final class BridgeConnectionController { guard let port = NWEndpoint.Port(rawValue: UInt16(resolvedPort)) else { return } self.didAutoConnect = true - appModel.connectToBridge( - endpoint: .hostPort(host: NWEndpoint.Host(manualHost), port: port), - hello: self.makeHello(token: token)) + let endpoint = NWEndpoint.hostPort(host: NWEndpoint.Host(manualHost), port: port) + self.startAutoConnect(endpoint: endpoint, token: token, instanceId: instanceId) return } @@ -112,7 +127,7 @@ final class BridgeConnectionController { guard let target = self.bridges.first(where: { $0.stableID == targetStableID }) else { return } self.didAutoConnect = true - appModel.connectToBridge(endpoint: target.endpoint, hello: self.makeHello(token: token)) + self.startAutoConnect(endpoint: target.endpoint, token: token, instanceId: instanceId) } private func updateLastDiscoveredBridge(from bridges: [BridgeDiscoveryModel.DiscoveredBridge]) { @@ -140,6 +155,40 @@ final class BridgeConnectionController { commands: self.currentCommands()) } + private func keychainAccount(instanceId: String) -> String { + "bridge-token.\(instanceId)" + } + + private func startAutoConnect(endpoint: NWEndpoint, token: String, instanceId: String) { + guard let appModel else { return } + Task { [weak self] in + guard let self else { return } + do { + let hello = self.makeHello(token: token) + let refreshed = try await self.bridgeClientFactory().pairAndHello( + endpoint: endpoint, + hello: hello, + onStatus: { status in + Task { @MainActor in + appModel.bridgeStatusText = status + } + }) + let resolvedToken = refreshed.isEmpty ? token : refreshed + if !refreshed.isEmpty, refreshed != token { + _ = KeychainStore.saveString( + refreshed, + service: "com.steipete.clawdis.bridge", + account: self.keychainAccount(instanceId: instanceId)) + } + appModel.connectToBridge(endpoint: endpoint, hello: self.makeHello(token: resolvedToken)) + } catch { + await MainActor.run { + appModel.bridgeStatusText = "Bridge error: \(error.localizedDescription)" + } + } + } + } + private func resolvedDisplayName(defaults: UserDefaults) -> String { let key = "node.displayName" let existing = defaults.string(forKey: key)?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" @@ -265,5 +314,13 @@ extension BridgeConnectionController { func _test_appVersion() -> String { self.appVersion() } + + func _test_setBridges(_ bridges: [BridgeDiscoveryModel.DiscoveredBridge]) { + self.bridges = bridges + } + + func _test_triggerAutoConnect() { + self.maybeAutoConnect() + } } #endif diff --git a/apps/ios/Sources/Bridge/BridgeDiscoveryModel.swift b/apps/ios/Sources/Bridge/BridgeDiscoveryModel.swift index 2555de680..45df2a887 100644 --- a/apps/ios/Sources/Bridge/BridgeDiscoveryModel.swift +++ b/apps/ios/Sources/Bridge/BridgeDiscoveryModel.swift @@ -18,6 +18,12 @@ final class BridgeDiscoveryModel { var endpoint: NWEndpoint var stableID: String var debugID: String + var lanHost: String? + var tailnetDns: String? + var gatewayPort: Int? + var bridgePort: Int? + var canvasPort: Int? + var cliPath: String? } var bridges: [DiscoveredBridge] = [] @@ -68,7 +74,8 @@ final class BridgeDiscoveryModel { switch result.endpoint { case let .service(name, _, _, _): let decodedName = BonjourEscapes.decode(name) - let advertisedName = result.endpoint.txtRecord?.dictionary["displayName"] + let txt = result.endpoint.txtRecord?.dictionary ?? [:] + let advertisedName = txt["displayName"] let prettyAdvertised = advertisedName .map(Self.prettifyInstanceName) .flatMap { $0.isEmpty ? nil : $0 } @@ -77,7 +84,13 @@ final class BridgeDiscoveryModel { name: prettyName, endpoint: result.endpoint, stableID: BridgeEndpointID.stableID(result.endpoint), - debugID: BridgeEndpointID.prettyDescription(result.endpoint)) + debugID: BridgeEndpointID.prettyDescription(result.endpoint), + lanHost: Self.txtValue(txt, key: "lanHost"), + tailnetDns: Self.txtValue(txt, key: "tailnetDns"), + gatewayPort: Self.txtIntValue(txt, key: "gatewayPort"), + bridgePort: Self.txtIntValue(txt, key: "bridgePort"), + canvasPort: Self.txtIntValue(txt, key: "canvasPort"), + cliPath: Self.txtValue(txt, key: "cliPath")) default: return nil } @@ -191,4 +204,14 @@ final class BridgeDiscoveryModel { .replacingOccurrences(of: #"\s+\(\d+\)$"#, with: "", options: .regularExpression) return stripped.trimmingCharacters(in: .whitespacesAndNewlines) } + + private static func txtValue(_ dict: [String: String], key: String) -> String? { + let raw = dict[key]?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" + return raw.isEmpty ? nil : raw + } + + private static func txtIntValue(_ dict: [String: String], key: String) -> Int? { + guard let raw = self.txtValue(dict, key: key) else { return nil } + return Int(raw) + } } diff --git a/apps/ios/Sources/Settings/SettingsTab.swift b/apps/ios/Sources/Settings/SettingsTab.swift index 48b5e0aac..34feee23a 100644 --- a/apps/ios/Sources/Settings/SettingsTab.swift +++ b/apps/ios/Sources/Settings/SettingsTab.swift @@ -51,6 +51,9 @@ struct SettingsTab: View { } } } + LabeledContent("Platform", value: self.platformString()) + LabeledContent("Version", value: self.appVersion()) + LabeledContent("Model", value: self.modelIdentifier()) } Section("Bridge") { @@ -227,6 +230,12 @@ struct SettingsTab: View { HStack { VStack(alignment: .leading, spacing: 2) { Text(bridge.name) + let detailLines = self.bridgeDetailLines(bridge) + ForEach(detailLines, id: \.self) { line in + Text(line) + .font(.footnote) + .foregroundStyle(.secondary) + } } Spacer() @@ -504,4 +513,26 @@ struct SettingsTab: View { private static func httpURLString(host: String?, port: Int?, fallback: String) -> String { SettingsNetworkingHelpers.httpURLString(host: host, port: port, fallback: fallback) } + + private func bridgeDetailLines(_ bridge: BridgeDiscoveryModel.DiscoveredBridge) -> [String] { + var lines: [String] = [] + if let lanHost = bridge.lanHost { lines.append("LAN: \(lanHost)") } + if let tailnet = bridge.tailnetDns { lines.append("Tailnet: \(tailnet)") } + + let gatewayPort = bridge.gatewayPort + let bridgePort = bridge.bridgePort + let canvasPort = bridge.canvasPort + if gatewayPort != nil || bridgePort != nil || canvasPort != nil { + let gw = gatewayPort.map(String.init) ?? "—" + let br = bridgePort.map(String.init) ?? "—" + let canvas = canvasPort.map(String.init) ?? "—" + lines.append("Ports: gw \(gw) · bridge \(br) · canvas \(canvas)") + } + + if lines.isEmpty { + lines.append(bridge.debugID) + } + + return lines + } } diff --git a/apps/ios/Tests/BridgeConnectionControllerTests.swift b/apps/ios/Tests/BridgeConnectionControllerTests.swift index 4ff359616..51e22ec5d 100644 --- a/apps/ios/Tests/BridgeConnectionControllerTests.swift +++ b/apps/ios/Tests/BridgeConnectionControllerTests.swift @@ -1,5 +1,6 @@ import ClawdisKit import Foundation +import Network import Testing import UIKit @testable import Clawdis @@ -15,6 +16,25 @@ private let instanceIdEntry = KeychainEntry(service: nodeService, account: "inst private let preferredBridgeEntry = KeychainEntry(service: bridgeService, account: "preferredStableID") private let lastBridgeEntry = KeychainEntry(service: bridgeService, account: "lastDiscoveredStableID") +private actor MockBridgePairingClient: BridgePairingClient { + private(set) var lastToken: String? + private let resultToken: String + + init(resultToken: String) { + self.resultToken = resultToken + } + + func pairAndHello( + endpoint: NWEndpoint, + hello: BridgeHello, + onStatus: (@Sendable (String) -> Void)?) async throws -> String + { + self.lastToken = hello.token + onStatus?("Testing…") + return self.resultToken + } +} + private func withUserDefaults(_ updates: [String: Any?], _ body: () throws -> T) rethrows -> T { let defaults = UserDefaults.standard var snapshot: [String: Any?] = [:] @@ -156,4 +176,52 @@ private func withKeychainValues(_ updates: [KeychainEntry: String?], _ body: } } } + + @Test @MainActor func autoConnectRefreshesTokenOnUnauthorized() async { + let bridge = BridgeDiscoveryModel.DiscoveredBridge( + name: "Gateway", + endpoint: .hostPort(host: NWEndpoint.Host("127.0.0.1"), port: 18790), + stableID: "bridge-1", + debugID: "bridge-debug", + lanHost: "Mac.local", + tailnetDns: nil, + gatewayPort: 18789, + bridgePort: 18790, + canvasPort: 18793, + cliPath: nil) + let mock = MockBridgePairingClient(resultToken: "new-token") + let account = "bridge-token.ios-test" + + withKeychainValues([ + instanceIdEntry: nil, + preferredBridgeEntry: nil, + lastBridgeEntry: nil, + KeychainEntry(service: bridgeService, account: account): "old-token", + ]) { + withUserDefaults([ + "node.instanceId": "ios-test", + "bridge.lastDiscoveredStableID": "bridge-1", + "bridge.manual.enabled": false, + ]) { + let appModel = NodeAppModel() + let controller = BridgeConnectionController( + appModel: appModel, + startDiscovery: false, + bridgeClientFactory: { mock }) + controller._test_setBridges([bridge]) + controller._test_triggerAutoConnect() + + for _ in 0..<20 { + if appModel.connectedBridgeID == bridge.stableID { break } + try? await Task.sleep(nanoseconds: 50_000_000) + } + + #expect(appModel.connectedBridgeID == bridge.stableID) + let stored = KeychainStore.loadString(service: bridgeService, account: account) + #expect(stored == "new-token") + let lastToken = await mock.lastToken + #expect(lastToken == "old-token") + } + } + } } From f41ade9417e3db606bec5726635b50d8472015d5 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 22:51:42 +0100 Subject: [PATCH 016/100] feat(skills): add obsidian skill --- skills/obsidian/SKILL.md | 55 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 skills/obsidian/SKILL.md diff --git a/skills/obsidian/SKILL.md b/skills/obsidian/SKILL.md new file mode 100644 index 000000000..d5026be7b --- /dev/null +++ b/skills/obsidian/SKILL.md @@ -0,0 +1,55 @@ +--- +name: obsidian +description: Work with Obsidian vaults (plain Markdown notes) and automate via obsidian-cli. +homepage: https://help.obsidian.md +metadata: {"clawdis":{"emoji":"💎","requires":{"bins":["obsidian-cli"]},"install":[{"id":"brew","kind":"brew","formula":"yakitrak/yakitrak/obsidian-cli","bins":["obsidian-cli"],"label":"Install obsidian-cli (brew)"}]}} +--- + +# Obsidian + +Obsidian vault = a normal folder on disk. + +Vault structure (typical) +- Notes: `*.md` (plain text Markdown; edit with any editor) +- Config: `.obsidian/` (workspace + plugin settings; usually don’t touch from scripts) +- Canvases: `*.canvas` (JSON) +- Attachments: whatever folder you chose in Obsidian settings (images/PDFs/etc.) + +## Find the active vault(s) + +Obsidian desktop tracks vaults here (source of truth): +- `~/Library/Application Support/obsidian/obsidian.json` + +`obsidian-cli` resolves vaults from that file; vault name is typically the **folder name** (path suffix). + +Fast “what vault is active / where are the notes?” +- If you’ve already set a default: `obsidian-cli print-default --path-only` +- Otherwise, read `~/Library/Application Support/obsidian/obsidian.json` and use the vault entry with `"open": true`. + +Notes +- Multiple vaults common (iCloud vs `~/Documents`, work/personal, etc.). Don’t guess; read config. +- Avoid writing hardcoded vault paths into scripts; prefer reading the config or using `print-default`. + +## obsidian-cli quick start + +Pick a default vault (once): +- `obsidian-cli set-default ""` +- `obsidian-cli print-default` / `obsidian-cli print-default --path-only` + +Search +- `obsidian-cli search "query"` (note names) +- `obsidian-cli search-content "query"` (inside notes; shows snippets + lines) + +Create +- `obsidian-cli create "Folder/New note" --content "..." --open` +- Requires Obsidian URI handler (`obsidian://…`) working (Obsidian installed). +- Avoid creating notes under “hidden” dot-folders (e.g. `.something/...`) via URI; Obsidian may refuse. + +Move/rename (safe refactor) +- `obsidian-cli move "old/path/note" "new/path/note"` +- Updates `[[wikilinks]]` and common Markdown links across the vault (this is the main win vs `mv`). + +Delete +- `obsidian-cli delete "path/note"` + +Prefer direct edits when appropriate: open the `.md` file and change it; Obsidian will pick it up. From a61b7056d5004ad1252ae1f2b256d062a80bf29a Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 23:12:03 +0100 Subject: [PATCH 017/100] feat: surface camera activity in status pill --- .../clawdis/node/ui/CameraHudOverlay.kt | 85 +---------------- .../steipete/clawdis/node/ui/RootScreen.kt | 43 ++++++++- .../steipete/clawdis/node/ui/StatusPill.kt | 49 +++++++--- apps/ios/Sources/RootCanvas.swift | 92 ++++--------------- apps/ios/Sources/RootTabs.swift | 1 + apps/ios/Sources/Status/StatusPill.swift | 39 +++++++- 6 files changed, 134 insertions(+), 175 deletions(-) diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/CameraHudOverlay.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/CameraHudOverlay.kt index b205929cd..2e1fec0d9 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/CameraHudOverlay.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/CameraHudOverlay.kt @@ -1,64 +1,26 @@ package com.steipete.clawdis.node.ui -import androidx.compose.animation.AnimatedVisibility -import androidx.compose.animation.fadeIn -import androidx.compose.animation.fadeOut -import androidx.compose.animation.slideInVertically -import androidx.compose.animation.slideOutVertically import androidx.compose.foundation.background import androidx.compose.foundation.layout.Box -import androidx.compose.foundation.layout.Row -import androidx.compose.foundation.layout.Spacer import androidx.compose.foundation.layout.fillMaxSize -import androidx.compose.foundation.layout.padding -import androidx.compose.foundation.layout.size -import androidx.compose.foundation.layout.statusBarsPadding -import androidx.compose.foundation.shape.RoundedCornerShape -import androidx.compose.material.icons.Icons -import androidx.compose.material.icons.filled.CheckCircle -import androidx.compose.material.icons.filled.Error -import androidx.compose.material.icons.filled.FiberManualRecord -import androidx.compose.material.icons.filled.PhotoCamera -import androidx.compose.material3.CircularProgressIndicator -import androidx.compose.material3.Icon -import androidx.compose.material3.MaterialTheme -import androidx.compose.material3.Surface -import androidx.compose.material3.Text import androidx.compose.runtime.Composable import androidx.compose.runtime.LaunchedEffect import androidx.compose.runtime.getValue import androidx.compose.runtime.mutableFloatStateOf import androidx.compose.runtime.remember import androidx.compose.runtime.setValue -import androidx.compose.ui.Alignment import androidx.compose.ui.Modifier import androidx.compose.ui.draw.alpha import androidx.compose.ui.graphics.Color -import androidx.compose.ui.text.style.TextOverflow -import androidx.compose.ui.unit.dp -import com.steipete.clawdis.node.CameraHudKind -import com.steipete.clawdis.node.CameraHudState import kotlinx.coroutines.delay @Composable -fun CameraHudOverlay( - hud: CameraHudState?, - flashToken: Long, +fun CameraFlashOverlay( + token: Long, modifier: Modifier = Modifier, ) { Box(modifier = modifier.fillMaxSize()) { - CameraFlash(token = flashToken) - - AnimatedVisibility( - visible = hud != null, - enter = slideInVertically(initialOffsetY = { -it / 2 }) + fadeIn(), - exit = slideOutVertically(targetOffsetY = { -it / 2 }) + fadeOut(), - modifier = Modifier.align(Alignment.TopStart).statusBarsPadding().padding(start = 12.dp, top = 58.dp), - ) { - if (hud != null) { - Toast(hud = hud) - } - } + CameraFlash(token = token) } } @@ -80,44 +42,3 @@ private fun CameraFlash(token: Long) { .background(Color.White), ) } - -@Composable -private fun Toast(hud: CameraHudState) { - Surface( - shape = RoundedCornerShape(14.dp), - color = MaterialTheme.colorScheme.surface.copy(alpha = 0.85f), - tonalElevation = 2.dp, - shadowElevation = 8.dp, - ) { - Row( - modifier = Modifier.padding(vertical = 10.dp, horizontal = 12.dp), - verticalAlignment = Alignment.CenterVertically, - ) { - when (hud.kind) { - CameraHudKind.Photo -> { - Icon(Icons.Default.PhotoCamera, contentDescription = null) - Spacer(Modifier.size(10.dp)) - CircularProgressIndicator(modifier = Modifier.size(14.dp), strokeWidth = 2.dp) - } - CameraHudKind.Recording -> { - Icon(Icons.Default.FiberManualRecord, contentDescription = null, tint = Color.Red) - } - CameraHudKind.Success -> { - Icon(Icons.Default.CheckCircle, contentDescription = null) - } - CameraHudKind.Error -> { - Icon(Icons.Default.Error, contentDescription = null) - } - } - - Spacer(Modifier.size(10.dp)) - Text( - text = hud.message, - style = MaterialTheme.typography.bodyMedium, - maxLines = 1, - overflow = TextOverflow.Ellipsis, - ) - } - } -} - diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt index 49bbee928..f3cfb4b67 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt @@ -32,6 +32,10 @@ import androidx.compose.material3.ModalBottomSheet import androidx.compose.material3.rememberModalBottomSheetState import androidx.compose.material.icons.Icons import androidx.compose.material.icons.filled.ChatBubble +import androidx.compose.material.icons.filled.CheckCircle +import androidx.compose.material.icons.filled.Error +import androidx.compose.material.icons.filled.FiberManualRecord +import androidx.compose.material.icons.filled.PhotoCamera import androidx.compose.material.icons.filled.Settings import androidx.compose.runtime.Composable import androidx.compose.runtime.collectAsState @@ -47,6 +51,7 @@ import androidx.compose.ui.viewinterop.AndroidView import androidx.compose.ui.window.Popup import androidx.compose.ui.window.PopupProperties import androidx.core.content.ContextCompat +import com.steipete.clawdis.node.CameraHudKind import com.steipete.clawdis.node.MainViewModel @OptIn(ExperimentalMaterial3Api::class) @@ -60,6 +65,39 @@ fun RootScreen(viewModel: MainViewModel) { val statusText by viewModel.statusText.collectAsState() val cameraHud by viewModel.cameraHud.collectAsState() val cameraFlashToken by viewModel.cameraFlashToken.collectAsState() + val activity = + remember(cameraHud) { + cameraHud?.let { hud -> + when (hud.kind) { + CameraHudKind.Photo -> + StatusActivity( + title = hud.message, + icon = Icons.Default.PhotoCamera, + contentDescription = "Taking photo", + ) + CameraHudKind.Recording -> + StatusActivity( + title = hud.message, + icon = Icons.Default.FiberManualRecord, + contentDescription = "Recording", + tint = androidx.compose.ui.graphics.Color.Red, + ) + CameraHudKind.Success -> + StatusActivity( + title = hud.message, + icon = Icons.Default.CheckCircle, + contentDescription = "Capture finished", + ) + CameraHudKind.Error -> + StatusActivity( + title = hud.message, + icon = Icons.Default.Error, + contentDescription = "Capture failed", + tint = androidx.compose.ui.graphics.Color.Red, + ) + } + } + } val bridgeState = remember(serverName, statusText) { @@ -80,9 +118,9 @@ fun RootScreen(viewModel: MainViewModel) { CanvasView(viewModel = viewModel, modifier = Modifier.fillMaxSize()) } - // Camera HUD (flash + toast) must be in a Popup to render above the WebView. + // Camera flash must be in a Popup to render above the WebView. Popup(alignment = Alignment.Center, properties = PopupProperties(focusable = false)) { - CameraHudOverlay(hud = cameraHud, flashToken = cameraFlashToken, modifier = Modifier.fillMaxSize()) + CameraFlashOverlay(token = cameraFlashToken, modifier = Modifier.fillMaxSize()) } // Keep the overlay buttons above the WebView canvas (AndroidView), otherwise they may not receive touches. @@ -90,6 +128,7 @@ fun RootScreen(viewModel: MainViewModel) { StatusPill( bridge = bridgeState, voiceEnabled = voiceEnabled, + activity = activity, onClick = { sheet = Sheet.Settings }, modifier = Modifier.windowInsetsPadding(safeOverlayInsets).padding(start = 12.dp, top = 12.dp), ) diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/StatusPill.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/StatusPill.kt index 87a500265..2efcccae7 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/StatusPill.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/StatusPill.kt @@ -28,6 +28,7 @@ import androidx.compose.ui.unit.dp fun StatusPill( bridge: BridgeState, voiceEnabled: Boolean, + activity: StatusActivity? = null, onClick: () -> Unit, modifier: Modifier = Modifier, ) { @@ -62,23 +63,49 @@ fun StatusPill( color = MaterialTheme.colorScheme.onSurfaceVariant, ) - Icon( - imageVector = if (voiceEnabled) Icons.Default.Mic else Icons.Default.MicOff, - contentDescription = if (voiceEnabled) "Voice enabled" else "Voice disabled", - tint = - if (voiceEnabled) { - overlayIconColor() - } else { - MaterialTheme.colorScheme.onSurfaceVariant - }, - modifier = Modifier.size(18.dp), - ) + if (activity != null) { + Row( + horizontalArrangement = Arrangement.spacedBy(6.dp), + verticalAlignment = Alignment.CenterVertically, + ) { + Icon( + imageVector = activity.icon, + contentDescription = activity.contentDescription, + tint = activity.tint ?: overlayIconColor(), + modifier = Modifier.size(18.dp), + ) + Text( + text = activity.title, + style = MaterialTheme.typography.labelLarge, + maxLines = 1, + ) + } + } else { + Icon( + imageVector = if (voiceEnabled) Icons.Default.Mic else Icons.Default.MicOff, + contentDescription = if (voiceEnabled) "Voice enabled" else "Voice disabled", + tint = + if (voiceEnabled) { + overlayIconColor() + } else { + MaterialTheme.colorScheme.onSurfaceVariant + }, + modifier = Modifier.size(18.dp), + ) + } Spacer(modifier = Modifier.width(2.dp)) } } } +data class StatusActivity( + val title: String, + val icon: androidx.compose.ui.graphics.vector.ImageVector, + val contentDescription: String, + val tint: Color? = null, +) + enum class BridgeState(val title: String, val color: Color) { Connected("Connected", Color(0xFF2ECC71)), Connecting("Connecting…", Color(0xFFF1C40F)), diff --git a/apps/ios/Sources/RootCanvas.swift b/apps/ios/Sources/RootCanvas.swift index 9a5fb0b76..4d552618e 100644 --- a/apps/ios/Sources/RootCanvas.swift +++ b/apps/ios/Sources/RootCanvas.swift @@ -152,6 +152,7 @@ private struct CanvasContent: View { StatusPill( bridge: self.bridgeStatus, voiceWakeEnabled: self.voiceWakeEnabled, + activity: self.statusActivity, brighten: self.brightenButtons, onTap: { self.openSettings() @@ -169,33 +170,30 @@ private struct CanvasContent: View { .transition(.move(edge: .top).combined(with: .opacity)) } } - .overlay(alignment: .topLeading) { - if let cameraHUDText, !cameraHUDText.isEmpty, let cameraHUDKind { - CameraCaptureToast( - text: cameraHUDText, - kind: self.mapCameraKind(cameraHUDKind), - brighten: self.brightenButtons) - .padding(SwiftUI.Edge.Set.leading, 10) - .safeAreaPadding(SwiftUI.Edge.Set.top, 106) - .transition( - AnyTransition.move(edge: SwiftUI.Edge.top) - .combined(with: AnyTransition.opacity)) - } - } } - private func mapCameraKind(_ kind: NodeAppModel.CameraHUDKind) -> CameraCaptureToast.Kind { - switch kind { + private var statusActivity: StatusPill.Activity? { + guard let cameraHUDText, !cameraHUDText.isEmpty, let cameraHUDKind else { return nil } + let systemImage: String + let tint: Color? + switch cameraHUDKind { case .photo: - .photo + systemImage = "camera.fill" + tint = nil case .recording: - .recording + systemImage = "video.fill" + tint = .red case .success: - .success + systemImage = "checkmark.circle.fill" + tint = .green case .error: - .error + systemImage = "exclamationmark.triangle.fill" + tint = .red } + + return StatusPill.Activity(title: cameraHUDText, systemImage: systemImage, tint: tint) } + } private struct OverlayButton: View { @@ -261,59 +259,3 @@ private struct CameraFlashOverlay: View { } } } - -private struct CameraCaptureToast: View { - enum Kind { - case photo - case recording - case success - case error - } - - var text: String - var kind: Kind - var brighten: Bool = false - - var body: some View { - HStack(spacing: 10) { - self.icon - .font(.system(size: 14, weight: .semibold)) - .foregroundStyle(.primary) - - Text(self.text) - .font(.system(size: 14, weight: .semibold)) - .foregroundStyle(.primary) - .lineLimit(1) - .truncationMode(.tail) - } - .padding(.vertical, 10) - .padding(.horizontal, 12) - .background { - RoundedRectangle(cornerRadius: 14, style: .continuous) - .fill(.ultraThinMaterial) - .overlay { - RoundedRectangle(cornerRadius: 14, style: .continuous) - .strokeBorder(.white.opacity(self.brighten ? 0.24 : 0.18), lineWidth: 0.5) - } - .shadow(color: .black.opacity(0.25), radius: 12, y: 6) - } - .accessibilityLabel("Camera") - .accessibilityValue(self.text) - } - - @ViewBuilder - private var icon: some View { - switch self.kind { - case .photo: - Image(systemName: "camera.fill") - case .recording: - Image(systemName: "record.circle.fill") - .symbolRenderingMode(.palette) - .foregroundStyle(.red, .primary) - case .success: - Image(systemName: "checkmark.circle.fill") - case .error: - Image(systemName: "exclamationmark.triangle.fill") - } - } -} diff --git a/apps/ios/Sources/RootTabs.swift b/apps/ios/Sources/RootTabs.swift index dc2508895..913073d4a 100644 --- a/apps/ios/Sources/RootTabs.swift +++ b/apps/ios/Sources/RootTabs.swift @@ -26,6 +26,7 @@ struct RootTabs: View { StatusPill( bridge: self.bridgeStatus, voiceWakeEnabled: self.voiceWakeEnabled, + activity: nil, onTap: { self.selectedTab = 2 }) .padding(.leading, 10) .safeAreaPadding(.top, 10) diff --git a/apps/ios/Sources/Status/StatusPill.swift b/apps/ios/Sources/Status/StatusPill.swift index 9d3c6f6d6..f5df8e7df 100644 --- a/apps/ios/Sources/Status/StatusPill.swift +++ b/apps/ios/Sources/Status/StatusPill.swift @@ -28,8 +28,15 @@ struct StatusPill: View { } } + struct Activity: Equatable { + var title: String + var systemImage: String + var tint: Color? = nil + } + var bridge: BridgeState var voiceWakeEnabled: Bool + var activity: Activity? = nil var brighten: Bool = false var onTap: () -> Void @@ -54,10 +61,24 @@ struct StatusPill: View { .frame(height: 14) .opacity(0.35) - Image(systemName: self.voiceWakeEnabled ? "mic.fill" : "mic.slash") - .font(.system(size: 13, weight: .semibold)) - .foregroundStyle(self.voiceWakeEnabled ? .primary : .secondary) - .accessibilityLabel(self.voiceWakeEnabled ? "Voice Wake enabled" : "Voice Wake disabled") + if let activity { + HStack(spacing: 6) { + Image(systemName: activity.systemImage) + .font(.system(size: 13, weight: .semibold)) + .foregroundStyle(activity.tint ?? .primary) + Text(activity.title) + .font(.system(size: 13, weight: .semibold)) + .foregroundStyle(.primary) + .lineLimit(1) + } + .transition(.opacity.combined(with: .move(edge: .top))) + } else { + Image(systemName: self.voiceWakeEnabled ? "mic.fill" : "mic.slash") + .font(.system(size: 13, weight: .semibold)) + .foregroundStyle(self.voiceWakeEnabled ? .primary : .secondary) + .accessibilityLabel(self.voiceWakeEnabled ? "Voice Wake enabled" : "Voice Wake disabled") + .transition(.opacity.combined(with: .move(edge: .top))) + } } .padding(.vertical, 8) .padding(.horizontal, 12) @@ -73,7 +94,7 @@ struct StatusPill: View { } .buttonStyle(.plain) .accessibilityLabel("Status") - .accessibilityValue("\(self.bridge.title), Voice Wake \(self.voiceWakeEnabled ? "enabled" : "disabled")") + .accessibilityValue(self.accessibilityValue) .onAppear { self.updatePulse(for: self.bridge, scenePhase: self.scenePhase) } .onDisappear { self.pulse = false } .onChange(of: self.bridge) { _, newValue in @@ -82,6 +103,14 @@ struct StatusPill: View { .onChange(of: self.scenePhase) { _, newValue in self.updatePulse(for: self.bridge, scenePhase: newValue) } + .animation(.easeInOut(duration: 0.18), value: self.activity?.title) + } + + private var accessibilityValue: String { + if let activity { + return "\(self.bridge.title), \(activity.title)" + } + return "\(self.bridge.title), Voice Wake \(self.voiceWakeEnabled ? "enabled" : "disabled")" } private func updatePulse(for bridge: BridgeState, scenePhase: ScenePhase) { From 8f0c8a656128f4b35475646bffa83fc6808df25a Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 23:12:20 +0100 Subject: [PATCH 018/100] fix: cap camera snap payload size --- CHANGELOG.md | 2 + .../clawdis/node/node/CameraCaptureManager.kt | 36 ++++++-- .../clawdis/node/node/JpegSizeLimiter.kt | 61 ++++++++++++++ .../clawdis/node/node/JpegSizeLimiterTest.kt | 47 +++++++++++ .../ios/Sources/Camera/CameraController.swift | 6 +- .../Sources/ClawdisKit/JPEGTranscoder.swift | 82 ++++++++++++++----- .../ClawdisKitTests/JPEGTranscoderTests.swift | 57 ++++++++++++- docs/camera.md | 5 ++ 8 files changed, 267 insertions(+), 29 deletions(-) create mode 100644 apps/android/app/src/main/java/com/steipete/clawdis/node/node/JpegSizeLimiter.kt create mode 100644 apps/android/app/src/test/java/com/steipete/clawdis/node/node/JpegSizeLimiterTest.kt diff --git a/CHANGELOG.md b/CHANGELOG.md index 4edcae2c0..f73a670ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ - macOS menu: device list now shows connected nodes only. - iOS node: fix ReplayKit screen recording crash caused by queue isolation assertions during capture. - iOS/Android nodes: bridge auto-connect refreshes stale tokens and settings now show richer bridge/device details. +- iOS/Android nodes: status pill now surfaces camera activity instead of overlay toasts. +- iOS/Android nodes: camera snaps recompress to keep base64 payloads under 5 MB. - CLI: avoid spurious gateway close errors after successful request/response cycles. - Tests: add Swift Testing coverage for camera errors and Kotest coverage for Android bridge endpoints. diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/node/CameraCaptureManager.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/node/CameraCaptureManager.kt index 4f1501340..b25b95ea4 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/node/CameraCaptureManager.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/node/CameraCaptureManager.kt @@ -28,6 +28,7 @@ import kotlinx.coroutines.withContext import java.io.ByteArrayOutputStream import java.io.File import java.util.concurrent.Executor +import kotlin.math.roundToInt import kotlin.coroutines.resume import kotlin.coroutines.resumeWithException @@ -99,14 +100,35 @@ class CameraCaptureManager(private val context: Context) { decoded } - val out = ByteArrayOutputStream() - val jpegQuality = (quality * 100.0).toInt().coerceIn(10, 100) - if (!scaled.compress(Bitmap.CompressFormat.JPEG, jpegQuality, out)) { - throw IllegalStateException("UNAVAILABLE: failed to encode JPEG") - } - val base64 = Base64.encodeToString(out.toByteArray(), Base64.NO_WRAP) + val maxPayloadBytes = 5 * 1024 * 1024 + val maxEncodedBytes = (maxPayloadBytes / 4) * 3 + val result = + JpegSizeLimiter.compressToLimit( + initialWidth = scaled.width, + initialHeight = scaled.height, + startQuality = (quality * 100.0).roundToInt().coerceIn(10, 100), + maxBytes = maxEncodedBytes, + encode = { width, height, q -> + val bitmap = + if (width == scaled.width && height == scaled.height) { + scaled + } else { + scaled.scale(width, height) + } + val out = ByteArrayOutputStream() + if (!bitmap.compress(Bitmap.CompressFormat.JPEG, q, out)) { + if (bitmap !== scaled) bitmap.recycle() + throw IllegalStateException("UNAVAILABLE: failed to encode JPEG") + } + if (bitmap !== scaled) { + bitmap.recycle() + } + out.toByteArray() + }, + ) + val base64 = Base64.encodeToString(result.bytes, Base64.NO_WRAP) Payload( - """{"format":"jpg","base64":"$base64","width":${scaled.width},"height":${scaled.height}}""", + """{"format":"jpg","base64":"$base64","width":${result.width},"height":${result.height}}""", ) } diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/node/JpegSizeLimiter.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/node/JpegSizeLimiter.kt new file mode 100644 index 000000000..bb9377231 --- /dev/null +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/node/JpegSizeLimiter.kt @@ -0,0 +1,61 @@ +package com.steipete.clawdis.node.node + +import kotlin.math.max +import kotlin.math.min +import kotlin.math.roundToInt + +internal data class JpegSizeLimiterResult( + val bytes: ByteArray, + val width: Int, + val height: Int, + val quality: Int, +) + +internal object JpegSizeLimiter { + fun compressToLimit( + initialWidth: Int, + initialHeight: Int, + startQuality: Int, + maxBytes: Int, + minQuality: Int = 20, + minSize: Int = 256, + scaleStep: Double = 0.85, + maxScaleAttempts: Int = 6, + maxQualityAttempts: Int = 6, + encode: (width: Int, height: Int, quality: Int) -> ByteArray, + ): JpegSizeLimiterResult { + require(initialWidth > 0 && initialHeight > 0) { "Invalid image size" } + require(maxBytes > 0) { "Invalid maxBytes" } + + var width = initialWidth + var height = initialHeight + val clampedStartQuality = startQuality.coerceIn(minQuality, 100) + var best = JpegSizeLimiterResult(bytes = encode(width, height, clampedStartQuality), width = width, height = height, quality = clampedStartQuality) + if (best.bytes.size <= maxBytes) return best + + repeat(maxScaleAttempts) { + var quality = clampedStartQuality + repeat(maxQualityAttempts) { + val bytes = encode(width, height, quality) + best = JpegSizeLimiterResult(bytes = bytes, width = width, height = height, quality = quality) + if (bytes.size <= maxBytes) return best + if (quality <= minQuality) return@repeat + quality = max(minQuality, (quality * 0.75).roundToInt()) + } + + val minScale = (minSize.toDouble() / min(width, height).toDouble()).coerceAtMost(1.0) + val nextScale = max(scaleStep, minScale) + val nextWidth = max(minSize, (width * nextScale).roundToInt()) + val nextHeight = max(minSize, (height * nextScale).roundToInt()) + if (nextWidth == width && nextHeight == height) return@repeat + width = min(nextWidth, width) + height = min(nextHeight, height) + } + + if (best.bytes.size > maxBytes) { + throw IllegalStateException("CAMERA_TOO_LARGE: ${best.bytes.size} bytes > $maxBytes bytes") + } + + return best + } +} diff --git a/apps/android/app/src/test/java/com/steipete/clawdis/node/node/JpegSizeLimiterTest.kt b/apps/android/app/src/test/java/com/steipete/clawdis/node/node/JpegSizeLimiterTest.kt new file mode 100644 index 000000000..457bd189d --- /dev/null +++ b/apps/android/app/src/test/java/com/steipete/clawdis/node/node/JpegSizeLimiterTest.kt @@ -0,0 +1,47 @@ +package com.steipete.clawdis.node.node + +import org.junit.Assert.assertEquals +import org.junit.Assert.assertTrue +import org.junit.Test +import kotlin.math.min + +class JpegSizeLimiterTest { + @Test + fun compressesLargePayloadsUnderLimit() { + val maxBytes = 5 * 1024 * 1024 + val result = + JpegSizeLimiter.compressToLimit( + initialWidth = 4000, + initialHeight = 3000, + startQuality = 95, + maxBytes = maxBytes, + encode = { width, height, quality -> + val estimated = (width.toLong() * height.toLong() * quality.toLong()) / 100 + val size = min(maxBytes.toLong() * 2, estimated).toInt() + ByteArray(size) + }, + ) + + assertTrue(result.bytes.size <= maxBytes) + assertTrue(result.width <= 4000) + assertTrue(result.height <= 3000) + assertTrue(result.quality <= 95) + } + + @Test + fun keepsSmallPayloadsAsIs() { + val maxBytes = 5 * 1024 * 1024 + val result = + JpegSizeLimiter.compressToLimit( + initialWidth = 800, + initialHeight = 600, + startQuality = 90, + maxBytes = maxBytes, + encode = { _, _, _ -> ByteArray(120_000) }, + ) + + assertEquals(800, result.width) + assertEquals(600, result.height) + assertEquals(90, result.quality) + } +} diff --git a/apps/ios/Sources/Camera/CameraController.swift b/apps/ios/Sources/Camera/CameraController.swift index a57769d31..cf8c6ce50 100644 --- a/apps/ios/Sources/Camera/CameraController.swift +++ b/apps/ios/Sources/Camera/CameraController.swift @@ -84,10 +84,14 @@ actor CameraController { } withExtendedLifetime(delegate) {} + let maxPayloadBytes = 5 * 1024 * 1024 + // Base64 inflates payloads by ~4/3, so cap encoded bytes to keep payload <= 5MB. + let maxEncodedBytes = (maxPayloadBytes / 4) * 3 let res = try JPEGTranscoder.transcodeToJPEG( imageData: rawData, maxWidthPx: maxWidth, - quality: quality) + quality: quality, + maxBytes: maxEncodedBytes) return ( format: format.rawValue, diff --git a/apps/shared/ClawdisKit/Sources/ClawdisKit/JPEGTranscoder.swift b/apps/shared/ClawdisKit/Sources/ClawdisKit/JPEGTranscoder.swift index 39761f131..f4b1cb951 100644 --- a/apps/shared/ClawdisKit/Sources/ClawdisKit/JPEGTranscoder.swift +++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/JPEGTranscoder.swift @@ -7,6 +7,7 @@ public enum JPEGTranscodeError: LocalizedError, Sendable { case decodeFailed case propertiesMissing case encodeFailed + case sizeLimitExceeded(maxBytes: Int, actualBytes: Int) public var errorDescription: String? { switch self { @@ -16,6 +17,8 @@ public enum JPEGTranscodeError: LocalizedError, Sendable { "Failed to read image properties" case .encodeFailed: "Failed to encode JPEG" + case let .sizeLimitExceeded(maxBytes, actualBytes): + "JPEG exceeds size limit (\(actualBytes) bytes > \(maxBytes) bytes)" } } } @@ -32,7 +35,8 @@ public struct JPEGTranscoder: Sendable { public static func transcodeToJPEG( imageData: Data, maxWidthPx: Int?, - quality: Double) throws -> (data: Data, widthPx: Int, heightPx: Int) + quality: Double, + maxBytes: Int? = nil) throws -> (data: Data, widthPx: Int, heightPx: Int) { guard let src = CGImageSourceCreateWithData(imageData as CFData, nil) else { throw JPEGTranscodeError.decodeFailed @@ -58,7 +62,7 @@ public struct JPEGTranscoder: Sendable { let orientedHeight = rotates90 ? pixelWidth : pixelHeight let maxDim = max(orientedWidth, orientedHeight) - let targetMaxPixelSize: Int = { + var targetMaxPixelSize: Int = { guard let maxWidthPx, maxWidthPx > 0 else { return maxDim } guard orientedWidth > maxWidthPx else { return maxDim } // never upscale @@ -66,28 +70,66 @@ public struct JPEGTranscoder: Sendable { return max(1, Int((Double(maxDim) * scale).rounded(.toNearestOrAwayFromZero))) }() - let thumbOpts: [CFString: Any] = [ - kCGImageSourceCreateThumbnailFromImageAlways: true, - kCGImageSourceCreateThumbnailWithTransform: true, - kCGImageSourceThumbnailMaxPixelSize: targetMaxPixelSize, - kCGImageSourceShouldCacheImmediately: true, - ] + func encode(maxPixelSize: Int, quality: Double) throws -> (data: Data, widthPx: Int, heightPx: Int) { + let thumbOpts: [CFString: Any] = [ + kCGImageSourceCreateThumbnailFromImageAlways: true, + kCGImageSourceCreateThumbnailWithTransform: true, + kCGImageSourceThumbnailMaxPixelSize: maxPixelSize, + kCGImageSourceShouldCacheImmediately: true, + ] - guard let img = CGImageSourceCreateThumbnailAtIndex(src, 0, thumbOpts as CFDictionary) else { - throw JPEGTranscodeError.decodeFailed + guard let img = CGImageSourceCreateThumbnailAtIndex(src, 0, thumbOpts as CFDictionary) else { + throw JPEGTranscodeError.decodeFailed + } + + let out = NSMutableData() + guard let dest = CGImageDestinationCreateWithData(out, UTType.jpeg.identifier as CFString, 1, nil) else { + throw JPEGTranscodeError.encodeFailed + } + let q = self.clampQuality(quality) + let encodeProps = [kCGImageDestinationLossyCompressionQuality: q] as CFDictionary + CGImageDestinationAddImage(dest, img, encodeProps) + guard CGImageDestinationFinalize(dest) else { + throw JPEGTranscodeError.encodeFailed + } + + return (out as Data, img.width, img.height) } - let out = NSMutableData() - guard let dest = CGImageDestinationCreateWithData(out, UTType.jpeg.identifier as CFString, 1, nil) else { - throw JPEGTranscodeError.encodeFailed - } - let q = self.clampQuality(quality) - let encodeProps = [kCGImageDestinationLossyCompressionQuality: q] as CFDictionary - CGImageDestinationAddImage(dest, img, encodeProps) - guard CGImageDestinationFinalize(dest) else { - throw JPEGTranscodeError.encodeFailed + guard let maxBytes, maxBytes > 0 else { + return try encode(maxPixelSize: targetMaxPixelSize, quality: quality) } - return (out as Data, img.width, img.height) + let minQuality = max(0.2, self.clampQuality(quality) * 0.35) + let minPixelSize = 256 + var best = try encode(maxPixelSize: targetMaxPixelSize, quality: quality) + if best.data.count <= maxBytes { + return best + } + + for _ in 0..<6 { + var q = self.clampQuality(quality) + for _ in 0..<6 { + let candidate = try encode(maxPixelSize: targetMaxPixelSize, quality: q) + best = candidate + if candidate.data.count <= maxBytes { + return candidate + } + if q <= minQuality { break } + q = max(minQuality, q * 0.75) + } + + let nextPixelSize = max(Int(Double(targetMaxPixelSize) * 0.85), minPixelSize) + if nextPixelSize == targetMaxPixelSize { + break + } + targetMaxPixelSize = nextPixelSize + } + + if best.data.count > maxBytes { + throw JPEGTranscodeError.sizeLimitExceeded(maxBytes: maxBytes, actualBytes: best.data.count) + } + + return best } } diff --git a/apps/shared/ClawdisKit/Tests/ClawdisKitTests/JPEGTranscoderTests.swift b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/JPEGTranscoderTests.swift index 9c8fcfbda..b12587a4c 100644 --- a/apps/shared/ClawdisKit/Tests/ClawdisKitTests/JPEGTranscoderTests.swift +++ b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/JPEGTranscoderTests.swift @@ -47,6 +47,52 @@ import UniformTypeIdentifiers return out as Data } + private func makeNoiseJPEG(width: Int, height: Int) throws -> Data { + let bytesPerPixel = 4 + let byteCount = width * height * bytesPerPixel + var data = Data(count: byteCount) + let cs = CGColorSpaceCreateDeviceRGB() + let bitmapInfo = CGImageAlphaInfo.premultipliedLast.rawValue + + let out = try data.withUnsafeMutableBytes { rawBuffer -> Data in + guard let base = rawBuffer.baseAddress?.assumingMemoryBound(to: UInt8.self) else { + throw NSError(domain: "JPEGTranscoderTests", code: 6) + } + for idx in 0.."` - `width`, `height` + - Payload guard: photos are recompressed to keep the base64 payload under 5 MB. - `camera.clip` - Params: @@ -90,6 +91,10 @@ If permissions are missing, the app will prompt when possible; if denied, `camer Like `canvas.*`, the Android node only allows `camera.*` commands in the **foreground**. Background invocations return `NODE_BACKGROUND_UNAVAILABLE`. +### Payload guard + +Photos are recompressed to keep the base64 payload under 5 MB. + ## macOS app ### User setting (default off) From 6e83f95c83fe60d22d18f5084ab006156157ffb7 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 22:13:39 +0000 Subject: [PATCH 019/100] fix: clamp tool images to 5MB --- CHANGELOG.md | 1 + src/agents/tool-images.test.ts | 35 ++++++++++++ src/agents/tool-images.ts | 98 +++++++++++++++++++--------------- 3 files changed, 90 insertions(+), 44 deletions(-) create mode 100644 src/agents/tool-images.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index f73a670ad..662af8051 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ - iOS/Android nodes: status pill now surfaces camera activity instead of overlay toasts. - iOS/Android nodes: camera snaps recompress to keep base64 payloads under 5 MB. - CLI: avoid spurious gateway close errors after successful request/response cycles. +- Agent runtime: clamp tool-result images to the 5MB Anthropic limit to avoid hard request rejections. - Tests: add Swift Testing coverage for camera errors and Kotest coverage for Android bridge endpoints. ## 2.0.0-beta4 — 2025-12-27 diff --git a/src/agents/tool-images.test.ts b/src/agents/tool-images.test.ts new file mode 100644 index 000000000..c6a5baffd --- /dev/null +++ b/src/agents/tool-images.test.ts @@ -0,0 +1,35 @@ +import sharp from "sharp"; +import { describe, expect, it } from "vitest"; + +import { sanitizeContentBlocksImages } from "./tool-images.js"; + +describe("tool image sanitizing", () => { + it("shrinks oversized images to <=5MB", async () => { + const width = 2800; + const height = 2800; + const raw = Buffer.alloc(width * height * 3, 0xff); + const bigPng = await sharp(raw, { + raw: { width, height, channels: 3 }, + }) + .png({ compressionLevel: 0 }) + .toBuffer(); + expect(bigPng.byteLength).toBeGreaterThan(5 * 1024 * 1024); + + const blocks = [ + { + type: "image" as const, + data: bigPng.toString("base64"), + mimeType: "image/png", + }, + ]; + + const out = await sanitizeContentBlocksImages(blocks, "test"); + const image = out.find((b) => b.type === "image"); + if (!image || image.type !== "image") { + throw new Error("expected image block"); + } + const size = Buffer.from(image.data, "base64").byteLength; + expect(size).toBeLessThanOrEqual(5 * 1024 * 1024); + expect(image.mimeType).toBe("image/jpeg"); + }, 20_000); +}); diff --git a/src/agents/tool-images.ts b/src/agents/tool-images.ts index a4357b504..167470ad7 100644 --- a/src/agents/tool-images.ts +++ b/src/agents/tool-images.ts @@ -1,19 +1,19 @@ import type { AgentToolResult } from "@mariozechner/pi-ai"; import { getImageMetadata, resizeToJpeg } from "../media/image-ops.js"; -import { detectMime } from "../media/mime.js"; type ToolContentBlock = AgentToolResult["content"][number]; type ImageContentBlock = Extract; type TextContentBlock = Extract; -// Anthropic Messages API limitation (observed in Clawdis sessions): -// When sending many images in a single request (e.g. via session history + tool results), -// Anthropic rejects any image where *either* dimension exceeds 2000px. +// Anthropic Messages API limitations (observed in Clawdis sessions): +// - Images over ~2000px per side can fail in multi-image requests. +// - Images over 5MB are rejected by the API. // // To keep sessions resilient (and avoid "silent" WhatsApp non-replies), we auto-downscale -// all base64 image blocks above this limit while preserving aspect ratio. +// and recompress base64 image blocks when they exceed these limits. const MAX_IMAGE_DIMENSION_PX = 2000; +const MAX_IMAGE_BYTES = 5 * 1024 * 1024; function isImageBlock(block: unknown): block is ImageContentBlock { if (!block || typeof block !== "object") return false; @@ -35,66 +35,75 @@ async function resizeImageBase64IfNeeded(params: { base64: string; mimeType: string; maxDimensionPx: number; + maxBytes: number; }): Promise<{ base64: string; mimeType: string; resized: boolean }> { const buf = Buffer.from(params.base64, "base64"); const meta = await getImageMetadata(buf); const width = meta?.width; const height = meta?.height; - if ( - typeof width !== "number" || - typeof height !== "number" || - (width <= params.maxDimensionPx && height <= params.maxDimensionPx) + const overBytes = buf.byteLength > params.maxBytes; + const maxDim = Math.max(width ?? 0, height ?? 0); + if (typeof width !== "number" || typeof height !== "number") { + if (!overBytes) { + return { + base64: params.base64, + mimeType: params.mimeType, + resized: false, + }; + } + } else if ( + !overBytes && + width <= params.maxDimensionPx && + height <= params.maxDimensionPx ) { return { base64: params.base64, mimeType: params.mimeType, resized: false }; } - const mime = params.mimeType.toLowerCase(); - let out: Buffer; - try { - const mod = (await import("sharp")) as unknown as { - default?: typeof import("sharp"); - }; - const sharp = mod.default ?? (mod as unknown as typeof import("sharp")); - const img = sharp(buf, { failOnError: false }).resize({ - width: params.maxDimensionPx, - height: params.maxDimensionPx, - fit: "inside", - withoutEnlargement: true, - }); - if (mime === "image/jpeg" || mime === "image/jpg") { - out = await img.jpeg({ quality: 85 }).toBuffer(); - } else if (mime === "image/webp") { - out = await img.webp({ quality: 85 }).toBuffer(); - } else if (mime === "image/png") { - out = await img.png().toBuffer(); - } else { - out = await img.png().toBuffer(); + const qualities = [85, 75, 65, 55, 45, 35]; + const sideStart = maxDim > 0 ? Math.min(params.maxDimensionPx, maxDim) : params.maxDimensionPx; + const sideGrid = [sideStart, 1800, 1600, 1400, 1200, 1000, 800] + .map((v) => Math.min(params.maxDimensionPx, v)) + .filter((v, i, arr) => v > 0 && arr.indexOf(v) === i) + .sort((a, b) => b - a); + + let smallest: { buffer: Buffer; size: number } | null = null; + for (const side of sideGrid) { + for (const quality of qualities) { + const out = await resizeToJpeg({ + buffer: buf, + maxSide: side, + quality, + withoutEnlargement: true, + }); + if (!smallest || out.byteLength < smallest.size) { + smallest = { buffer: out, size: out.byteLength }; + } + if (out.byteLength <= params.maxBytes) { + return { + base64: out.toString("base64"), + mimeType: "image/jpeg", + resized: true, + }; + } } - } catch { - // Bun can't load sharp native addons. Fall back to a JPEG conversion. - out = await resizeToJpeg({ - buffer: buf, - maxSide: params.maxDimensionPx, - quality: 85, - withoutEnlargement: true, - }); } - const sniffed = await detectMime({ buffer: out.slice(0, 256) }); - const nextMime = sniffed?.startsWith("image/") ? sniffed : params.mimeType; - - return { base64: out.toString("base64"), mimeType: nextMime, resized: true }; + const best = smallest?.buffer ?? buf; + const maxMb = (params.maxBytes / (1024 * 1024)).toFixed(0); + const gotMb = (best.byteLength / (1024 * 1024)).toFixed(2); + throw new Error(`Image could not be reduced below ${maxMb}MB (got ${gotMb}MB)`); } export async function sanitizeContentBlocksImages( blocks: ToolContentBlock[], label: string, - opts: { maxDimensionPx?: number } = {}, + opts: { maxDimensionPx?: number; maxBytes?: number } = {}, ): Promise { const maxDimensionPx = Math.max( opts.maxDimensionPx ?? MAX_IMAGE_DIMENSION_PX, 1, ); + const maxBytes = Math.max(opts.maxBytes ?? MAX_IMAGE_BYTES, 1); const out: ToolContentBlock[] = []; for (const block of blocks) { @@ -117,6 +126,7 @@ export async function sanitizeContentBlocksImages( base64: data, mimeType: block.mimeType, maxDimensionPx, + maxBytes, }); out.push({ ...block, data: resized.base64, mimeType: resized.mimeType }); } catch (err) { @@ -133,7 +143,7 @@ export async function sanitizeContentBlocksImages( export async function sanitizeToolResultImages( result: AgentToolResult, label: string, - opts: { maxDimensionPx?: number } = {}, + opts: { maxDimensionPx?: number; maxBytes?: number } = {}, ): Promise> { const content = Array.isArray(result.content) ? result.content : []; if (!content.some((b) => isImageBlock(b) || isTextBlock(b))) return result; From 6927b0fb8d990ccac9b09a435610286314117740 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 23:20:35 +0100 Subject: [PATCH 020/100] fix: align camera payload caps --- CHANGELOG.md | 5 ++++- .../steipete/clawdis/node/node/CameraCaptureManager.kt | 1 + .../main/java/com/steipete/clawdis/node/ui/RootScreen.kt | 1 + apps/ios/Sources/Camera/CameraController.swift | 2 +- apps/ios/Sources/RootCanvas.swift | 1 + apps/macos/Sources/Clawdis/CameraCaptureService.swift | 9 ++++++++- docs/camera.md | 1 + 7 files changed, 17 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 662af8051..c17709774 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ ## 2.0.0-beta5 — Unreleased +### Features +- Talk mode: continuous speech conversations (macOS/iOS/Android) with ElevenLabs TTS, reply directives, and optional interrupt-on-speech. + ### Fixes - macOS: Voice Wake now fully tears down the Speech pipeline when disabled (cancel pending restarts, drop stale callbacks) to avoid high CPU in the background. - iOS/Android nodes: enable scrolling for loaded web pages in the Canvas WebView (default scaffold stays touch-first). @@ -10,7 +13,7 @@ - iOS node: fix ReplayKit screen recording crash caused by queue isolation assertions during capture. - iOS/Android nodes: bridge auto-connect refreshes stale tokens and settings now show richer bridge/device details. - iOS/Android nodes: status pill now surfaces camera activity instead of overlay toasts. -- iOS/Android nodes: camera snaps recompress to keep base64 payloads under 5 MB. +- iOS/Android/macOS nodes: camera snaps recompress to keep base64 payloads under 5 MB. - CLI: avoid spurious gateway close errors after successful request/response cycles. - Agent runtime: clamp tool-result images to the 5MB Anthropic limit to avoid hard request rejections. - Tests: add Swift Testing coverage for camera errors and Kotest coverage for Android bridge endpoints. diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/node/CameraCaptureManager.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/node/CameraCaptureManager.kt index b25b95ea4..416690766 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/node/CameraCaptureManager.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/node/CameraCaptureManager.kt @@ -101,6 +101,7 @@ class CameraCaptureManager(private val context: Context) { } val maxPayloadBytes = 5 * 1024 * 1024 + // Base64 inflates payloads by ~4/3; cap encoded bytes so the payload stays under 5MB (API limit). val maxEncodedBytes = (maxPayloadBytes / 4) * 3 val result = JpegSizeLimiter.compressToLimit( diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt index f3cfb4b67..2594449b8 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt @@ -67,6 +67,7 @@ fun RootScreen(viewModel: MainViewModel) { val cameraFlashToken by viewModel.cameraFlashToken.collectAsState() val activity = remember(cameraHud) { + // Status pill owns transient capture state so it doesn't overlap the connection indicator. cameraHud?.let { hud -> when (hud.kind) { CameraHudKind.Photo -> diff --git a/apps/ios/Sources/Camera/CameraController.swift b/apps/ios/Sources/Camera/CameraController.swift index cf8c6ce50..00d633bd9 100644 --- a/apps/ios/Sources/Camera/CameraController.swift +++ b/apps/ios/Sources/Camera/CameraController.swift @@ -85,7 +85,7 @@ actor CameraController { withExtendedLifetime(delegate) {} let maxPayloadBytes = 5 * 1024 * 1024 - // Base64 inflates payloads by ~4/3, so cap encoded bytes to keep payload <= 5MB. + // Base64 inflates payloads by ~4/3; cap encoded bytes so the payload stays under 5MB (API limit). let maxEncodedBytes = (maxPayloadBytes / 4) * 3 let res = try JPEGTranscoder.transcodeToJPEG( imageData: rawData, diff --git a/apps/ios/Sources/RootCanvas.swift b/apps/ios/Sources/RootCanvas.swift index 4d552618e..c02eceb69 100644 --- a/apps/ios/Sources/RootCanvas.swift +++ b/apps/ios/Sources/RootCanvas.swift @@ -173,6 +173,7 @@ private struct CanvasContent: View { } private var statusActivity: StatusPill.Activity? { + // Status pill owns transient capture state so it doesn't overlap the connection indicator. guard let cameraHUDText, !cameraHUDText.isEmpty, let cameraHUDKind else { return nil } let systemImage: String let tint: Color? diff --git a/apps/macos/Sources/Clawdis/CameraCaptureService.swift b/apps/macos/Sources/Clawdis/CameraCaptureService.swift index c087c8fd3..3c9d9c357 100644 --- a/apps/macos/Sources/Clawdis/CameraCaptureService.swift +++ b/apps/macos/Sources/Clawdis/CameraCaptureService.swift @@ -79,7 +79,14 @@ actor CameraCaptureService { } withExtendedLifetime(delegate) {} - let res = try JPEGTranscoder.transcodeToJPEG(imageData: rawData, maxWidthPx: maxWidth, quality: quality) + let maxPayloadBytes = 5 * 1024 * 1024 + // Base64 inflates payloads by ~4/3; cap encoded bytes so the payload stays under 5MB (API limit). + let maxEncodedBytes = (maxPayloadBytes / 4) * 3 + let res = try JPEGTranscoder.transcodeToJPEG( + imageData: rawData, + maxWidthPx: maxWidth, + quality: quality, + maxBytes: maxEncodedBytes) return (data: res.data, size: CGSize(width: res.widthPx, height: res.heightPx)) } diff --git a/docs/camera.md b/docs/camera.md index 0353d1567..aba3b5268 100644 --- a/docs/camera.md +++ b/docs/camera.md @@ -121,6 +121,7 @@ clawdis nodes camera clip --node --no-audio Notes: - `clawdis nodes camera snap` defaults to `maxWidth=1600` unless overridden. +- Photo payloads are recompressed to keep base64 under 5 MB. ## Safety + practical limits From 20d7882033a00c44529e2b257e12b004ce2823c1 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 23:21:05 +0100 Subject: [PATCH 021/100] feat: add talk mode across nodes --- .../steipete/clawdis/node/MainViewModel.kt | 8 + .../com/steipete/clawdis/node/NodeRuntime.kt | 26 + .../com/steipete/clawdis/node/SecurePrefs.kt | 8 + .../steipete/clawdis/node/ui/SettingsSheet.kt | 24 + .../clawdis/node/voice/TalkDirectiveParser.kt | 194 +++++ .../clawdis/node/voice/TalkModeManager.kt | 713 ++++++++++++++++++ .../node/voice/TalkDirectiveParserTest.kt | 55 ++ apps/ios/Sources/Model/NodeAppModel.swift | 8 + apps/ios/Sources/Settings/SettingsTab.swift | 5 + apps/ios/Sources/Voice/TalkModeManager.swift | 518 +++++++++++++ apps/ios/Sources/Voice/VoiceTab.swift | 5 + apps/macos/Sources/Clawdis/AppState.swift | 32 + .../Sources/Clawdis/ConfigSettings.swift | 68 ++ apps/macos/Sources/Clawdis/Constants.swift | 1 + .../Sources/Clawdis/MenuContentView.swift | 13 + .../Sources/Clawdis/TalkAudioPlayer.swift | 54 ++ .../Sources/Clawdis/TalkModeController.swift | 42 ++ .../Sources/Clawdis/TalkModeRuntime.swift | 684 +++++++++++++++++ .../macos/Sources/Clawdis/TalkModeTypes.swift | 8 + apps/macos/Sources/Clawdis/TalkOverlay.swift | 119 +++ .../Sources/Clawdis/TalkOverlayView.swift | 139 ++++ .../Sources/ClawdisKit/TalkDirective.swift | 194 +++++ .../ClawdisKitTests/TalkDirectiveTests.swift | 62 ++ docs/configuration.md | 15 + docs/talk.md | 72 ++ src/config/config.ts | 20 + 26 files changed, 3087 insertions(+) create mode 100644 apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkDirectiveParser.kt create mode 100644 apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt create mode 100644 apps/android/app/src/test/java/com/steipete/clawdis/node/voice/TalkDirectiveParserTest.kt create mode 100644 apps/ios/Sources/Voice/TalkModeManager.swift create mode 100644 apps/macos/Sources/Clawdis/TalkAudioPlayer.swift create mode 100644 apps/macos/Sources/Clawdis/TalkModeController.swift create mode 100644 apps/macos/Sources/Clawdis/TalkModeRuntime.swift create mode 100644 apps/macos/Sources/Clawdis/TalkModeTypes.swift create mode 100644 apps/macos/Sources/Clawdis/TalkOverlay.swift create mode 100644 apps/macos/Sources/Clawdis/TalkOverlayView.swift create mode 100644 apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift create mode 100644 apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkDirectiveTests.swift create mode 100644 docs/talk.md diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/MainViewModel.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/MainViewModel.kt index 28d702975..ee1c83c9b 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/MainViewModel.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/MainViewModel.kt @@ -35,6 +35,10 @@ class MainViewModel(app: Application) : AndroidViewModel(app) { val voiceWakeMode: StateFlow = runtime.voiceWakeMode val voiceWakeStatusText: StateFlow = runtime.voiceWakeStatusText val voiceWakeIsListening: StateFlow = runtime.voiceWakeIsListening + val talkEnabled: StateFlow = runtime.talkEnabled + val talkStatusText: StateFlow = runtime.talkStatusText + val talkIsListening: StateFlow = runtime.talkIsListening + val talkIsSpeaking: StateFlow = runtime.talkIsSpeaking val manualEnabled: StateFlow = runtime.manualEnabled val manualHost: StateFlow = runtime.manualHost val manualPort: StateFlow = runtime.manualPort @@ -95,6 +99,10 @@ class MainViewModel(app: Application) : AndroidViewModel(app) { runtime.setVoiceWakeMode(mode) } + fun setTalkEnabled(enabled: Boolean) { + runtime.setTalkEnabled(enabled) + } + fun connect(endpoint: BridgeEndpoint) { runtime.connect(endpoint) } diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt index 0ade08e3b..4984f7e0f 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt @@ -25,6 +25,7 @@ import com.steipete.clawdis.node.protocol.ClawdisCanvasA2UIAction import com.steipete.clawdis.node.protocol.ClawdisCanvasA2UICommand import com.steipete.clawdis.node.protocol.ClawdisCanvasCommand import com.steipete.clawdis.node.protocol.ClawdisScreenCommand +import com.steipete.clawdis.node.voice.TalkModeManager import com.steipete.clawdis.node.voice.VoiceWakeManager import kotlinx.coroutines.CoroutineScope import kotlinx.coroutines.Dispatchers @@ -84,6 +85,15 @@ class NodeRuntime(context: Context) { val voiceWakeStatusText: StateFlow get() = voiceWake.statusText + val talkStatusText: StateFlow + get() = talkMode.statusText + + val talkIsListening: StateFlow + get() = talkMode.isListening + + val talkIsSpeaking: StateFlow + get() = talkMode.isSpeaking + private val discovery = BridgeDiscovery(appContext, scope = scope) val bridges: StateFlow> = discovery.bridges val discoveryStatusText: StateFlow = discovery.statusText @@ -133,6 +143,9 @@ class NodeRuntime(context: Context) { ) private val chat = ChatController(scope = scope, session = session, json = json) + private val talkMode: TalkModeManager by lazy { + TalkModeManager(context = appContext, scope = scope).also { it.attachSession(session) } + } private fun handleSessionDisconnected(message: String) { _statusText.value = message @@ -163,6 +176,7 @@ class NodeRuntime(context: Context) { val preventSleep: StateFlow = prefs.preventSleep val wakeWords: StateFlow> = prefs.wakeWords val voiceWakeMode: StateFlow = prefs.voiceWakeMode + val talkEnabled: StateFlow = prefs.talkEnabled val manualEnabled: StateFlow = prefs.manualEnabled val manualHost: StateFlow = prefs.manualHost val manualPort: StateFlow = prefs.manualPort @@ -218,6 +232,13 @@ class NodeRuntime(context: Context) { } } + scope.launch { + talkEnabled.collect { enabled -> + talkMode.setEnabled(enabled) + externalAudioCaptureActive.value = enabled + } + } + scope.launch(Dispatchers.Default) { bridges.collect { list -> if (list.isNotEmpty()) { @@ -311,6 +332,10 @@ class NodeRuntime(context: Context) { prefs.setVoiceWakeMode(mode) } + fun setTalkEnabled(value: Boolean) { + prefs.setTalkEnabled(value) + } + fun connect(endpoint: BridgeEndpoint) { scope.launch { _statusText.value = "Connecting…" @@ -548,6 +573,7 @@ class NodeRuntime(context: Context) { return } + talkMode.handleBridgeEvent(event, payloadJson) chat.handleBridgeEvent(event, payloadJson) } diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/SecurePrefs.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/SecurePrefs.kt index 8d7ceb0a2..b288ef29e 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/SecurePrefs.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/SecurePrefs.kt @@ -73,6 +73,9 @@ class SecurePrefs(context: Context) { private val _voiceWakeMode = MutableStateFlow(loadVoiceWakeMode()) val voiceWakeMode: StateFlow = _voiceWakeMode + private val _talkEnabled = MutableStateFlow(prefs.getBoolean("talk.enabled", false)) + val talkEnabled: StateFlow = _talkEnabled + fun setLastDiscoveredStableId(value: String) { val trimmed = value.trim() prefs.edit { putString("bridge.lastDiscoveredStableId", trimmed) } @@ -158,6 +161,11 @@ class SecurePrefs(context: Context) { _voiceWakeMode.value = mode } + fun setTalkEnabled(value: Boolean) { + prefs.edit { putBoolean("talk.enabled", value) } + _talkEnabled.value = value + } + private fun loadVoiceWakeMode(): VoiceWakeMode { val raw = prefs.getString(voiceWakeModeKey, null) val resolved = VoiceWakeMode.fromRawValue(raw) diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/SettingsSheet.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/SettingsSheet.kt index c7d011892..2ec4a7119 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/SettingsSheet.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/SettingsSheet.kt @@ -62,6 +62,8 @@ fun SettingsSheet(viewModel: MainViewModel) { val wakeWords by viewModel.wakeWords.collectAsState() val voiceWakeMode by viewModel.voiceWakeMode.collectAsState() val voiceWakeStatusText by viewModel.voiceWakeStatusText.collectAsState() + val talkEnabled by viewModel.talkEnabled.collectAsState() + val talkStatusText by viewModel.talkStatusText.collectAsState() val isConnected by viewModel.isConnected.collectAsState() val manualEnabled by viewModel.manualEnabled.collectAsState() val manualHost by viewModel.manualHost.collectAsState() @@ -307,6 +309,28 @@ fun SettingsSheet(viewModel: MainViewModel) { // Voice item { Text("Voice", style = MaterialTheme.typography.titleSmall) } + item { + ListItem( + headlineContent = { Text("Talk Mode") }, + supportingContent = { Text(talkStatusText) }, + trailingContent = { + Switch( + checked = talkEnabled, + onCheckedChange = { on -> + if (on) { + val micOk = + ContextCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO) == + PackageManager.PERMISSION_GRANTED + if (!micOk) audioPermissionLauncher.launch(Manifest.permission.RECORD_AUDIO) + viewModel.setTalkEnabled(true) + } else { + viewModel.setTalkEnabled(false) + } + }, + ) + }, + ) + } item { val enabled = voiceWakeMode != VoiceWakeMode.Off ListItem( diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkDirectiveParser.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkDirectiveParser.kt new file mode 100644 index 000000000..539f556ff --- /dev/null +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkDirectiveParser.kt @@ -0,0 +1,194 @@ +package com.steipete.clawdis.node.voice + +import kotlinx.serialization.json.Json +import kotlinx.serialization.json.JsonElement +import kotlinx.serialization.json.JsonObject +import kotlinx.serialization.json.JsonPrimitive + +private val directiveJson = Json { ignoreUnknownKeys = true } + +data class TalkDirective( + val voiceId: String? = null, + val modelId: String? = null, + val speed: Double? = null, + val rateWpm: Int? = null, + val stability: Double? = null, + val similarity: Double? = null, + val style: Double? = null, + val speakerBoost: Boolean? = null, + val seed: Long? = null, + val normalize: String? = null, + val language: String? = null, + val outputFormat: String? = null, + val latencyTier: Int? = null, + val once: Boolean? = null, +) + +data class TalkDirectiveParseResult( + val directive: TalkDirective?, + val stripped: String, + val unknownKeys: List, +) + +object TalkDirectiveParser { + fun parse(text: String): TalkDirectiveParseResult { + val normalized = text.replace("\r\n", "\n") + val lines = normalized.split("\n").toMutableList() + if (lines.isEmpty()) return TalkDirectiveParseResult(null, text, emptyList()) + + val firstNonEmpty = lines.indexOfFirst { it.trim().isNotEmpty() } + if (firstNonEmpty == -1) return TalkDirectiveParseResult(null, text, emptyList()) + + val head = lines[firstNonEmpty].trim() + if (!head.startsWith("{") || !head.endsWith("}")) { + return TalkDirectiveParseResult(null, text, emptyList()) + } + + val obj = parseJsonObject(head) ?: return TalkDirectiveParseResult(null, text, emptyList()) + + val speakerBoost = + boolValue(obj, listOf("speaker_boost", "speakerBoost")) + ?: boolValue(obj, listOf("no_speaker_boost", "noSpeakerBoost"))?.not() + + val directive = TalkDirective( + voiceId = stringValue(obj, listOf("voice", "voice_id", "voiceId")), + modelId = stringValue(obj, listOf("model", "model_id", "modelId")), + speed = doubleValue(obj, listOf("speed")), + rateWpm = intValue(obj, listOf("rate", "wpm")), + stability = doubleValue(obj, listOf("stability")), + similarity = doubleValue(obj, listOf("similarity", "similarity_boost", "similarityBoost")), + style = doubleValue(obj, listOf("style")), + speakerBoost = speakerBoost, + seed = longValue(obj, listOf("seed")), + normalize = stringValue(obj, listOf("normalize", "apply_text_normalization")), + language = stringValue(obj, listOf("lang", "language_code", "language")), + outputFormat = stringValue(obj, listOf("output_format", "format")), + latencyTier = intValue(obj, listOf("latency", "latency_tier", "latencyTier")), + once = boolValue(obj, listOf("once")), + ) + + val hasDirective = listOf( + directive.voiceId, + directive.modelId, + directive.speed, + directive.rateWpm, + directive.stability, + directive.similarity, + directive.style, + directive.speakerBoost, + directive.seed, + directive.normalize, + directive.language, + directive.outputFormat, + directive.latencyTier, + directive.once, + ).any { it != null } + + if (!hasDirective) return TalkDirectiveParseResult(null, text, emptyList()) + + val knownKeys = setOf( + "voice", "voice_id", "voiceid", + "model", "model_id", "modelid", + "speed", "rate", "wpm", + "stability", "similarity", "similarity_boost", "similarityboost", + "style", + "speaker_boost", "speakerboost", + "no_speaker_boost", "nospeakerboost", + "seed", + "normalize", "apply_text_normalization", + "lang", "language_code", "language", + "output_format", "format", + "latency", "latency_tier", "latencytier", + "once", + ) + val unknownKeys = obj.keys.filter { !knownKeys.contains(it.lowercase()) }.sorted() + + lines.removeAt(firstNonEmpty) + if (firstNonEmpty < lines.size) { + if (lines[firstNonEmpty].trim().isEmpty()) { + lines.removeAt(firstNonEmpty) + } + } + + return TalkDirectiveParseResult(directive, lines.joinToString("\n"), unknownKeys) + } + + private fun parseJsonObject(line: String): JsonObject? { + return try { + directiveJson.parseToJsonElement(line) as? JsonObject + } catch (_: Throwable) { + null + } + } + + private fun stringValue(obj: JsonObject, keys: List): String? { + for (key in keys) { + val value = obj[key].asStringOrNull()?.trim() + if (!value.isNullOrEmpty()) return value + } + return null + } + + private fun doubleValue(obj: JsonObject, keys: List): Double? { + for (key in keys) { + val value = obj[key].asDoubleOrNull() + if (value != null) return value + } + return null + } + + private fun intValue(obj: JsonObject, keys: List): Int? { + for (key in keys) { + val value = obj[key].asIntOrNull() + if (value != null) return value + } + return null + } + + private fun longValue(obj: JsonObject, keys: List): Long? { + for (key in keys) { + val value = obj[key].asLongOrNull() + if (value != null) return value + } + return null + } + + private fun boolValue(obj: JsonObject, keys: List): Boolean? { + for (key in keys) { + val value = obj[key].asBooleanOrNull() + if (value != null) return value + } + return null + } +} + +private fun JsonElement?.asStringOrNull(): String? = (this as? JsonPrimitive)?.contentOrNull + +private fun JsonElement?.asDoubleOrNull(): Double? { + val primitive = this as? JsonPrimitive ?: return null + if (primitive.isString) return primitive.content.toDoubleOrNull() + return primitive.doubleOrNull +} + +private fun JsonElement?.asIntOrNull(): Int? { + val primitive = this as? JsonPrimitive ?: return null + if (primitive.isString) return primitive.content.toIntOrNull() + return primitive.intOrNull +} + +private fun JsonElement?.asLongOrNull(): Long? { + val primitive = this as? JsonPrimitive ?: return null + if (primitive.isString) return primitive.content.toLongOrNull() + return primitive.longOrNull +} + +private fun JsonElement?.asBooleanOrNull(): Boolean? { + val primitive = this as? JsonPrimitive ?: return null + if (primitive.booleanOrNull != null) return primitive.booleanOrNull + val content = primitive.contentOrNull?.trim()?.lowercase() ?: return null + return when (content) { + "true", "yes", "1" -> true + "false", "no", "0" -> false + else -> null + } +} diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt new file mode 100644 index 000000000..ecbc51869 --- /dev/null +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt @@ -0,0 +1,713 @@ +package com.steipete.clawdis.node.voice + +import android.Manifest +import android.content.Context +import android.content.Intent +import android.content.pm.PackageManager +import android.media.AudioAttributes +import android.media.MediaPlayer +import android.os.Bundle +import android.os.Handler +import android.os.Looper +import android.os.SystemClock +import android.speech.RecognitionListener +import android.speech.RecognizerIntent +import android.speech.SpeechRecognizer +import android.util.Log +import androidx.core.content.ContextCompat +import com.steipete.clawdis.node.bridge.BridgeSession +import java.io.File +import java.net.HttpURLConnection +import java.net.URL +import java.util.UUID +import kotlinx.coroutines.CompletableDeferred +import kotlinx.coroutines.CoroutineScope +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.Job +import kotlinx.coroutines.delay +import kotlinx.coroutines.flow.MutableStateFlow +import kotlinx.coroutines.flow.StateFlow +import kotlinx.coroutines.launch +import kotlinx.coroutines.withContext +import kotlinx.serialization.json.Json +import kotlinx.serialization.json.JsonArray +import kotlinx.serialization.json.JsonElement +import kotlinx.serialization.json.JsonObject +import kotlinx.serialization.json.JsonPrimitive +import kotlinx.serialization.json.buildJsonObject + +class TalkModeManager( + private val context: Context, + private val scope: CoroutineScope, +) { + companion object { + private const val tag = "TalkMode" + } + + private val mainHandler = Handler(Looper.getMainLooper()) + private val json = Json { ignoreUnknownKeys = true } + + private val _isEnabled = MutableStateFlow(false) + val isEnabled: StateFlow = _isEnabled + + private val _isListening = MutableStateFlow(false) + val isListening: StateFlow = _isListening + + private val _isSpeaking = MutableStateFlow(false) + val isSpeaking: StateFlow = _isSpeaking + + private val _statusText = MutableStateFlow("Off") + val statusText: StateFlow = _statusText + + private var recognizer: SpeechRecognizer? = null + private var restartJob: Job? = null + private var stopRequested = false + private var listeningMode = false + + private var silenceJob: Job? = null + private val silenceWindowMs = 700L + private var lastTranscript: String = "" + private var lastHeardAtMs: Long? = null + private var lastSpokenText: String? = null + private var lastInterruptedAtSeconds: Double? = null + + private var defaultVoiceId: String? = null + private var currentVoiceId: String? = null + private var defaultModelId: String? = null + private var currentModelId: String? = null + private var defaultOutputFormat: String? = null + private var interruptOnSpeech: Boolean = true + private var voiceOverrideActive = false + private var modelOverrideActive = false + + private var session: BridgeSession? = null + private var pendingRunId: String? = null + private var pendingFinal: CompletableDeferred? = null + + private var player: MediaPlayer? = null + private var currentAudioFile: File? = null + + fun attachSession(session: BridgeSession) { + this.session = session + } + + fun setEnabled(enabled: Boolean) { + if (_isEnabled.value == enabled) return + _isEnabled.value = enabled + if (enabled) { + start() + } else { + stop() + } + } + + fun handleBridgeEvent(event: String, payloadJson: String?) { + if (event != "chat") return + if (payloadJson.isNullOrBlank()) return + val pending = pendingRunId ?: return + val obj = + try { + json.parseToJsonElement(payloadJson).asObjectOrNull() + } catch (_: Throwable) { + null + } ?: return + val runId = obj["runId"].asStringOrNull() ?: return + if (runId != pending) return + val state = obj["state"].asStringOrNull() ?: return + if (state == "final") { + pendingFinal?.complete(true) + pendingFinal = null + pendingRunId = null + } + } + + private fun start() { + mainHandler.post { + if (_isListening.value) return@post + stopRequested = false + listeningMode = true + + if (!SpeechRecognizer.isRecognitionAvailable(context)) { + _statusText.value = "Speech recognizer unavailable" + return@post + } + + val micOk = + ContextCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO) == + PackageManager.PERMISSION_GRANTED + if (!micOk) { + _statusText.value = "Microphone permission required" + return@post + } + + try { + recognizer?.destroy() + recognizer = SpeechRecognizer.createSpeechRecognizer(context).also { it.setRecognitionListener(listener) } + startListeningInternal(markListening = true) + startSilenceMonitor() + } catch (err: Throwable) { + _statusText.value = "Start failed: ${err.message ?: err::class.simpleName}" + } + } + } + + private fun stop() { + stopRequested = true + listeningMode = false + restartJob?.cancel() + restartJob = null + silenceJob?.cancel() + silenceJob = null + lastTranscript = "" + lastHeardAtMs = null + _isListening.value = false + _statusText.value = "Off" + stopSpeaking() + + mainHandler.post { + recognizer?.cancel() + recognizer?.destroy() + recognizer = null + } + } + + private fun startListeningInternal(markListening: Boolean) { + val r = recognizer ?: return + val intent = + Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH).apply { + putExtra(RecognizerIntent.EXTRA_LANGUAGE_MODEL, RecognizerIntent.LANGUAGE_MODEL_FREE_FORM) + putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, true) + putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 3) + putExtra(RecognizerIntent.EXTRA_CALLING_PACKAGE, context.packageName) + } + + if (markListening) { + _statusText.value = "Listening" + _isListening.value = true + } + r.startListening(intent) + } + + private fun scheduleRestart(delayMs: Long = 350) { + if (stopRequested) return + restartJob?.cancel() + restartJob = + scope.launch { + delay(delayMs) + mainHandler.post { + if (stopRequested) return@post + try { + recognizer?.cancel() + val shouldListen = listeningMode + val shouldInterrupt = _isSpeaking.value && interruptOnSpeech + if (!shouldListen && !shouldInterrupt) return@post + startListeningInternal(markListening = shouldListen) + } catch (_: Throwable) { + // handled by onError + } + } + } + } + + private fun handleTranscript(text: String, isFinal: Boolean) { + val trimmed = text.trim() + if (_isSpeaking.value && interruptOnSpeech) { + if (shouldInterrupt(trimmed)) { + stopSpeaking() + } + return + } + + if (!_isListening.value) return + + if (trimmed.isNotEmpty()) { + lastTranscript = trimmed + lastHeardAtMs = SystemClock.elapsedRealtime() + } + + if (isFinal) { + lastTranscript = trimmed + } + } + + private fun startSilenceMonitor() { + silenceJob?.cancel() + silenceJob = + scope.launch { + while (_isEnabled.value) { + delay(200) + checkSilence() + } + } + } + + private fun checkSilence() { + if (!_isListening.value) return + val transcript = lastTranscript.trim() + if (transcript.isEmpty()) return + val lastHeard = lastHeardAtMs ?: return + val elapsed = SystemClock.elapsedRealtime() - lastHeard + if (elapsed < silenceWindowMs) return + scope.launch { finalizeTranscript(transcript) } + } + + private suspend fun finalizeTranscript(transcript: String) { + listeningMode = false + _isListening.value = false + _statusText.value = "Thinking…" + lastTranscript = "" + lastHeardAtMs = null + + reloadConfig() + val prompt = buildPrompt(transcript) + val bridge = session + if (bridge == null) { + _statusText.value = "Bridge not connected" + start() + return + } + + try { + val runId = sendChat(prompt, bridge) + val ok = waitForChatFinal(runId) + if (!ok) { + _statusText.value = "No reply" + start() + return + } + val assistant = fetchLatestAssistantText(bridge) + if (assistant.isNullOrBlank()) { + _statusText.value = "No reply" + start() + return + } + playAssistant(assistant) + } catch (err: Throwable) { + _statusText.value = "Talk failed: ${err.message ?: err::class.simpleName}" + } + + if (_isEnabled.value) { + start() + } + } + + private fun buildPrompt(transcript: String): String { + val lines = mutableListOf( + "Talk Mode active. Reply in a concise, spoken tone.", + "You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"\",\"once\":true}.", + ) + lastInterruptedAtSeconds?.let { + lines.add("Assistant speech interrupted at ${"%.1f".format(it)}s.") + lastInterruptedAtSeconds = null + } + lines.add("") + lines.add(transcript) + return lines.joinToString("\n") + } + + private suspend fun sendChat(message: String, bridge: BridgeSession): String { + val runId = UUID.randomUUID().toString() + val params = + buildJsonObject { + put("sessionKey", JsonPrimitive("main")) + put("message", JsonPrimitive(message)) + put("thinking", JsonPrimitive("low")) + put("timeoutMs", JsonPrimitive(30_000)) + put("idempotencyKey", JsonPrimitive(runId)) + } + val res = bridge.request("chat.send", params.toString()) + val parsed = parseRunId(res) ?: runId + if (parsed != runId) { + pendingRunId = parsed + } + return parsed + } + + private suspend fun waitForChatFinal(runId: String): Boolean { + pendingFinal?.cancel() + val deferred = CompletableDeferred() + pendingRunId = runId + pendingFinal = deferred + + val result = + withContext(Dispatchers.IO) { + try { + kotlinx.coroutines.withTimeout(120_000) { deferred.await() } + } catch (_: Throwable) { + false + } + } + + if (!result) { + pendingFinal = null + pendingRunId = null + } + return result + } + + private suspend fun fetchLatestAssistantText(bridge: BridgeSession): String? { + val res = bridge.request("chat.history", "{\"sessionKey\":\"main\"}") + val root = json.parseToJsonElement(res).asObjectOrNull() ?: return null + val messages = root["messages"] as? JsonArray ?: return null + for (item in messages.reversed()) { + val obj = item.asObjectOrNull() ?: continue + if (obj["role"].asStringOrNull() != "assistant") continue + val content = obj["content"] as? JsonArray ?: continue + val text = + content.mapNotNull { entry -> + entry.asObjectOrNull()?.get("text")?.asStringOrNull()?.trim() + }.filter { it.isNotEmpty() } + if (text.isNotEmpty()) return text.joinToString("\n") + } + return null + } + + private suspend fun playAssistant(text: String) { + val parsed = TalkDirectiveParser.parse(text) + if (parsed.unknownKeys.isNotEmpty()) { + Log.w(tag, "Unknown talk directive keys: ${parsed.unknownKeys}") + } + val directive = parsed.directive + val cleaned = parsed.stripped.trim() + if (cleaned.isEmpty()) return + + if (directive?.voiceId != null) { + if (directive.once != true) { + currentVoiceId = directive.voiceId + voiceOverrideActive = true + } + } + if (directive?.modelId != null) { + if (directive.once != true) { + currentModelId = directive.modelId + modelOverrideActive = true + } + } + + val voiceId = directive?.voiceId ?: currentVoiceId ?: defaultVoiceId + if (voiceId.isNullOrBlank()) { + _statusText.value = "Missing voice ID" + return + } + + val apiKey = System.getenv("ELEVENLABS_API_KEY")?.trim() + if (apiKey.isNullOrEmpty()) { + _statusText.value = "Missing ELEVENLABS_API_KEY" + return + } + + _statusText.value = "Speaking…" + _isSpeaking.value = true + lastSpokenText = cleaned + ensureInterruptListener() + + try { + val request = + ElevenLabsRequest( + text = cleaned, + modelId = directive?.modelId ?: currentModelId ?: defaultModelId, + outputFormat = directive?.outputFormat ?: defaultOutputFormat, + speed = TalkModeRuntime.resolveSpeed(directive?.speed, directive?.rateWpm), + stability = TalkModeRuntime.validatedUnit(directive?.stability), + similarity = TalkModeRuntime.validatedUnit(directive?.similarity), + style = TalkModeRuntime.validatedUnit(directive?.style), + speakerBoost = directive?.speakerBoost, + seed = TalkModeRuntime.validatedSeed(directive?.seed), + normalize = TalkModeRuntime.validatedNormalize(directive?.normalize), + language = TalkModeRuntime.validatedLanguage(directive?.language), + ) + val audio = synthesize(voiceId = voiceId, apiKey = apiKey, request = request) + playAudio(audio) + } catch (err: Throwable) { + _statusText.value = "Speak failed: ${err.message ?: err::class.simpleName}" + } + + _isSpeaking.value = false + } + + private suspend fun playAudio(data: ByteArray) { + stopSpeaking(resetInterrupt = false) + val file = File.createTempFile("talk-", ".mp3", context.cacheDir) + file.writeBytes(data) + currentAudioFile = file + + val player = MediaPlayer() + this.player = player + + val finished = CompletableDeferred() + player.setAudioAttributes( + AudioAttributes.Builder() + .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) + .setUsage(AudioAttributes.USAGE_ASSISTANT) + .build(), + ) + player.setOnCompletionListener { + finished.complete(Unit) + } + player.setOnErrorListener { _, _, _ -> + finished.completeExceptionally(IllegalStateException("MediaPlayer error")) + true + } + + player.setDataSource(file.absolutePath) + withContext(Dispatchers.Main) { + player.setOnPreparedListener { it.start() } + player.prepareAsync() + } + + try { + finished.await() + } finally { + cleanupPlayer() + } + } + + private fun stopSpeaking(resetInterrupt: Boolean = true) { + if (!_isSpeaking.value) { + cleanupPlayer() + return + } + if (resetInterrupt) { + val currentMs = player?.currentPosition?.toDouble() ?: 0.0 + lastInterruptedAtSeconds = currentMs / 1000.0 + } + cleanupPlayer() + _isSpeaking.value = false + } + + private fun cleanupPlayer() { + player?.stop() + player?.release() + player = null + currentAudioFile?.delete() + currentAudioFile = null + } + + private fun shouldInterrupt(transcript: String): Boolean { + val trimmed = transcript.trim() + if (trimmed.length < 3) return false + val spoken = lastSpokenText?.lowercase() + if (spoken != null && spoken.contains(trimmed.lowercase())) return false + return true + } + + private suspend fun reloadConfig() { + val bridge = session ?: return + val envVoice = System.getenv("ELEVENLABS_VOICE_ID")?.trim() + val sagVoice = System.getenv("SAG_VOICE_ID")?.trim() + try { + val res = bridge.request("config.get", "{}") + val root = json.parseToJsonElement(res).asObjectOrNull() + val config = root?.get("config").asObjectOrNull() + val talk = config?.get("talk").asObjectOrNull() + val voice = talk?.get("voiceId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } + val model = talk?.get("modelId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } + val outputFormat = talk?.get("outputFormat")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } + val interrupt = talk?.get("interruptOnSpeech")?.asBooleanOrNull() + + defaultVoiceId = voice ?: envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() } + if (!voiceOverrideActive) currentVoiceId = defaultVoiceId + defaultModelId = model + if (!modelOverrideActive) currentModelId = defaultModelId + defaultOutputFormat = outputFormat + if (interrupt != null) interruptOnSpeech = interrupt + } catch (_: Throwable) { + defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() } + } + } + + private fun parseRunId(jsonString: String): String? { + val obj = json.parseToJsonElement(jsonString).asObjectOrNull() ?: return null + return obj["runId"].asStringOrNull() + } + + private suspend fun synthesize(voiceId: String, apiKey: String, request: ElevenLabsRequest): ByteArray { + return withContext(Dispatchers.IO) { + val url = URL("https://api.elevenlabs.io/v1/text-to-speech/$voiceId") + val conn = url.openConnection() as HttpURLConnection + conn.requestMethod = "POST" + conn.setRequestProperty("Content-Type", "application/json") + conn.setRequestProperty("Accept", "audio/mpeg") + conn.setRequestProperty("xi-api-key", apiKey) + conn.doOutput = true + + val payload = buildRequestPayload(request) + conn.outputStream.use { it.write(payload.toByteArray()) } + + val code = conn.responseCode + val stream = if (code >= 400) conn.errorStream else conn.inputStream + val data = stream.readBytes() + if (code >= 400) { + val message = String(data) + throw IllegalStateException("ElevenLabs failed: $code $message") + } + data + } + } + + private fun buildRequestPayload(request: ElevenLabsRequest): String { + val voiceSettingsEntries = + buildJsonObject { + request.speed?.let { put("speed", JsonPrimitive(it)) } + request.stability?.let { put("stability", JsonPrimitive(it)) } + request.similarity?.let { put("similarity_boost", JsonPrimitive(it)) } + request.style?.let { put("style", JsonPrimitive(it)) } + request.speakerBoost?.let { put("use_speaker_boost", JsonPrimitive(it)) } + } + + val payload = + buildJsonObject { + put("text", JsonPrimitive(request.text)) + request.modelId?.takeIf { it.isNotEmpty() }?.let { put("model_id", JsonPrimitive(it)) } + request.outputFormat?.takeIf { it.isNotEmpty() }?.let { put("output_format", JsonPrimitive(it)) } + request.seed?.let { put("seed", JsonPrimitive(it)) } + request.normalize?.let { put("apply_text_normalization", JsonPrimitive(it)) } + request.language?.let { put("language_code", JsonPrimitive(it)) } + if (voiceSettingsEntries.isNotEmpty()) { + put("voice_settings", voiceSettingsEntries) + } + } + + return payload.toString() + } + + private data class ElevenLabsRequest( + val text: String, + val modelId: String?, + val outputFormat: String?, + val speed: Double?, + val stability: Double?, + val similarity: Double?, + val style: Double?, + val speakerBoost: Boolean?, + val seed: Long?, + val normalize: String?, + val language: String?, + ) + + private object TalkModeRuntime { + fun resolveSpeed(speed: Double?, rateWpm: Int?): Double? { + if (rateWpm != null && rateWpm > 0) { + val resolved = rateWpm.toDouble() / 175.0 + if (resolved <= 0.5 || resolved >= 2.0) return null + return resolved + } + if (speed != null) { + if (speed <= 0.5 || speed >= 2.0) return null + return speed + } + return null + } + + fun validatedUnit(value: Double?): Double? { + if (value == null) return null + if (value < 0 || value > 1) return null + return value + } + + fun validatedSeed(value: Long?): Long? { + if (value == null) return null + if (value < 0 || value > 4294967295L) return null + return value + } + + fun validatedNormalize(value: String?): String? { + val normalized = value?.trim()?.lowercase() ?: return null + return if (normalized in listOf("auto", "on", "off")) normalized else null + } + + fun validatedLanguage(value: String?): String? { + val normalized = value?.trim()?.lowercase() ?: return null + if (normalized.length != 2) return null + if (!normalized.all { it in 'a'..'z' }) return null + return normalized + } + } + + private fun ensureInterruptListener() { + if (!interruptOnSpeech || !_isEnabled.value) return + mainHandler.post { + if (stopRequested) return@post + if (!SpeechRecognizer.isRecognitionAvailable(context)) return@post + try { + if (recognizer == null) { + recognizer = SpeechRecognizer.createSpeechRecognizer(context).also { it.setRecognitionListener(listener) } + } + recognizer?.cancel() + startListeningInternal(markListening = false) + } catch (_: Throwable) { + // ignore + } + } + } + + private val listener = + object : RecognitionListener { + override fun onReadyForSpeech(params: Bundle?) { + if (_isEnabled.value) { + _statusText.value = if (_isListening.value) "Listening" else _statusText.value + } + } + + override fun onBeginningOfSpeech() {} + + override fun onRmsChanged(rmsdB: Float) {} + + override fun onBufferReceived(buffer: ByteArray?) {} + + override fun onEndOfSpeech() { + scheduleRestart() + } + + override fun onError(error: Int) { + if (stopRequested) return + _isListening.value = false + if (error == SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS) { + _statusText.value = "Microphone permission required" + return + } + + _statusText.value = + when (error) { + SpeechRecognizer.ERROR_AUDIO -> "Audio error" + SpeechRecognizer.ERROR_CLIENT -> "Client error" + SpeechRecognizer.ERROR_NETWORK -> "Network error" + SpeechRecognizer.ERROR_NETWORK_TIMEOUT -> "Network timeout" + SpeechRecognizer.ERROR_NO_MATCH -> "Listening" + SpeechRecognizer.ERROR_RECOGNIZER_BUSY -> "Recognizer busy" + SpeechRecognizer.ERROR_SERVER -> "Server error" + SpeechRecognizer.ERROR_SPEECH_TIMEOUT -> "Listening" + else -> "Speech error ($error)" + } + scheduleRestart(delayMs = 600) + } + + override fun onResults(results: Bundle?) { + val list = results?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION).orEmpty() + list.firstOrNull()?.let { handleTranscript(it, isFinal = true) } + scheduleRestart() + } + + override fun onPartialResults(partialResults: Bundle?) { + val list = partialResults?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION).orEmpty() + list.firstOrNull()?.let { handleTranscript(it, isFinal = false) } + } + + override fun onEvent(eventType: Int, params: Bundle?) {} + } +} + +private fun JsonElement?.asObjectOrNull(): JsonObject? = this as? JsonObject + +private fun JsonElement?.asStringOrNull(): String? = (this as? JsonPrimitive)?.contentOrNull + +private fun JsonElement?.asBooleanOrNull(): Boolean? { + val primitive = this as? JsonPrimitive ?: return null + if (primitive.booleanOrNull != null) return primitive.booleanOrNull + val content = primitive.contentOrNull?.trim()?.lowercase() ?: return null + return when (content) { + "true", "yes", "1" -> true + "false", "no", "0" -> false + else -> null + } +} diff --git a/apps/android/app/src/test/java/com/steipete/clawdis/node/voice/TalkDirectiveParserTest.kt b/apps/android/app/src/test/java/com/steipete/clawdis/node/voice/TalkDirectiveParserTest.kt new file mode 100644 index 000000000..d69d2008f --- /dev/null +++ b/apps/android/app/src/test/java/com/steipete/clawdis/node/voice/TalkDirectiveParserTest.kt @@ -0,0 +1,55 @@ +package com.steipete.clawdis.node.voice + +import org.junit.Assert.assertEquals +import org.junit.Assert.assertNull +import org.junit.Assert.assertTrue +import org.junit.Test + +class TalkDirectiveParserTest { + @Test + fun parsesDirectiveAndStripsHeader() { + val input = """ + {"voice":"voice-123","once":true} + Hello from talk mode. + """.trimIndent() + val result = TalkDirectiveParser.parse(input) + assertEquals("voice-123", result.directive?.voiceId) + assertEquals(true, result.directive?.once) + assertEquals("Hello from talk mode.", result.stripped.trim()) + } + + @Test + fun ignoresUnknownKeysButReportsThem() { + val input = """ + {"voice":"abc","foo":1,"bar":"baz"} + Hi there. + """.trimIndent() + val result = TalkDirectiveParser.parse(input) + assertEquals("abc", result.directive?.voiceId) + assertTrue(result.unknownKeys.containsAll(listOf("bar", "foo"))) + } + + @Test + fun parsesAlternateKeys() { + val input = """ + {"model_id":"eleven_v3","similarity_boost":0.4,"no_speaker_boost":true,"rate":200} + Speak. + """.trimIndent() + val result = TalkDirectiveParser.parse(input) + assertEquals("eleven_v3", result.directive?.modelId) + assertEquals(0.4, result.directive?.similarity) + assertEquals(false, result.directive?.speakerBoost) + assertEquals(200, result.directive?.rateWpm) + } + + @Test + fun returnsNullWhenNoDirectivePresent() { + val input = """ + {} + Hello. + """.trimIndent() + val result = TalkDirectiveParser.parse(input) + assertNull(result.directive) + assertEquals(input, result.stripped) + } +} diff --git a/apps/ios/Sources/Model/NodeAppModel.swift b/apps/ios/Sources/Model/NodeAppModel.swift index 36b9345e1..4c491ea55 100644 --- a/apps/ios/Sources/Model/NodeAppModel.swift +++ b/apps/ios/Sources/Model/NodeAppModel.swift @@ -28,6 +28,7 @@ final class NodeAppModel { private var voiceWakeSyncTask: Task? @ObservationIgnored private var cameraHUDDismissTask: Task? let voiceWake = VoiceWakeManager() + let talkMode = TalkModeManager() private var lastAutoA2uiURL: String? var bridgeSession: BridgeSession { self.bridge } @@ -49,6 +50,9 @@ final class NodeAppModel { let enabled = UserDefaults.standard.bool(forKey: "voiceWake.enabled") self.voiceWake.setEnabled(enabled) + self.talkMode.attachBridge(self.bridge) + let talkEnabled = UserDefaults.standard.bool(forKey: "talk.enabled") + self.talkMode.setEnabled(talkEnabled) // Wire up deep links from canvas taps self.screen.onDeepLink = { [weak self] url in @@ -177,6 +181,10 @@ final class NodeAppModel { self.voiceWake.setEnabled(enabled) } + func setTalkEnabled(_ enabled: Bool) { + self.talkMode.setEnabled(enabled) + } + func connectToBridge( endpoint: NWEndpoint, hello: BridgeHello) diff --git a/apps/ios/Sources/Settings/SettingsTab.swift b/apps/ios/Sources/Settings/SettingsTab.swift index 34feee23a..265b7069c 100644 --- a/apps/ios/Sources/Settings/SettingsTab.swift +++ b/apps/ios/Sources/Settings/SettingsTab.swift @@ -20,6 +20,7 @@ struct SettingsTab: View { @AppStorage("node.displayName") private var displayName: String = "iOS Node" @AppStorage("node.instanceId") private var instanceId: String = UUID().uuidString @AppStorage("voiceWake.enabled") private var voiceWakeEnabled: Bool = false + @AppStorage("talk.enabled") private var talkEnabled: Bool = false @AppStorage("camera.enabled") private var cameraEnabled: Bool = true @AppStorage("screen.preventSleep") private var preventSleep: Bool = true @AppStorage("bridge.preferredStableID") private var preferredBridgeStableID: String = "" @@ -156,6 +157,10 @@ struct SettingsTab: View { .onChange(of: self.voiceWakeEnabled) { _, newValue in self.appModel.setVoiceWakeEnabled(newValue) } + Toggle("Talk Mode", isOn: self.$talkEnabled) + .onChange(of: self.talkEnabled) { _, newValue in + self.appModel.setTalkEnabled(newValue) + } NavigationLink { VoiceWakeWordsSettingsView() diff --git a/apps/ios/Sources/Voice/TalkModeManager.swift b/apps/ios/Sources/Voice/TalkModeManager.swift new file mode 100644 index 000000000..649eaa03a --- /dev/null +++ b/apps/ios/Sources/Voice/TalkModeManager.swift @@ -0,0 +1,518 @@ +import AVFAudio +import ClawdisKit +import Foundation +import Observation +import Speech + +@MainActor +@Observable +final class TalkModeManager: NSObject { + var isEnabled: Bool = false + var isListening: Bool = false + var isSpeaking: Bool = false + var statusText: String = "Off" + + private let audioEngine = AVAudioEngine() + private var speechRecognizer: SFSpeechRecognizer? + private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest? + private var recognitionTask: SFSpeechRecognitionTask? + private var silenceTask: Task? + + private var lastHeard: Date? + private var lastTranscript: String = "" + private var lastSpokenText: String? + private var lastInterruptedAtSeconds: Double? + + private var defaultVoiceId: String? + private var currentVoiceId: String? + private var defaultModelId: String? + private var currentModelId: String? + private var defaultOutputFormat: String? + private var interruptOnSpeech: Bool = true + + private var bridge: BridgeSession? + private let silenceWindow: TimeInterval = 0.7 + + private var player: AVAudioPlayer? + + func attachBridge(_ bridge: BridgeSession) { + self.bridge = bridge + } + + func setEnabled(_ enabled: Bool) { + self.isEnabled = enabled + if enabled { + Task { await self.start() } + } else { + self.stop() + } + } + + func start() async { + guard self.isEnabled else { return } + if self.isListening { return } + + self.statusText = "Requesting permissions…" + let micOk = await Self.requestMicrophonePermission() + guard micOk else { + self.statusText = "Microphone permission denied" + return + } + let speechOk = await Self.requestSpeechPermission() + guard speechOk else { + self.statusText = "Speech recognition permission denied" + return + } + + await self.reloadConfig() + do { + try Self.configureAudioSession() + try self.startRecognition() + self.isListening = true + self.statusText = "Listening" + self.startSilenceMonitor() + } catch { + self.isListening = false + self.statusText = "Start failed: \(error.localizedDescription)" + } + } + + func stop() { + self.isEnabled = false + self.isListening = false + self.statusText = "Off" + self.lastTranscript = "" + self.lastHeard = nil + self.silenceTask?.cancel() + self.silenceTask = nil + self.stopRecognition() + self.stopSpeaking() + } + + private func startRecognition() throws { + self.speechRecognizer = SFSpeechRecognizer() + guard let recognizer = self.speechRecognizer else { + throw NSError(domain: "TalkMode", code: 1, userInfo: [ + NSLocalizedDescriptionKey: "Speech recognizer unavailable", + ]) + } + + self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest() + self.recognitionRequest?.shouldReportPartialResults = true + guard let request = self.recognitionRequest else { return } + + let input = self.audioEngine.inputNode + let format = input.outputFormat(forBus: 0) + input.removeTap(onBus: 0) + input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak request] buffer, _ in + request?.append(buffer) + } + + self.audioEngine.prepare() + try self.audioEngine.start() + + self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in + guard let self else { return } + if let error { + self.statusText = "Speech error: \(error.localizedDescription)" + } + guard let result else { return } + let transcript = result.bestTranscription.formattedString + Task { @MainActor in + await self.handleTranscript(transcript: transcript, isFinal: result.isFinal) + } + } + } + + private func stopRecognition() { + self.recognitionTask?.cancel() + self.recognitionTask = nil + self.recognitionRequest?.endAudio() + self.recognitionRequest = nil + self.audioEngine.inputNode.removeTap(onBus: 0) + self.audioEngine.stop() + self.speechRecognizer = nil + } + + private func handleTranscript(transcript: String, isFinal: Bool) async { + let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines) + if self.isSpeaking, self.interruptOnSpeech { + if self.shouldInterrupt(with: trimmed) { + self.stopSpeaking() + } + return + } + + guard self.isListening else { return } + if !trimmed.isEmpty { + self.lastTranscript = trimmed + self.lastHeard = Date() + } + if isFinal { + self.lastTranscript = trimmed + } + } + + private func startSilenceMonitor() { + self.silenceTask?.cancel() + self.silenceTask = Task { [weak self] in + guard let self else { return } + while self.isEnabled { + try? await Task.sleep(nanoseconds: 200_000_000) + await self.checkSilence() + } + } + } + + private func checkSilence() async { + guard self.isListening else { return } + let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines) + guard !transcript.isEmpty else { return } + guard let lastHeard else { return } + if Date().timeIntervalSince(lastHeard) < self.silenceWindow { return } + await self.finalizeTranscript(transcript) + } + + private func finalizeTranscript(_ transcript: String) async { + self.isListening = false + self.statusText = "Thinking…" + self.lastTranscript = "" + self.lastHeard = nil + self.stopRecognition() + + await self.reloadConfig() + let prompt = self.buildPrompt(transcript: transcript) + guard let bridge else { + self.statusText = "Bridge not connected" + await self.start() + return + } + + do { + let runId = try await self.sendChat(prompt, bridge: bridge) + let ok = await self.waitForChatFinal(runId: runId, bridge: bridge) + if !ok { + self.statusText = "No reply" + await self.start() + return + } + + guard let assistantText = try await self.fetchLatestAssistantText(bridge: bridge) else { + self.statusText = "No reply" + await self.start() + return + } + await self.playAssistant(text: assistantText) + } catch { + self.statusText = "Talk failed: \(error.localizedDescription)" + } + + await self.start() + } + + private func buildPrompt(transcript: String) -> String { + var lines: [String] = [ + "Talk Mode active. Reply in a concise, spoken tone.", + "You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"\",\"once\":true}.", + ] + + if let interrupted = self.lastInterruptedAtSeconds { + let formatted = String(format: "%.1f", interrupted) + lines.append("Assistant speech interrupted at \(formatted)s.") + self.lastInterruptedAtSeconds = nil + } + + lines.append("") + lines.append(transcript) + return lines.joined(separator: "\n") + } + + private func sendChat(_ message: String, bridge: BridgeSession) async throws -> String { + struct SendResponse: Decodable { let runId: String } + let payload: [String: Any] = [ + "sessionKey": "main", + "message": message, + "thinking": "low", + "timeoutMs": 30_000, + "idempotencyKey": UUID().uuidString, + ] + let data = try JSONSerialization.data(withJSONObject: payload) + let json = String(decoding: data, as: UTF8.self) + let res = try await bridge.request(method: "chat.send", paramsJSON: json, timeoutSeconds: 30) + let decoded = try JSONDecoder().decode(SendResponse.self, from: res) + return decoded.runId + } + + private func waitForChatFinal(runId: String, bridge: BridgeSession) async -> Bool { + let stream = await bridge.subscribeServerEvents(bufferingNewest: 200) + let timeout = Date().addingTimeInterval(120) + for await evt in stream { + if Date() > timeout { return false } + guard evt.event == "chat", let payload = evt.payloadJSON else { continue } + guard let data = payload.data(using: .utf8) else { continue } + guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else { continue } + if (json["runId"] as? String) != runId { continue } + if let state = json["state"] as? String, state == "final" { + return true + } + } + return false + } + + private func fetchLatestAssistantText(bridge: BridgeSession) async throws -> String? { + let res = try await bridge.request(method: "chat.history", paramsJSON: "{\"sessionKey\":\"main\"}", timeoutSeconds: 15) + guard let json = try JSONSerialization.jsonObject(with: res) as? [String: Any] else { return nil } + guard let messages = json["messages"] as? [[String: Any]] else { return nil } + for msg in messages.reversed() { + guard (msg["role"] as? String) == "assistant" else { continue } + guard let content = msg["content"] as? [[String: Any]] else { continue } + let text = content.compactMap { $0["text"] as? String }.joined(separator: "\n") + let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) + if !trimmed.isEmpty { return trimmed } + } + return nil + } + + private func playAssistant(text: String) async { + let parsed = TalkDirectiveParser.parse(text) + let directive = parsed.directive + let cleaned = parsed.stripped.trimmingCharacters(in: .whitespacesAndNewlines) + guard !cleaned.isEmpty else { return } + + if let voice = directive?.voiceId { + if directive?.once != true { + self.currentVoiceId = voice + } + } + if let model = directive?.modelId { + if directive?.once != true { + self.currentModelId = model + } + } + + let voiceId = directive?.voiceId ?? self.currentVoiceId ?? self.defaultVoiceId + guard let voiceId, !voiceId.isEmpty else { + self.statusText = "Missing voice ID" + return + } + + guard let apiKey = ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"], !apiKey.isEmpty else { + self.statusText = "Missing ELEVENLABS_API_KEY" + return + } + + self.statusText = "Speaking…" + self.isSpeaking = true + self.lastSpokenText = cleaned + + do { + let request = ElevenLabsRequest( + text: cleaned, + modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId, + outputFormat: directive?.outputFormat ?? self.defaultOutputFormat, + speed: TalkModeRuntime.resolveSpeed( + speed: directive?.speed, + rateWPM: directive?.rateWPM), + stability: TalkModeRuntime.validatedUnit(directive?.stability), + similarity: TalkModeRuntime.validatedUnit(directive?.similarity), + style: TalkModeRuntime.validatedUnit(directive?.style), + speakerBoost: directive?.speakerBoost, + seed: TalkModeRuntime.validatedSeed(directive?.seed), + normalize: TalkModeRuntime.validatedNormalize(directive?.normalize), + language: TalkModeRuntime.validatedLanguage(directive?.language)) + let audio = try await ElevenLabsClient(apiKey: apiKey).synthesize( + voiceId: voiceId, + request: request) + try await self.playAudio(data: audio) + } catch { + self.statusText = "Speak failed: \(error.localizedDescription)" + } + + self.isSpeaking = false + } + + private func playAudio(data: Data) async throws { + self.player?.stop() + let player = try AVAudioPlayer(data: data) + self.player = player + player.prepareToPlay() + player.play() + while player.isPlaying { + try? await Task.sleep(nanoseconds: 120_000_000) + } + } + + private func stopSpeaking() { + guard self.isSpeaking else { return } + self.lastInterruptedAtSeconds = self.player?.currentTime + self.player?.stop() + self.player = nil + self.isSpeaking = false + } + + private func shouldInterrupt(with transcript: String) -> Bool { + let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines) + guard trimmed.count >= 3 else { return false } + if let spoken = self.lastSpokenText?.lowercased(), spoken.contains(trimmed.lowercased()) { + return false + } + return true + } + + private func reloadConfig() async { + guard let bridge else { return } + do { + let res = try await bridge.request(method: "config.get", paramsJSON: "{}", timeoutSeconds: 8) + guard let json = try JSONSerialization.jsonObject(with: res) as? [String: Any] else { return } + guard let config = json["config"] as? [String: Any] else { return } + let talk = config["talk"] as? [String: Any] + self.defaultVoiceId = (talk?["voiceId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) + self.currentVoiceId = self.defaultVoiceId + self.defaultModelId = (talk?["modelId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) + self.currentModelId = self.defaultModelId + self.defaultOutputFormat = (talk?["outputFormat"] as? String)? + .trimmingCharacters(in: .whitespacesAndNewlines) + if let interrupt = talk?["interruptOnSpeech"] as? Bool { + self.interruptOnSpeech = interrupt + } + } catch { + // ignore + } + } + + private static func configureAudioSession() throws { + let session = AVAudioSession.sharedInstance() + try session.setCategory(.playAndRecord, mode: .measurement, options: [ + .duckOthers, + .mixWithOthers, + .allowBluetoothHFP, + .defaultToSpeaker, + ]) + try session.setActive(true, options: []) + } + + private nonisolated static func requestMicrophonePermission() async -> Bool { + await withCheckedContinuation(isolation: nil) { cont in + AVAudioApplication.requestRecordPermission { ok in + cont.resume(returning: ok) + } + } + } + + private nonisolated static func requestSpeechPermission() async -> Bool { + await withCheckedContinuation(isolation: nil) { cont in + SFSpeechRecognizer.requestAuthorization { status in + cont.resume(returning: status == .authorized) + } + } + } +} + +private struct ElevenLabsRequest { + let text: String + let modelId: String? + let outputFormat: String? + let speed: Double? + let stability: Double? + let similarity: Double? + let style: Double? + let speakerBoost: Bool? + let seed: UInt32? + let normalize: String? + let language: String? +} + +private struct ElevenLabsClient { + let apiKey: String + let baseUrl = URL(string: "https://api.elevenlabs.io")! + + func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data { + var url = self.baseUrl + url.appendPathComponent("v1") + url.appendPathComponent("text-to-speech") + url.appendPathComponent(voiceId) + + var payload: [String: Any] = [ + "text": request.text, + ] + if let modelId = request.modelId, !modelId.isEmpty { + payload["model_id"] = modelId + } + if let outputFormat = request.outputFormat, !outputFormat.isEmpty { + payload["output_format"] = outputFormat + } + if let seed = request.seed { + payload["seed"] = seed + } + if let normalize = request.normalize { + payload["apply_text_normalization"] = normalize + } + if let language = request.language { + payload["language_code"] = language + } + var voiceSettings: [String: Any] = [:] + if let speed = request.speed { voiceSettings["speed"] = speed } + if let stability = request.stability { voiceSettings["stability"] = stability } + if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity } + if let style = request.style { voiceSettings["style"] = style } + if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost } + if !voiceSettings.isEmpty { payload["voice_settings"] = voiceSettings } + + let body = try JSONSerialization.data(withJSONObject: payload, options: []) + var req = URLRequest(url: url) + req.httpMethod = "POST" + req.httpBody = body + req.setValue("application/json", forHTTPHeaderField: "Content-Type") + req.setValue("audio/mpeg", forHTTPHeaderField: "Accept") + req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key") + + let (data, response) = try await URLSession.shared.data(for: req) + if let http = response as? HTTPURLResponse, http.statusCode >= 400 { + let message = String(data: data, encoding: .utf8) ?? "unknown" + throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [ + NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)", + ]) + } + return data + } +} + +private enum TalkModeRuntime { + static func resolveSpeed(speed: Double?, rateWPM: Int?) -> Double? { + if let rateWPM, rateWPM > 0 { + let resolved = Double(rateWPM) / 175.0 + if resolved <= 0.5 || resolved >= 2.0 { return nil } + return resolved + } + if let speed { + if speed <= 0.5 || speed >= 2.0 { return nil } + return speed + } + return nil + } + + static func validatedUnit(_ value: Double?) -> Double? { + guard let value else { return nil } + if value < 0 || value > 1 { return nil } + return value + } + + static func validatedSeed(_ value: Int?) -> UInt32? { + guard let value else { return nil } + if value < 0 || value > 4294967295 { return nil } + return UInt32(value) + } + + static func validatedNormalize(_ value: String?) -> String? { + guard let value else { return nil } + let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + return ["auto", "on", "off"].contains(normalized) ? normalized : nil + } + + static func validatedLanguage(_ value: String?) -> String? { + guard let value else { return nil } + let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else { return nil } + return normalized + } +} diff --git a/apps/ios/Sources/Voice/VoiceTab.swift b/apps/ios/Sources/Voice/VoiceTab.swift index 59e1cd6d4..4fedd0ce9 100644 --- a/apps/ios/Sources/Voice/VoiceTab.swift +++ b/apps/ios/Sources/Voice/VoiceTab.swift @@ -4,6 +4,7 @@ struct VoiceTab: View { @Environment(NodeAppModel.self) private var appModel @Environment(VoiceWakeManager.self) private var voiceWake @AppStorage("voiceWake.enabled") private var voiceWakeEnabled: Bool = false + @AppStorage("talk.enabled") private var talkEnabled: Bool = false var body: some View { NavigationStack { @@ -14,6 +15,7 @@ struct VoiceTab: View { Text(self.voiceWake.statusText) .font(.footnote) .foregroundStyle(.secondary) + LabeledContent("Talk Mode", value: self.talkEnabled ? "Enabled" : "Disabled") } Section("Notes") { @@ -36,6 +38,9 @@ struct VoiceTab: View { .onChange(of: self.voiceWakeEnabled) { _, newValue in self.appModel.setVoiceWakeEnabled(newValue) } + .onChange(of: self.talkEnabled) { _, newValue in + self.appModel.setTalkEnabled(newValue) + } } } } diff --git a/apps/macos/Sources/Clawdis/AppState.swift b/apps/macos/Sources/Clawdis/AppState.swift index 53d81c02d..94e20538a 100644 --- a/apps/macos/Sources/Clawdis/AppState.swift +++ b/apps/macos/Sources/Clawdis/AppState.swift @@ -121,6 +121,15 @@ final class AppState { forKey: voicePushToTalkEnabledKey) } } } + var talkEnabled: Bool { + didSet { + self.ifNotPreview { + UserDefaults.standard.set(self.talkEnabled, forKey: talkEnabledKey) + Task { await TalkModeController.shared.setEnabled(self.talkEnabled) } + } + } + } + var iconOverride: IconOverrideSelection { didSet { self.ifNotPreview { UserDefaults.standard.set(self.iconOverride.rawValue, forKey: iconOverrideKey) } } } @@ -216,6 +225,7 @@ final class AppState { .stringArray(forKey: voiceWakeAdditionalLocalesKey) ?? [] self.voicePushToTalkEnabled = UserDefaults.standard .object(forKey: voicePushToTalkEnabledKey) as? Bool ?? false + self.talkEnabled = UserDefaults.standard.bool(forKey: talkEnabledKey) if let storedHeartbeats = UserDefaults.standard.object(forKey: heartbeatsEnabledKey) as? Bool { self.heartbeatsEnabled = storedHeartbeats } else { @@ -256,9 +266,13 @@ final class AppState { if self.swabbleEnabled, !PermissionManager.voiceWakePermissionsGranted() { self.swabbleEnabled = false } + if self.talkEnabled, !PermissionManager.voiceWakePermissionsGranted() { + self.talkEnabled = false + } if !self.isPreview { Task { await VoiceWakeRuntime.shared.refresh(state: self) } + Task { await TalkModeController.shared.setEnabled(self.talkEnabled) } } } @@ -312,6 +326,23 @@ final class AppState { Task { await VoiceWakeRuntime.shared.refresh(state: self) } } + func setTalkEnabled(_ enabled: Bool) async { + guard voiceWakeSupported else { + self.talkEnabled = false + return + } + + self.talkEnabled = enabled + guard !self.isPreview else { return } + + if !enabled { return } + + if PermissionManager.voiceWakePermissionsGranted() { return } + + let granted = await PermissionManager.ensureVoiceWakePermissions(interactive: true) + self.talkEnabled = granted + } + // MARK: - Global wake words sync (Gateway-owned) func applyGlobalVoiceWakeTriggers(_ triggers: [String]) { @@ -367,6 +398,7 @@ extension AppState { state.voiceWakeLocaleID = Locale.current.identifier state.voiceWakeAdditionalLocaleIDs = ["en-US", "de-DE"] state.voicePushToTalkEnabled = false + state.talkEnabled = false state.iconOverride = .system state.heartbeatsEnabled = true state.connectionMode = .local diff --git a/apps/macos/Sources/Clawdis/ConfigSettings.swift b/apps/macos/Sources/Clawdis/ConfigSettings.swift index 043139351..cbbf04d5a 100644 --- a/apps/macos/Sources/Clawdis/ConfigSettings.swift +++ b/apps/macos/Sources/Clawdis/ConfigSettings.swift @@ -30,6 +30,10 @@ struct ConfigSettings: View { @State private var browserColorHex: String = "#FF4500" @State private var browserAttachOnly: Bool = false + // Talk mode settings (stored in ~/.clawdis/clawdis.json under "talk") + @State private var talkVoiceId: String = "" + @State private var talkInterruptOnSpeech: Bool = true + var body: some View { ScrollView { self.content } .onChange(of: self.modelCatalogPath) { _, _ in @@ -53,6 +57,7 @@ struct ConfigSettings: View { self.header self.agentSection self.heartbeatSection + self.talkSection self.browserSection Spacer(minLength: 0) } @@ -266,6 +271,37 @@ struct ConfigSettings: View { .frame(maxWidth: .infinity, alignment: .leading) } + private var talkSection: some View { + GroupBox("Talk Mode") { + Grid(alignment: .leadingFirstTextBaseline, horizontalSpacing: 14, verticalSpacing: 10) { + GridRow { + self.gridLabel("Voice ID") + VStack(alignment: .leading, spacing: 6) { + ComboBox("ElevenLabs voice ID", text: self.$talkVoiceId) { + ForEach(self.talkVoiceSuggestions, id: \.self) { value in + Text(value).tag(value) + } + } + .textFieldStyle(.roundedBorder) + .frame(maxWidth: .infinity) + .onChange(of: self.talkVoiceId) { _, _ in self.autosaveConfig() } + Text("Defaults to ELEVENLABS_VOICE_ID / SAG_VOICE_ID if unset.") + .font(.footnote) + .foregroundStyle(.secondary) + } + } + GridRow { + self.gridLabel("Interrupt") + Toggle("Stop speaking when you start talking", isOn: self.$talkInterruptOnSpeech) + .labelsHidden() + .toggleStyle(.checkbox) + .onChange(of: self.talkInterruptOnSpeech) { _, _ in self.autosaveConfig() } + } + } + } + .frame(maxWidth: .infinity, alignment: .leading) + } + private func gridLabel(_ text: String) -> some View { Text(text) .foregroundStyle(.secondary) @@ -278,6 +314,7 @@ struct ConfigSettings: View { let heartbeatMinutes = agent?["heartbeatMinutes"] as? Int let heartbeatBody = agent?["heartbeatBody"] as? String let browser = parsed["browser"] as? [String: Any] + let talk = parsed["talk"] as? [String: Any] let loadedModel = (agent?["model"] as? String) ?? "" if !loadedModel.isEmpty { @@ -297,6 +334,13 @@ struct ConfigSettings: View { if let color = browser["color"] as? String, !color.isEmpty { self.browserColorHex = color } if let attachOnly = browser["attachOnly"] as? Bool { self.browserAttachOnly = attachOnly } } + + if let talk { + if let voice = talk["voiceId"] as? String { self.talkVoiceId = voice } + if let interrupt = talk["interruptOnSpeech"] as? Bool { + self.talkInterruptOnSpeech = interrupt + } + } } private func autosaveConfig() { @@ -312,6 +356,7 @@ struct ConfigSettings: View { var root = self.loadConfigDict() var agent = root["agent"] as? [String: Any] ?? [:] var browser = root["browser"] as? [String: Any] ?? [:] + var talk = root["talk"] as? [String: Any] ?? [:] let chosenModel = (self.configModel == "__custom__" ? self.customModel : self.configModel) .trimmingCharacters(in: .whitespacesAndNewlines) @@ -337,6 +382,15 @@ struct ConfigSettings: View { browser["attachOnly"] = self.browserAttachOnly root["browser"] = browser + let trimmedVoice = self.talkVoiceId.trimmingCharacters(in: .whitespacesAndNewlines) + if trimmedVoice.isEmpty { + talk.removeValue(forKey: "voiceId") + } else { + talk["voiceId"] = trimmedVoice + } + talk["interruptOnSpeech"] = self.talkInterruptOnSpeech + root["talk"] = talk + ClawdisConfigFile.saveDict(root) } @@ -354,6 +408,20 @@ struct ConfigSettings: View { return Color(red: r, green: g, blue: b) } + private var talkVoiceSuggestions: [String] { + let env = ProcessInfo.processInfo.environment + let candidates = [ + self.talkVoiceId, + env["ELEVENLABS_VOICE_ID"] ?? "", + env["SAG_VOICE_ID"] ?? "", + ] + var seen = Set() + return candidates + .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) } + .filter { !$0.isEmpty } + .filter { seen.insert($0).inserted } + } + private var browserPathLabel: String? { guard self.browserEnabled else { return nil } diff --git a/apps/macos/Sources/Clawdis/Constants.swift b/apps/macos/Sources/Clawdis/Constants.swift index 966d1744a..589091261 100644 --- a/apps/macos/Sources/Clawdis/Constants.swift +++ b/apps/macos/Sources/Clawdis/Constants.swift @@ -16,6 +16,7 @@ let voiceWakeMicKey = "clawdis.voiceWakeMicID" let voiceWakeLocaleKey = "clawdis.voiceWakeLocaleID" let voiceWakeAdditionalLocalesKey = "clawdis.voiceWakeAdditionalLocaleIDs" let voicePushToTalkEnabledKey = "clawdis.voicePushToTalkEnabled" +let talkEnabledKey = "clawdis.talkEnabled" let iconOverrideKey = "clawdis.iconOverride" let connectionModeKey = "clawdis.connectionMode" let remoteTargetKey = "clawdis.remoteTarget" diff --git a/apps/macos/Sources/Clawdis/MenuContentView.swift b/apps/macos/Sources/Clawdis/MenuContentView.swift index 6a5dc1e89..748ce018d 100644 --- a/apps/macos/Sources/Clawdis/MenuContentView.swift +++ b/apps/macos/Sources/Clawdis/MenuContentView.swift @@ -72,6 +72,11 @@ struct MenuContent: View { if self.showVoiceWakeMicPicker { self.voiceWakeMicMenu } + Toggle(isOn: self.talkBinding) { + Label("Talk", systemImage: "bubble.left.and.waveform") + } + .disabled(!voiceWakeSupported) + .opacity(voiceWakeSupported ? 1 : 0.5) Divider() Button { Task { @MainActor in @@ -331,6 +336,14 @@ struct MenuContent: View { }) } + private var talkBinding: Binding { + Binding( + get: { self.state.talkEnabled }, + set: { newValue in + Task { await self.state.setTalkEnabled(newValue) } + }) + } + private var showVoiceWakeMicPicker: Bool { voiceWakeSupported && self.state.swabbleEnabled } diff --git a/apps/macos/Sources/Clawdis/TalkAudioPlayer.swift b/apps/macos/Sources/Clawdis/TalkAudioPlayer.swift new file mode 100644 index 000000000..f72de1d02 --- /dev/null +++ b/apps/macos/Sources/Clawdis/TalkAudioPlayer.swift @@ -0,0 +1,54 @@ +import AVFoundation +import Foundation +import OSLog + +@MainActor +final class TalkAudioPlayer: NSObject, AVAudioPlayerDelegate { + static let shared = TalkAudioPlayer() + + private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts") + private var player: AVAudioPlayer? + private var continuation: CheckedContinuation? + + func play(data: Data) async -> TalkPlaybackResult { + self.stopInternal(interrupted: true) + do { + let player = try AVAudioPlayer(data: data) + self.player = player + player.delegate = self + player.prepareToPlay() + player.play() + return await withCheckedContinuation { continuation in + self.continuation = continuation + } + } catch { + self.logger.error("talk audio player failed: \(error.localizedDescription, privacy: .public)") + return TalkPlaybackResult(finished: false, interruptedAt: nil) + } + } + + func stop() -> Double? { + guard let player else { return nil } + let time = player.currentTime + self.stopInternal(interrupted: true, interruptedAt: time) + return time + } + + func audioPlayerDidFinishPlaying(_: AVAudioPlayer, successfully flag: Bool) { + self.stopInternal(interrupted: !flag) + } + + private func stopInternal(interrupted: Bool, interruptedAt: Double? = nil) { + self.player?.stop() + self.player = nil + if let continuation { + self.continuation = nil + continuation.resume(returning: TalkPlaybackResult(finished: !interrupted, interruptedAt: interruptedAt)) + } + } +} + +struct TalkPlaybackResult: Sendable { + let finished: Bool + let interruptedAt: Double? +} diff --git a/apps/macos/Sources/Clawdis/TalkModeController.swift b/apps/macos/Sources/Clawdis/TalkModeController.swift new file mode 100644 index 000000000..920af0539 --- /dev/null +++ b/apps/macos/Sources/Clawdis/TalkModeController.swift @@ -0,0 +1,42 @@ +import Observation +import OSLog + +@MainActor +@Observable +final class TalkModeController { + static let shared = TalkModeController() + + private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.controller") + + func setEnabled(_ enabled: Bool) async { + self.logger.info("talk enabled=\(enabled)") + if enabled { + TalkOverlayController.shared.present() + } else { + TalkOverlayController.shared.dismiss() + } + await TalkModeRuntime.shared.setEnabled(enabled) + } + + func updatePhase(_ phase: TalkModePhase) { + TalkOverlayController.shared.updatePhase(phase) + } + + func updateLevel(_ level: Double) { + TalkOverlayController.shared.updateLevel(level) + } + + func stopSpeaking(reason: TalkStopReason = .userTap) { + Task { await TalkModeRuntime.shared.stopSpeaking(reason: reason) } + } + + func exitTalkMode() { + Task { await AppStateStore.shared.setTalkEnabled(false) } + } +} + +enum TalkStopReason { + case userTap + case speech + case manual +} diff --git a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift new file mode 100644 index 000000000..955d9ceda --- /dev/null +++ b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift @@ -0,0 +1,684 @@ +import AVFoundation +import ClawdisChatUI +import ClawdisKit +import Foundation +import OSLog +import Speech + +actor TalkModeRuntime { + static let shared = TalkModeRuntime() + + private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.runtime") + + private var recognizer: SFSpeechRecognizer? + private var audioEngine: AVAudioEngine? + private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest? + private var recognitionTask: SFSpeechRecognitionTask? + private var recognitionGeneration: Int = 0 + + private var captureTask: Task? + private var silenceTask: Task? + private var phase: TalkModePhase = .idle + private var isEnabled = false + + private var lastHeard: Date? + private var noiseFloorRMS: Double = 1e-4 + private var lastTranscript: String = "" + private var lastSpeechEnergyAt: Date? + + private var defaultVoiceId: String? + private var currentVoiceId: String? + private var defaultModelId: String? + private var currentModelId: String? + private var voiceOverrideActive = false + private var modelOverrideActive = false + private var defaultOutputFormat: String? + private var interruptOnSpeech: Bool = true + private var lastInterruptedAtSeconds: Double? + private var lastSpokenText: String? + + private let silenceWindow: TimeInterval = 0.7 + private let minSpeechRMS: Double = 1e-3 + private let speechBoostFactor: Double = 6.0 + + // MARK: - Lifecycle + + func setEnabled(_ enabled: Bool) async { + guard enabled != self.isEnabled else { return } + self.isEnabled = enabled + if enabled { + await self.start() + } else { + await self.stop() + } + } + + private func start() async { + guard voiceWakeSupported else { return } + guard PermissionManager.voiceWakePermissionsGranted() else { + self.logger.debug("talk runtime not starting: permissions missing") + return + } + await self.reloadConfig() + await self.startRecognition() + self.phase = .listening + await MainActor.run { TalkModeController.shared.updatePhase(.listening) } + self.startSilenceMonitor() + } + + private func stop() async { + self.captureTask?.cancel() + self.captureTask = nil + self.silenceTask?.cancel() + self.silenceTask = nil + self.lastTranscript = "" + self.lastHeard = nil + self.lastSpeechEnergyAt = nil + self.phase = .idle + await self.stopRecognition() + await self.stopSpeaking(reason: .manual) + await MainActor.run { + TalkModeController.shared.updateLevel(0) + TalkModeController.shared.updatePhase(.idle) + } + } + + // MARK: - Speech recognition + + private struct RecognitionUpdate { + let transcript: String? + let segments: [SFTranscriptionSegment] + let isFinal: Bool + let error: Error? + let generation: Int + } + + private func startRecognition() async { + await self.stopRecognition() + self.recognitionGeneration &+= 1 + let generation = self.recognitionGeneration + + let locale = await MainActor.run { AppStateStore.shared.voiceWakeLocaleID } + self.recognizer = SFSpeechRecognizer(locale: Locale(identifier: locale)) + guard let recognizer, recognizer.isAvailable else { + self.logger.error("talk recognizer unavailable") + return + } + + self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest() + self.recognitionRequest?.shouldReportPartialResults = true + guard let request = self.recognitionRequest else { return } + + if self.audioEngine == nil { + self.audioEngine = AVAudioEngine() + } + guard let audioEngine = self.audioEngine else { return } + + let input = audioEngine.inputNode + let format = input.outputFormat(forBus: 0) + input.removeTap(onBus: 0) + input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak self, weak request] buffer, _ in + request?.append(buffer) + if let rms = Self.rmsLevel(buffer: buffer) { + Task.detached { [weak self] in + await self?.noteAudioLevel(rms: rms) + } + } + } + + audioEngine.prepare() + do { + try audioEngine.start() + } catch { + self.logger.error("talk audio engine start failed: \(error.localizedDescription, privacy: .public)") + return + } + + self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self, generation] result, error in + guard let self else { return } + let transcript = result?.bestTranscription.formattedString + let update = RecognitionUpdate( + transcript: transcript, + segments: result?.bestTranscription.segments ?? [], + isFinal: result?.isFinal ?? false, + error: error, + generation: generation) + Task { await self.handleRecognition(update) } + } + } + + private func stopRecognition() async { + self.recognitionGeneration &+= 1 + self.recognitionTask?.cancel() + self.recognitionTask = nil + self.recognitionRequest?.endAudio() + self.recognitionRequest = nil + self.audioEngine?.inputNode.removeTap(onBus: 0) + self.audioEngine?.stop() + self.audioEngine = nil + self.recognizer = nil + } + + private func handleRecognition(_ update: RecognitionUpdate) async { + guard update.generation == self.recognitionGeneration else { return } + if let error = update.error { + self.logger.debug("talk recognition error: \(error.localizedDescription, privacy: .public)") + } + guard let transcript = update.transcript else { return } + + let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines) + if self.phase == .speaking, self.interruptOnSpeech { + if await self.shouldInterrupt(transcript: trimmed, segments: update.segments) { + await self.stopSpeaking(reason: .speech) + self.lastTranscript = "" + self.lastHeard = nil + await self.startListening() + } + return + } + + guard self.phase == .listening else { return } + + if !trimmed.isEmpty { + self.lastTranscript = trimmed + self.lastHeard = Date() + } + + if update.isFinal { + self.lastTranscript = trimmed + } + } + + // MARK: - Silence handling + + private func startSilenceMonitor() { + self.silenceTask?.cancel() + self.silenceTask = Task { [weak self] in + guard let self else { return } + while self.isEnabled { + try? await Task.sleep(nanoseconds: 200_000_000) + await self.checkSilence() + } + } + } + + private func checkSilence() async { + guard self.phase == .listening else { return } + let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines) + guard !transcript.isEmpty else { return } + guard let lastHeard else { return } + let elapsed = Date().timeIntervalSince(lastHeard) + guard elapsed >= self.silenceWindow else { return } + await self.finalizeTranscript(transcript) + } + + private func startListening() async { + self.phase = .listening + self.lastTranscript = "" + self.lastHeard = nil + await MainActor.run { + TalkModeController.shared.updatePhase(.listening) + TalkModeController.shared.updateLevel(0) + } + } + + private func finalizeTranscript(_ text: String) async { + self.lastTranscript = "" + self.lastHeard = nil + self.phase = .thinking + await MainActor.run { TalkModeController.shared.updatePhase(.thinking) } + await self.stopRecognition() + await self.sendAndSpeak(text) + } + + // MARK: - Gateway + TTS + + private func sendAndSpeak(_ transcript: String) async { + await self.reloadConfig() + let prompt = self.buildPrompt(transcript: transcript) + let runId = UUID().uuidString + + do { + let response = try await GatewayConnection.shared.chatSend( + sessionKey: "main", + message: prompt, + thinking: "low", + idempotencyKey: runId, + attachments: []) + let completion = await self.waitForChatCompletion( + runId: response.runId, + timeoutSeconds: 120) + guard completion == .final else { + await self.startListening() + await self.startRecognition() + return + } + + guard let assistantText = await self.latestAssistantText(sessionKey: "main") else { + await self.startListening() + await self.startRecognition() + return + } + + await self.playAssistant(text: assistantText) + await self.startListening() + await self.startRecognition() + return + } catch { + self.logger.error("talk chat.send failed: \(error.localizedDescription, privacy: .public)") + await self.startListening() + await self.startRecognition() + return + } + } + + private func buildPrompt(transcript: String) -> String { + var lines: [String] = [ + "Talk Mode active. Reply in a concise, spoken tone.", + "You may optionally prefix the response with JSON (first line) to set ElevenLabs voice, e.g. {\"voice\":\"\",\"once\":true}.", + ] + + if let interrupted = self.lastInterruptedAtSeconds { + let formatted = String(format: "%.1f", interrupted) + lines.append("Assistant speech interrupted at \(formatted)s.") + self.lastInterruptedAtSeconds = nil + } + + lines.append("") + lines.append(transcript) + return lines.joined(separator: "\n") + } + + private enum ChatCompletionState { + case final + case aborted + case error + case timeout + } + + private func waitForChatCompletion(runId: String, timeoutSeconds: Int) async -> ChatCompletionState { + await withTaskGroup(of: ChatCompletionState.self) { group in + group.addTask { [runId] in + let stream = GatewayConnection.shared.subscribe() + for await push in stream { + if case let .event(evt) = push, evt.event == "chat", let payload = evt.payload { + if let chat = try? JSONDecoder().decode( + ClawdisChatEventPayload.self, + from: JSONEncoder().encode(payload)) + { + guard chat.runId == runId else { continue } + switch chat.state { + case .some("final"): return .final + case .some("aborted"): return .aborted + case .some("error"): return .error + default: break + } + } + } + } + return .timeout + } + group.addTask { + try? await Task.sleep(nanoseconds: UInt64(timeoutSeconds) * 1_000_000_000) + return .timeout + } + let result = await group.next() ?? .timeout + group.cancelAll() + return result + } + } + + private func latestAssistantText(sessionKey: String) async -> String? { + do { + let history = try await GatewayConnection.shared.chatHistory(sessionKey: sessionKey) + let messages = history.messages ?? [] + let decoded = messages.compactMap { item in + guard let data = try? JSONEncoder().encode(item) else { return nil } + return try? JSONDecoder().decode(ClawdisChatMessage.self, from: data) + } + guard let assistant = decoded.last(where: { $0.role == "assistant" }) else { return nil } + let text = assistant.content.compactMap { $0.text }.joined(separator: "\n") + let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) + return trimmed.isEmpty ? nil : trimmed + } catch { + self.logger.error("talk history fetch failed: \(error.localizedDescription, privacy: .public)") + return nil + } + } + + private func playAssistant(text: String) async { + let parse = TalkDirectiveParser.parse(text) + let directive = parse.directive + let cleaned = parse.stripped.trimmingCharacters(in: .whitespacesAndNewlines) + guard !cleaned.isEmpty else { return } + + if !parse.unknownKeys.isEmpty { + self.logger.warning("talk directive ignored keys: \(parse.unknownKeys.joined(separator: ","), privacy: .public)") + } + + if let voice = directive?.voiceId { + if directive?.once == true { + self.logger.info("talk voice override (once) voiceId=\(voice, privacy: .public)") + } else { + self.currentVoiceId = voice + self.voiceOverrideActive = true + self.logger.info("talk voice override voiceId=\(voice, privacy: .public)") + } + } + + if let model = directive?.modelId { + if directive?.once == true { + self.logger.info("talk model override (once) modelId=\(model, privacy: .public)") + } else { + self.currentModelId = model + self.modelOverrideActive = true + } + } + + let voiceId = + directive?.voiceId ?? + self.currentVoiceId ?? + self.defaultVoiceId + + guard let voiceId, !voiceId.isEmpty else { + self.logger.error("talk missing voiceId; set talk.voiceId or ELEVENLABS_VOICE_ID") + return + } + + let apiKey = ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"] ?? "" + if apiKey.isEmpty { + self.logger.error("talk missing ELEVENLABS_API_KEY") + return + } + + await self.startRecognition() + await MainActor.run { TalkModeController.shared.updatePhase(.speaking) } + self.phase = .speaking + self.lastSpokenText = cleaned + + let resolvedSpeed = Self.resolveSpeed( + speed: directive?.speed, + rateWPM: directive?.rateWPM, + logger: self.logger) + + let request = ElevenLabsRequest( + text: cleaned, + modelId: directive?.modelId ?? self.currentModelId ?? self.defaultModelId, + outputFormat: directive?.outputFormat ?? self.defaultOutputFormat, + speed: resolvedSpeed, + stability: Self.validatedUnit(directive?.stability, name: "stability", logger: self.logger), + similarity: Self.validatedUnit(directive?.similarity, name: "similarity", logger: self.logger), + style: Self.validatedUnit(directive?.style, name: "style", logger: self.logger), + speakerBoost: directive?.speakerBoost, + seed: Self.validatedSeed(directive?.seed, logger: self.logger), + normalize: Self.validatedNormalize(directive?.normalize, logger: self.logger), + language: Self.validatedLanguage(directive?.language, logger: self.logger)) + + do { + let audio = try await ElevenLabsClient(apiKey: apiKey).synthesize( + voiceId: voiceId, + request: request) + let result = await MainActor.run { await TalkAudioPlayer.shared.play(data: audio) } + if !result.finished, let interruptedAt = result.interruptedAt, self.phase == .speaking { + if self.interruptOnSpeech { + self.lastInterruptedAtSeconds = interruptedAt + } + } + } catch { + self.logger.error("talk TTS failed: \(error.localizedDescription, privacy: .public)") + } + + self.phase = .thinking + await MainActor.run { TalkModeController.shared.updatePhase(.thinking) } + } + + func stopSpeaking(reason: TalkStopReason) async { + guard self.phase == .speaking else { return } + let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() } + if reason == .speech, let interruptedAt { + self.lastInterruptedAtSeconds = interruptedAt + } + self.phase = .thinking + await MainActor.run { TalkModeController.shared.updatePhase(.thinking) } + } + + // MARK: - Config + + private func reloadConfig() async { + let cfg = await self.fetchTalkConfig() + self.defaultVoiceId = cfg.voiceId + if !self.voiceOverrideActive { + self.currentVoiceId = cfg.voiceId + } + self.defaultModelId = cfg.modelId + if !self.modelOverrideActive { + self.currentModelId = cfg.modelId + } + self.defaultOutputFormat = cfg.outputFormat + self.interruptOnSpeech = cfg.interruptOnSpeech + } + + private struct TalkRuntimeConfig { + let voiceId: String? + let modelId: String? + let outputFormat: String? + let interruptOnSpeech: Bool + } + + private func fetchTalkConfig() async -> TalkRuntimeConfig { + let env = ProcessInfo.processInfo.environment + let envVoice = env["ELEVENLABS_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines) + let sagVoice = env["SAG_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines) + + do { + let snap: ConfigSnapshot = try await GatewayConnection.shared.requestDecoded( + method: .configGet, + params: nil, + timeoutMs: 8000) + let talk = snap.config?["talk"]?.dictionaryValue + let voice = talk?["voiceId"]?.stringValue + let model = talk?["modelId"]?.stringValue + let outputFormat = talk?["outputFormat"]?.stringValue + let interrupt = talk?["interruptOnSpeech"]?.boolValue + let resolvedVoice = + (voice?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? voice : nil) ?? + (envVoice?.isEmpty == false ? envVoice : nil) ?? + (sagVoice?.isEmpty == false ? sagVoice : nil) + return TalkRuntimeConfig( + voiceId: resolvedVoice, + modelId: model, + outputFormat: outputFormat, + interruptOnSpeech: interrupt ?? true) + } catch { + let resolvedVoice = + (envVoice?.isEmpty == false ? envVoice : nil) ?? + (sagVoice?.isEmpty == false ? sagVoice : nil) + return TalkRuntimeConfig( + voiceId: resolvedVoice, + modelId: nil, + outputFormat: nil, + interruptOnSpeech: true) + } + } + + // MARK: - Audio level handling + + private func noteAudioLevel(rms: Double) async { + if self.phase != .listening && self.phase != .speaking { return } + let alpha: Double = rms < self.noiseFloorRMS ? 0.08 : 0.01 + self.noiseFloorRMS = max(1e-7, self.noiseFloorRMS + (rms - self.noiseFloorRMS) * alpha) + + let threshold = max(self.minSpeechRMS, self.noiseFloorRMS * self.speechBoostFactor) + if rms >= threshold { + let now = Date() + self.lastHeard = now + self.lastSpeechEnergyAt = now + } + + if self.phase == .listening { + let clamped = min(1.0, max(0.0, rms / max(self.minSpeechRMS, threshold))) + await MainActor.run { TalkModeController.shared.updateLevel(clamped) } + } + } + + private static func rmsLevel(buffer: AVAudioPCMBuffer) -> Double? { + guard let channelData = buffer.floatChannelData?.pointee else { return nil } + let frameCount = Int(buffer.frameLength) + guard frameCount > 0 else { return nil } + var sum: Double = 0 + for i in 0.. Bool { + let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines) + guard trimmed.count >= 3 else { return false } + if self.isLikelyEcho(of: trimmed) { return false } + let now = Date() + if let lastSpeechEnergyAt, now.timeIntervalSince(lastSpeechEnergyAt) > 0.35 { + return false + } + let hasConfidence = segments.contains { $0.confidence > 0.6 } + return hasConfidence + } + + private func isLikelyEcho(of transcript: String) -> Bool { + guard let spoken = self.lastSpokenText?.lowercased(), !spoken.isEmpty else { return false } + let probe = transcript.lowercased() + if probe.count < 6 { + return spoken.contains(probe) + } + return spoken.contains(probe) + } + + private static func resolveSpeed(speed: Double?, rateWPM: Int?, logger: Logger) -> Double? { + if let rateWPM, rateWPM > 0 { + let resolved = Double(rateWPM) / 175.0 + if resolved <= 0.5 || resolved >= 2.0 { + logger.warning("talk rateWPM out of range: \(rateWPM, privacy: .public)") + return nil + } + return resolved + } + if let speed { + if speed <= 0.5 || speed >= 2.0 { + logger.warning("talk speed out of range: \(speed, privacy: .public)") + return nil + } + return speed + } + return nil + } + + private static func validatedUnit(_ value: Double?, name: String, logger: Logger) -> Double? { + guard let value else { return nil } + if value < 0 || value > 1 { + logger.warning("talk \(name, privacy: .public) out of range: \(value, privacy: .public)") + return nil + } + return value + } + + private static func validatedSeed(_ value: Int?, logger: Logger) -> UInt32? { + guard let value else { return nil } + if value < 0 || value > 4294967295 { + logger.warning("talk seed out of range: \(value, privacy: .public)") + return nil + } + return UInt32(value) + } + + private static func validatedNormalize(_ value: String?, logger: Logger) -> String? { + guard let value else { return nil } + let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + guard ["auto", "on", "off"].contains(normalized) else { + logger.warning("talk normalize invalid: \(normalized, privacy: .public)") + return nil + } + return normalized + } + + private static func validatedLanguage(_ value: String?, logger: Logger) -> String? { + guard let value else { return nil } + let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + guard normalized.count == 2, normalized.allSatisfy({ $0 >= "a" && $0 <= "z" }) else { + logger.warning("talk language invalid: \(normalized, privacy: .public)") + return nil + } + return normalized + } +} + +private struct ElevenLabsRequest { + let text: String + let modelId: String? + let outputFormat: String? + let speed: Double? + let stability: Double? + let similarity: Double? + let style: Double? + let speakerBoost: Bool? + let seed: UInt32? + let normalize: String? + let language: String? +} + +private struct ElevenLabsClient { + let apiKey: String + let baseUrl: URL = URL(string: "https://api.elevenlabs.io")! + + func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data { + var url = self.baseUrl + url.appendPathComponent("v1") + url.appendPathComponent("text-to-speech") + url.appendPathComponent(voiceId) + + var payload: [String: Any] = [ + "text": request.text, + ] + if let modelId = request.modelId, !modelId.isEmpty { + payload["model_id"] = modelId + } + if let outputFormat = request.outputFormat, !outputFormat.isEmpty { + payload["output_format"] = outputFormat + } + if let seed = request.seed { + payload["seed"] = seed + } + if let normalize = request.normalize { + payload["apply_text_normalization"] = normalize + } + if let language = request.language { + payload["language_code"] = language + } + var voiceSettings: [String: Any] = [:] + if let speed = request.speed { voiceSettings["speed"] = speed } + if let stability = request.stability { voiceSettings["stability"] = stability } + if let similarity = request.similarity { voiceSettings["similarity_boost"] = similarity } + if let style = request.style { voiceSettings["style"] = style } + if let speakerBoost = request.speakerBoost { voiceSettings["use_speaker_boost"] = speakerBoost } + if !voiceSettings.isEmpty { + payload["voice_settings"] = voiceSettings + } + + let body = try JSONSerialization.data(withJSONObject: payload, options: []) + var req = URLRequest(url: url) + req.httpMethod = "POST" + req.httpBody = body + req.setValue("application/json", forHTTPHeaderField: "Content-Type") + req.setValue("audio/mpeg", forHTTPHeaderField: "Accept") + req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key") + + let (data, response) = try await URLSession.shared.data(for: req) + if let http = response as? HTTPURLResponse, http.statusCode >= 400 { + let message = String(data: data, encoding: .utf8) ?? "unknown" + throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [ + NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)", + ]) + } + return data + } +} diff --git a/apps/macos/Sources/Clawdis/TalkModeTypes.swift b/apps/macos/Sources/Clawdis/TalkModeTypes.swift new file mode 100644 index 000000000..3ae978255 --- /dev/null +++ b/apps/macos/Sources/Clawdis/TalkModeTypes.swift @@ -0,0 +1,8 @@ +import Foundation + +enum TalkModePhase: String { + case idle + case listening + case thinking + case speaking +} diff --git a/apps/macos/Sources/Clawdis/TalkOverlay.swift b/apps/macos/Sources/Clawdis/TalkOverlay.swift new file mode 100644 index 000000000..63c9d5dce --- /dev/null +++ b/apps/macos/Sources/Clawdis/TalkOverlay.swift @@ -0,0 +1,119 @@ +import AppKit +import Observation +import OSLog +import SwiftUI + +@MainActor +@Observable +final class TalkOverlayController { + static let shared = TalkOverlayController() + + private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.overlay") + + struct Model { + var isVisible: Bool = false + var phase: TalkModePhase = .idle + var level: Double = 0 + } + + var model = Model() + private var window: NSPanel? + private var hostingView: NSHostingView? + + private let width: CGFloat = 92 + private let height: CGFloat = 92 + private let padding: CGFloat = 8 + + func present() { + self.ensureWindow() + self.hostingView?.rootView = TalkOverlayView(controller: self) + let target = self.targetFrame() + + guard let window else { return } + if !self.model.isVisible { + self.model.isVisible = true + let start = target.offsetBy(dx: 0, dy: -6) + window.setFrame(start, display: true) + window.alphaValue = 0 + window.orderFrontRegardless() + NSAnimationContext.runAnimationGroup { context in + context.duration = 0.18 + context.timingFunction = CAMediaTimingFunction(name: .easeOut) + window.animator().setFrame(target, display: true) + window.animator().alphaValue = 1 + } + } else { + window.setFrame(target, display: true) + window.orderFrontRegardless() + } + } + + func dismiss() { + guard let window else { + self.model.isVisible = false + return + } + + let target = window.frame.offsetBy(dx: 6, dy: 6) + NSAnimationContext.runAnimationGroup { context in + context.duration = 0.16 + context.timingFunction = CAMediaTimingFunction(name: .easeOut) + window.animator().setFrame(target, display: true) + window.animator().alphaValue = 0 + } completionHandler: { + Task { @MainActor in + window.orderOut(nil) + self.model.isVisible = false + } + } + } + + func updatePhase(_ phase: TalkModePhase) { + guard self.model.phase != phase else { return } + self.logger.info("talk overlay phase=\(phase.rawValue, privacy: .public)") + self.model.phase = phase + } + + func updateLevel(_ level: Double) { + guard self.model.isVisible else { return } + self.model.level = max(0, min(1, level)) + } + + // MARK: - Private + + private func ensureWindow() { + if self.window != nil { return } + let panel = NSPanel( + contentRect: NSRect(x: 0, y: 0, width: self.width, height: self.height), + styleMask: [.nonactivatingPanel, .borderless], + backing: .buffered, + defer: false) + panel.isOpaque = false + panel.backgroundColor = .clear + panel.hasShadow = false + panel.level = NSWindow.Level(rawValue: NSWindow.Level.popUpMenu.rawValue - 4) + panel.collectionBehavior = [.canJoinAllSpaces, .fullScreenAuxiliary, .transient] + panel.hidesOnDeactivate = false + panel.isMovable = false + panel.isFloatingPanel = true + panel.becomesKeyOnlyIfNeeded = true + panel.titleVisibility = .hidden + panel.titlebarAppearsTransparent = true + + let host = NSHostingView(rootView: TalkOverlayView(controller: self)) + host.translatesAutoresizingMaskIntoConstraints = false + panel.contentView = host + self.hostingView = host + self.window = panel + } + + private func targetFrame() -> NSRect { + guard let screen = NSScreen.main else { return .zero } + let size = NSSize(width: self.width, height: self.height) + let visible = screen.visibleFrame + let origin = CGPoint( + x: visible.maxX - size.width - self.padding, + y: visible.maxY - size.height - self.padding) + return NSRect(origin: origin, size: size) + } +} diff --git a/apps/macos/Sources/Clawdis/TalkOverlayView.swift b/apps/macos/Sources/Clawdis/TalkOverlayView.swift new file mode 100644 index 000000000..2f2be75ca --- /dev/null +++ b/apps/macos/Sources/Clawdis/TalkOverlayView.swift @@ -0,0 +1,139 @@ +import SwiftUI + +struct TalkOverlayView: View { + var controller: TalkOverlayController + @State private var hovering = false + + var body: some View { + ZStack(alignment: .topLeading) { + TalkCloudView(phase: self.controller.model.phase, level: self.controller.model.level) + .frame(width: 76, height: 64) + .contentShape(Rectangle()) + .onTapGesture { + TalkModeController.shared.stopSpeaking(reason: .userTap) + } + .padding(8) + + Button { + TalkModeController.shared.exitTalkMode() + } label: { + Image(systemName: "xmark") + .font(.system(size: 10, weight: .bold)) + .foregroundStyle(Color.white.opacity(self.hovering ? 0.95 : 0.7)) + .frame(width: 18, height: 18) + .background(Color.black.opacity(self.hovering ? 0.45 : 0.3)) + .clipShape(Circle()) + } + .buttonStyle(.plain) + .contentShape(Circle()) + .padding(4) + .onHover { self.hovering = $0 } + } + .frame(width: 92, height: 92, alignment: .center) + } +} + +private struct TalkCloudView: View { + let phase: TalkModePhase + let level: Double + + var body: some View { + TimelineView(.animation) { context in + let t = context.date.timeIntervalSinceReferenceDate + let pulse = phase == .speaking ? (1 + 0.04 * sin(t * 6)) : 1 + let sink = phase == .thinking ? (3 + 2 * sin(t * 2)) : 0 + let listenScale = phase == .listening ? (1 + CGFloat(self.level) * 0.14) : 1 + let baseScale = phase == .thinking ? 0.94 : 1 + + ZStack { + CloudShape() + .fill(self.cloudGradient) + .overlay( + CloudShape() + .stroke(Color.white.opacity(0.35), lineWidth: 0.8)) + .shadow(color: Color.black.opacity(0.18), radius: 8, x: 0, y: 4) + .scaleEffect(baseScale * pulse * listenScale) + .offset(y: sink) + + if phase == .listening { + Circle() + .stroke(self.ringGradient, lineWidth: 1) + .scaleEffect(1 + CGFloat(self.level) * 0.45) + .opacity(0.3 + CGFloat(self.level) * 0.4) + .animation(.easeOut(duration: 0.08), value: self.level) + } + + if phase == .thinking { + TalkThinkingDots(time: t) + .offset(y: 18) + } + + if phase == .speaking { + TalkSpeakingRings(time: t) + } + } + } + } + + private var cloudGradient: LinearGradient { + LinearGradient( + colors: [Color(red: 0.95, green: 0.98, blue: 1.0), Color(red: 0.75, green: 0.88, blue: 1.0)], + startPoint: .topLeading, + endPoint: .bottomTrailing) + } + + private var ringGradient: LinearGradient { + LinearGradient( + colors: [Color.white.opacity(0.6), Color.white.opacity(0.1)], + startPoint: .top, + endPoint: .bottom) + } +} + +private struct TalkThinkingDots: View { + let time: TimeInterval + + var body: some View { + HStack(spacing: 4) { + ForEach(0..<3, id: \.self) { idx in + let phase = (time * 2 + Double(idx) * 0.45).truncatingRemainder(dividingBy: 1) + Circle() + .fill(Color.white.opacity(0.75)) + .frame(width: 5, height: 5) + .opacity(0.35 + 0.55 * phase) + } + } + } +} + +private struct TalkSpeakingRings: View { + let time: TimeInterval + + var body: some View { + ZStack { + ForEach(0..<3, id: \.self) { idx in + let phase = (time * 1.1 + Double(idx) / 3).truncatingRemainder(dividingBy: 1) + Circle() + .stroke(Color.white.opacity(0.6 - phase * 0.5), lineWidth: 1) + .scaleEffect(0.8 + phase * 0.7) + .opacity(0.6 - phase * 0.6) + } + } + } +} + +private struct CloudShape: Shape { + func path(in rect: CGRect) -> Path { + let w = rect.width + let h = rect.height + let baseHeight = h * 0.44 + let baseRect = CGRect(x: rect.minX, y: rect.minY + h * 0.46, width: w, height: baseHeight) + + var path = Path() + path.addRoundedRect(in: baseRect, cornerSize: CGSize(width: baseHeight / 2, height: baseHeight / 2)) + path.addEllipse(in: CGRect(x: rect.minX + w * 0.05, y: rect.minY + h * 0.28, width: w * 0.36, height: h * 0.36)) + path.addEllipse(in: CGRect(x: rect.minX + w * 0.28, y: rect.minY + h * 0.05, width: w * 0.44, height: h * 0.44)) + path.addEllipse(in: CGRect(x: rect.minX + w * 0.62, y: rect.minY + h * 0.3, width: w * 0.3, height: h * 0.3)) + return path + } +} diff --git a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift new file mode 100644 index 000000000..6bc4c0195 --- /dev/null +++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift @@ -0,0 +1,194 @@ +import Foundation + +public struct TalkDirective: Equatable, Sendable { + public var voiceId: String? + public var modelId: String? + public var speed: Double? + public var rateWPM: Int? + public var stability: Double? + public var similarity: Double? + public var style: Double? + public var speakerBoost: Bool? + public var seed: Int? + public var normalize: String? + public var language: String? + public var outputFormat: String? + public var latencyTier: Int? + public var once: Bool? + + public init( + voiceId: String? = nil, + modelId: String? = nil, + speed: Double? = nil, + rateWPM: Int? = nil, + stability: Double? = nil, + similarity: Double? = nil, + style: Double? = nil, + speakerBoost: Bool? = nil, + seed: Int? = nil, + normalize: String? = nil, + language: String? = nil, + outputFormat: String? = nil, + latencyTier: Int? = nil, + once: Bool? = nil) + { + self.voiceId = voiceId + self.modelId = modelId + self.speed = speed + self.rateWPM = rateWPM + self.stability = stability + self.similarity = similarity + self.style = style + self.speakerBoost = speakerBoost + self.seed = seed + self.normalize = normalize + self.language = language + self.outputFormat = outputFormat + self.latencyTier = latencyTier + self.once = once + } +} + +public struct TalkDirectiveParseResult: Equatable, Sendable { + public let directive: TalkDirective? + public let stripped: String + public let unknownKeys: [String] + + public init(directive: TalkDirective?, stripped: String, unknownKeys: [String]) { + self.directive = directive + self.stripped = stripped + self.unknownKeys = unknownKeys + } +} + +public enum TalkDirectiveParser { + public static func parse(_ text: String) -> TalkDirectiveParseResult { + let normalized = text.replacingOccurrences(of: "\r\n", with: "\n") + var lines = normalized.split(separator: "\n", omittingEmptySubsequences: false) + guard !lines.isEmpty else { return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) } + + guard let firstNonEmpty = lines.firstIndex(where: { !$0.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty }) + else { + return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) + } + + let head = lines[firstNonEmpty].trimmingCharacters(in: .whitespacesAndNewlines) + guard head.hasPrefix("{"), head.hasSuffix("}") else { + return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) + } + + guard let data = head.data(using: .utf8), + let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] + else { + return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) + } + + let speakerBoost = boolValue(json, keys: ["speaker_boost", "speakerBoost"]) + ?? boolValue(json, keys: ["no_speaker_boost", "noSpeakerBoost"]).map { !$0 } + + let directive = TalkDirective( + voiceId: stringValue(json, keys: ["voice", "voice_id", "voiceId"]), + modelId: stringValue(json, keys: ["model", "model_id", "modelId"]), + speed: doubleValue(json, keys: ["speed"]), + rateWPM: intValue(json, keys: ["rate", "wpm"]), + stability: doubleValue(json, keys: ["stability"]), + similarity: doubleValue(json, keys: ["similarity", "similarity_boost", "similarityBoost"]), + style: doubleValue(json, keys: ["style"]), + speakerBoost: speakerBoost, + seed: intValue(json, keys: ["seed"]), + normalize: stringValue(json, keys: ["normalize", "apply_text_normalization"]), + language: stringValue(json, keys: ["lang", "language_code", "language"]), + outputFormat: stringValue(json, keys: ["output_format", "format"]), + latencyTier: intValue(json, keys: ["latency", "latency_tier", "latencyTier"]), + once: boolValue(json, keys: ["once"])) + + let hasDirective = [ + directive.voiceId, + directive.modelId, + directive.speed.map { "\($0)" }, + directive.rateWPM.map { "\($0)" }, + directive.stability.map { "\($0)" }, + directive.similarity.map { "\($0)" }, + directive.style.map { "\($0)" }, + directive.speakerBoost.map { "\($0)" }, + directive.seed.map { "\($0)" }, + directive.normalize, + directive.language, + directive.outputFormat, + directive.latencyTier.map { "\($0)" }, + directive.once.map { "\($0)" }, + ].contains { $0 != nil } + + guard hasDirective else { + return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) + } + + let knownKeys = Set([ + "voice", "voice_id", "voiceid", + "model", "model_id", "modelid", + "speed", "rate", "wpm", + "stability", "similarity", "similarity_boost", "similarityboost", + "style", + "speaker_boost", "speakerboost", + "no_speaker_boost", "nospeakerboost", + "seed", + "normalize", "apply_text_normalization", + "lang", "language_code", "language", + "output_format", "format", + "latency", "latency_tier", "latencytier", + "once", + ]) + let unknownKeys = json.keys.filter { !knownKeys.contains($0.lowercased()) }.sorted() + + lines.remove(at: firstNonEmpty) + if firstNonEmpty < lines.count { + let next = lines[firstNonEmpty].trimmingCharacters(in: .whitespacesAndNewlines) + if next.isEmpty { + lines.remove(at: firstNonEmpty) + } + } + + let stripped = lines.joined(separator: "\n") + return TalkDirectiveParseResult(directive: directive, stripped: stripped, unknownKeys: unknownKeys) + } + + private static func stringValue(_ dict: [String: Any], keys: [String]) -> String? { + for key in keys { + if let value = dict[key] as? String { + let trimmed = value.trimmingCharacters(in: .whitespacesAndNewlines) + if !trimmed.isEmpty { return trimmed } + } + } + return nil + } + + private static func doubleValue(_ dict: [String: Any], keys: [String]) -> Double? { + for key in keys { + if let value = dict[key] as? Double { return value } + if let value = dict[key] as? Int { return Double(value) } + if let value = dict[key] as? String, let parsed = Double(value) { return parsed } + } + return nil + } + + private static func intValue(_ dict: [String: Any], keys: [String]) -> Int? { + for key in keys { + if let value = dict[key] as? Int { return value } + if let value = dict[key] as? Double { return Int(value) } + if let value = dict[key] as? String, let parsed = Int(value) { return parsed } + } + return nil + } + + private static func boolValue(_ dict: [String: Any], keys: [String]) -> Bool? { + for key in keys { + if let value = dict[key] as? Bool { return value } + if let value = dict[key] as? String { + let trimmed = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + if ["true", "yes", "1"].contains(trimmed) { return true } + if ["false", "no", "0"].contains(trimmed) { return false } + } + } + return nil + } +} diff --git a/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkDirectiveTests.swift b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkDirectiveTests.swift new file mode 100644 index 000000000..cbfdb572b --- /dev/null +++ b/apps/shared/ClawdisKit/Tests/ClawdisKitTests/TalkDirectiveTests.swift @@ -0,0 +1,62 @@ +import XCTest +@testable import ClawdisKit + +final class TalkDirectiveTests: XCTestCase { + func testParsesDirectiveAndStripsLine() { + let text = """ + {"voice":"abc123","once":true} + Hello there. + """ + let result = TalkDirectiveParser.parse(text) + XCTAssertEqual(result.directive?.voiceId, "abc123") + XCTAssertEqual(result.directive?.once, true) + XCTAssertEqual(result.stripped, "Hello there.") + } + + func testIgnoresNonDirective() { + let text = "Hello world." + let result = TalkDirectiveParser.parse(text) + XCTAssertNil(result.directive) + XCTAssertEqual(result.stripped, text) + } + + func testKeepsDirectiveLineIfNoRecognizedFields() { + let text = """ + {"unknown":"value"} + Hello. + """ + let result = TalkDirectiveParser.parse(text) + XCTAssertNil(result.directive) + XCTAssertEqual(result.stripped, text) + } + + func testParsesExtendedOptions() { + let text = """ + {"voice_id":"v1","model_id":"m1","rate":200,"stability":0.5,"similarity":0.8,"style":0.2,"speaker_boost":true,"seed":1234,"normalize":"auto","lang":"en","output_format":"mp3_44100_128"} + Hello. + """ + let result = TalkDirectiveParser.parse(text) + XCTAssertEqual(result.directive?.voiceId, "v1") + XCTAssertEqual(result.directive?.modelId, "m1") + XCTAssertEqual(result.directive?.rateWPM, 200) + XCTAssertEqual(result.directive?.stability, 0.5) + XCTAssertEqual(result.directive?.similarity, 0.8) + XCTAssertEqual(result.directive?.style, 0.2) + XCTAssertEqual(result.directive?.speakerBoost, true) + XCTAssertEqual(result.directive?.seed, 1234) + XCTAssertEqual(result.directive?.normalize, "auto") + XCTAssertEqual(result.directive?.language, "en") + XCTAssertEqual(result.directive?.outputFormat, "mp3_44100_128") + XCTAssertEqual(result.stripped, "Hello.") + } + + func testTracksUnknownKeys() { + let text = """ + {"voice":"abc","mystery":"value","extra":1} + Hi. + """ + let result = TalkDirectiveParser.parse(text) + XCTAssertEqual(result.directive?.voiceId, "abc") + XCTAssertEqual(result.unknownKeys, ["extra", "mystery"]) + } +} diff --git a/docs/configuration.md b/docs/configuration.md index a6838f4cc..f15a8f046 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -195,6 +195,21 @@ Controls inbound/outbound prefixes and timestamps. } ``` +### `talk` + +Defaults for Talk mode (macOS/iOS/Android). Voice IDs fall back to `ELEVENLABS_VOICE_ID` or `SAG_VOICE_ID` when unset. + +```json5 +{ + talk: { + voiceId: "elevenlabs_voice_id", + modelId: "eleven_v3", + outputFormat: "mp3_44100_128", + interruptOnSpeech: true + } +} +``` + ### `agent` Controls the embedded agent runtime (model/thinking/verbose/timeouts). diff --git a/docs/talk.md b/docs/talk.md new file mode 100644 index 000000000..4c3cf53cb --- /dev/null +++ b/docs/talk.md @@ -0,0 +1,72 @@ +--- +summary: "Talk mode: continuous speech conversations with ElevenLabs TTS" +read_when: + - Implementing Talk mode on macOS/iOS/Android + - Changing voice/TTS/interrupt behavior +--- +# Talk Mode + +Talk mode is a continuous voice conversation loop: +1) Listen for speech +2) Send transcript to the model (main session, chat.send) +3) Wait for the response +4) Speak it via ElevenLabs + +## Behavior (macOS) +- **Always-on overlay** while Talk mode is enabled. +- **Listening → Thinking → Speaking** phase transitions. +- On a **short pause** (silence window), the current transcript is sent. +- Replies are **written to WebChat** (same as typing). +- **Interrupt on speech** (default on): if the user starts talking while the assistant is speaking, we stop playback and note the interruption timestamp for the next prompt. + +## Voice directives in replies +The assistant may prefix its reply with a **single JSON line** to control voice: + +```json +{"voice":"","once":true} +``` + +Rules: +- First non-empty line only. +- Unknown keys are ignored. +- `once: true` applies to the current reply only. +- Without `once`, the voice becomes the new default for Talk mode. +- The JSON line is stripped before TTS playback. + +Supported keys: +- `voice` / `voice_id` / `voiceId` +- `model` / `model_id` / `modelId` +- `speed`, `rate` (WPM), `stability`, `similarity`, `style`, `speakerBoost` +- `seed`, `normalize`, `lang`, `output_format`, `latency_tier` +- `once` + +## Config (clawdis.json) +```json5 +{ + "talk": { + "voiceId": "elevenlabs_voice_id", + "modelId": "eleven_v3", + "outputFormat": "mp3_44100_128", + "interruptOnSpeech": true + } +} +``` + +Defaults: +- `interruptOnSpeech`: true +- `voiceId`: falls back to `ELEVENLABS_VOICE_ID` / `SAG_VOICE_ID` + +## macOS UI +- Menu bar toggle: **Talk** +- Config tab: **Talk Mode** group (voice id + interrupt toggle) +- Overlay: + - **Listening**: cloud pulses with mic level + - **Thinking**: sinking animation + - **Speaking**: radiating rings + - Click cloud: stop speaking + - Click X: exit Talk mode + +## Notes +- Requires Speech + Microphone permissions. +- Uses `chat.send` against session key `main`. +- TTS uses ElevenLabs API with `ELEVENLABS_API_KEY`. diff --git a/src/config/config.ts b/src/config/config.ts index a9cbb2243..40ae5da06 100644 --- a/src/config/config.ts +++ b/src/config/config.ts @@ -219,6 +219,17 @@ export type CanvasHostConfig = { port?: number; }; +export type TalkConfig = { + /** Default ElevenLabs voice ID for Talk mode. */ + voiceId?: string; + /** Default ElevenLabs model ID for Talk mode. */ + modelId?: string; + /** Default ElevenLabs output format (e.g. mp3_44100_128). */ + outputFormat?: string; + /** Stop speaking when user starts talking (default: true). */ + interruptOnSpeech?: boolean; +}; + export type GatewayControlUiConfig = { /** If false, the Gateway will not serve the Control UI (/). Default: true. */ enabled?: boolean; @@ -391,6 +402,7 @@ export type ClawdisConfig = { bridge?: BridgeConfig; discovery?: DiscoveryConfig; canvasHost?: CanvasHostConfig; + talk?: TalkConfig; gateway?: GatewayConfig; skills?: Record; }; @@ -785,6 +797,14 @@ const ClawdisSchema = z.object({ port: z.number().int().positive().optional(), }) .optional(), + talk: z + .object({ + voiceId: z.string().optional(), + modelId: z.string().optional(), + outputFormat: z.string().optional(), + interruptOnSpeech: z.boolean().optional(), + }) + .optional(), gateway: z .object({ mode: z.union([z.literal("local"), z.literal("remote")]).optional(), From 3c338d1858382c54f1426c497990fd9c0f1d0bed Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 23:26:38 +0100 Subject: [PATCH 022/100] fix: adjust android talk parser for kotlin json --- .../clawdis/node/voice/TalkDirectiveParser.kt | 15 ++++++--------- .../clawdis/node/voice/TalkModeManager.kt | 6 +++--- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkDirectiveParser.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkDirectiveParser.kt index 539f556ff..8dd059279 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkDirectiveParser.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkDirectiveParser.kt @@ -162,30 +162,27 @@ object TalkDirectiveParser { } } -private fun JsonElement?.asStringOrNull(): String? = (this as? JsonPrimitive)?.contentOrNull +private fun JsonElement?.asStringOrNull(): String? = + (this as? JsonPrimitive)?.takeIf { it.isString }?.content private fun JsonElement?.asDoubleOrNull(): Double? { val primitive = this as? JsonPrimitive ?: return null - if (primitive.isString) return primitive.content.toDoubleOrNull() - return primitive.doubleOrNull + return primitive.content.toDoubleOrNull() } private fun JsonElement?.asIntOrNull(): Int? { val primitive = this as? JsonPrimitive ?: return null - if (primitive.isString) return primitive.content.toIntOrNull() - return primitive.intOrNull + return primitive.content.toIntOrNull() } private fun JsonElement?.asLongOrNull(): Long? { val primitive = this as? JsonPrimitive ?: return null - if (primitive.isString) return primitive.content.toLongOrNull() - return primitive.longOrNull + return primitive.content.toLongOrNull() } private fun JsonElement?.asBooleanOrNull(): Boolean? { val primitive = this as? JsonPrimitive ?: return null - if (primitive.booleanOrNull != null) return primitive.booleanOrNull - val content = primitive.contentOrNull?.trim()?.lowercase() ?: return null + val content = primitive.content.trim().lowercase() return when (content) { "true", "yes", "1" -> true "false", "no", "0" -> false diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt index ecbc51869..920466739 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt @@ -699,12 +699,12 @@ class TalkModeManager( private fun JsonElement?.asObjectOrNull(): JsonObject? = this as? JsonObject -private fun JsonElement?.asStringOrNull(): String? = (this as? JsonPrimitive)?.contentOrNull +private fun JsonElement?.asStringOrNull(): String? = + (this as? JsonPrimitive)?.takeIf { it.isString }?.content private fun JsonElement?.asBooleanOrNull(): Boolean? { val primitive = this as? JsonPrimitive ?: return null - if (primitive.booleanOrNull != null) return primitive.booleanOrNull - val content = primitive.contentOrNull?.trim()?.lowercase() ?: return null + val content = primitive.content.trim().lowercase() return when (content) { "true", "yes", "1" -> true "false", "no", "0" -> false From 303954ae8cdd8b036d00ded09b302bd215593372 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 23:42:22 +0100 Subject: [PATCH 023/100] feat: extend status activity indicators --- CHANGELOG.md | 2 + .../steipete/clawdis/node/MainViewModel.kt | 3 + .../com/steipete/clawdis/node/NodeRuntime.kt | 25 ++++-- .../steipete/clawdis/node/ui/RootScreen.kt | 81 +++++++++++++++++-- apps/ios/Sources/Model/NodeAppModel.swift | 4 + apps/ios/Sources/RootCanvas.swift | 74 ++++++++++++----- apps/ios/Sources/RootTabs.swift | 64 ++++++++++++++- .../Sources/Clawdis/MenuContentView.swift | 8 ++ .../Clawdis/NodePairingApprovalPrompter.swift | 16 ++++ 9 files changed, 241 insertions(+), 36 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c17709774..01821b781 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ - iOS/Android nodes: bridge auto-connect refreshes stale tokens and settings now show richer bridge/device details. - iOS/Android nodes: status pill now surfaces camera activity instead of overlay toasts. - iOS/Android/macOS nodes: camera snaps recompress to keep base64 payloads under 5 MB. +- iOS/Android nodes: status pill now surfaces pairing, screen recording, voice wake, and foreground-required states. +- macOS menu: top status line now shows pending node pairing approvals (incl. repairs). - CLI: avoid spurious gateway close errors after successful request/response cycles. - Agent runtime: clamp tool-result images to the 5MB Anthropic limit to avoid hard request rejections. - Tests: add Swift Testing coverage for camera errors and Kotest coverage for Android bridge endpoints. diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/MainViewModel.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/MainViewModel.kt index ee1c83c9b..f1fef1640 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/MainViewModel.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/MainViewModel.kt @@ -23,9 +23,11 @@ class MainViewModel(app: Application) : AndroidViewModel(app) { val statusText: StateFlow = runtime.statusText val serverName: StateFlow = runtime.serverName val remoteAddress: StateFlow = runtime.remoteAddress + val isForeground: StateFlow = runtime.isForeground val cameraHud: StateFlow = runtime.cameraHud val cameraFlashToken: StateFlow = runtime.cameraFlashToken + val screenRecordActive: StateFlow = runtime.screenRecordActive val instanceId: StateFlow = runtime.instanceId val displayName: StateFlow = runtime.displayName @@ -34,6 +36,7 @@ class MainViewModel(app: Application) : AndroidViewModel(app) { val wakeWords: StateFlow> = runtime.wakeWords val voiceWakeMode: StateFlow = runtime.voiceWakeMode val voiceWakeStatusText: StateFlow = runtime.voiceWakeStatusText + val voiceWakeStatusText: StateFlow = runtime.voiceWakeStatusText val voiceWakeIsListening: StateFlow = runtime.voiceWakeIsListening val talkEnabled: StateFlow = runtime.talkEnabled val talkStatusText: StateFlow = runtime.talkStatusText diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt index 4984f7e0f..21a22a428 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt @@ -111,6 +111,9 @@ class NodeRuntime(context: Context) { private val _cameraFlashToken = MutableStateFlow(0L) val cameraFlashToken: StateFlow = _cameraFlashToken.asStateFlow() + private val _screenRecordActive = MutableStateFlow(false) + val screenRecordActive: StateFlow = _screenRecordActive.asStateFlow() + private val _serverName = MutableStateFlow(null) val serverName: StateFlow = _serverName.asStateFlow() @@ -756,14 +759,20 @@ class NodeRuntime(context: Context) { } } ClawdisScreenCommand.Record.rawValue -> { - val res = - try { - screenRecorder.record(paramsJson) - } catch (err: Throwable) { - val (code, message) = invokeErrorFromThrowable(err) - return BridgeSession.InvokeResult.error(code = code, message = message) - } - BridgeSession.InvokeResult.ok(res.payloadJson) + // Status pill mirrors screen recording state so it stays visible without overlay stacking. + _screenRecordActive.value = true + try { + val res = + try { + screenRecorder.record(paramsJson) + } catch (err: Throwable) { + val (code, message) = invokeErrorFromThrowable(err) + return BridgeSession.InvokeResult.error(code = code, message = message) + } + BridgeSession.InvokeResult.ok(res.payloadJson) + } finally { + _screenRecordActive.value = false + } } else -> BridgeSession.InvokeResult.error( diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt index 2594449b8..86d5a334e 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt @@ -36,6 +36,10 @@ import androidx.compose.material.icons.filled.CheckCircle import androidx.compose.material.icons.filled.Error import androidx.compose.material.icons.filled.FiberManualRecord import androidx.compose.material.icons.filled.PhotoCamera +import androidx.compose.material.icons.filled.RecordVoiceOver +import androidx.compose.material.icons.filled.Refresh +import androidx.compose.material.icons.filled.Report +import androidx.compose.material.icons.filled.ScreenShare import androidx.compose.material.icons.filled.Settings import androidx.compose.runtime.Composable import androidx.compose.runtime.collectAsState @@ -65,39 +69,100 @@ fun RootScreen(viewModel: MainViewModel) { val statusText by viewModel.statusText.collectAsState() val cameraHud by viewModel.cameraHud.collectAsState() val cameraFlashToken by viewModel.cameraFlashToken.collectAsState() + val screenRecordActive by viewModel.screenRecordActive.collectAsState() + val isForeground by viewModel.isForeground.collectAsState() + val voiceWakeStatusText by viewModel.voiceWakeStatusText.collectAsState() val activity = - remember(cameraHud) { - // Status pill owns transient capture state so it doesn't overlap the connection indicator. - cameraHud?.let { hud -> - when (hud.kind) { + remember(cameraHud, screenRecordActive, isForeground, statusText, voiceWakeStatusText) { + // Status pill owns transient activity state so it doesn't overlap the connection indicator. + if (!isForeground) { + return@remember StatusActivity( + title = "Foreground required", + icon = Icons.Default.Report, + contentDescription = "Foreground required", + ) + } + + val lowerStatus = statusText.lowercase() + if (lowerStatus.contains("repair")) { + return@remember StatusActivity( + title = "Repairing…", + icon = Icons.Default.Refresh, + contentDescription = "Repairing", + ) + } + if (lowerStatus.contains("pairing") || lowerStatus.contains("approval")) { + return@remember StatusActivity( + title = "Approval pending", + icon = Icons.Default.RecordVoiceOver, + contentDescription = "Approval pending", + ) + } + if (lowerStatus.contains("reconnecting") || lowerStatus.contains("connecting")) { + return@remember StatusActivity( + title = "Gateway reconnecting…", + icon = Icons.Default.Refresh, + contentDescription = "Gateway reconnecting", + ) + } + + if (screenRecordActive) { + return@remember StatusActivity( + title = "Recording screen…", + icon = Icons.Default.ScreenShare, + contentDescription = "Recording screen", + tint = androidx.compose.ui.graphics.Color.Red, + ) + } + + if (cameraHud != null) { + return@remember when (cameraHud.kind) { CameraHudKind.Photo -> StatusActivity( - title = hud.message, + title = cameraHud.message, icon = Icons.Default.PhotoCamera, contentDescription = "Taking photo", ) CameraHudKind.Recording -> StatusActivity( - title = hud.message, + title = cameraHud.message, icon = Icons.Default.FiberManualRecord, contentDescription = "Recording", tint = androidx.compose.ui.graphics.Color.Red, ) CameraHudKind.Success -> StatusActivity( - title = hud.message, + title = cameraHud.message, icon = Icons.Default.CheckCircle, contentDescription = "Capture finished", ) CameraHudKind.Error -> StatusActivity( - title = hud.message, + title = cameraHud.message, icon = Icons.Default.Error, contentDescription = "Capture failed", tint = androidx.compose.ui.graphics.Color.Red, ) } } + + if (voiceWakeStatusText.contains("Microphone permission", ignoreCase = true)) { + return@remember StatusActivity( + title = "Mic permission", + icon = Icons.Default.Error, + contentDescription = "Mic permission required", + ) + } + if (voiceWakeStatusText == "Paused") { + val suffix = if (!isForeground) " (background)" else "" + return@remember StatusActivity( + title = "Voice Wake paused$suffix", + icon = Icons.Default.RecordVoiceOver, + contentDescription = "Voice Wake paused", + ) + } + + null } val bridgeState = diff --git a/apps/ios/Sources/Model/NodeAppModel.swift b/apps/ios/Sources/Model/NodeAppModel.swift index 4c491ea55..8c2935ffc 100644 --- a/apps/ios/Sources/Model/NodeAppModel.swift +++ b/apps/ios/Sources/Model/NodeAppModel.swift @@ -36,6 +36,7 @@ final class NodeAppModel { var cameraHUDText: String? var cameraHUDKind: CameraHUDKind? var cameraFlashNonce: Int = 0 + var screenRecordActive: Bool = false init() { self.voiceWake.configure { [weak self] cmd in @@ -598,6 +599,9 @@ final class NodeAppModel { NSLocalizedDescriptionKey: "INVALID_REQUEST: screen format must be mp4", ]) } + // Status pill mirrors screen recording state so it stays visible without overlay stacking. + self.screenRecordActive = true + defer { self.screenRecordActive = false } let path = try await self.screenRecorder.record( screenIndex: params.screenIndex, durationMs: params.durationMs, diff --git a/apps/ios/Sources/RootCanvas.swift b/apps/ios/Sources/RootCanvas.swift index c02eceb69..b55f84cc1 100644 --- a/apps/ios/Sources/RootCanvas.swift +++ b/apps/ios/Sources/RootCanvas.swift @@ -119,6 +119,7 @@ struct RootCanvas: View { } private struct CanvasContent: View { + @Environment(NodeAppModel.self) private var appModel var systemColorScheme: ColorScheme var bridgeStatus: StatusPill.BridgeState var voiceWakeEnabled: Bool @@ -173,28 +174,63 @@ private struct CanvasContent: View { } private var statusActivity: StatusPill.Activity? { - // Status pill owns transient capture state so it doesn't overlap the connection indicator. - guard let cameraHUDText, !cameraHUDText.isEmpty, let cameraHUDKind else { return nil } - let systemImage: String - let tint: Color? - switch cameraHUDKind { - case .photo: - systemImage = "camera.fill" - tint = nil - case .recording: - systemImage = "video.fill" - tint = .red - case .success: - systemImage = "checkmark.circle.fill" - tint = .green - case .error: - systemImage = "exclamationmark.triangle.fill" - tint = .red + // Status pill owns transient activity state so it doesn't overlap the connection indicator. + if self.appModel.isBackgrounded { + return StatusPill.Activity( + title: "Foreground required", + systemImage: "exclamationmark.triangle.fill", + tint: .orange) } - return StatusPill.Activity(title: cameraHUDText, systemImage: systemImage, tint: tint) - } + let bridgeStatus = self.appModel.bridgeStatusText.trimmingCharacters(in: .whitespacesAndNewlines) + let bridgeLower = bridgeStatus.lowercased() + if bridgeLower.contains("repair") { + return StatusPill.Activity(title: "Repairing…", systemImage: "wrench.and.screwdriver", tint: .orange) + } + if bridgeLower.contains("approval") || bridgeLower.contains("pairing") { + return StatusPill.Activity(title: "Approval pending", systemImage: "person.crop.circle.badge.clock") + } + if bridgeLower.contains("reconnecting") || bridgeLower.contains("connecting") { + return StatusPill.Activity(title: "Gateway reconnecting…", systemImage: "arrow.triangle.2.circlepath") + } + if self.appModel.screenRecordActive { + return StatusPill.Activity(title: "Recording screen…", systemImage: "record.circle.fill", tint: .red) + } + + if let cameraHUDText, !cameraHUDText.isEmpty, let cameraHUDKind { + let systemImage: String + let tint: Color? + switch cameraHUDKind { + case .photo: + systemImage = "camera.fill" + tint = nil + case .recording: + systemImage = "video.fill" + tint = .red + case .success: + systemImage = "checkmark.circle.fill" + tint = .green + case .error: + systemImage = "exclamationmark.triangle.fill" + tint = .red + } + return StatusPill.Activity(title: cameraHUDText, systemImage: systemImage, tint: tint) + } + + if self.voiceWakeEnabled { + let voiceStatus = self.appModel.voiceWake.statusText + if voiceStatus.localizedCaseInsensitiveContains("microphone permission") { + return StatusPill.Activity(title: "Mic permission", systemImage: "mic.slash", tint: .orange) + } + if voiceStatus == "Paused" { + let suffix = self.appModel.isBackgrounded ? " (background)" : "" + return StatusPill.Activity(title: "Voice Wake paused\(suffix)", systemImage: "pause.circle.fill") + } + } + + return nil + } } private struct OverlayButton: View { diff --git a/apps/ios/Sources/RootTabs.swift b/apps/ios/Sources/RootTabs.swift index 913073d4a..a480fc8ab 100644 --- a/apps/ios/Sources/RootTabs.swift +++ b/apps/ios/Sources/RootTabs.swift @@ -26,7 +26,7 @@ struct RootTabs: View { StatusPill( bridge: self.bridgeStatus, voiceWakeEnabled: self.voiceWakeEnabled, - activity: nil, + activity: self.statusActivity, onTap: { self.selectedTab = 2 }) .padding(.leading, 10) .safeAreaPadding(.top, 10) @@ -80,4 +80,66 @@ struct RootTabs: View { return .disconnected } + + private var statusActivity: StatusPill.Activity? { + // Keep the top pill consistent across tabs (camera + voice wake + pairing states). + if self.appModel.isBackgrounded { + return StatusPill.Activity( + title: "Foreground required", + systemImage: "exclamationmark.triangle.fill", + tint: .orange) + } + + let bridgeStatus = self.appModel.bridgeStatusText.trimmingCharacters(in: .whitespacesAndNewlines) + let bridgeLower = bridgeStatus.lowercased() + if bridgeLower.contains("repair") { + return StatusPill.Activity(title: "Repairing…", systemImage: "wrench.and.screwdriver", tint: .orange) + } + if bridgeLower.contains("approval") || bridgeLower.contains("pairing") { + return StatusPill.Activity(title: "Approval pending", systemImage: "person.crop.circle.badge.clock") + } + if bridgeLower.contains("reconnecting") || bridgeLower.contains("connecting") { + return StatusPill.Activity(title: "Gateway reconnecting…", systemImage: "arrow.triangle.2.circlepath") + } + + if self.appModel.screenRecordActive { + return StatusPill.Activity(title: "Recording screen…", systemImage: "record.circle.fill", tint: .red) + } + + if let cameraHUDText = self.appModel.cameraHUDText, + let cameraHUDKind = self.appModel.cameraHUDKind, + !cameraHUDText.isEmpty + { + let systemImage: String + let tint: Color? + switch cameraHUDKind { + case .photo: + systemImage = "camera.fill" + tint = nil + case .recording: + systemImage = "video.fill" + tint = .red + case .success: + systemImage = "checkmark.circle.fill" + tint = .green + case .error: + systemImage = "exclamationmark.triangle.fill" + tint = .red + } + return StatusPill.Activity(title: cameraHUDText, systemImage: systemImage, tint: tint) + } + + if self.voiceWakeEnabled { + let voiceStatus = self.appModel.voiceWake.statusText + if voiceStatus.localizedCaseInsensitiveContains("microphone permission") { + return StatusPill.Activity(title: "Mic permission", systemImage: "mic.slash", tint: .orange) + } + if voiceStatus == "Paused" { + let suffix = self.appModel.isBackgrounded ? " (background)" : "" + return StatusPill.Activity(title: "Voice Wake paused\(suffix)", systemImage: "pause.circle.fill") + } + } + + return nil + } } diff --git a/apps/macos/Sources/Clawdis/MenuContentView.swift b/apps/macos/Sources/Clawdis/MenuContentView.swift index 748ce018d..dee70ed5d 100644 --- a/apps/macos/Sources/Clawdis/MenuContentView.swift +++ b/apps/macos/Sources/Clawdis/MenuContentView.swift @@ -14,6 +14,7 @@ struct MenuContent: View { private let heartbeatStore = HeartbeatStore.shared private let controlChannel = ControlChannel.shared private let activityStore = WorkActivityStore.shared + @Bindable private var pairingPrompter = NodePairingApprovalPrompter.shared @Environment(\.openSettings) private var openSettings @State private var availableMics: [AudioInputDevice] = [] @State private var loadingMics = false @@ -32,6 +33,13 @@ struct MenuContent: View { VStack(alignment: .leading, spacing: 2) { Text(self.connectionLabel) self.statusLine(label: self.healthStatus.label, color: self.healthStatus.color) + if self.pairingPrompter.pendingCount > 0 { + let repairCount = self.pairingPrompter.pendingRepairCount + let repairSuffix = repairCount > 0 ? " · \(repairCount) repair" : "" + self.statusLine( + label: "Pairing approval pending (\(self.pairingPrompter.pendingCount))\(repairSuffix)", + color: .orange) + } } } .disabled(self.state.connectionMode == .unconfigured) diff --git a/apps/macos/Sources/Clawdis/NodePairingApprovalPrompter.swift b/apps/macos/Sources/Clawdis/NodePairingApprovalPrompter.swift index 85d8a9f12..932f272f6 100644 --- a/apps/macos/Sources/Clawdis/NodePairingApprovalPrompter.swift +++ b/apps/macos/Sources/Clawdis/NodePairingApprovalPrompter.swift @@ -2,6 +2,7 @@ import AppKit import ClawdisIPC import ClawdisProtocol import Foundation +import Observation import OSLog import UserNotifications @@ -15,6 +16,7 @@ enum NodePairingReconcilePolicy { } @MainActor +@Observable final class NodePairingApprovalPrompter { static let shared = NodePairingApprovalPrompter() @@ -26,6 +28,8 @@ final class NodePairingApprovalPrompter { private var isStopping = false private var isPresenting = false private var queue: [PendingRequest] = [] + var pendingCount: Int = 0 + var pendingRepairCount: Int = 0 private var activeAlert: NSAlert? private var activeRequestId: String? private var alertHostWindow: NSWindow? @@ -104,6 +108,7 @@ final class NodePairingApprovalPrompter { self.reconcileOnceTask?.cancel() self.reconcileOnceTask = nil self.queue.removeAll(keepingCapacity: false) + self.updatePendingCounts() self.isPresenting = false self.activeRequestId = nil self.alertHostWindow?.orderOut(nil) @@ -292,6 +297,7 @@ final class NodePairingApprovalPrompter { private func enqueue(_ req: PendingRequest) { if self.queue.contains(req) { return } self.queue.append(req) + self.updatePendingCounts() self.presentNextIfNeeded() self.updateReconcileLoop() } @@ -362,6 +368,7 @@ final class NodePairingApprovalPrompter { } else { self.queue.removeAll { $0 == request } } + self.updatePendingCounts() self.isPresenting = false self.presentNextIfNeeded() self.updateReconcileLoop() @@ -501,6 +508,8 @@ final class NodePairingApprovalPrompter { } else { self.queue.removeAll { $0 == req } } + + self.updatePendingCounts() self.isPresenting = false self.presentNextIfNeeded() self.updateReconcileLoop() @@ -599,6 +608,12 @@ final class NodePairingApprovalPrompter { } } + private func updatePendingCounts() { + // Keep a cheap observable summary for the menu bar status line. + self.pendingCount = self.queue.count + self.pendingRepairCount = self.queue.filter { $0.isRepair == true }.count + } + private func reconcileOnce(timeoutMs: Double) async { if self.isStopping { return } if self.reconcileInFlight { return } @@ -643,6 +658,7 @@ final class NodePairingApprovalPrompter { return } self.queue.removeAll { $0.requestId == resolved.requestId } + self.updatePendingCounts() Task { @MainActor in await self.notify(resolution: resolution, request: request, via: "remote") } From 857cd6a28ae318c49a50fa1db45ca8d0b584ea26 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 29 Dec 2025 23:45:58 +0100 Subject: [PATCH 024/100] fix: align ios lint and android build --- .../java/com/steipete/clawdis/node/MainViewModel.kt | 1 - .../java/com/steipete/clawdis/node/ui/RootScreen.kt | 12 ++++++------ apps/ios/Sources/Status/StatusPill.swift | 4 ++-- apps/ios/Sources/Voice/TalkModeManager.swift | 9 ++++++--- apps/ios/SwiftSources.input.xcfilelist | 2 ++ .../Sources/ClawdisKit/TalkDirective.swift | 7 ++++--- 6 files changed, 20 insertions(+), 15 deletions(-) diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/MainViewModel.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/MainViewModel.kt index f1fef1640..69b394a63 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/MainViewModel.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/MainViewModel.kt @@ -36,7 +36,6 @@ class MainViewModel(app: Application) : AndroidViewModel(app) { val wakeWords: StateFlow> = runtime.wakeWords val voiceWakeMode: StateFlow = runtime.voiceWakeMode val voiceWakeStatusText: StateFlow = runtime.voiceWakeStatusText - val voiceWakeStatusText: StateFlow = runtime.voiceWakeStatusText val voiceWakeIsListening: StateFlow = runtime.voiceWakeIsListening val talkEnabled: StateFlow = runtime.talkEnabled val talkStatusText: StateFlow = runtime.talkStatusText diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt index 86d5a334e..cb11cf303 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt @@ -115,30 +115,30 @@ fun RootScreen(viewModel: MainViewModel) { ) } - if (cameraHud != null) { - return@remember when (cameraHud.kind) { + cameraHud?.let { hud -> + return@remember when (hud.kind) { CameraHudKind.Photo -> StatusActivity( - title = cameraHud.message, + title = hud.message, icon = Icons.Default.PhotoCamera, contentDescription = "Taking photo", ) CameraHudKind.Recording -> StatusActivity( - title = cameraHud.message, + title = hud.message, icon = Icons.Default.FiberManualRecord, contentDescription = "Recording", tint = androidx.compose.ui.graphics.Color.Red, ) CameraHudKind.Success -> StatusActivity( - title = cameraHud.message, + title = hud.message, icon = Icons.Default.CheckCircle, contentDescription = "Capture finished", ) CameraHudKind.Error -> StatusActivity( - title = cameraHud.message, + title = hud.message, icon = Icons.Default.Error, contentDescription = "Capture failed", tint = androidx.compose.ui.graphics.Color.Red, diff --git a/apps/ios/Sources/Status/StatusPill.swift b/apps/ios/Sources/Status/StatusPill.swift index f5df8e7df..1e30ad16d 100644 --- a/apps/ios/Sources/Status/StatusPill.swift +++ b/apps/ios/Sources/Status/StatusPill.swift @@ -31,12 +31,12 @@ struct StatusPill: View { struct Activity: Equatable { var title: String var systemImage: String - var tint: Color? = nil + var tint: Color? } var bridge: BridgeState var voiceWakeEnabled: Bool - var activity: Activity? = nil + var activity: Activity? var brighten: Bool = false var onTap: () -> Void diff --git a/apps/ios/Sources/Voice/TalkModeManager.swift b/apps/ios/Sources/Voice/TalkModeManager.swift index 649eaa03a..82cd451c3 100644 --- a/apps/ios/Sources/Voice/TalkModeManager.swift +++ b/apps/ios/Sources/Voice/TalkModeManager.swift @@ -233,7 +233,7 @@ final class TalkModeManager: NSObject { "sessionKey": "main", "message": message, "thinking": "low", - "timeoutMs": 30_000, + "timeoutMs": 30000, "idempotencyKey": UUID().uuidString, ] let data = try JSONSerialization.data(withJSONObject: payload) @@ -260,7 +260,10 @@ final class TalkModeManager: NSObject { } private func fetchLatestAssistantText(bridge: BridgeSession) async throws -> String? { - let res = try await bridge.request(method: "chat.history", paramsJSON: "{\"sessionKey\":\"main\"}", timeoutSeconds: 15) + let res = try await bridge.request( + method: "chat.history", + paramsJSON: "{\"sessionKey\":\"main\"}", + timeoutSeconds: 15) guard let json = try JSONSerialization.jsonObject(with: res) as? [String: Any] else { return nil } guard let messages = json["messages"] as? [[String: Any]] else { return nil } for msg in messages.reversed() { @@ -499,7 +502,7 @@ private enum TalkModeRuntime { static func validatedSeed(_ value: Int?) -> UInt32? { guard let value else { return nil } - if value < 0 || value > 4294967295 { return nil } + if value < 0 || value > 4_294_967_295 { return nil } return UInt32(value) } diff --git a/apps/ios/SwiftSources.input.xcfilelist b/apps/ios/SwiftSources.input.xcfilelist index 3e2a9a7b0..81d42dce1 100644 --- a/apps/ios/SwiftSources.input.xcfilelist +++ b/apps/ios/SwiftSources.input.xcfilelist @@ -54,4 +54,6 @@ Sources/Voice/VoiceWakePreferences.swift ../shared/ClawdisKit/Sources/ClawdisKit/ScreenCommands.swift ../shared/ClawdisKit/Sources/ClawdisKit/StoragePaths.swift ../shared/ClawdisKit/Sources/ClawdisKit/SystemCommands.swift +../shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift ../../Swabble/Sources/SwabbleKit/WakeWordGate.swift +Sources/Voice/TalkModeManager.swift diff --git a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift index 6bc4c0195..af0e2365f 100644 --- a/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift +++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/TalkDirective.swift @@ -67,7 +67,8 @@ public enum TalkDirectiveParser { var lines = normalized.split(separator: "\n", omittingEmptySubsequences: false) guard !lines.isEmpty else { return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) } - guard let firstNonEmpty = lines.firstIndex(where: { !$0.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty }) + guard let firstNonEmpty = + lines.firstIndex(where: { !$0.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty }) else { return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) } @@ -83,8 +84,8 @@ public enum TalkDirectiveParser { return TalkDirectiveParseResult(directive: nil, stripped: text, unknownKeys: []) } - let speakerBoost = boolValue(json, keys: ["speaker_boost", "speakerBoost"]) - ?? boolValue(json, keys: ["no_speaker_boost", "noSpeakerBoost"]).map { !$0 } + let speakerBoost = self.boolValue(json, keys: ["speaker_boost", "speakerBoost"]) + ?? self.boolValue(json, keys: ["no_speaker_boost", "noSpeakerBoost"]).map { !$0 } let directive = TalkDirective( voiceId: stringValue(json, keys: ["voice", "voice_id", "voiceId"]), From c56292a6ec17905b1fce2794ea12a9750f0afe76 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 30 Dec 2025 00:01:21 +0100 Subject: [PATCH 025/100] feat: move talk mode to overlay button --- CHANGELOG.md | 2 ++ .../steipete/clawdis/node/ui/RootScreen.kt | 33 +++++++++++++++++++ .../steipete/clawdis/node/ui/SettingsSheet.kt | 24 -------------- apps/ios/Sources/RootCanvas.swift | 15 +++++++++ apps/ios/Sources/Settings/SettingsTab.swift | 3 ++ 5 files changed, 53 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 01821b781..cd94a3ce5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ ### Fixes - macOS: Voice Wake now fully tears down the Speech pipeline when disabled (cancel pending restarts, drop stale callbacks) to avoid high CPU in the background. +- macOS menu: add a Talk Mode action alongside the Open Dashboard/Chat/Canvas entries. - iOS/Android nodes: enable scrolling for loaded web pages in the Canvas WebView (default scaffold stays touch-first). - macOS menu: device list now uses `node.list` (devices only; no agent/tool presence entries). - macOS menu: device list now shows connected nodes only. @@ -15,6 +16,7 @@ - iOS/Android nodes: status pill now surfaces camera activity instead of overlay toasts. - iOS/Android/macOS nodes: camera snaps recompress to keep base64 payloads under 5 MB. - iOS/Android nodes: status pill now surfaces pairing, screen recording, voice wake, and foreground-required states. +- iOS/Android nodes: Talk Mode now lives on a side bubble (with an iOS toggle to hide it), and Android settings no longer show the Talk Mode switch. - macOS menu: top status line now shows pending node pairing approvals (incl. repairs). - CLI: avoid spurious gateway close errors after successful request/response cycles. - Agent runtime: clamp tool-result images to the 5MB Anthropic limit to avoid hard request rejections. diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt index cb11cf303..791f76325 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt @@ -13,6 +13,8 @@ import android.webkit.WebResourceError import android.webkit.WebResourceRequest import android.webkit.WebResourceResponse import android.webkit.WebViewClient +import androidx.activity.compose.rememberLauncherForActivityResult +import androidx.activity.result.contract.ActivityResultContracts import androidx.compose.foundation.layout.Arrangement import androidx.compose.foundation.layout.Box import androidx.compose.foundation.layout.Column @@ -28,6 +30,8 @@ import androidx.compose.material3.ExperimentalMaterial3Api import androidx.compose.material3.FilledTonalIconButton import androidx.compose.material3.Icon import androidx.compose.material3.IconButtonDefaults +import androidx.compose.material3.LocalContentColor +import androidx.compose.material3.MaterialTheme import androidx.compose.material3.ModalBottomSheet import androidx.compose.material3.rememberModalBottomSheetState import androidx.compose.material.icons.Icons @@ -72,6 +76,11 @@ fun RootScreen(viewModel: MainViewModel) { val screenRecordActive by viewModel.screenRecordActive.collectAsState() val isForeground by viewModel.isForeground.collectAsState() val voiceWakeStatusText by viewModel.voiceWakeStatusText.collectAsState() + val talkEnabled by viewModel.talkEnabled.collectAsState() + val audioPermissionLauncher = + rememberLauncherForActivityResult(ActivityResultContracts.RequestPermission()) { granted -> + if (granted) viewModel.setTalkEnabled(true) + } val activity = remember(cameraHud, screenRecordActive, isForeground, statusText, voiceWakeStatusText) { // Status pill owns transient activity state so it doesn't overlap the connection indicator. @@ -211,6 +220,30 @@ fun RootScreen(viewModel: MainViewModel) { icon = { Icon(Icons.Default.ChatBubble, contentDescription = "Chat") }, ) + // Talk mode gets a dedicated side bubble instead of burying it in settings. + OverlayIconButton( + onClick = { + val next = !talkEnabled + if (next) { + val micOk = + ContextCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO) == + PackageManager.PERMISSION_GRANTED + if (!micOk) audioPermissionLauncher.launch(Manifest.permission.RECORD_AUDIO) + viewModel.setTalkEnabled(true) + } else { + viewModel.setTalkEnabled(false) + } + }, + icon = { + val tint = if (talkEnabled) MaterialTheme.colorScheme.primary else LocalContentColor.current + Icon( + Icons.Default.RecordVoiceOver, + contentDescription = "Talk Mode", + tint = tint, + ) + }, + ) + OverlayIconButton( onClick = { sheet = Sheet.Settings }, icon = { Icon(Icons.Default.Settings, contentDescription = "Settings") }, diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/SettingsSheet.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/SettingsSheet.kt index 2ec4a7119..c7d011892 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/SettingsSheet.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/SettingsSheet.kt @@ -62,8 +62,6 @@ fun SettingsSheet(viewModel: MainViewModel) { val wakeWords by viewModel.wakeWords.collectAsState() val voiceWakeMode by viewModel.voiceWakeMode.collectAsState() val voiceWakeStatusText by viewModel.voiceWakeStatusText.collectAsState() - val talkEnabled by viewModel.talkEnabled.collectAsState() - val talkStatusText by viewModel.talkStatusText.collectAsState() val isConnected by viewModel.isConnected.collectAsState() val manualEnabled by viewModel.manualEnabled.collectAsState() val manualHost by viewModel.manualHost.collectAsState() @@ -309,28 +307,6 @@ fun SettingsSheet(viewModel: MainViewModel) { // Voice item { Text("Voice", style = MaterialTheme.typography.titleSmall) } - item { - ListItem( - headlineContent = { Text("Talk Mode") }, - supportingContent = { Text(talkStatusText) }, - trailingContent = { - Switch( - checked = talkEnabled, - onCheckedChange = { on -> - if (on) { - val micOk = - ContextCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO) == - PackageManager.PERMISSION_GRANTED - if (!micOk) audioPermissionLauncher.launch(Manifest.permission.RECORD_AUDIO) - viewModel.setTalkEnabled(true) - } else { - viewModel.setTalkEnabled(false) - } - }, - ) - }, - ) - } item { val enabled = voiceWakeMode != VoiceWakeMode.Off ListItem( diff --git a/apps/ios/Sources/RootCanvas.swift b/apps/ios/Sources/RootCanvas.swift index b55f84cc1..910d96a3d 100644 --- a/apps/ios/Sources/RootCanvas.swift +++ b/apps/ios/Sources/RootCanvas.swift @@ -120,6 +120,8 @@ struct RootCanvas: View { private struct CanvasContent: View { @Environment(NodeAppModel.self) private var appModel + @AppStorage("talk.enabled") private var talkEnabled: Bool = false + @AppStorage("talk.button.enabled") private var talkButtonEnabled: Bool = true var systemColorScheme: ColorScheme var bridgeStatus: StatusPill.BridgeState var voiceWakeEnabled: Bool @@ -141,6 +143,19 @@ private struct CanvasContent: View { } .accessibilityLabel("Chat") + if self.talkButtonEnabled { + // Talk mode lives on a side bubble so it doesn't get buried in settings. + OverlayButton( + systemImage: self.appModel.talkMode.isEnabled ? "waveform.circle.fill" : "waveform.circle", + brighten: self.brightenButtons) + { + let next = !self.appModel.talkMode.isEnabled + self.talkEnabled = next + self.appModel.setTalkEnabled(next) + } + .accessibilityLabel("Talk Mode") + } + OverlayButton(systemImage: "gearshape.fill", brighten: self.brightenButtons) { self.openSettings() } diff --git a/apps/ios/Sources/Settings/SettingsTab.swift b/apps/ios/Sources/Settings/SettingsTab.swift index 265b7069c..34b05dfc9 100644 --- a/apps/ios/Sources/Settings/SettingsTab.swift +++ b/apps/ios/Sources/Settings/SettingsTab.swift @@ -21,6 +21,7 @@ struct SettingsTab: View { @AppStorage("node.instanceId") private var instanceId: String = UUID().uuidString @AppStorage("voiceWake.enabled") private var voiceWakeEnabled: Bool = false @AppStorage("talk.enabled") private var talkEnabled: Bool = false + @AppStorage("talk.button.enabled") private var talkButtonEnabled: Bool = true @AppStorage("camera.enabled") private var cameraEnabled: Bool = true @AppStorage("screen.preventSleep") private var preventSleep: Bool = true @AppStorage("bridge.preferredStableID") private var preferredBridgeStableID: String = "" @@ -161,6 +162,8 @@ struct SettingsTab: View { .onChange(of: self.talkEnabled) { _, newValue in self.appModel.setTalkEnabled(newValue) } + // Keep this separate so users can hide the side bubble without disabling Talk Mode. + Toggle("Show Talk Button", isOn: self.$talkButtonEnabled) NavigationLink { VoiceWakeWordsSettingsView() From 53eccc1c1ecd4fe8f71719bf10c8a9239cab370a Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 30 Dec 2025 00:17:10 +0100 Subject: [PATCH 026/100] fix: wire talk menu + mac build --- AGENTS.md | 1 + .../Sources/Clawdis/ConfigSettings.swift | 23 ++++++++--- .../Sources/Clawdis/MenuContentView.swift | 20 ++++----- .../Sources/Clawdis/TalkAudioPlayer.swift | 2 +- .../Sources/Clawdis/TalkModeRuntime.swift | 41 ++++++++++--------- 5 files changed, 48 insertions(+), 39 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 0abed7948..8226882d7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -41,6 +41,7 @@ - Also read the shared guardrails at `~/Projects/oracle/AGENTS.md` and `~/Projects/agent-scripts/AGENTS.MD` before making changes; align with any cross-repo rules noted there. - SwiftUI state management (iOS/macOS): prefer the `Observation` framework (`@Observable`, `@Bindable`) over `ObservableObject`/`@StateObject`; don’t introduce new `ObservableObject` unless required for compatibility, and migrate existing usages when touching related code. - **Restart apps:** “restart iOS/Android apps” means rebuild (recompile/install) and relaunch, not just kill/launch. +- **Device checks:** before testing, verify connected real devices (iOS/Android) before reaching for simulators/emulators. - iOS Team ID lookup: `security find-identity -p codesigning -v` → use Apple Development (…) TEAMID. Fallback: `defaults read com.apple.dt.Xcode IDEProvisioningTeamIdentifiers`. - A2UI bundle hash: `src/canvas-host/a2ui/.bundle.hash` is auto-generated; regenerate via `pnpm canvas:a2ui:bundle` (or `scripts/bundle-a2ui.sh`) instead of manual conflict resolution. - Notary key file lives at `~/Library/CloudStorage/Dropbox/Backup/AppStore/AuthKey_NJF3NFGTS3.p8` (Sparkle keys live under `~/Library/CloudStorage/Dropbox/Backup/Sparkle`). diff --git a/apps/macos/Sources/Clawdis/ConfigSettings.swift b/apps/macos/Sources/Clawdis/ConfigSettings.swift index cbbf04d5a..7c0867d79 100644 --- a/apps/macos/Sources/Clawdis/ConfigSettings.swift +++ b/apps/macos/Sources/Clawdis/ConfigSettings.swift @@ -277,14 +277,25 @@ struct ConfigSettings: View { GridRow { self.gridLabel("Voice ID") VStack(alignment: .leading, spacing: 6) { - ComboBox("ElevenLabs voice ID", text: self.$talkVoiceId) { - ForEach(self.talkVoiceSuggestions, id: \.self) { value in - Text(value).tag(value) + HStack(spacing: 8) { + TextField("ElevenLabs voice ID", text: self.$talkVoiceId) + .textFieldStyle(.roundedBorder) + .frame(maxWidth: .infinity) + .onChange(of: self.talkVoiceId) { _, _ in self.autosaveConfig() } + if !self.talkVoiceSuggestions.isEmpty { + Menu { + ForEach(self.talkVoiceSuggestions, id: \.self) { value in + Button(value) { + self.talkVoiceId = value + self.autosaveConfig() + } + } + } label: { + Label("Suggestions", systemImage: "chevron.up.chevron.down") + } + .fixedSize() } } - .textFieldStyle(.roundedBorder) - .frame(maxWidth: .infinity) - .onChange(of: self.talkVoiceId) { _, _ in self.autosaveConfig() } Text("Defaults to ELEVENLABS_VOICE_ID / SAG_VOICE_ID if unset.") .font(.footnote) .foregroundStyle(.secondary) diff --git a/apps/macos/Sources/Clawdis/MenuContentView.swift b/apps/macos/Sources/Clawdis/MenuContentView.swift index dee70ed5d..e1453e5a2 100644 --- a/apps/macos/Sources/Clawdis/MenuContentView.swift +++ b/apps/macos/Sources/Clawdis/MenuContentView.swift @@ -80,11 +80,6 @@ struct MenuContent: View { if self.showVoiceWakeMicPicker { self.voiceWakeMicMenu } - Toggle(isOn: self.talkBinding) { - Label("Talk", systemImage: "bubble.left.and.waveform") - } - .disabled(!voiceWakeSupported) - .opacity(voiceWakeSupported ? 1 : 0.5) Divider() Button { Task { @MainActor in @@ -115,6 +110,13 @@ struct MenuContent: View { systemImage: "rectangle.inset.filled.on.rectangle") } } + Button { + Task { await self.state.setTalkEnabled(!self.state.talkEnabled) } + } label: { + Label(self.state.talkEnabled ? "Stop Talk Mode" : "Talk Mode", systemImage: "bubble.left.and.waveform") + } + .disabled(!voiceWakeSupported) + .opacity(voiceWakeSupported ? 1 : 0.5) Divider() Button("Settings…") { self.open(tab: .general) } .keyboardShortcut(",", modifiers: [.command]) @@ -344,14 +346,6 @@ struct MenuContent: View { }) } - private var talkBinding: Binding { - Binding( - get: { self.state.talkEnabled }, - set: { newValue in - Task { await self.state.setTalkEnabled(newValue) } - }) - } - private var showVoiceWakeMicPicker: Bool { voiceWakeSupported && self.state.swabbleEnabled } diff --git a/apps/macos/Sources/Clawdis/TalkAudioPlayer.swift b/apps/macos/Sources/Clawdis/TalkAudioPlayer.swift index f72de1d02..b1df3886b 100644 --- a/apps/macos/Sources/Clawdis/TalkAudioPlayer.swift +++ b/apps/macos/Sources/Clawdis/TalkAudioPlayer.swift @@ -3,7 +3,7 @@ import Foundation import OSLog @MainActor -final class TalkAudioPlayer: NSObject, AVAudioPlayerDelegate { +final class TalkAudioPlayer: NSObject, @preconcurrency AVAudioPlayerDelegate { static let shared = TalkAudioPlayer() private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts") diff --git a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift index 955d9ceda..3be350d9d 100644 --- a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift +++ b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift @@ -87,9 +87,9 @@ actor TalkModeRuntime { private struct RecognitionUpdate { let transcript: String? - let segments: [SFTranscriptionSegment] + let hasConfidence: Bool let isFinal: Bool - let error: Error? + let errorDescription: String? let generation: Int } @@ -136,12 +136,13 @@ actor TalkModeRuntime { self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self, generation] result, error in guard let self else { return } + let segments = result?.bestTranscription.segments ?? [] let transcript = result?.bestTranscription.formattedString let update = RecognitionUpdate( transcript: transcript, - segments: result?.bestTranscription.segments ?? [], + hasConfidence: segments.contains { $0.confidence > 0.6 }, isFinal: result?.isFinal ?? false, - error: error, + errorDescription: error?.localizedDescription, generation: generation) Task { await self.handleRecognition(update) } } @@ -161,14 +162,14 @@ actor TalkModeRuntime { private func handleRecognition(_ update: RecognitionUpdate) async { guard update.generation == self.recognitionGeneration else { return } - if let error = update.error { - self.logger.debug("talk recognition error: \(error.localizedDescription, privacy: .public)") + if let errorDescription = update.errorDescription { + self.logger.debug("talk recognition error: \(errorDescription, privacy: .public)") } guard let transcript = update.transcript else { return } let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines) if self.phase == .speaking, self.interruptOnSpeech { - if await self.shouldInterrupt(transcript: trimmed, segments: update.segments) { + if await self.shouldInterrupt(transcript: trimmed, hasConfidence: update.hasConfidence) { await self.stopSpeaking(reason: .speech) self.lastTranscript = "" self.lastHeard = nil @@ -194,11 +195,14 @@ actor TalkModeRuntime { private func startSilenceMonitor() { self.silenceTask?.cancel() self.silenceTask = Task { [weak self] in - guard let self else { return } - while self.isEnabled { - try? await Task.sleep(nanoseconds: 200_000_000) - await self.checkSilence() - } + await self?.silenceLoop() + } + } + + private func silenceLoop() async { + while self.isEnabled { + try? await Task.sleep(nanoseconds: 200_000_000) + await self.checkSilence() } } @@ -297,9 +301,9 @@ actor TalkModeRuntime { } private func waitForChatCompletion(runId: String, timeoutSeconds: Int) async -> ChatCompletionState { - await withTaskGroup(of: ChatCompletionState.self) { group in + let stream = await GatewayConnection.shared.subscribe() + return await withTaskGroup(of: ChatCompletionState.self) { group in group.addTask { [runId] in - let stream = GatewayConnection.shared.subscribe() for await push in stream { if case let .event(evt) = push, evt.event == "chat", let payload = evt.payload { if let chat = try? JSONDecoder().decode( @@ -332,13 +336,13 @@ actor TalkModeRuntime { do { let history = try await GatewayConnection.shared.chatHistory(sessionKey: sessionKey) let messages = history.messages ?? [] - let decoded = messages.compactMap { item in + let decoded: [ClawdisChatMessage] = messages.compactMap { item in guard let data = try? JSONEncoder().encode(item) else { return nil } return try? JSONDecoder().decode(ClawdisChatMessage.self, from: data) } guard let assistant = decoded.last(where: { $0.role == "assistant" }) else { return nil } let text = assistant.content.compactMap { $0.text }.joined(separator: "\n") - let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) + let trimmed = text.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines) return trimmed.isEmpty ? nil : trimmed } catch { self.logger.error("talk history fetch failed: \(error.localizedDescription, privacy: .public)") @@ -418,7 +422,7 @@ actor TalkModeRuntime { let audio = try await ElevenLabsClient(apiKey: apiKey).synthesize( voiceId: voiceId, request: request) - let result = await MainActor.run { await TalkAudioPlayer.shared.play(data: audio) } + let result = await TalkAudioPlayer.shared.play(data: audio) if !result.finished, let interruptedAt = result.interruptedAt, self.phase == .speaking { if self.interruptOnSpeech { self.lastInterruptedAtSeconds = interruptedAt @@ -533,7 +537,7 @@ actor TalkModeRuntime { return sqrt(sum / Double(frameCount)) } - private func shouldInterrupt(transcript: String, segments: [SFTranscriptionSegment]) async -> Bool { + private func shouldInterrupt(transcript: String, hasConfidence: Bool) async -> Bool { let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines) guard trimmed.count >= 3 else { return false } if self.isLikelyEcho(of: trimmed) { return false } @@ -541,7 +545,6 @@ actor TalkModeRuntime { if let lastSpeechEnergyAt, now.timeIntervalSince(lastSpeechEnergyAt) > 0.35 { return false } - let hasConfidence = segments.contains { $0.confidence > 0.6 } return hasConfidence } From 39fccc36998a1152d801e6070f6afb6205e0b1b0 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 30 Dec 2025 00:51:17 +0100 Subject: [PATCH 027/100] fix: talk overlay + elevenlabs defaults --- CHANGELOG.md | 1 + .../Sources/Clawdis/ConfigSettings.swift | 63 ++++++++++ .../Sources/Clawdis/MenuContentView.swift | 2 +- .../Sources/Clawdis/TalkModeRuntime.swift | 112 +++++++++++++++-- .../Sources/Clawdis/TalkOverlayView.swift | 118 +++++++----------- 5 files changed, 208 insertions(+), 88 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cd94a3ce5..62c9357ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ ### Fixes - macOS: Voice Wake now fully tears down the Speech pipeline when disabled (cancel pending restarts, drop stale callbacks) to avoid high CPU in the background. - macOS menu: add a Talk Mode action alongside the Open Dashboard/Chat/Canvas entries. +- macOS Talk Mode: orb overlay refresh, ElevenLabs request logging, API key status in settings, and auto-select first voice when none is configured. - iOS/Android nodes: enable scrolling for loaded web pages in the Canvas WebView (default scaffold stays touch-first). - macOS menu: device list now uses `node.list` (devices only; no agent/tool presence entries). - macOS menu: device list now shows connected nodes only. diff --git a/apps/macos/Sources/Clawdis/ConfigSettings.swift b/apps/macos/Sources/Clawdis/ConfigSettings.swift index 7c0867d79..eb22490c0 100644 --- a/apps/macos/Sources/Clawdis/ConfigSettings.swift +++ b/apps/macos/Sources/Clawdis/ConfigSettings.swift @@ -33,6 +33,7 @@ struct ConfigSettings: View { // Talk mode settings (stored in ~/.clawdis/clawdis.json under "talk") @State private var talkVoiceId: String = "" @State private var talkInterruptOnSpeech: Bool = true + @State private var talkApiKey: String = "" var body: some View { ScrollView { self.content } @@ -301,6 +302,30 @@ struct ConfigSettings: View { .foregroundStyle(.secondary) } } + GridRow { + self.gridLabel("API key") + VStack(alignment: .leading, spacing: 6) { + HStack(spacing: 8) { + SecureField("ELEVENLABS_API_KEY", text: self.$talkApiKey) + .textFieldStyle(.roundedBorder) + .frame(maxWidth: .infinity) + .disabled(self.hasEnvApiKey) + .onChange(of: self.talkApiKey) { _, _ in self.autosaveConfig() } + if !self.hasEnvApiKey && !self.talkApiKey.isEmpty { + Button("Clear") { + self.talkApiKey = "" + self.autosaveConfig() + } + } + } + self.statusLine(label: self.apiKeyStatusLabel, color: self.apiKeyStatusColor) + if self.hasEnvApiKey { + Text("Using ELEVENLABS_API_KEY from the environment.") + .font(.footnote) + .foregroundStyle(.secondary) + } + } + } GridRow { self.gridLabel("Interrupt") Toggle("Stop speaking when you start talking", isOn: self.$talkInterruptOnSpeech) @@ -319,6 +344,18 @@ struct ConfigSettings: View { .frame(width: self.labelColumnWidth, alignment: .leading) } + private func statusLine(label: String, color: Color) -> some View { + HStack(spacing: 6) { + Circle() + .fill(color) + .frame(width: 6, height: 6) + Text(label) + .font(.footnote) + .foregroundStyle(.secondary) + } + .padding(.top, 2) + } + private func loadConfig() { let parsed = self.loadConfigDict() let agent = parsed["agent"] as? [String: Any] @@ -348,6 +385,7 @@ struct ConfigSettings: View { if let talk { if let voice = talk["voiceId"] as? String { self.talkVoiceId = voice } + if let apiKey = talk["apiKey"] as? String { self.talkApiKey = apiKey } if let interrupt = talk["interruptOnSpeech"] as? Bool { self.talkInterruptOnSpeech = interrupt } @@ -399,6 +437,12 @@ struct ConfigSettings: View { } else { talk["voiceId"] = trimmedVoice } + let trimmedApiKey = self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines) + if trimmedApiKey.isEmpty { + talk.removeValue(forKey: "apiKey") + } else { + talk["apiKey"] = trimmedApiKey + } talk["interruptOnSpeech"] = self.talkInterruptOnSpeech root["talk"] = talk @@ -433,6 +477,25 @@ struct ConfigSettings: View { .filter { seen.insert($0).inserted } } + private var hasEnvApiKey: Bool { + let raw = ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"] ?? "" + return !raw.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty + } + + private var apiKeyStatusLabel: String { + if self.hasEnvApiKey { return "ElevenLabs API key: found (environment)" } + if !self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + return "ElevenLabs API key: stored in config" + } + return "ElevenLabs API key: missing" + } + + private var apiKeyStatusColor: Color { + if self.hasEnvApiKey { return .green } + if !self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { return .green } + return .red + } + private var browserPathLabel: String? { guard self.browserEnabled else { return nil } diff --git a/apps/macos/Sources/Clawdis/MenuContentView.swift b/apps/macos/Sources/Clawdis/MenuContentView.swift index e1453e5a2..c43986925 100644 --- a/apps/macos/Sources/Clawdis/MenuContentView.swift +++ b/apps/macos/Sources/Clawdis/MenuContentView.swift @@ -113,7 +113,7 @@ struct MenuContent: View { Button { Task { await self.state.setTalkEnabled(!self.state.talkEnabled) } } label: { - Label(self.state.talkEnabled ? "Stop Talk Mode" : "Talk Mode", systemImage: "bubble.left.and.waveform") + Label(self.state.talkEnabled ? "Stop Talk Mode" : "Talk Mode", systemImage: "waveform.circle.fill") } .disabled(!voiceWakeSupported) .opacity(voiceWakeSupported ? 1 : 0.5) diff --git a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift index 3be350d9d..0443e26ea 100644 --- a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift +++ b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift @@ -9,6 +9,7 @@ actor TalkModeRuntime { static let shared = TalkModeRuntime() private let logger = Logger(subsystem: "com.steipete.clawdis", category: "talk.runtime") + private let ttsLogger = Logger(subsystem: "com.steipete.clawdis", category: "talk.tts") private var recognizer: SFSpeechRecognizer? private var audioEngine: AVAudioEngine? @@ -36,6 +37,8 @@ actor TalkModeRuntime { private var interruptOnSpeech: Bool = true private var lastInterruptedAtSeconds: Double? private var lastSpokenText: String? + private var apiKey: String? + private var fallbackVoiceId: String? private let silenceWindow: TimeInterval = 0.7 private let minSpeechRMS: Double = 1e-3 @@ -379,19 +382,17 @@ actor TalkModeRuntime { } } - let voiceId = - directive?.voiceId ?? - self.currentVoiceId ?? - self.defaultVoiceId - - guard let voiceId, !voiceId.isEmpty else { - self.logger.error("talk missing voiceId; set talk.voiceId or ELEVENLABS_VOICE_ID") + guard let apiKey = self.apiKey, !apiKey.isEmpty else { + self.logger.error("talk missing ELEVENLABS_API_KEY") return } - let apiKey = ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"] ?? "" - if apiKey.isEmpty { - self.logger.error("talk missing ELEVENLABS_API_KEY") + let requestedVoice = + directive?.voiceId ?? + self.currentVoiceId ?? + self.defaultVoiceId + guard let voiceId = await self.resolveVoiceId(preferred: requestedVoice, apiKey: apiKey) else { + self.logger.error("talk missing voiceId; set talk.voiceId or ELEVENLABS_VOICE_ID") return } @@ -419,7 +420,7 @@ actor TalkModeRuntime { language: Self.validatedLanguage(directive?.language, logger: self.logger)) do { - let audio = try await ElevenLabsClient(apiKey: apiKey).synthesize( + let audio = try await ElevenLabsClient(apiKey: apiKey, logger: self.ttsLogger).synthesize( voiceId: voiceId, request: request) let result = await TalkAudioPlayer.shared.play(data: audio) @@ -436,6 +437,33 @@ actor TalkModeRuntime { await MainActor.run { TalkModeController.shared.updatePhase(.thinking) } } + private func resolveVoiceId(preferred: String?, apiKey: String) async -> String? { + let trimmed = preferred?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" + if !trimmed.isEmpty { return trimmed } + if let fallbackVoiceId { return fallbackVoiceId } + + do { + let voices = try await ElevenLabsClient(apiKey: apiKey, logger: self.ttsLogger).listVoices() + guard let first = voices.first else { + self.ttsLogger.error("elevenlabs voices list empty") + return nil + } + self.fallbackVoiceId = first.voiceId + if self.defaultVoiceId == nil { + self.defaultVoiceId = first.voiceId + } + if !self.voiceOverrideActive { + self.currentVoiceId = first.voiceId + } + let name = first.name ?? "unknown" + self.ttsLogger.info("talk default voice selected \(name, privacy: .public) (\(first.voiceId, privacy: .public))") + return first.voiceId + } catch { + self.ttsLogger.error("elevenlabs list voices failed: \(error.localizedDescription, privacy: .public)") + return nil + } + } + func stopSpeaking(reason: TalkStopReason) async { guard self.phase == .speaking else { return } let interruptedAt = await MainActor.run { TalkAudioPlayer.shared.stop() } @@ -460,6 +488,7 @@ actor TalkModeRuntime { } self.defaultOutputFormat = cfg.outputFormat self.interruptOnSpeech = cfg.interruptOnSpeech + self.apiKey = cfg.apiKey } private struct TalkRuntimeConfig { @@ -467,12 +496,14 @@ actor TalkModeRuntime { let modelId: String? let outputFormat: String? let interruptOnSpeech: Bool + let apiKey: String? } private func fetchTalkConfig() async -> TalkRuntimeConfig { let env = ProcessInfo.processInfo.environment let envVoice = env["ELEVENLABS_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines) let sagVoice = env["SAG_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines) + let envApiKey = env["ELEVENLABS_API_KEY"]?.trimmingCharacters(in: .whitespacesAndNewlines) do { let snap: ConfigSnapshot = try await GatewayConnection.shared.requestDecoded( @@ -484,24 +515,31 @@ actor TalkModeRuntime { let model = talk?["modelId"]?.stringValue let outputFormat = talk?["outputFormat"]?.stringValue let interrupt = talk?["interruptOnSpeech"]?.boolValue + let apiKey = talk?["apiKey"]?.stringValue let resolvedVoice = (voice?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? voice : nil) ?? (envVoice?.isEmpty == false ? envVoice : nil) ?? (sagVoice?.isEmpty == false ? sagVoice : nil) + let resolvedApiKey = + (envApiKey?.isEmpty == false ? envApiKey : nil) ?? + (apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? apiKey : nil) return TalkRuntimeConfig( voiceId: resolvedVoice, modelId: model, outputFormat: outputFormat, - interruptOnSpeech: interrupt ?? true) + interruptOnSpeech: interrupt ?? true, + apiKey: resolvedApiKey) } catch { let resolvedVoice = (envVoice?.isEmpty == false ? envVoice : nil) ?? (sagVoice?.isEmpty == false ? sagVoice : nil) + let resolvedApiKey = envApiKey?.isEmpty == false ? envApiKey : nil return TalkRuntimeConfig( voiceId: resolvedVoice, modelId: nil, outputFormat: nil, - interruptOnSpeech: true) + interruptOnSpeech: true, + apiKey: resolvedApiKey) } } @@ -631,6 +669,7 @@ private struct ElevenLabsRequest { private struct ElevenLabsClient { let apiKey: String + let logger: Logger let baseUrl: URL = URL(string: "https://api.elevenlabs.io")! func synthesize(voiceId: String, request: ElevenLabsRequest) async throws -> Data { @@ -639,6 +678,11 @@ private struct ElevenLabsClient { url.appendPathComponent("text-to-speech") url.appendPathComponent(voiceId) + let charCount = request.text.count + self.logger.info( + "elevenlabs tts request voice=\(voiceId, privacy: .public) model=\(request.modelId ?? "default", privacy: .public) chars=\(charCount, privacy: .public)") + let startedAt = Date() + var payload: [String: Any] = [ "text": request.text, ] @@ -678,10 +722,52 @@ private struct ElevenLabsClient { let (data, response) = try await URLSession.shared.data(for: req) if let http = response as? HTTPURLResponse, http.statusCode >= 400 { let message = String(data: data, encoding: .utf8) ?? "unknown" + self.logger.error( + "elevenlabs tts failed status=\(http.statusCode, privacy: .public) message=\(message, privacy: .public)") throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [ NSLocalizedDescriptionKey: "ElevenLabs failed: \(http.statusCode) \(message)", ]) } + let elapsed = Date().timeIntervalSince(startedAt) + self.logger.info("elevenlabs tts ok bytes=\(data.count, privacy: .public) dur=\(elapsed, privacy: .public)s") return data } + + func listVoices() async throws -> [ElevenLabsVoice] { + var url = self.baseUrl + url.appendPathComponent("v1") + url.appendPathComponent("voices") + + self.logger.info("elevenlabs voices list request") + var req = URLRequest(url: url) + req.httpMethod = "GET" + req.setValue(self.apiKey, forHTTPHeaderField: "xi-api-key") + + let (data, response) = try await URLSession.shared.data(for: req) + if let http = response as? HTTPURLResponse, http.statusCode >= 400 { + let message = String(data: data, encoding: .utf8) ?? "unknown" + self.logger.error( + "elevenlabs voices list failed status=\(http.statusCode, privacy: .public) message=\(message, privacy: .public)") + throw NSError(domain: "TalkTTS", code: http.statusCode, userInfo: [ + NSLocalizedDescriptionKey: "ElevenLabs voices failed: \(http.statusCode) \(message)", + ]) + } + + let decoded = try JSONDecoder().decode(ElevenLabsVoicesResponse.self, from: data) + return decoded.voices + } +} + +private struct ElevenLabsVoice: Decodable { + let voiceId: String + let name: String? + + enum CodingKeys: String, CodingKey { + case voiceId = "voice_id" + case name + } +} + +private struct ElevenLabsVoicesResponse: Decodable { + let voices: [ElevenLabsVoice] } diff --git a/apps/macos/Sources/Clawdis/TalkOverlayView.swift b/apps/macos/Sources/Clawdis/TalkOverlayView.swift index 2f2be75ca..29d7a6914 100644 --- a/apps/macos/Sources/Clawdis/TalkOverlayView.swift +++ b/apps/macos/Sources/Clawdis/TalkOverlayView.swift @@ -6,13 +6,13 @@ struct TalkOverlayView: View { var body: some View { ZStack(alignment: .topLeading) { - TalkCloudView(phase: self.controller.model.phase, level: self.controller.model.level) - .frame(width: 76, height: 64) + TalkOrbView(phase: self.controller.model.phase, level: self.controller.model.level) + .frame(width: 72, height: 72) .contentShape(Rectangle()) .onTapGesture { TalkModeController.shared.stopSpeaking(reason: .userTap) } - .padding(8) + .padding(10) Button { TalkModeController.shared.exitTalkMode() @@ -33,107 +33,77 @@ struct TalkOverlayView: View { } } -private struct TalkCloudView: View { +private struct TalkOrbView: View { let phase: TalkModePhase let level: Double var body: some View { TimelineView(.animation) { context in let t = context.date.timeIntervalSinceReferenceDate - let pulse = phase == .speaking ? (1 + 0.04 * sin(t * 6)) : 1 - let sink = phase == .thinking ? (3 + 2 * sin(t * 2)) : 0 - let listenScale = phase == .listening ? (1 + CGFloat(self.level) * 0.14) : 1 - let baseScale = phase == .thinking ? 0.94 : 1 + let listenScale = phase == .listening ? (1 + CGFloat(self.level) * 0.12) : 1 + let pulse = phase == .speaking ? (1 + 0.06 * sin(t * 6)) : 1 ZStack { - CloudShape() - .fill(self.cloudGradient) - .overlay( - CloudShape() - .stroke(Color.white.opacity(0.35), lineWidth: 0.8)) - .shadow(color: Color.black.opacity(0.18), radius: 8, x: 0, y: 4) - .scaleEffect(baseScale * pulse * listenScale) - .offset(y: sink) + Circle() + .fill(self.orbGradient) + .overlay(Circle().stroke(Color.white.opacity(0.45), lineWidth: 1)) + .shadow(color: Color.black.opacity(0.22), radius: 10, x: 0, y: 5) + .scaleEffect(pulse * listenScale) - if phase == .listening { - Circle() - .stroke(self.ringGradient, lineWidth: 1) - .scaleEffect(1 + CGFloat(self.level) * 0.45) - .opacity(0.3 + CGFloat(self.level) * 0.4) - .animation(.easeOut(duration: 0.08), value: self.level) - } + TalkWaveRings(phase: phase, level: level, time: t) if phase == .thinking { - TalkThinkingDots(time: t) - .offset(y: 18) - } - - if phase == .speaking { - TalkSpeakingRings(time: t) + TalkOrbitArcs(time: t) } } } } - private var cloudGradient: LinearGradient { - LinearGradient( - colors: [Color(red: 0.95, green: 0.98, blue: 1.0), Color(red: 0.75, green: 0.88, blue: 1.0)], - startPoint: .topLeading, - endPoint: .bottomTrailing) - } - - private var ringGradient: LinearGradient { - LinearGradient( - colors: [Color.white.opacity(0.6), Color.white.opacity(0.1)], - startPoint: .top, - endPoint: .bottom) + private var orbGradient: RadialGradient { + RadialGradient( + colors: [Color.white, Color(red: 0.62, green: 0.88, blue: 1.0)], + center: .topLeading, + startRadius: 4, + endRadius: 52) } } -private struct TalkThinkingDots: View { - let time: TimeInterval - - var body: some View { - HStack(spacing: 4) { - ForEach(0..<3, id: \.self) { idx in - let phase = (time * 2 + Double(idx) * 0.45).truncatingRemainder(dividingBy: 1) - Circle() - .fill(Color.white.opacity(0.75)) - .frame(width: 5, height: 5) - .opacity(0.35 + 0.55 * phase) - } - } - } -} - -private struct TalkSpeakingRings: View { +private struct TalkWaveRings: View { + let phase: TalkModePhase + let level: Double let time: TimeInterval var body: some View { ZStack { ForEach(0..<3, id: \.self) { idx in - let phase = (time * 1.1 + Double(idx) / 3).truncatingRemainder(dividingBy: 1) + let speed = phase == .speaking ? 1.4 : phase == .listening ? 0.9 : 0.6 + let progress = (time * speed + Double(idx) * 0.28).truncatingRemainder(dividingBy: 1) + let amplitude = phase == .speaking ? 0.95 : phase == .listening ? 0.5 + level * 0.7 : 0.35 + let scale = 0.75 + progress * amplitude + (phase == .listening ? level * 0.15 : 0) + let alpha = phase == .speaking ? 0.55 : phase == .listening ? 0.45 + level * 0.25 : 0.28 Circle() - .stroke(Color.white.opacity(0.6 - phase * 0.5), lineWidth: 1) - .scaleEffect(0.8 + phase * 0.7) - .opacity(0.6 - phase * 0.6) + .stroke(Color.white.opacity(alpha - progress * 0.35), lineWidth: 1.2) + .scaleEffect(scale) + .opacity(alpha - progress * 0.6) } } } } -private struct CloudShape: Shape { - func path(in rect: CGRect) -> Path { - let w = rect.width - let h = rect.height - let baseHeight = h * 0.44 - let baseRect = CGRect(x: rect.minX, y: rect.minY + h * 0.46, width: w, height: baseHeight) +private struct TalkOrbitArcs: View { + let time: TimeInterval - var path = Path() - path.addRoundedRect(in: baseRect, cornerSize: CGSize(width: baseHeight / 2, height: baseHeight / 2)) - path.addEllipse(in: CGRect(x: rect.minX + w * 0.05, y: rect.minY + h * 0.28, width: w * 0.36, height: h * 0.36)) - path.addEllipse(in: CGRect(x: rect.minX + w * 0.28, y: rect.minY + h * 0.05, width: w * 0.44, height: h * 0.44)) - path.addEllipse(in: CGRect(x: rect.minX + w * 0.62, y: rect.minY + h * 0.3, width: w * 0.3, height: h * 0.3)) - return path + var body: some View { + ZStack { + Circle() + .trim(from: 0.08, to: 0.26) + .stroke(Color.white.opacity(0.75), style: StrokeStyle(lineWidth: 1.4, lineCap: .round)) + .rotationEffect(.degrees(time * 42)) + Circle() + .trim(from: 0.62, to: 0.86) + .stroke(Color.white.opacity(0.55), style: StrokeStyle(lineWidth: 1.2, lineCap: .round)) + .rotationEffect(.degrees(-time * 35)) + } + .scaleEffect(1.05) } } From 37f85bb2d118d8d2b503845bd6f44bc7bfb3b509 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 30 Dec 2025 00:58:58 +0100 Subject: [PATCH 028/100] fix: expand talk overlay bounds --- apps/macos/Sources/Clawdis/TalkOverlay.swift | 6 +++--- apps/macos/Sources/Clawdis/TalkOverlayView.swift | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/apps/macos/Sources/Clawdis/TalkOverlay.swift b/apps/macos/Sources/Clawdis/TalkOverlay.swift index 63c9d5dce..59555a104 100644 --- a/apps/macos/Sources/Clawdis/TalkOverlay.swift +++ b/apps/macos/Sources/Clawdis/TalkOverlay.swift @@ -20,9 +20,9 @@ final class TalkOverlayController { private var window: NSPanel? private var hostingView: NSHostingView? - private let width: CGFloat = 92 - private let height: CGFloat = 92 - private let padding: CGFloat = 8 + private let width: CGFloat = 120 + private let height: CGFloat = 120 + private let padding: CGFloat = 6 func present() { self.ensureWindow() diff --git a/apps/macos/Sources/Clawdis/TalkOverlayView.swift b/apps/macos/Sources/Clawdis/TalkOverlayView.swift index 29d7a6914..f5484c439 100644 --- a/apps/macos/Sources/Clawdis/TalkOverlayView.swift +++ b/apps/macos/Sources/Clawdis/TalkOverlayView.swift @@ -7,12 +7,12 @@ struct TalkOverlayView: View { var body: some View { ZStack(alignment: .topLeading) { TalkOrbView(phase: self.controller.model.phase, level: self.controller.model.level) - .frame(width: 72, height: 72) + .frame(width: 80, height: 80) .contentShape(Rectangle()) .onTapGesture { TalkModeController.shared.stopSpeaking(reason: .userTap) } - .padding(10) + .padding(16) Button { TalkModeController.shared.exitTalkMode() @@ -29,7 +29,7 @@ struct TalkOverlayView: View { .padding(4) .onHover { self.hovering = $0 } } - .frame(width: 92, height: 92, alignment: .center) + .frame(width: 120, height: 120, alignment: .center) } } From 7aabe73521d99ceb549ed222b4621658053ea189 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 30 Dec 2025 00:59:30 +0100 Subject: [PATCH 029/100] chore: sync pending changes --- .../Sources/Screen/ScreenRecordService.swift | 187 +++++++++--------- 1 file changed, 96 insertions(+), 91 deletions(-) diff --git a/apps/ios/Sources/Screen/ScreenRecordService.swift b/apps/ios/Sources/Screen/ScreenRecordService.swift index 861704310..b5d75b57d 100644 --- a/apps/ios/Sources/Screen/ScreenRecordService.swift +++ b/apps/ios/Sources/Screen/ScreenRecordService.swift @@ -68,114 +68,119 @@ final class ScreenRecordService: @unchecked Sendable { try? FileManager.default.removeItem(at: outURL) let state = CaptureState() + let recordQueue = DispatchQueue(label: "com.steipete.clawdis.screenrecord") try await withCheckedThrowingContinuation { (cont: CheckedContinuation) in let handler: @Sendable (CMSampleBuffer, RPSampleBufferType, Error?) -> Void = { sample, type, error in - if let error { - state.withLock { state in - if state.handlerError == nil { state.handlerError = error } - } - return - } - guard CMSampleBufferDataIsReady(sample) else { return } - - switch type { - case .video: - let pts = CMSampleBufferGetPresentationTimeStamp(sample) - let shouldSkip = state.withLock { state in - if let lastVideoTime = state.lastVideoTime { - let delta = CMTimeSubtract(pts, lastVideoTime) - return delta.seconds < (1.0 / fpsValue) + // ReplayKit can call the capture handler on a background queue. + // Serialize writes to avoid queue asserts. + recordQueue.async { + if let error { + state.withLock { state in + if state.handlerError == nil { state.handlerError = error } } - return false + return } - if shouldSkip { return } + guard CMSampleBufferDataIsReady(sample) else { return } - if state.withLock({ $0.writer == nil }) { - guard let imageBuffer = CMSampleBufferGetImageBuffer(sample) else { - state.withLock { state in - if state.handlerError == nil { - state.handlerError = ScreenRecordError.captureFailed("Missing image buffer") - } + switch type { + case .video: + let pts = CMSampleBufferGetPresentationTimeStamp(sample) + let shouldSkip = state.withLock { state in + if let lastVideoTime = state.lastVideoTime { + let delta = CMTimeSubtract(pts, lastVideoTime) + return delta.seconds < (1.0 / fpsValue) } - return + return false } - let width = CVPixelBufferGetWidth(imageBuffer) - let height = CVPixelBufferGetHeight(imageBuffer) - do { - let w = try AVAssetWriter(outputURL: outURL, fileType: .mp4) - let settings: [String: Any] = [ - AVVideoCodecKey: AVVideoCodecType.h264, - AVVideoWidthKey: width, - AVVideoHeightKey: height, - ] - let vInput = AVAssetWriterInput(mediaType: .video, outputSettings: settings) - vInput.expectsMediaDataInRealTime = true - guard w.canAdd(vInput) else { - throw ScreenRecordError.writeFailed("Cannot add video input") - } - w.add(vInput) + if shouldSkip { return } - if includeAudio { - let aInput = AVAssetWriterInput(mediaType: .audio, outputSettings: nil) - aInput.expectsMediaDataInRealTime = true - if w.canAdd(aInput) { - w.add(aInput) - state.withLock { state in - state.audioInput = aInput - } - } - } - - guard w.startWriting() else { - throw ScreenRecordError - .writeFailed(w.error?.localizedDescription ?? "Failed to start writer") - } - w.startSession(atSourceTime: pts) - state.withLock { state in - state.writer = w - state.videoInput = vInput - state.started = true - } - } catch { - state.withLock { state in - if state.handlerError == nil { state.handlerError = error } - } - return - } - } - - let vInput = state.withLock { $0.videoInput } - let isStarted = state.withLock { $0.started } - guard let vInput, isStarted else { return } - if vInput.isReadyForMoreMediaData { - if vInput.append(sample) { - state.withLock { state in - state.sawVideo = true - state.lastVideoTime = pts - } - } else { - let err = state.withLock { $0.writer?.error } - if let err { + if state.withLock({ $0.writer == nil }) { + guard let imageBuffer = CMSampleBufferGetImageBuffer(sample) else { state.withLock { state in if state.handlerError == nil { - state.handlerError = ScreenRecordError.writeFailed(err.localizedDescription) + state.handlerError = ScreenRecordError.captureFailed("Missing image buffer") + } + } + return + } + let width = CVPixelBufferGetWidth(imageBuffer) + let height = CVPixelBufferGetHeight(imageBuffer) + do { + let w = try AVAssetWriter(outputURL: outURL, fileType: .mp4) + let settings: [String: Any] = [ + AVVideoCodecKey: AVVideoCodecType.h264, + AVVideoWidthKey: width, + AVVideoHeightKey: height, + ] + let vInput = AVAssetWriterInput(mediaType: .video, outputSettings: settings) + vInput.expectsMediaDataInRealTime = true + guard w.canAdd(vInput) else { + throw ScreenRecordError.writeFailed("Cannot add video input") + } + w.add(vInput) + + if includeAudio { + let aInput = AVAssetWriterInput(mediaType: .audio, outputSettings: nil) + aInput.expectsMediaDataInRealTime = true + if w.canAdd(aInput) { + w.add(aInput) + state.withLock { state in + state.audioInput = aInput + } + } + } + + guard w.startWriting() else { + throw ScreenRecordError + .writeFailed(w.error?.localizedDescription ?? "Failed to start writer") + } + w.startSession(atSourceTime: pts) + state.withLock { state in + state.writer = w + state.videoInput = vInput + state.started = true + } + } catch { + state.withLock { state in + if state.handlerError == nil { state.handlerError = error } + } + return + } + } + + let vInput = state.withLock { $0.videoInput } + let isStarted = state.withLock { $0.started } + guard let vInput, isStarted else { return } + if vInput.isReadyForMoreMediaData { + if vInput.append(sample) { + state.withLock { state in + state.sawVideo = true + state.lastVideoTime = pts + } + } else { + let err = state.withLock { $0.writer?.error } + if let err { + state.withLock { state in + if state.handlerError == nil { + state.handlerError = ScreenRecordError.writeFailed(err.localizedDescription) + } } } } } - } - case .audioApp, .audioMic: - let aInput = state.withLock { $0.audioInput } - let isStarted = state.withLock { $0.started } - guard includeAudio, let aInput, isStarted else { return } - if aInput.isReadyForMoreMediaData { - _ = aInput.append(sample) - } + case .audioApp, .audioMic: + let aInput = state.withLock { $0.audioInput } + let isStarted = state.withLock { $0.started } + guard includeAudio, let aInput, isStarted else { return } + if aInput.isReadyForMoreMediaData { + _ = aInput.append(sample) + } - @unknown default: - break + @unknown default: + break + } } } From 10e1e7fd4481255290bc5501f1c7de87aed2ea31 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 30 Dec 2025 00:15:57 +0000 Subject: [PATCH 030/100] chore: apply biome formatting --- src/agents/tool-images.ts | 9 +++++++-- src/web/auto-reply.test.ts | 36 +++++++++++++++++++----------------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/src/agents/tool-images.ts b/src/agents/tool-images.ts index 167470ad7..5182d5f3c 100644 --- a/src/agents/tool-images.ts +++ b/src/agents/tool-images.ts @@ -60,7 +60,10 @@ async function resizeImageBase64IfNeeded(params: { } const qualities = [85, 75, 65, 55, 45, 35]; - const sideStart = maxDim > 0 ? Math.min(params.maxDimensionPx, maxDim) : params.maxDimensionPx; + const sideStart = + maxDim > 0 + ? Math.min(params.maxDimensionPx, maxDim) + : params.maxDimensionPx; const sideGrid = [sideStart, 1800, 1600, 1400, 1200, 1000, 800] .map((v) => Math.min(params.maxDimensionPx, v)) .filter((v, i, arr) => v > 0 && arr.indexOf(v) === i) @@ -91,7 +94,9 @@ async function resizeImageBase64IfNeeded(params: { const best = smallest?.buffer ?? buf; const maxMb = (params.maxBytes / (1024 * 1024)).toFixed(0); const gotMb = (best.byteLength / (1024 * 1024)).toFixed(2); - throw new Error(`Image could not be reduced below ${maxMb}MB (got ${gotMb}MB)`); + throw new Error( + `Image could not be reduced below ${maxMb}MB (got ${gotMb}MB)`, + ); } export async function sanitizeContentBlocksImages( diff --git a/src/web/auto-reply.test.ts b/src/web/auto-reply.test.ts index 8edf2dcf7..271ccceec 100644 --- a/src/web/auto-reply.test.ts +++ b/src/web/auto-reply.test.ts @@ -318,23 +318,25 @@ describe("web auto-reply", () => { let capturedOnMessage: | ((msg: import("./inbound.js").WebInboundMessage) => Promise) | undefined; - const listenerFactory = vi.fn(async (opts: { - onMessage: ( - msg: import("./inbound.js").WebInboundMessage, - ) => Promise; - }) => { - capturedOnMessage = opts.onMessage; - let resolveClose: (reason: unknown) => void = () => {}; - const onClose = new Promise((res) => { - resolveClose = res; - closeResolvers.push(res); - }); - return { - close: vi.fn(), - onClose, - signalClose: (reason?: unknown) => resolveClose(reason), - }; - }); + const listenerFactory = vi.fn( + async (opts: { + onMessage: ( + msg: import("./inbound.js").WebInboundMessage, + ) => Promise; + }) => { + capturedOnMessage = opts.onMessage; + let resolveClose: (reason: unknown) => void = () => {}; + const onClose = new Promise((res) => { + resolveClose = res; + closeResolvers.push(res); + }); + return { + close: vi.fn(), + onClose, + signalClose: (reason?: unknown) => resolveClose(reason), + }; + }, + ); const runtime = { log: vi.fn(), error: vi.fn(), From 02db68aa6711d2338d26d57cf714eb8ee2194bab Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 30 Dec 2025 01:40:51 +0100 Subject: [PATCH 031/100] fix(macos): hide Restart Gateway when remote --- CHANGELOG.md | 1 + .../Sources/Clawdis/CommandResolver.swift | 46 +++++++++++++------ .../macos/Sources/Clawdis/DebugSettings.swift | 18 ++++++-- .../Sources/Clawdis/MenuContentView.swift | 10 ++-- .../Sources/Clawdis/SettingsRootView.swift | 2 +- .../CommandResolverTests.swift | 13 ++++-- .../ConnectionsSettingsSmokeTests.swift | 6 ++- 7 files changed, 67 insertions(+), 29 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 62c9357ac..8f8317f04 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ ### Fixes - macOS: Voice Wake now fully tears down the Speech pipeline when disabled (cancel pending restarts, drop stale callbacks) to avoid high CPU in the background. - macOS menu: add a Talk Mode action alongside the Open Dashboard/Chat/Canvas entries. +- macOS Debug: hide “Restart Gateway” when the app won’t start a local gateway (remote mode / attach-only). - macOS Talk Mode: orb overlay refresh, ElevenLabs request logging, API key status in settings, and auto-select first voice when none is configured. - iOS/Android nodes: enable scrolling for loaded web pages in the Canvas WebView (default scaffold stays touch-first). - macOS menu: device list now uses `node.list` (devices only; no agent/tool presence entries). diff --git a/apps/macos/Sources/Clawdis/CommandResolver.swift b/apps/macos/Sources/Clawdis/CommandResolver.swift index d418cd5e0..2e8b5c6ce 100644 --- a/apps/macos/Sources/Clawdis/CommandResolver.swift +++ b/apps/macos/Sources/Clawdis/CommandResolver.swift @@ -16,6 +16,10 @@ enum CommandResolver { RuntimeLocator.resolve(searchPaths: self.preferredPaths()) } + static func runtimeResolution(searchPaths: [String]?) -> Result { + RuntimeLocator.resolve(searchPaths: searchPaths ?? self.preferredPaths()) + } + static func makeRuntimeCommand( runtime: RuntimeResolution, entrypoint: String, @@ -152,8 +156,8 @@ enum CommandResolver { return paths } - static func findExecutable(named name: String) -> String? { - for dir in self.preferredPaths() { + static func findExecutable(named name: String, searchPaths: [String]? = nil) -> String? { + for dir in (searchPaths ?? self.preferredPaths()) { let candidate = (dir as NSString).appendingPathComponent(name) if FileManager.default.isExecutableFile(atPath: candidate) { return candidate @@ -162,8 +166,14 @@ enum CommandResolver { return nil } - static func clawdisExecutable() -> String? { - self.findExecutable(named: self.helperName) + static func clawdisExecutable(searchPaths: [String]? = nil) -> String? { + self.findExecutable(named: self.helperName, searchPaths: searchPaths) + } + + static func projectClawdisExecutable(projectRoot: URL? = nil) -> String? { + let root = projectRoot ?? self.projectRoot() + let candidate = root.appendingPathComponent("node_modules/.bin").appendingPathComponent(self.helperName).path + return FileManager.default.isExecutableFile(atPath: candidate) ? candidate : nil } static func nodeCliPath() -> String? { @@ -171,17 +181,18 @@ enum CommandResolver { return FileManager.default.isReadableFile(atPath: candidate) ? candidate : nil } - static func hasAnyClawdisInvoker() -> Bool { - if self.clawdisExecutable() != nil { return true } - if self.findExecutable(named: "pnpm") != nil { return true } - if self.findExecutable(named: "node") != nil, self.nodeCliPath() != nil { return true } + static func hasAnyClawdisInvoker(searchPaths: [String]? = nil) -> Bool { + if self.clawdisExecutable(searchPaths: searchPaths) != nil { return true } + if self.findExecutable(named: "pnpm", searchPaths: searchPaths) != nil { return true } + if self.findExecutable(named: "node", searchPaths: searchPaths) != nil, self.nodeCliPath() != nil { return true } return false } static func clawdisNodeCommand( subcommand: String, extraArgs: [String] = [], - defaults: UserDefaults = .standard) -> [String] + defaults: UserDefaults = .standard, + searchPaths: [String]? = nil) -> [String] { let settings = self.connectionSettings(defaults: defaults) if settings.mode == .remote, let ssh = self.sshNodeCommand( @@ -192,25 +203,29 @@ enum CommandResolver { return ssh } - let runtimeResult = self.runtimeResolution() + let runtimeResult = self.runtimeResolution(searchPaths: searchPaths) switch runtimeResult { case let .success(runtime): - if let clawdisPath = self.clawdisExecutable() { + let root = self.projectRoot() + if let clawdisPath = self.projectClawdisExecutable(projectRoot: root) { return [clawdisPath, subcommand] + extraArgs } - if let entry = self.gatewayEntrypoint(in: self.projectRoot()) { + if let entry = self.gatewayEntrypoint(in: root) { return self.makeRuntimeCommand( runtime: runtime, entrypoint: entry, subcommand: subcommand, extraArgs: extraArgs) } - if let pnpm = self.findExecutable(named: "pnpm") { + if let pnpm = self.findExecutable(named: "pnpm", searchPaths: searchPaths) { // Use --silent to avoid pnpm lifecycle banners that would corrupt JSON outputs. return [pnpm, "--silent", "clawdis", subcommand] + extraArgs } + if let clawdisPath = self.clawdisExecutable(searchPaths: searchPaths) { + return [clawdisPath, subcommand] + extraArgs + } let missingEntry = """ clawdis entrypoint missing (looked for dist/index.js or bin/clawdis.js); run pnpm build. @@ -226,9 +241,10 @@ enum CommandResolver { static func clawdisCommand( subcommand: String, extraArgs: [String] = [], - defaults: UserDefaults = .standard) -> [String] + defaults: UserDefaults = .standard, + searchPaths: [String]? = nil) -> [String] { - self.clawdisNodeCommand(subcommand: subcommand, extraArgs: extraArgs, defaults: defaults) + self.clawdisNodeCommand(subcommand: subcommand, extraArgs: extraArgs, defaults: defaults, searchPaths: searchPaths) } // MARK: - SSH helpers diff --git a/apps/macos/Sources/Clawdis/DebugSettings.swift b/apps/macos/Sources/Clawdis/DebugSettings.swift index a730d5ef1..a30cf917a 100644 --- a/apps/macos/Sources/Clawdis/DebugSettings.swift +++ b/apps/macos/Sources/Clawdis/DebugSettings.swift @@ -1,8 +1,10 @@ import AppKit +import Observation import SwiftUI import UniformTypeIdentifiers struct DebugSettings: View { + @Bindable var state: AppState private let isPreview = ProcessInfo.processInfo.isPreview private let labelColumnWidth: CGFloat = 140 @AppStorage(modelCatalogPathKey) private var modelCatalogPath: String = ModelCatalogLoader.defaultPath @@ -36,6 +38,10 @@ struct DebugSettings: View { @State private var canvasEvalResult: String? @State private var canvasSnapshotPath: String? + init(state: AppState = AppStateStore.shared) { + self.state = state + } + var body: some View { ScrollView(.vertical) { VStack(alignment: .leading, spacing: 14) { @@ -194,7 +200,9 @@ struct DebugSettings: View { .overlay(RoundedRectangle(cornerRadius: 6).stroke(Color.secondary.opacity(0.2))) HStack(spacing: 8) { - Button("Restart Gateway") { DebugActions.restartGateway() } + if self.canRestartGateway { + Button("Restart Gateway") { DebugActions.restartGateway() } + } Button("Clear log") { GatewayProcessManager.shared.clearLog() } Spacer(minLength: 0) } @@ -762,6 +770,10 @@ struct DebugSettings: View { CommandResolver.connectionSettings().mode == .remote } + private var canRestartGateway: Bool { + self.state.connectionMode == .local && !self.attachExistingGatewayOnly + } + private func configURL() -> URL { FileManager.default.homeDirectoryForCurrentUser .appendingPathComponent(".clawdis") @@ -902,7 +914,7 @@ private struct PlainSettingsGroupBoxStyle: GroupBoxStyle { #if DEBUG struct DebugSettings_Previews: PreviewProvider { static var previews: some View { - DebugSettings() + DebugSettings(state: .preview) .frame(width: SettingsTab.windowWidth, height: SettingsTab.windowHeight) } } @@ -910,7 +922,7 @@ struct DebugSettings_Previews: PreviewProvider { @MainActor extension DebugSettings { static func exerciseForTesting() async { - let view = DebugSettings() + let view = DebugSettings(state: .preview) view.modelsCount = 3 view.modelsLoading = false view.modelsError = "Failed to load models" diff --git a/apps/macos/Sources/Clawdis/MenuContentView.swift b/apps/macos/Sources/Clawdis/MenuContentView.swift index c43986925..7cb1d420b 100644 --- a/apps/macos/Sources/Clawdis/MenuContentView.swift +++ b/apps/macos/Sources/Clawdis/MenuContentView.swift @@ -209,10 +209,12 @@ struct MenuContent: View { Label("Send Test Notification", systemImage: "bell") } Divider() - Button { - DebugActions.restartGateway() - } label: { - Label("Restart Gateway", systemImage: "arrow.clockwise") + if self.state.connectionMode == .local, !AppStateStore.attachExistingGatewayOnly { + Button { + DebugActions.restartGateway() + } label: { + Label("Restart Gateway", systemImage: "arrow.clockwise") + } } Button { DebugActions.restartApp() diff --git a/apps/macos/Sources/Clawdis/SettingsRootView.swift b/apps/macos/Sources/Clawdis/SettingsRootView.swift index 9ede06efb..8cfd39ae5 100644 --- a/apps/macos/Sources/Clawdis/SettingsRootView.swift +++ b/apps/macos/Sources/Clawdis/SettingsRootView.swift @@ -57,7 +57,7 @@ struct SettingsRootView: View { .tag(SettingsTab.permissions) if self.state.debugPaneEnabled { - DebugSettings() + DebugSettings(state: self.state) .tabItem { Label("Debug", systemImage: "ant") } .tag(SettingsTab.debug) } diff --git a/apps/macos/Tests/ClawdisIPCTests/CommandResolverTests.swift b/apps/macos/Tests/ClawdisIPCTests/CommandResolverTests.swift index f0a543a87..9a4a650aa 100644 --- a/apps/macos/Tests/ClawdisIPCTests/CommandResolverTests.swift +++ b/apps/macos/Tests/ClawdisIPCTests/CommandResolverTests.swift @@ -52,12 +52,17 @@ import Testing try FileManager.default.setAttributes([.posixPermissions: 0o755], ofItemAtPath: nodePath.path) try self.makeExec(at: scriptPath) - let cmd = CommandResolver.clawdisCommand(subcommand: "rpc", defaults: defaults) + let cmd = CommandResolver.clawdisCommand( + subcommand: "rpc", + defaults: defaults, + searchPaths: [tmp.appendingPathComponent("node_modules/.bin").path]) #expect(cmd.count >= 3) - #expect(cmd[0] == nodePath.path) - #expect(cmd[1] == scriptPath.path) - #expect(cmd[2] == "rpc") + if cmd.count >= 3 { + #expect(cmd[0] == nodePath.path) + #expect(cmd[1] == scriptPath.path) + #expect(cmd[2] == "rpc") + } } @Test func fallsBackToPnpm() async throws { diff --git a/apps/macos/Tests/ClawdisIPCTests/ConnectionsSettingsSmokeTests.swift b/apps/macos/Tests/ClawdisIPCTests/ConnectionsSettingsSmokeTests.swift index 4941b0524..a9ba93a5f 100644 --- a/apps/macos/Tests/ClawdisIPCTests/ConnectionsSettingsSmokeTests.swift +++ b/apps/macos/Tests/ClawdisIPCTests/ConnectionsSettingsSmokeTests.swift @@ -43,7 +43,8 @@ struct ConnectionsSettingsSmokeTests { elapsedMs: 120, bot: ProvidersStatusSnapshot.TelegramBot(id: 123, username: "clawdisbot"), webhook: ProvidersStatusSnapshot.TelegramWebhook(url: "https://example.com/hook", hasCustomCert: false)), - lastProbeAt: 1_700_000_050_000)) + lastProbeAt: 1_700_000_050_000), + discord: nil) store.whatsappLoginMessage = "Scan QR" store.whatsappLoginQrDataUrl = @@ -92,7 +93,8 @@ struct ConnectionsSettingsSmokeTests { elapsedMs: 120, bot: nil, webhook: nil), - lastProbeAt: 1_700_000_100_000)) + lastProbeAt: 1_700_000_100_000), + discord: nil) let view = ConnectionsSettings(store: store) _ = view.body From e119a823345f8c73770b9d8f354e423012d4deba Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 30 Dec 2025 01:57:45 +0100 Subject: [PATCH 032/100] feat: talk mode key distribution and tts polling --- CHANGELOG.md | 2 + .../clawdis/node/voice/TalkModeManager.kt | 35 +++++++++++-- apps/ios/Sources/Voice/TalkModeManager.swift | 34 +++++++++++-- apps/macos/Sources/Clawdis/AppState.swift | 12 ++++- .../Sources/Clawdis/ConfigSettings.swift | 22 +++++++++ .../Sources/Clawdis/GatewayConnection.swift | 7 +++ .../Sources/Clawdis/TalkModeController.swift | 1 + .../Sources/Clawdis/TalkModeRuntime.swift | 32 ++++++++++-- apps/macos/Sources/Clawdis/TalkOverlay.swift | 6 +-- .../Sources/Clawdis/TalkOverlayView.swift | 17 ++++--- docs/configuration.md | 2 + docs/talk.md | 2 + src/config/config.test.ts | 47 ++++++++++++++++++ src/config/config.ts | 49 ++++++++++++++++++- src/gateway/protocol/index.ts | 5 ++ src/gateway/protocol/schema.ts | 10 ++++ src/gateway/server.ts | 44 +++++++++++++++++ 17 files changed, 303 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f8317f04..97643cee5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ - macOS menu: add a Talk Mode action alongside the Open Dashboard/Chat/Canvas entries. - macOS Debug: hide “Restart Gateway” when the app won’t start a local gateway (remote mode / attach-only). - macOS Talk Mode: orb overlay refresh, ElevenLabs request logging, API key status in settings, and auto-select first voice when none is configured. +- Talk Mode: wait for chat history to surface the assistant reply before starting TTS (macOS/iOS/Android). +- Gateway config: inject `talk.apiKey` from `ELEVENLABS_API_KEY`/shell profile so nodes can fetch it on demand. - iOS/Android nodes: enable scrolling for loaded web pages in the Canvas WebView (default scaffold stays touch-first). - macOS menu: device list now uses `node.list` (devices only; no agent/tool presence entries). - macOS menu: device list now shows connected nodes only. diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt index 920466739..5cad01140 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt @@ -76,6 +76,7 @@ class TalkModeManager( private var defaultModelId: String? = null private var currentModelId: String? = null private var defaultOutputFormat: String? = null + private var apiKey: String? = null private var interruptOnSpeech: Boolean = true private var voiceOverrideActive = false private var modelOverrideActive = false @@ -268,6 +269,7 @@ class TalkModeManager( } try { + val startedAt = System.currentTimeMillis().toDouble() / 1000.0 val runId = sendChat(prompt, bridge) val ok = waitForChatFinal(runId) if (!ok) { @@ -275,7 +277,7 @@ class TalkModeManager( start() return } - val assistant = fetchLatestAssistantText(bridge) + val assistant = waitForAssistantText(bridge, startedAt, 12_000) if (assistant.isNullOrBlank()) { _statusText.value = "No reply" start() @@ -345,13 +347,34 @@ class TalkModeManager( return result } - private suspend fun fetchLatestAssistantText(bridge: BridgeSession): String? { + private suspend fun waitForAssistantText( + bridge: BridgeSession, + sinceSeconds: Double, + timeoutMs: Long, + ): String? { + val deadline = SystemClock.elapsedRealtime() + timeoutMs + while (SystemClock.elapsedRealtime() < deadline) { + val text = fetchLatestAssistantText(bridge, sinceSeconds) + if (!text.isNullOrBlank()) return text + delay(300) + } + return null + } + + private suspend fun fetchLatestAssistantText( + bridge: BridgeSession, + sinceSeconds: Double? = null, + ): String? { val res = bridge.request("chat.history", "{\"sessionKey\":\"main\"}") val root = json.parseToJsonElement(res).asObjectOrNull() ?: return null val messages = root["messages"] as? JsonArray ?: return null for (item in messages.reversed()) { val obj = item.asObjectOrNull() ?: continue if (obj["role"].asStringOrNull() != "assistant") continue + if (sinceSeconds != null) { + val timestamp = obj["timestamp"].asDoubleOrNull() + if (timestamp != null && timestamp < sinceSeconds - 0.5) continue + } val content = obj["content"] as? JsonArray ?: continue val text = content.mapNotNull { entry -> @@ -390,7 +413,9 @@ class TalkModeManager( return } - val apiKey = System.getenv("ELEVENLABS_API_KEY")?.trim() + val apiKey = + apiKey?.trim()?.takeIf { it.isNotEmpty() } + ?: System.getenv("ELEVENLABS_API_KEY")?.trim() if (apiKey.isNullOrEmpty()) { _statusText.value = "Missing ELEVENLABS_API_KEY" return @@ -495,6 +520,7 @@ class TalkModeManager( val bridge = session ?: return val envVoice = System.getenv("ELEVENLABS_VOICE_ID")?.trim() val sagVoice = System.getenv("SAG_VOICE_ID")?.trim() + val envKey = System.getenv("ELEVENLABS_API_KEY")?.trim() try { val res = bridge.request("config.get", "{}") val root = json.parseToJsonElement(res).asObjectOrNull() @@ -503,6 +529,7 @@ class TalkModeManager( val voice = talk?.get("voiceId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } val model = talk?.get("modelId")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } val outputFormat = talk?.get("outputFormat")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } + val key = talk?.get("apiKey")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() } val interrupt = talk?.get("interruptOnSpeech")?.asBooleanOrNull() defaultVoiceId = voice ?: envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() } @@ -510,9 +537,11 @@ class TalkModeManager( defaultModelId = model if (!modelOverrideActive) currentModelId = defaultModelId defaultOutputFormat = outputFormat + apiKey = key ?: envKey?.takeIf { it.isNotEmpty() } if (interrupt != null) interruptOnSpeech = interrupt } catch (_: Throwable) { defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() } + apiKey = envKey?.takeIf { it.isNotEmpty() } } } diff --git a/apps/ios/Sources/Voice/TalkModeManager.swift b/apps/ios/Sources/Voice/TalkModeManager.swift index 82cd451c3..3766845b1 100644 --- a/apps/ios/Sources/Voice/TalkModeManager.swift +++ b/apps/ios/Sources/Voice/TalkModeManager.swift @@ -28,6 +28,7 @@ final class TalkModeManager: NSObject { private var defaultModelId: String? private var currentModelId: String? private var defaultOutputFormat: String? + private var apiKey: String? private var interruptOnSpeech: Bool = true private var bridge: BridgeSession? @@ -189,6 +190,7 @@ final class TalkModeManager: NSObject { } do { + let startedAt = Date().timeIntervalSince1970 let runId = try await self.sendChat(prompt, bridge: bridge) let ok = await self.waitForChatFinal(runId: runId, bridge: bridge) if !ok { @@ -197,7 +199,11 @@ final class TalkModeManager: NSObject { return } - guard let assistantText = try await self.fetchLatestAssistantText(bridge: bridge) else { + guard let assistantText = try await self.waitForAssistantText( + bridge: bridge, + since: startedAt, + timeoutSeconds: 12) + else { self.statusText = "No reply" await self.start() return @@ -259,7 +265,22 @@ final class TalkModeManager: NSObject { return false } - private func fetchLatestAssistantText(bridge: BridgeSession) async throws -> String? { + private func waitForAssistantText( + bridge: BridgeSession, + since: Double, + timeoutSeconds: Int) async throws -> String? + { + let deadline = Date().addingTimeInterval(TimeInterval(timeoutSeconds)) + while Date() < deadline { + if let text = try await self.fetchLatestAssistantText(bridge: bridge, since: since) { + return text + } + try? await Task.sleep(nanoseconds: 300_000_000) + } + return nil + } + + private func fetchLatestAssistantText(bridge: BridgeSession, since: Double? = nil) async throws -> String? { let res = try await bridge.request( method: "chat.history", paramsJSON: "{\"sessionKey\":\"main\"}", @@ -268,6 +289,9 @@ final class TalkModeManager: NSObject { guard let messages = json["messages"] as? [[String: Any]] else { return nil } for msg in messages.reversed() { guard (msg["role"] as? String) == "assistant" else { continue } + if let since, let timestamp = msg["timestamp"] as? Double, timestamp < since - 0.5 { + continue + } guard let content = msg["content"] as? [[String: Any]] else { continue } let text = content.compactMap { $0["text"] as? String }.joined(separator: "\n") let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) @@ -299,7 +323,10 @@ final class TalkModeManager: NSObject { return } - guard let apiKey = ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"], !apiKey.isEmpty else { + let resolvedKey = + (self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ?? + ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"] + guard let apiKey = resolvedKey, !apiKey.isEmpty else { self.statusText = "Missing ELEVENLABS_API_KEY" return } @@ -375,6 +402,7 @@ final class TalkModeManager: NSObject { self.currentModelId = self.defaultModelId self.defaultOutputFormat = (talk?["outputFormat"] as? String)? .trimmingCharacters(in: .whitespacesAndNewlines) + self.apiKey = (talk?["apiKey"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) if let interrupt = talk?["interruptOnSpeech"] as? Bool { self.interruptOnSpeech = interrupt } diff --git a/apps/macos/Sources/Clawdis/AppState.swift b/apps/macos/Sources/Clawdis/AppState.swift index 94e20538a..c73383241 100644 --- a/apps/macos/Sources/Clawdis/AppState.swift +++ b/apps/macos/Sources/Clawdis/AppState.swift @@ -329,18 +329,26 @@ final class AppState { func setTalkEnabled(_ enabled: Bool) async { guard voiceWakeSupported else { self.talkEnabled = false + await GatewayConnection.shared.talkMode(enabled: false, phase: "disabled") return } self.talkEnabled = enabled guard !self.isPreview else { return } - if !enabled { return } + if !enabled { + await GatewayConnection.shared.talkMode(enabled: false, phase: "disabled") + return + } - if PermissionManager.voiceWakePermissionsGranted() { return } + if PermissionManager.voiceWakePermissionsGranted() { + await GatewayConnection.shared.talkMode(enabled: true, phase: "enabled") + return + } let granted = await PermissionManager.ensureVoiceWakePermissions(interactive: true) self.talkEnabled = granted + await GatewayConnection.shared.talkMode(enabled: granted, phase: granted ? "enabled" : "denied") } // MARK: - Global wake words sync (Gateway-owned) diff --git a/apps/macos/Sources/Clawdis/ConfigSettings.swift b/apps/macos/Sources/Clawdis/ConfigSettings.swift index eb22490c0..784fe7a71 100644 --- a/apps/macos/Sources/Clawdis/ConfigSettings.swift +++ b/apps/macos/Sources/Clawdis/ConfigSettings.swift @@ -34,6 +34,7 @@ struct ConfigSettings: View { @State private var talkVoiceId: String = "" @State private var talkInterruptOnSpeech: Bool = true @State private var talkApiKey: String = "" + @State private var gatewayApiKeyFound = false var body: some View { ScrollView { self.content } @@ -49,6 +50,7 @@ struct ConfigSettings: View { self.hasLoaded = true self.loadConfig() await self.loadModels() + await self.refreshGatewayTalkApiKey() self.allowAutosave = true } } @@ -323,6 +325,10 @@ struct ConfigSettings: View { Text("Using ELEVENLABS_API_KEY from the environment.") .font(.footnote) .foregroundStyle(.secondary) + } else if self.gatewayApiKeyFound && self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + Text("Using API key from the gateway profile.") + .font(.footnote) + .foregroundStyle(.secondary) } } } @@ -392,6 +398,20 @@ struct ConfigSettings: View { } } + private func refreshGatewayTalkApiKey() async { + do { + let snap: ConfigSnapshot = try await GatewayConnection.shared.requestDecoded( + method: .configGet, + params: nil, + timeoutMs: 8000) + let talk = snap.config?["talk"]?.dictionaryValue + let apiKey = talk?["apiKey"]?.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines) + self.gatewayApiKeyFound = !(apiKey ?? "").isEmpty + } catch { + self.gatewayApiKeyFound = false + } + } + private func autosaveConfig() { guard self.allowAutosave else { return } Task { await self.saveConfig() } @@ -487,12 +507,14 @@ struct ConfigSettings: View { if !self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { return "ElevenLabs API key: stored in config" } + if self.gatewayApiKeyFound { return "ElevenLabs API key: found (gateway)" } return "ElevenLabs API key: missing" } private var apiKeyStatusColor: Color { if self.hasEnvApiKey { return .green } if !self.talkApiKey.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { return .green } + if self.gatewayApiKeyFound { return .green } return .red } diff --git a/apps/macos/Sources/Clawdis/GatewayConnection.swift b/apps/macos/Sources/Clawdis/GatewayConnection.swift index f255f69b9..289507cc5 100644 --- a/apps/macos/Sources/Clawdis/GatewayConnection.swift +++ b/apps/macos/Sources/Clawdis/GatewayConnection.swift @@ -51,6 +51,7 @@ actor GatewayConnection { case providersStatus = "providers.status" case configGet = "config.get" case configSet = "config.set" + case talkMode = "talk.mode" case webLoginStart = "web.login.start" case webLoginWait = "web.login.wait" case webLogout = "web.logout" @@ -483,6 +484,12 @@ extension GatewayConnection { return res.aborted ?? false } + func talkMode(enabled: Bool, phase: String? = nil) async { + var params: [String: AnyCodable] = ["enabled": AnyCodable(enabled)] + if let phase { params["phase"] = AnyCodable(phase) } + try? await self.requestVoid(method: .talkMode, params: params) + } + // MARK: - VoiceWake func voiceWakeGetTriggers() async throws -> [String] { diff --git a/apps/macos/Sources/Clawdis/TalkModeController.swift b/apps/macos/Sources/Clawdis/TalkModeController.swift index 920af0539..707b56995 100644 --- a/apps/macos/Sources/Clawdis/TalkModeController.swift +++ b/apps/macos/Sources/Clawdis/TalkModeController.swift @@ -20,6 +20,7 @@ final class TalkModeController { func updatePhase(_ phase: TalkModePhase) { TalkOverlayController.shared.updatePhase(phase) + Task { await GatewayConnection.shared.talkMode(enabled: AppStateStore.shared.talkEnabled, phase: phase.rawValue) } } func updateLevel(_ level: Double) { diff --git a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift index 0443e26ea..54804337e 100644 --- a/apps/macos/Sources/Clawdis/TalkModeRuntime.swift +++ b/apps/macos/Sources/Clawdis/TalkModeRuntime.swift @@ -244,6 +244,7 @@ actor TalkModeRuntime { await self.reloadConfig() let prompt = self.buildPrompt(transcript: transcript) let runId = UUID().uuidString + let startedAt = Date().timeIntervalSince1970 do { let response = try await GatewayConnection.shared.chatSend( @@ -261,7 +262,11 @@ actor TalkModeRuntime { return } - guard let assistantText = await self.latestAssistantText(sessionKey: "main") else { + guard let assistantText = await self.waitForAssistantText( + sessionKey: "main", + since: startedAt, + timeoutSeconds: 12) + else { await self.startListening() await self.startRecognition() return @@ -335,7 +340,22 @@ actor TalkModeRuntime { } } - private func latestAssistantText(sessionKey: String) async -> String? { + private func waitForAssistantText( + sessionKey: String, + since: Double, + timeoutSeconds: Int) async -> String? + { + let deadline = Date().addingTimeInterval(TimeInterval(timeoutSeconds)) + while Date() < deadline { + if let text = await self.latestAssistantText(sessionKey: sessionKey, since: since) { + return text + } + try? await Task.sleep(nanoseconds: 300_000_000) + } + return nil + } + + private func latestAssistantText(sessionKey: String, since: Double? = nil) async -> String? { do { let history = try await GatewayConnection.shared.chatHistory(sessionKey: sessionKey) let messages = history.messages ?? [] @@ -343,7 +363,13 @@ actor TalkModeRuntime { guard let data = try? JSONEncoder().encode(item) else { return nil } return try? JSONDecoder().decode(ClawdisChatMessage.self, from: data) } - guard let assistant = decoded.last(where: { $0.role == "assistant" }) else { return nil } + let assistant = decoded.last { message in + guard message.role == "assistant" else { return false } + guard let since else { return true } + guard let timestamp = message.timestamp else { return false } + return timestamp >= since - 0.5 + } + guard let assistant else { return nil } let text = assistant.content.compactMap { $0.text }.joined(separator: "\n") let trimmed = text.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines) return trimmed.isEmpty ? nil : trimmed diff --git a/apps/macos/Sources/Clawdis/TalkOverlay.swift b/apps/macos/Sources/Clawdis/TalkOverlay.swift index 59555a104..e41d758f7 100644 --- a/apps/macos/Sources/Clawdis/TalkOverlay.swift +++ b/apps/macos/Sources/Clawdis/TalkOverlay.swift @@ -20,9 +20,9 @@ final class TalkOverlayController { private var window: NSPanel? private var hostingView: NSHostingView? - private let width: CGFloat = 120 - private let height: CGFloat = 120 - private let padding: CGFloat = 6 + private let width: CGFloat = 160 + private let height: CGFloat = 160 + private let padding: CGFloat = 8 func present() { self.ensureWindow() diff --git a/apps/macos/Sources/Clawdis/TalkOverlayView.swift b/apps/macos/Sources/Clawdis/TalkOverlayView.swift index f5484c439..d7b400ed3 100644 --- a/apps/macos/Sources/Clawdis/TalkOverlayView.swift +++ b/apps/macos/Sources/Clawdis/TalkOverlayView.swift @@ -7,12 +7,12 @@ struct TalkOverlayView: View { var body: some View { ZStack(alignment: .topLeading) { TalkOrbView(phase: self.controller.model.phase, level: self.controller.model.level) - .frame(width: 80, height: 80) + .frame(width: 96, height: 96) .contentShape(Rectangle()) .onTapGesture { TalkModeController.shared.stopSpeaking(reason: .userTap) } - .padding(16) + .padding(26) Button { TalkModeController.shared.exitTalkMode() @@ -29,7 +29,7 @@ struct TalkOverlayView: View { .padding(4) .onHover { self.hovering = $0 } } - .frame(width: 120, height: 120, alignment: .center) + .frame(width: 160, height: 160, alignment: .center) } } @@ -72,6 +72,7 @@ private struct TalkWaveRings: View { let phase: TalkModePhase let level: Double let time: TimeInterval + private let ringColor = Color(red: 0.82, green: 0.94, blue: 1.0) var body: some View { ZStack { @@ -80,9 +81,9 @@ private struct TalkWaveRings: View { let progress = (time * speed + Double(idx) * 0.28).truncatingRemainder(dividingBy: 1) let amplitude = phase == .speaking ? 0.95 : phase == .listening ? 0.5 + level * 0.7 : 0.35 let scale = 0.75 + progress * amplitude + (phase == .listening ? level * 0.15 : 0) - let alpha = phase == .speaking ? 0.55 : phase == .listening ? 0.45 + level * 0.25 : 0.28 + let alpha = phase == .speaking ? 0.72 : phase == .listening ? 0.58 + level * 0.28 : 0.4 Circle() - .stroke(Color.white.opacity(alpha - progress * 0.35), lineWidth: 1.2) + .stroke(self.ringColor.opacity(alpha - progress * 0.3), lineWidth: 1.6) .scaleEffect(scale) .opacity(alpha - progress * 0.6) } @@ -97,13 +98,13 @@ private struct TalkOrbitArcs: View { ZStack { Circle() .trim(from: 0.08, to: 0.26) - .stroke(Color.white.opacity(0.75), style: StrokeStyle(lineWidth: 1.4, lineCap: .round)) + .stroke(Color.white.opacity(0.88), style: StrokeStyle(lineWidth: 1.6, lineCap: .round)) .rotationEffect(.degrees(time * 42)) Circle() .trim(from: 0.62, to: 0.86) - .stroke(Color.white.opacity(0.55), style: StrokeStyle(lineWidth: 1.2, lineCap: .round)) + .stroke(Color.white.opacity(0.7), style: StrokeStyle(lineWidth: 1.4, lineCap: .round)) .rotationEffect(.degrees(-time * 35)) } - .scaleEffect(1.05) + .scaleEffect(1.08) } } diff --git a/docs/configuration.md b/docs/configuration.md index f15a8f046..a49e916f8 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -198,6 +198,7 @@ Controls inbound/outbound prefixes and timestamps. ### `talk` Defaults for Talk mode (macOS/iOS/Android). Voice IDs fall back to `ELEVENLABS_VOICE_ID` or `SAG_VOICE_ID` when unset. +`apiKey` falls back to `ELEVENLABS_API_KEY` (or the gateway’s shell profile) when unset. ```json5 { @@ -205,6 +206,7 @@ Defaults for Talk mode (macOS/iOS/Android). Voice IDs fall back to `ELEVENLABS_V voiceId: "elevenlabs_voice_id", modelId: "eleven_v3", outputFormat: "mp3_44100_128", + apiKey: "elevenlabs_api_key", interruptOnSpeech: true } } diff --git a/docs/talk.md b/docs/talk.md index 4c3cf53cb..41f8239a4 100644 --- a/docs/talk.md +++ b/docs/talk.md @@ -47,6 +47,7 @@ Supported keys: "voiceId": "elevenlabs_voice_id", "modelId": "eleven_v3", "outputFormat": "mp3_44100_128", + "apiKey": "elevenlabs_api_key", "interruptOnSpeech": true } } @@ -55,6 +56,7 @@ Supported keys: Defaults: - `interruptOnSpeech`: true - `voiceId`: falls back to `ELEVENLABS_VOICE_ID` / `SAG_VOICE_ID` +- `apiKey`: falls back to `ELEVENLABS_API_KEY` (or gateway shell profile if available) ## macOS UI - Menu bar toggle: **Talk** diff --git a/src/config/config.test.ts b/src/config/config.test.ts index c6669848a..ff2009a35 100644 --- a/src/config/config.test.ts +++ b/src/config/config.test.ts @@ -174,3 +174,50 @@ describe("config identity defaults", () => { }); }); }); + +describe("talk api key fallback", () => { + let previousEnv: string | undefined; + + beforeEach(() => { + previousEnv = process.env.ELEVENLABS_API_KEY; + delete process.env.ELEVENLABS_API_KEY; + }); + + afterEach(() => { + process.env.ELEVENLABS_API_KEY = previousEnv; + }); + + it("injects talk.apiKey from profile when config is missing", async () => { + await withTempHome(async (home) => { + await fs.writeFile( + path.join(home, ".profile"), + "export ELEVENLABS_API_KEY=profile-key\n", + "utf-8", + ); + + vi.resetModules(); + const { readConfigFileSnapshot } = await import("./config.js"); + const snap = await readConfigFileSnapshot(); + + expect(snap.config?.talk?.apiKey).toBe("profile-key"); + expect(snap.exists).toBe(false); + }); + }); + + it("prefers ELEVENLABS_API_KEY env over profile", async () => { + await withTempHome(async (home) => { + await fs.writeFile( + path.join(home, ".profile"), + "export ELEVENLABS_API_KEY=profile-key\n", + "utf-8", + ); + process.env.ELEVENLABS_API_KEY = "env-key"; + + vi.resetModules(); + const { readConfigFileSnapshot } = await import("./config.js"); + const snap = await readConfigFileSnapshot(); + + expect(snap.config?.talk?.apiKey).toBe("env-key"); + }); + }); +}); diff --git a/src/config/config.ts b/src/config/config.ts index 40ae5da06..870ba9621 100644 --- a/src/config/config.ts +++ b/src/config/config.ts @@ -226,6 +226,8 @@ export type TalkConfig = { modelId?: string; /** Default ElevenLabs output format (e.g. mp3_44100_128). */ outputFormat?: string; + /** ElevenLabs API key (optional; falls back to ELEVENLABS_API_KEY). */ + apiKey?: string; /** Stop speaking when user starts talking (default: true). */ interruptOnSpeech?: boolean; }; @@ -802,6 +804,7 @@ const ClawdisSchema = z.object({ voiceId: z.string().optional(), modelId: z.string().optional(), outputFormat: z.string().optional(), + apiKey: z.string().optional(), interruptOnSpeech: z.boolean().optional(), }) .optional(), @@ -964,17 +967,59 @@ export function parseConfigJson5( } } +function readTalkApiKeyFromProfile(): string | null { + const home = os.homedir(); + const candidates = [".profile", ".zprofile", ".zshrc", ".bashrc"].map( + (name) => path.join(home, name), + ); + for (const candidate of candidates) { + if (!fs.existsSync(candidate)) continue; + try { + const text = fs.readFileSync(candidate, "utf-8"); + const match = text.match( + /(?:^|\n)\s*(?:export\s+)?ELEVENLABS_API_KEY\s*=\s*["']?([^\n"']+)["']?/, + ); + const value = match?.[1]?.trim(); + if (value) return value; + } catch { + // Ignore profile read errors. + } + } + return null; +} + +function resolveTalkApiKey(): string | null { + const envValue = (process.env.ELEVENLABS_API_KEY ?? "").trim(); + if (envValue) return envValue; + return readTalkApiKeyFromProfile(); +} + +function applyTalkApiKey(config: ClawdisConfig): ClawdisConfig { + const resolved = resolveTalkApiKey(); + if (!resolved) return config; + const existing = config.talk?.apiKey?.trim(); + if (existing) return config; + return { + ...config, + talk: { + ...config.talk, + apiKey: resolved, + }, + }; +} + export async function readConfigFileSnapshot(): Promise { const configPath = CONFIG_PATH_CLAWDIS; const exists = fs.existsSync(configPath); if (!exists) { + const config = applyTalkApiKey({}); return { path: configPath, exists: false, raw: null, parsed: {}, valid: true, - config: {}, + config, issues: [], }; } @@ -1015,7 +1060,7 @@ export async function readConfigFileSnapshot(): Promise { raw, parsed: parsedRes.parsed, valid: true, - config: validated.config, + config: applyTalkApiKey(validated.config), issues: [], }; } catch (err) { diff --git a/src/gateway/protocol/index.ts b/src/gateway/protocol/index.ts index 725f37cc6..42a46160a 100644 --- a/src/gateway/protocol/index.ts +++ b/src/gateway/protocol/index.ts @@ -95,6 +95,8 @@ import { SnapshotSchema, type StateVersion, StateVersionSchema, + type TalkModeParams, + TalkModeParamsSchema, type TickEvent, TickEventSchema, type WakeParams, @@ -169,6 +171,8 @@ export const validateConfigGetParams = ajv.compile( export const validateConfigSetParams = ajv.compile( ConfigSetParamsSchema, ); +export const validateTalkModeParams = + ajv.compile(TalkModeParamsSchema); export const validateProvidersStatusParams = ajv.compile( ProvidersStatusParamsSchema, ); @@ -297,6 +301,7 @@ export type { NodePairApproveParams, ConfigGetParams, ConfigSetParams, + TalkModeParams, ProvidersStatusParams, WebLoginStartParams, WebLoginWaitParams, diff --git a/src/gateway/protocol/schema.ts b/src/gateway/protocol/schema.ts index 7ea3fc23a..ffd5260f8 100644 --- a/src/gateway/protocol/schema.ts +++ b/src/gateway/protocol/schema.ts @@ -339,6 +339,14 @@ export const ConfigSetParamsSchema = Type.Object( { additionalProperties: false }, ); +export const TalkModeParamsSchema = Type.Object( + { + enabled: Type.Boolean(), + phase: Type.Optional(Type.String()), + }, + { additionalProperties: false }, +); + export const ProvidersStatusParamsSchema = Type.Object( { probe: Type.Optional(Type.Boolean()), @@ -668,6 +676,7 @@ export const ProtocolSchemas: Record = { SessionsCompactParams: SessionsCompactParamsSchema, ConfigGetParams: ConfigGetParamsSchema, ConfigSetParams: ConfigSetParamsSchema, + TalkModeParams: TalkModeParamsSchema, ProvidersStatusParams: ProvidersStatusParamsSchema, WebLoginStartParams: WebLoginStartParamsSchema, WebLoginWaitParams: WebLoginWaitParamsSchema, @@ -724,6 +733,7 @@ export type SessionsDeleteParams = Static; export type SessionsCompactParams = Static; export type ConfigGetParams = Static; export type ConfigSetParams = Static; +export type TalkModeParams = Static; export type ProvidersStatusParams = Static; export type WebLoginStartParams = Static; export type WebLoginWaitParams = Static; diff --git a/src/gateway/server.ts b/src/gateway/server.ts index 819e8fefb..db6e506f8 100644 --- a/src/gateway/server.ts +++ b/src/gateway/server.ts @@ -393,6 +393,7 @@ import { validateSkillsInstallParams, validateSkillsStatusParams, validateSkillsUpdateParams, + validateTalkModeParams, validateWakeParams, validateWebLoginStartParams, validateWebLoginWaitParams, @@ -469,6 +470,7 @@ const METHODS = [ "status", "config.get", "config.set", + "talk.mode", "models.list", "skills.status", "skills.install", @@ -518,6 +520,7 @@ const EVENTS = [ "chat", "presence", "tick", + "talk.mode", "shutdown", "health", "heartbeat", @@ -2379,6 +2382,25 @@ export async function startGatewayServer( }), }; } + case "talk.mode": { + const params = parseParams(); + if (!validateTalkModeParams(params)) { + return { + ok: false, + error: { + code: ErrorCodes.INVALID_REQUEST, + message: `invalid talk.mode params: ${formatValidationErrors(validateTalkModeParams.errors)}`, + }, + }; + } + const payload = { + enabled: (params as { enabled: boolean }).enabled, + phase: (params as { phase?: string }).phase ?? null, + ts: Date.now(), + }; + broadcast("talk.mode", payload, { dropIfSlow: true }); + return { ok: true, payloadJSON: JSON.stringify(payload) }; + } case "models.list": { const params = parseParams(); if (!validateModelsListParams(params)) { @@ -4615,6 +4637,28 @@ export async function startGatewayServer( ); break; } + case "talk.mode": { + const params = (req.params ?? {}) as Record; + if (!validateTalkModeParams(params)) { + respond( + false, + undefined, + errorShape( + ErrorCodes.INVALID_REQUEST, + `invalid talk.mode params: ${formatValidationErrors(validateTalkModeParams.errors)}`, + ), + ); + break; + } + const payload = { + enabled: (params as { enabled: boolean }).enabled, + phase: (params as { phase?: string }).phase ?? null, + ts: Date.now(), + }; + broadcast("talk.mode", payload, { dropIfSlow: true }); + respond(true, payload, undefined); + break; + } case "skills.status": { const params = (req.params ?? {}) as Record; if (!validateSkillsStatusParams(params)) { From 5c7c1af44ef67a33993c44360b3ed8bd43ea029f Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 30 Dec 2025 02:05:14 +0100 Subject: [PATCH 033/100] fix: android talk timestamp parsing --- .../java/com/steipete/clawdis/node/voice/TalkModeManager.kt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt index 5cad01140..2bf35a276 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/voice/TalkModeManager.kt @@ -731,6 +731,11 @@ private fun JsonElement?.asObjectOrNull(): JsonObject? = this as? JsonObject private fun JsonElement?.asStringOrNull(): String? = (this as? JsonPrimitive)?.takeIf { it.isString }?.content +private fun JsonElement?.asDoubleOrNull(): Double? { + val primitive = this as? JsonPrimitive ?: return null + return primitive.content.toDoubleOrNull() +} + private fun JsonElement?.asBooleanOrNull(): Boolean? { val primitive = this as? JsonPrimitive ?: return null val content = primitive.content.trim().lowercase() From 9846c46434ea67a0b751fa7ab85609cab18906b8 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 30 Dec 2025 03:49:24 +0100 Subject: [PATCH 034/100] fix: tag A2UI platform and boost Android canvas --- CHANGELOG.md | 1 + apps/android/app/build.gradle.kts | 1 + .../com/steipete/clawdis/node/NodeRuntime.kt | 2 +- .../clawdis/node/bridge/BridgeSession.kt | 42 +++++++++++++++++- .../clawdis/node/node/CanvasController.kt | 8 ++++ .../steipete/clawdis/node/ui/RootScreen.kt | 44 ++++++++++++++++++- apps/ios/Sources/Model/NodeAppModel.swift | 2 +- .../macos/Sources/Clawdis/CanvasManager.swift | 2 +- .../Clawdis/NodeMode/MacNodeRuntime.swift | 2 +- .../Resources/CanvasScaffold/scaffold.html | 24 ++++++++++ src/canvas-host/a2ui/index.html | 24 ++++++++++ 11 files changed, 145 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 97643cee5..2ef7f5e85 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ - macOS Talk Mode: orb overlay refresh, ElevenLabs request logging, API key status in settings, and auto-select first voice when none is configured. - Talk Mode: wait for chat history to surface the assistant reply before starting TTS (macOS/iOS/Android). - Gateway config: inject `talk.apiKey` from `ELEVENLABS_API_KEY`/shell profile so nodes can fetch it on demand. +- Canvas A2UI: tag requests with `platform=android|ios|macos` and boost Android canvas background contrast. - iOS/Android nodes: enable scrolling for loaded web pages in the Canvas WebView (default scaffold stays touch-first). - macOS menu: device list now uses `node.list` (devices only; no agent/tool presence entries). - macOS menu: device list now shows connected nodes only. diff --git a/apps/android/app/build.gradle.kts b/apps/android/app/build.gradle.kts index 1b353d83f..db3b17dca 100644 --- a/apps/android/app/build.gradle.kts +++ b/apps/android/app/build.gradle.kts @@ -64,6 +64,7 @@ dependencies { implementation("androidx.core:core-ktx:1.17.0") implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.10.0") implementation("androidx.activity:activity-compose:1.12.2") + implementation("androidx.webkit:webkit:1.14.0") implementation("androidx.compose.ui:ui") implementation("androidx.compose.ui:ui-tooling-preview") diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt index 21a22a428..50fbd3251 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/NodeRuntime.kt @@ -815,7 +815,7 @@ class NodeRuntime(context: Context) { val raw = session.currentCanvasHostUrl()?.trim().orEmpty() if (raw.isBlank()) return null val base = raw.trimEnd('/') - return "${base}/__clawdis__/a2ui/" + return "${base}/__clawdis__/a2ui/?platform=android" } private suspend fun ensureA2uiReady(a2uiUrl: String): Boolean { diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/bridge/BridgeSession.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/bridge/BridgeSession.kt index e50488d37..5f01959ec 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/bridge/BridgeSession.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/bridge/BridgeSession.kt @@ -11,6 +11,7 @@ import kotlinx.coroutines.launch import kotlinx.coroutines.sync.Mutex import kotlinx.coroutines.sync.withLock import kotlinx.coroutines.withContext +import com.steipete.clawdis.node.BuildConfig import kotlinx.serialization.json.Json import kotlinx.serialization.json.JsonArray import kotlinx.serialization.json.JsonObject @@ -23,6 +24,7 @@ import java.io.BufferedWriter import java.io.InputStreamReader import java.io.OutputStreamWriter import java.net.InetSocketAddress +import java.net.URI import java.net.Socket import java.util.UUID import java.util.concurrent.ConcurrentHashMap @@ -213,7 +215,14 @@ class BridgeSession( when (first["type"].asStringOrNull()) { "hello-ok" -> { val name = first["serverName"].asStringOrNull() ?: "Bridge" - canvasHostUrl = first["canvasHostUrl"].asStringOrNull()?.trim()?.ifEmpty { null } + val rawCanvasUrl = first["canvasHostUrl"].asStringOrNull()?.trim()?.ifEmpty { null } + canvasHostUrl = normalizeCanvasHostUrl(rawCanvasUrl, endpoint) + if (BuildConfig.DEBUG) { + android.util.Log.d( + "ClawdisBridge", + "canvasHostUrl resolved=${canvasHostUrl ?: "none"} (raw=${rawCanvasUrl ?: "none"})", + ) + } onConnected(name, conn.remoteAddress) } "error" -> { @@ -292,6 +301,37 @@ class BridgeSession( conn.closeQuietly() } } + + private fun normalizeCanvasHostUrl(raw: String?, endpoint: BridgeEndpoint): String? { + val trimmed = raw?.trim().orEmpty() + val parsed = trimmed.takeIf { it.isNotBlank() }?.let { runCatching { URI(it) }.getOrNull() } + val host = parsed?.host?.trim().orEmpty() + val port = parsed?.port ?: -1 + val scheme = parsed?.scheme?.trim().orEmpty().ifBlank { "http" } + + if (trimmed.isNotBlank() && !isLoopbackHost(host)) { + return trimmed + } + + val fallbackHost = + endpoint.tailnetDns?.trim().takeIf { !it.isNullOrEmpty() } + ?: endpoint.lanHost?.trim().takeIf { !it.isNullOrEmpty() } + ?: endpoint.host.trim() + if (fallbackHost.isEmpty()) return trimmed.ifBlank { null } + + val fallbackPort = endpoint.canvasPort ?: if (port > 0) port else 18793 + val formattedHost = if (fallbackHost.contains(":")) "[${fallbackHost}]" else fallbackHost + return "$scheme://$formattedHost:$fallbackPort" + } + + private fun isLoopbackHost(raw: String?): Boolean { + val host = raw?.trim()?.lowercase().orEmpty() + if (host.isEmpty()) return false + if (host == "localhost") return true + if (host == "::1") return true + if (host == "0.0.0.0" || host == "::") return true + return host.startsWith("127.") + } } private fun JsonElement?.asObjectOrNull(): JsonObject? = this as? JsonObject diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/node/CanvasController.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/node/CanvasController.kt index 5b4a09b64..685acdcd2 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/node/CanvasController.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/node/CanvasController.kt @@ -3,6 +3,7 @@ package com.steipete.clawdis.node.node import android.graphics.Bitmap import android.graphics.Canvas import android.os.Looper +import android.util.Log import android.webkit.WebView import androidx.core.graphics.createBitmap import androidx.core.graphics.scale @@ -16,6 +17,7 @@ import kotlinx.serialization.json.Json import kotlinx.serialization.json.JsonElement import kotlinx.serialization.json.JsonObject import kotlinx.serialization.json.JsonPrimitive +import com.steipete.clawdis.node.BuildConfig import kotlin.coroutines.resume class CanvasController { @@ -81,8 +83,14 @@ class CanvasController { val currentUrl = url withWebViewOnMain { wv -> if (currentUrl == null) { + if (BuildConfig.DEBUG) { + Log.d("ClawdisCanvas", "load scaffold: $scaffoldAssetUrl") + } wv.loadUrl(scaffoldAssetUrl) } else { + if (BuildConfig.DEBUG) { + Log.d("ClawdisCanvas", "load url: $currentUrl") + } wv.loadUrl(currentUrl) } } diff --git a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt index 791f76325..f7681eb49 100644 --- a/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt +++ b/apps/android/app/src/main/java/com/steipete/clawdis/node/ui/RootScreen.kt @@ -7,6 +7,8 @@ import android.graphics.Color import android.util.Log import android.view.View import android.webkit.JavascriptInterface +import android.webkit.ConsoleMessage +import android.webkit.WebChromeClient import android.webkit.WebView import android.webkit.WebSettings import android.webkit.WebResourceError @@ -15,6 +17,8 @@ import android.webkit.WebResourceResponse import android.webkit.WebViewClient import androidx.activity.compose.rememberLauncherForActivityResult import androidx.activity.result.contract.ActivityResultContracts +import androidx.webkit.WebSettingsCompat +import androidx.webkit.WebViewFeature import androidx.compose.foundation.layout.Arrangement import androidx.compose.foundation.layout.Box import androidx.compose.foundation.layout.Column @@ -301,6 +305,15 @@ private fun CanvasView(viewModel: MainViewModel, modifier: Modifier = Modifier) // Some embedded web UIs (incl. the "background website") use localStorage/sessionStorage. settings.domStorageEnabled = true settings.mixedContentMode = WebSettings.MIXED_CONTENT_COMPATIBILITY_MODE + if (WebViewFeature.isFeatureSupported(WebViewFeature.FORCE_DARK)) { + WebSettingsCompat.setForceDark(settings, WebSettingsCompat.FORCE_DARK_OFF) + } + if (WebViewFeature.isFeatureSupported(WebViewFeature.ALGORITHMIC_DARKENING)) { + WebSettingsCompat.setAlgorithmicDarkeningAllowed(settings, false) + } + if (isDebuggable) { + Log.d("ClawdisWebView", "userAgent: ${settings.userAgentString}") + } isScrollContainer = true overScrollMode = View.OVER_SCROLL_IF_CONTENT_SCROLLS isVerticalScrollBarEnabled = true @@ -331,11 +344,38 @@ private fun CanvasView(viewModel: MainViewModel, modifier: Modifier = Modifier) } override fun onPageFinished(view: WebView, url: String?) { + if (isDebuggable) { + Log.d("ClawdisWebView", "onPageFinished: $url") + } viewModel.canvas.onPageFinished() } + + override fun onRenderProcessGone( + view: WebView, + detail: android.webkit.RenderProcessGoneDetail, + ): Boolean { + if (isDebuggable) { + Log.e( + "ClawdisWebView", + "onRenderProcessGone didCrash=${detail.didCrash()} priorityAtExit=${detail.rendererPriorityAtExit()}", + ) + } + return true + } } - setBackgroundColor(Color.BLACK) - setLayerType(View.LAYER_TYPE_HARDWARE, null) + webChromeClient = + object : WebChromeClient() { + override fun onConsoleMessage(consoleMessage: ConsoleMessage?): Boolean { + if (!isDebuggable) return false + val msg = consoleMessage ?: return false + Log.d( + "ClawdisWebView", + "console ${msg.messageLevel()} @ ${msg.sourceId()}:${msg.lineNumber()} ${msg.message()}", + ) + return false + } + } + // Use default layer/background; avoid forcing a black fill over WebView content. val a2uiBridge = CanvasA2UIActionBridge { payload -> diff --git a/apps/ios/Sources/Model/NodeAppModel.swift b/apps/ios/Sources/Model/NodeAppModel.swift index 8c2935ffc..554441d1f 100644 --- a/apps/ios/Sources/Model/NodeAppModel.swift +++ b/apps/ios/Sources/Model/NodeAppModel.swift @@ -150,7 +150,7 @@ final class NodeAppModel { guard let raw = await self.bridge.currentCanvasHostUrl() else { return nil } let trimmed = raw.trimmingCharacters(in: .whitespacesAndNewlines) guard !trimmed.isEmpty, let base = URL(string: trimmed) else { return nil } - return base.appendingPathComponent("__clawdis__/a2ui/").absoluteString + return base.appendingPathComponent("__clawdis__/a2ui/").absoluteString + "?platform=ios" } private func showA2UIOnConnectIfNeeded() async { diff --git a/apps/macos/Sources/Clawdis/CanvasManager.swift b/apps/macos/Sources/Clawdis/CanvasManager.swift index c19c5d06d..32163744b 100644 --- a/apps/macos/Sources/Clawdis/CanvasManager.swift +++ b/apps/macos/Sources/Clawdis/CanvasManager.swift @@ -190,7 +190,7 @@ final class CanvasManager { private static func resolveA2UIHostUrl(from raw: String?) -> String? { let trimmed = raw?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" guard !trimmed.isEmpty, let base = URL(string: trimmed) else { return nil } - return base.appendingPathComponent("__clawdis__/a2ui/").absoluteString + return base.appendingPathComponent("__clawdis__/a2ui/").absoluteString + "?platform=macos" } // MARK: - Anchoring diff --git a/apps/macos/Sources/Clawdis/NodeMode/MacNodeRuntime.swift b/apps/macos/Sources/Clawdis/NodeMode/MacNodeRuntime.swift index 4b6c8bbc8..b46831034 100644 --- a/apps/macos/Sources/Clawdis/NodeMode/MacNodeRuntime.swift +++ b/apps/macos/Sources/Clawdis/NodeMode/MacNodeRuntime.swift @@ -265,7 +265,7 @@ actor MacNodeRuntime { guard let raw = await GatewayConnection.shared.canvasHostUrl() else { return nil } let trimmed = raw.trimmingCharacters(in: .whitespacesAndNewlines) guard !trimmed.isEmpty, let baseUrl = URL(string: trimmed) else { return nil } - return baseUrl.appendingPathComponent("__clawdis__/a2ui/").absoluteString + return baseUrl.appendingPathComponent("__clawdis__/a2ui/").absoluteString + "?platform=macos" } private func isA2UIReady(poll: Bool = false) async -> Bool { diff --git a/apps/shared/ClawdisKit/Sources/ClawdisKit/Resources/CanvasScaffold/scaffold.html b/apps/shared/ClawdisKit/Sources/ClawdisKit/Resources/CanvasScaffold/scaffold.html index d9a9cebfd..f8942af6f 100644 --- a/apps/shared/ClawdisKit/Sources/ClawdisKit/Resources/CanvasScaffold/scaffold.html +++ b/apps/shared/ClawdisKit/Sources/ClawdisKit/Resources/CanvasScaffold/scaffold.html @@ -4,6 +4,21 @@ Canvas +