feat(camera): add snap/clip capture

This commit is contained in:
Peter Steinberger
2025-12-14 00:48:58 +00:00
parent 2454e67e09
commit a92eb1f33d
19 changed files with 1669 additions and 2 deletions

View File

@@ -0,0 +1,319 @@
import AVFoundation
import ClawdisKit
import Foundation
import UIKit
actor CameraController {
enum CameraError: LocalizedError, Sendable {
case cameraUnavailable
case microphoneUnavailable
case permissionDenied(kind: String)
case invalidParams(String)
case captureFailed(String)
case exportFailed(String)
var errorDescription: String? {
switch self {
case .cameraUnavailable:
"Camera unavailable"
case .microphoneUnavailable:
"Microphone unavailable"
case let .permissionDenied(kind):
"\(kind) permission denied"
case let .invalidParams(msg):
msg
case let .captureFailed(msg):
msg
case let .exportFailed(msg):
msg
}
}
}
func snap(params: ClawdisCameraSnapParams) async throws -> (
format: String,
base64: String,
width: Int,
height: Int)
{
let facing = params.facing ?? .front
let maxWidth = params.maxWidth.flatMap { $0 > 0 ? $0 : nil }
let quality = Self.clampQuality(params.quality)
try await self.ensureAccess(for: .video)
let session = AVCaptureSession()
session.sessionPreset = .photo
guard let device = Self.pickCamera(facing: facing) else {
throw CameraError.cameraUnavailable
}
let input = try AVCaptureDeviceInput(device: device)
guard session.canAddInput(input) else {
throw CameraError.captureFailed("Failed to add camera input")
}
session.addInput(input)
let output = AVCapturePhotoOutput()
guard session.canAddOutput(output) else {
throw CameraError.captureFailed("Failed to add photo output")
}
session.addOutput(output)
output.maxPhotoQualityPrioritization = .quality
session.startRunning()
defer { session.stopRunning() }
let settings: AVCapturePhotoSettings = {
if output.availablePhotoCodecTypes.contains(.jpeg) {
return AVCapturePhotoSettings(format: [AVVideoCodecKey: AVVideoCodecType.jpeg])
}
return AVCapturePhotoSettings()
}()
settings.photoQualityPrioritization = .quality
let rawData: Data = try await withCheckedThrowingContinuation { cont in
output.capturePhoto(with: settings, delegate: PhotoCaptureDelegate(cont))
}
let (finalData, size) = try Self.reencodeJPEG(
imageData: rawData,
maxWidth: maxWidth,
quality: quality)
return (
format: "jpg",
base64: finalData.base64EncodedString(),
width: Int(size.width.rounded()),
height: Int(size.height.rounded()))
}
func clip(params: ClawdisCameraClipParams) async throws -> (
format: String,
base64: String,
durationMs: Int,
hasAudio: Bool)
{
let facing = params.facing ?? .front
let durationMs = Self.clampDurationMs(params.durationMs)
let includeAudio = params.includeAudio ?? true
try await self.ensureAccess(for: .video)
if includeAudio {
try await self.ensureAccess(for: .audio)
}
let session = AVCaptureSession()
session.sessionPreset = .high
guard let camera = Self.pickCamera(facing: facing) else {
throw CameraError.cameraUnavailable
}
let cameraInput = try AVCaptureDeviceInput(device: camera)
guard session.canAddInput(cameraInput) else {
throw CameraError.captureFailed("Failed to add camera input")
}
session.addInput(cameraInput)
if includeAudio {
guard let mic = AVCaptureDevice.default(for: .audio) else {
throw CameraError.microphoneUnavailable
}
let micInput = try AVCaptureDeviceInput(device: mic)
if session.canAddInput(micInput) {
session.addInput(micInput)
} else {
throw CameraError.captureFailed("Failed to add microphone input")
}
}
let output = AVCaptureMovieFileOutput()
guard session.canAddOutput(output) else {
throw CameraError.captureFailed("Failed to add movie output")
}
session.addOutput(output)
output.maxRecordedDuration = CMTime(value: Int64(durationMs), timescale: 1000)
session.startRunning()
defer { session.stopRunning() }
let movURL = FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-\(UUID().uuidString).mov")
let mp4URL = FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-\(UUID().uuidString).mp4")
defer {
try? FileManager.default.removeItem(at: movURL)
try? FileManager.default.removeItem(at: mp4URL)
}
let recordedURL: URL = try await withCheckedThrowingContinuation { cont in
let delegate = MovieFileDelegate(cont)
output.startRecording(to: movURL, recordingDelegate: delegate)
}
// Transcode .mov -> .mp4 for easier downstream handling.
try await Self.exportToMP4(inputURL: recordedURL, outputURL: mp4URL)
let data = try Data(contentsOf: mp4URL)
return (format: "mp4", base64: data.base64EncodedString(), durationMs: durationMs, hasAudio: includeAudio)
}
private func ensureAccess(for mediaType: AVMediaType) async throws {
let status = AVCaptureDevice.authorizationStatus(for: mediaType)
switch status {
case .authorized:
return
case .notDetermined:
let ok = await withCheckedContinuation(isolation: nil) { cont in
AVCaptureDevice.requestAccess(for: mediaType) { granted in
cont.resume(returning: granted)
}
}
if !ok {
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
}
case .denied, .restricted:
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
@unknown default:
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
}
}
private nonisolated static func pickCamera(facing: ClawdisCameraFacing) -> AVCaptureDevice? {
let position: AVCaptureDevice.Position = (facing == .front) ? .front : .back
return AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: position)
}
private nonisolated static func clampQuality(_ quality: Double?) -> Double {
let q = quality ?? 0.9
return min(1.0, max(0.05, q))
}
private nonisolated static func clampDurationMs(_ ms: Int?) -> Int {
let v = ms ?? 3000
// Keep clips short by default; avoid huge base64 payloads on the bridge.
return min(15000, max(250, v))
}
private nonisolated static func reencodeJPEG(
imageData: Data,
maxWidth: Int?,
quality: Double) throws -> (data: Data, size: CGSize)
{
guard let image = UIImage(data: imageData) else {
throw CameraError.captureFailed("Failed to decode captured image")
}
let finalImage: UIImage = if let maxWidth, maxWidth > 0 {
Self.downscale(image: image, maxWidth: CGFloat(maxWidth))
} else {
image
}
guard let out = finalImage.jpegData(compressionQuality: quality) else {
throw CameraError.captureFailed("Failed to encode JPEG")
}
return (out, finalImage.size)
}
private nonisolated static func downscale(image: UIImage, maxWidth: CGFloat) -> UIImage {
let w = image.size.width
let h = image.size.height
guard w > 0, h > 0 else { return image }
guard w > maxWidth else { return image }
let scale = maxWidth / w
let target = CGSize(width: maxWidth, height: max(1, h * scale))
let format = UIGraphicsImageRendererFormat.default()
format.opaque = false
let renderer = UIGraphicsImageRenderer(size: target, format: format)
return renderer.image { _ in
image.draw(in: CGRect(origin: .zero, size: target))
}
}
private nonisolated static func exportToMP4(inputURL: URL, outputURL: URL) async throws {
let asset = AVAsset(url: inputURL)
guard let exporter = AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetHighestQuality) else {
throw CameraError.exportFailed("Failed to create export session")
}
exporter.outputURL = outputURL
exporter.outputFileType = .mp4
exporter.shouldOptimizeForNetworkUse = true
try await withCheckedThrowingContinuation(isolation: nil) { cont in
exporter.exportAsynchronously {
switch exporter.status {
case .completed:
cont.resume(returning: ())
case .failed:
cont.resume(throwing: exporter.error ?? CameraError.exportFailed("Export failed"))
case .cancelled:
cont.resume(throwing: CameraError.exportFailed("Export cancelled"))
default:
cont.resume(throwing: CameraError.exportFailed("Export did not complete"))
}
}
}
}
}
private final class PhotoCaptureDelegate: NSObject, AVCapturePhotoCaptureDelegate {
private let continuation: CheckedContinuation<Data, Error>
private var didResume = false
init(_ continuation: CheckedContinuation<Data, Error>) {
self.continuation = continuation
}
func photoOutput(
_ output: AVCapturePhotoOutput,
didFinishProcessingPhoto photo: AVCapturePhoto,
error: Error?)
{
guard !self.didResume else { return }
self.didResume = true
if let error {
self.continuation.resume(throwing: error)
return
}
guard let data = photo.fileDataRepresentation() else {
self.continuation.resume(
throwing: NSError(domain: "Camera", code: 1, userInfo: [
NSLocalizedDescriptionKey: "photo data missing",
]))
return
}
self.continuation.resume(returning: data)
}
}
private final class MovieFileDelegate: NSObject, AVCaptureFileOutputRecordingDelegate {
private let continuation: CheckedContinuation<URL, Error>
private var didResume = false
init(_ continuation: CheckedContinuation<URL, Error>) {
self.continuation = continuation
}
func fileOutput(
_ output: AVCaptureFileOutput,
didFinishRecordingTo outputFileURL: URL,
from connections: [AVCaptureConnection],
error: Error?)
{
guard !self.didResume else { return }
self.didResume = true
if let error {
self.continuation.resume(throwing: error)
return
}
self.continuation.resume(returning: outputFileURL)
}
}

View File

@@ -26,6 +26,8 @@
</array>
<key>NSLocalNetworkUsageDescription</key>
<string>Clawdis discovers and connects to your Clawdis bridge on the local network.</string>
<key>NSCameraUsageDescription</key>
<string>Clawdis can capture photos or short video clips when requested via the bridge.</string>
<key>NSMicrophoneUsageDescription</key>
<string>Clawdis needs microphone access for voice wake.</string>
<key>NSSpeechRecognitionUsageDescription</key>

View File

@@ -6,6 +6,7 @@ import SwiftUI
final class NodeAppModel: ObservableObject {
@Published var isBackgrounded: Bool = false
let screen = ScreenController()
let camera = CameraController()
@Published var bridgeStatusText: String = "Not connected"
@Published var bridgeServerName: String?
@Published var bridgeRemoteAddress: String?
@@ -182,13 +183,22 @@ final class NodeAppModel: ObservableObject {
}
private func handleInvoke(_ req: BridgeInvokeRequest) async -> BridgeInvokeResponse {
if req.command.hasPrefix("screen."), self.isBackgrounded {
if req.command.hasPrefix("screen.") || req.command.hasPrefix("camera."), self.isBackgrounded {
return BridgeInvokeResponse(
id: req.id,
ok: false,
error: ClawdisNodeError(
code: .backgroundUnavailable,
message: "NODE_BACKGROUND_UNAVAILABLE: screen commands require foreground"))
message: "NODE_BACKGROUND_UNAVAILABLE: screen/camera commands require foreground"))
}
if req.command.hasPrefix("camera."), !self.isCameraEnabled() {
return BridgeInvokeResponse(
id: req.id,
ok: false,
error: ClawdisNodeError(
code: .unavailable,
message: "CAMERA_DISABLED: enable Camera in iOS Settings → Camera → Allow Camera"))
}
do {
@@ -222,6 +232,46 @@ final class NodeAppModel: ObservableObject {
let payload = try Self.encodePayload(["format": "png", "base64": base64])
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
case ClawdisCameraCommand.snap.rawValue:
let params = (try? Self.decodeParams(ClawdisCameraSnapParams.self, from: req.paramsJSON)) ??
ClawdisCameraSnapParams()
let res = try await self.camera.snap(params: params)
struct Payload: Codable {
var format: String
var base64: String
var width: Int
var height: Int
}
let payload = try Self.encodePayload(Payload(
format: res.format,
base64: res.base64,
width: res.width,
height: res.height))
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
case ClawdisCameraCommand.clip.rawValue:
let params = (try? Self.decodeParams(ClawdisCameraClipParams.self, from: req.paramsJSON)) ??
ClawdisCameraClipParams()
let suspended = (params.includeAudio ?? true) ? self.voiceWake.suspendForExternalAudioCapture() : false
defer { self.voiceWake.resumeAfterExternalAudioCapture(wasSuspended: suspended) }
let res = try await self.camera.clip(params: params)
struct Payload: Codable {
var format: String
var base64: String
var durationMs: Int
var hasAudio: Bool
}
let payload = try Self.encodePayload(Payload(
format: res.format,
base64: res.base64,
durationMs: res.durationMs,
hasAudio: res.hasAudio))
return BridgeInvokeResponse(id: req.id, ok: true, payloadJSON: payload)
default:
return BridgeInvokeResponse(
id: req.id,
@@ -254,4 +304,10 @@ final class NodeAppModel: ObservableObject {
}
return json
}
private func isCameraEnabled() -> Bool {
// Default-on: if the key doesn't exist yet, treat it as enabled.
if UserDefaults.standard.object(forKey: "camera.enabled") == nil { return true }
return UserDefaults.standard.bool(forKey: "camera.enabled")
}
}

View File

@@ -205,6 +205,37 @@ final class VoiceWakeManager: NSObject, ObservableObject {
try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation)
}
/// Temporarily releases the microphone so other subsystems (e.g. camera video capture) can record audio.
/// Returns `true` when listening was active and was suspended.
func suspendForExternalAudioCapture() -> Bool {
guard self.isEnabled, self.isListening else { return false }
self.isListening = false
self.statusText = "Paused"
self.tapDrainTask?.cancel()
self.tapDrainTask = nil
self.tapQueue?.clear()
self.tapQueue = nil
self.recognitionTask?.cancel()
self.recognitionTask = nil
self.recognitionRequest = nil
if self.audioEngine.isRunning {
self.audioEngine.stop()
self.audioEngine.inputNode.removeTap(onBus: 0)
}
try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation)
return true
}
func resumeAfterExternalAudioCapture(wasSuspended: Bool) {
guard wasSuspended else { return }
Task { await self.start() }
}
private func startRecognition() throws {
self.recognitionTask?.cancel()
self.recognitionTask = nil

View File

@@ -54,5 +54,6 @@ targets:
NSLocalNetworkUsageDescription: Clawdis discovers and connects to your Clawdis bridge on the local network.
NSBonjourServices:
- _clawdis-bridge._tcp
NSCameraUsageDescription: Clawdis can capture photos or short video clips when requested via the bridge.
NSMicrophoneUsageDescription: Clawdis needs microphone access for voice wake.
NSSpeechRecognitionUsageDescription: Clawdis uses on-device speech recognition for voice wake.

View File

@@ -0,0 +1,341 @@
import AVFoundation
import ClawdisIPC
import CoreGraphics
import Foundation
import ImageIO
import OSLog
import UniformTypeIdentifiers
actor CameraCaptureService {
enum CameraError: LocalizedError, Sendable {
case cameraUnavailable
case microphoneUnavailable
case permissionDenied(kind: String)
case captureFailed(String)
case exportFailed(String)
var errorDescription: String? {
switch self {
case .cameraUnavailable:
"Camera unavailable"
case .microphoneUnavailable:
"Microphone unavailable"
case let .permissionDenied(kind):
"\(kind) permission denied"
case let .captureFailed(msg):
msg
case let .exportFailed(msg):
msg
}
}
}
private let logger = Logger(subsystem: "com.steipete.clawdis", category: "camera")
func snap(facing: CameraFacing?, maxWidth: Int?, quality: Double?) async throws -> (data: Data, size: CGSize) {
let facing = facing ?? .front
let maxWidth = maxWidth.flatMap { $0 > 0 ? $0 : nil }
let quality = Self.clampQuality(quality)
try await self.ensureAccess(for: .video)
let session = AVCaptureSession()
session.sessionPreset = .photo
guard let device = Self.pickCamera(facing: facing) else {
throw CameraError.cameraUnavailable
}
let input = try AVCaptureDeviceInput(device: device)
guard session.canAddInput(input) else {
throw CameraError.captureFailed("Failed to add camera input")
}
session.addInput(input)
let output = AVCapturePhotoOutput()
guard session.canAddOutput(output) else {
throw CameraError.captureFailed("Failed to add photo output")
}
session.addOutput(output)
output.maxPhotoQualityPrioritization = .quality
session.startRunning()
defer { session.stopRunning() }
let settings: AVCapturePhotoSettings = {
if output.availablePhotoCodecTypes.contains(.jpeg) {
return AVCapturePhotoSettings(format: [AVVideoCodecKey: AVVideoCodecType.jpeg])
}
return AVCapturePhotoSettings()
}()
settings.photoQualityPrioritization = .quality
let rawData: Data = try await withCheckedThrowingContinuation(isolation: nil) { cont in
output.capturePhoto(with: settings, delegate: PhotoCaptureDelegate(cont))
}
return try Self.reencodeJPEG(imageData: rawData, maxWidth: maxWidth, quality: quality)
}
func clip(
facing: CameraFacing?,
durationMs: Int?,
includeAudio: Bool,
outPath: String?) async throws -> (path: String, durationMs: Int, hasAudio: Bool)
{
let facing = facing ?? .front
let durationMs = Self.clampDurationMs(durationMs)
try await self.ensureAccess(for: .video)
if includeAudio {
try await self.ensureAccess(for: .audio)
}
let session = AVCaptureSession()
session.sessionPreset = .high
guard let camera = Self.pickCamera(facing: facing) else {
throw CameraError.cameraUnavailable
}
let cameraInput = try AVCaptureDeviceInput(device: camera)
guard session.canAddInput(cameraInput) else {
throw CameraError.captureFailed("Failed to add camera input")
}
session.addInput(cameraInput)
if includeAudio {
guard let mic = AVCaptureDevice.default(for: .audio) else {
throw CameraError.microphoneUnavailable
}
let micInput = try AVCaptureDeviceInput(device: mic)
guard session.canAddInput(micInput) else {
throw CameraError.captureFailed("Failed to add microphone input")
}
session.addInput(micInput)
}
let output = AVCaptureMovieFileOutput()
guard session.canAddOutput(output) else {
throw CameraError.captureFailed("Failed to add movie output")
}
session.addOutput(output)
output.maxRecordedDuration = CMTime(value: Int64(durationMs), timescale: 1000)
session.startRunning()
defer { session.stopRunning() }
let tmpMovURL = FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-\(UUID().uuidString).mov")
defer { try? FileManager.default.removeItem(at: tmpMovURL) }
let outputURL: URL = {
if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
return URL(fileURLWithPath: outPath)
}
return FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-\(UUID().uuidString).mp4")
}()
// Ensure we don't fail exporting due to an existing file.
try? FileManager.default.removeItem(at: outputURL)
let logger = self.logger
let recordedURL: URL = try await withCheckedThrowingContinuation(isolation: nil) { cont in
output.startRecording(to: tmpMovURL, recordingDelegate: MovieFileDelegate(cont, logger: logger))
}
try await Self.exportToMP4(inputURL: recordedURL, outputURL: outputURL)
return (path: outputURL.path, durationMs: durationMs, hasAudio: includeAudio)
}
private func ensureAccess(for mediaType: AVMediaType) async throws {
let status = AVCaptureDevice.authorizationStatus(for: mediaType)
switch status {
case .authorized:
return
case .notDetermined:
let ok = await withCheckedContinuation(isolation: nil) { cont in
AVCaptureDevice.requestAccess(for: mediaType) { granted in
cont.resume(returning: granted)
}
}
if !ok {
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
}
case .denied, .restricted:
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
@unknown default:
throw CameraError.permissionDenied(kind: mediaType == .video ? "Camera" : "Microphone")
}
}
private nonisolated static func pickCamera(facing: CameraFacing) -> AVCaptureDevice? {
let position: AVCaptureDevice.Position = (facing == .front) ? .front : .back
if let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: position) {
return device
}
// Many macOS cameras report `unspecified` position; fall back to any default.
return AVCaptureDevice.default(for: .video)
}
private nonisolated static func clampQuality(_ quality: Double?) -> Double {
let q = quality ?? 0.9
return min(1.0, max(0.05, q))
}
private nonisolated static func clampDurationMs(_ ms: Int?) -> Int {
let v = ms ?? 3000
return min(15_000, max(250, v))
}
private nonisolated static func reencodeJPEG(
imageData: Data,
maxWidth: Int?,
quality: Double) throws -> (data: Data, size: CGSize)
{
guard let src = CGImageSourceCreateWithData(imageData as CFData, nil),
let img = CGImageSourceCreateImageAtIndex(src, 0, nil)
else {
throw CameraError.captureFailed("Failed to decode captured image")
}
let finalImage: CGImage
if let maxWidth, img.width > maxWidth {
guard let scaled = self.downscale(image: img, maxWidth: maxWidth) else {
throw CameraError.captureFailed("Failed to downscale image")
}
finalImage = scaled
} else {
finalImage = img
}
let out = NSMutableData()
guard let dest = CGImageDestinationCreateWithData(out, UTType.jpeg.identifier as CFString, 1, nil) else {
throw CameraError.captureFailed("Failed to create JPEG destination")
}
let props = [kCGImageDestinationLossyCompressionQuality: quality] as CFDictionary
CGImageDestinationAddImage(dest, finalImage, props)
guard CGImageDestinationFinalize(dest) else {
throw CameraError.captureFailed("Failed to encode JPEG")
}
return (out as Data, CGSize(width: finalImage.width, height: finalImage.height))
}
private nonisolated static func downscale(image: CGImage, maxWidth: Int) -> CGImage? {
guard image.width > 0, image.height > 0 else { return image }
guard image.width > maxWidth else { return image }
let scale = Double(maxWidth) / Double(image.width)
let targetW = maxWidth
let targetH = max(1, Int((Double(image.height) * scale).rounded()))
let cs = CGColorSpaceCreateDeviceRGB()
let bitmapInfo = CGImageAlphaInfo.premultipliedLast.rawValue
guard let ctx = CGContext(
data: nil,
width: targetW,
height: targetH,
bitsPerComponent: 8,
bytesPerRow: 0,
space: cs,
bitmapInfo: bitmapInfo)
else { return nil }
ctx.interpolationQuality = .high
ctx.draw(image, in: CGRect(x: 0, y: 0, width: targetW, height: targetH))
return ctx.makeImage()
}
private nonisolated static func exportToMP4(inputURL: URL, outputURL: URL) async throws {
let asset = AVAsset(url: inputURL)
guard let export = AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetMediumQuality) else {
throw CameraError.exportFailed("Failed to create export session")
}
export.outputURL = outputURL
export.outputFileType = .mp4
export.shouldOptimizeForNetworkUse = true
await withCheckedContinuation { cont in
export.exportAsynchronously {
cont.resume()
}
}
switch export.status {
case .completed:
return
case .failed:
throw CameraError.exportFailed(export.error?.localizedDescription ?? "export failed")
case .cancelled:
throw CameraError.exportFailed("export cancelled")
default:
throw CameraError.exportFailed("export did not complete (\(export.status.rawValue))")
}
}
}
private final class PhotoCaptureDelegate: NSObject, AVCapturePhotoCaptureDelegate {
private var cont: CheckedContinuation<Data, Error>?
init(_ cont: CheckedContinuation<Data, Error>) {
self.cont = cont
}
func photoOutput(
_ output: AVCapturePhotoOutput,
didFinishProcessingPhoto photo: AVCapturePhoto,
error: Error?)
{
guard let cont else { return }
self.cont = nil
if let error {
cont.resume(throwing: error)
return
}
guard let data = photo.fileDataRepresentation() else {
cont.resume(throwing: CameraCaptureService.CameraError.captureFailed("No photo data"))
return
}
cont.resume(returning: data)
}
}
private final class MovieFileDelegate: NSObject, AVCaptureFileOutputRecordingDelegate {
private var cont: CheckedContinuation<URL, Error>?
private let logger: Logger
init(_ cont: CheckedContinuation<URL, Error>, logger: Logger) {
self.cont = cont
self.logger = logger
}
func fileOutput(
_ output: AVCaptureFileOutput,
didFinishRecordingTo outputFileURL: URL,
from connections: [AVCaptureConnection],
error: Error?)
{
guard let cont else { return }
self.cont = nil
if let error {
let ns = error as NSError
if ns.domain == AVFoundationErrorDomain,
ns.code == AVError.maximumDurationReached.rawValue
{
cont.resume(returning: outputFileURL)
return
}
self.logger.error("camera record failed: \(error.localizedDescription, privacy: .public)")
cont.resume(throwing: error)
return
}
cont.resume(returning: outputFileURL)
}
}

View File

@@ -24,6 +24,7 @@ let webChatEnabledKey = "clawdis.webChatEnabled"
let webChatSwiftUIEnabledKey = "clawdis.webChatSwiftUIEnabled"
let webChatPortKey = "clawdis.webChatPort"
let canvasEnabledKey = "clawdis.canvasEnabled"
let cameraEnabledKey = "clawdis.cameraEnabled"
let peekabooBridgeEnabledKey = "clawdis.peekabooBridgeEnabled"
let deepLinkAgentEnabledKey = "clawdis.deepLinkAgentEnabled"
let deepLinkKeyKey = "clawdis.deepLinkKey"

View File

@@ -3,6 +3,8 @@ import Foundation
import OSLog
enum ControlRequestHandler {
private static let cameraCapture = CameraCaptureService()
static func process(
request: Request,
notifier: NotificationManager = NotificationManager(),
@@ -77,6 +79,16 @@ enum ControlRequestHandler {
command: command,
paramsJSON: paramsJSON,
logger: logger)
case let .cameraSnap(facing, maxWidth, quality, outPath):
return await self.handleCameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath)
case let .cameraClip(facing, durationMs, includeAudio, outPath):
return await self.handleCameraClip(
facing: facing,
durationMs: durationMs,
includeAudio: includeAudio,
outPath: outPath)
}
}
@@ -173,6 +185,10 @@ enum ControlRequestHandler {
UserDefaults.standard.object(forKey: canvasEnabledKey) as? Bool ?? true
}
private static func cameraEnabled() -> Bool {
UserDefaults.standard.object(forKey: cameraEnabledKey) as? Bool ?? false
}
private static func handleCanvasShow(
session: String,
path: String?,
@@ -254,4 +270,46 @@ enum ControlRequestHandler {
return Response(ok: false, message: error.localizedDescription)
}
}
private static func handleCameraSnap(
facing: CameraFacing?,
maxWidth: Int?,
quality: Double?,
outPath: String?) async -> Response
{
guard self.cameraEnabled() else { return Response(ok: false, message: "Camera disabled by user") }
do {
let res = try await self.cameraCapture.snap(facing: facing, maxWidth: maxWidth, quality: quality)
let url: URL = if let outPath, !outPath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
URL(fileURLWithPath: outPath)
} else {
FileManager.default.temporaryDirectory
.appendingPathComponent("clawdis-camera-snap-\(UUID().uuidString).jpg")
}
try res.data.write(to: url, options: [.atomic])
return Response(ok: true, message: url.path)
} catch {
return Response(ok: false, message: error.localizedDescription)
}
}
private static func handleCameraClip(
facing: CameraFacing?,
durationMs: Int?,
includeAudio: Bool,
outPath: String?) async -> Response
{
guard self.cameraEnabled() else { return Response(ok: false, message: "Camera disabled by user") }
do {
let res = try await self.cameraCapture.clip(
facing: facing,
durationMs: durationMs,
includeAudio: includeAudio,
outPath: outPath)
return Response(ok: true, message: res.path)
} catch {
return Response(ok: false, message: error.localizedDescription)
}
}
}

View File

@@ -9,6 +9,7 @@ struct DebugSettings: View {
@AppStorage(modelCatalogReloadKey) private var modelCatalogReloadBump: Int = 0
@AppStorage(iconOverrideKey) private var iconOverrideRaw: String = IconOverrideSelection.system.rawValue
@AppStorage(canvasEnabledKey) private var canvasEnabled: Bool = true
@AppStorage(cameraEnabledKey) private var cameraEnabled: Bool = false
@AppStorage(deepLinkAgentEnabledKey) private var deepLinkAgentEnabled: Bool = false
@State private var modelsCount: Int?
@State private var modelsLoading = false
@@ -48,6 +49,7 @@ struct DebugSettings: View {
self.pathsSection
self.quickActionsSection
self.canvasSection
self.cameraSection
self.experimentsSection
Spacer(minLength: 0)
@@ -571,6 +573,20 @@ struct DebugSettings: View {
}
}
private var cameraSection: some View {
GroupBox("Camera") {
VStack(alignment: .leading, spacing: 10) {
Toggle("Allow Camera (agent)", isOn: self.$cameraEnabled)
.toggleStyle(.checkbox)
.help("When off, camera requests return “Camera disabled by user”.")
Text("Allows Clawdis to capture a photo or short video via the built-in camera.")
.font(.caption)
.foregroundStyle(.secondary)
}
}
}
private var experimentsSection: some View {
GroupBox("Experiments") {
Grid(alignment: .leadingFirstTextBaseline, horizontalSpacing: 14, verticalSpacing: 10) {

View File

@@ -52,6 +52,7 @@ struct ClawdisCLI {
enum Kind {
case generic
case mediaPath
}
}
@@ -91,6 +92,9 @@ struct ClawdisCLI {
case "canvas":
return try self.parseCanvas(args: &args)
case "camera":
return try self.parseCamera(args: &args)
default:
throw CLIError.help
}
@@ -292,6 +296,62 @@ struct ClawdisCLI {
}
}
private static func parseCamera(args: inout [String]) throws -> ParsedCLIRequest {
guard let sub = args.popFirst() else { throw CLIError.help }
switch sub {
case "snap":
var facing: CameraFacing?
var maxWidth: Int?
var quality: Double?
var outPath: String?
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--facing":
if let val = args.popFirst(), let f = CameraFacing(rawValue: val) { facing = f }
case "--max-width":
maxWidth = args.popFirst().flatMap(Int.init)
case "--quality":
quality = args.popFirst().flatMap(Double.init)
case "--out":
outPath = args.popFirst()
default:
break
}
}
return ParsedCLIRequest(
request: .cameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath),
kind: .mediaPath)
case "clip":
var facing: CameraFacing?
var durationMs: Int?
var includeAudio = true
var outPath: String?
while !args.isEmpty {
let arg = args.removeFirst()
switch arg {
case "--facing":
if let val = args.popFirst(), let f = CameraFacing(rawValue: val) { facing = f }
case "--duration-ms":
durationMs = args.popFirst().flatMap(Int.init)
case "--no-audio":
includeAudio = false
case "--out":
outPath = args.popFirst()
default:
break
}
}
return ParsedCLIRequest(
request: .cameraClip(facing: facing, durationMs: durationMs, includeAudio: includeAudio, outPath: outPath),
kind: .mediaPath)
default:
throw CLIError.help
}
}
private static func parseCanvasPlacement(
args: inout [String],
session: inout String,
@@ -334,6 +394,10 @@ struct ClawdisCLI {
if let message = response.message, !message.isEmpty {
FileHandle.standardOutput.write(Data((message + "\n").utf8))
}
case .mediaPath:
if let message = response.message, !message.isEmpty {
print("MEDIA:\(message)")
}
}
}
@@ -352,6 +416,8 @@ struct ClawdisCLI {
output["payload"] = text
}
}
case .mediaPath:
break
}
let json = try JSONSerialization.data(withJSONObject: output, options: [.prettyPrinted])
@@ -406,6 +472,10 @@ struct ClawdisCLI {
clawdis-mac canvas eval --js <code> [--session <key>]
clawdis-mac canvas snapshot [--out <path>] [--session <key>]
Camera:
clawdis-mac camera snap [--facing <front|back>] [--max-width <px>] [--quality <0-1>] [--out <path>]
clawdis-mac camera clip [--facing <front|back>] [--duration-ms <ms>] [--no-audio] [--out <path>]
Browser (clawd):
clawdis-mac browser status|start|stop|tabs|open|focus|close|screenshot|eval|query|dom|snapshot
@@ -433,6 +503,7 @@ struct ClawdisCLI {
Output:
Default output is text. Use --json for machine-readable output.
In text mode, `browser screenshot` prints MEDIA:<path>.
In text mode, `camera snap` and `camera clip` print MEDIA:<path>.
"""
print(usage)
}

View File

@@ -13,6 +13,11 @@ public enum Capability: String, Codable, CaseIterable, Sendable {
case speechRecognition
}
public enum CameraFacing: String, Codable, Sendable {
case front
case back
}
// MARK: - Requests
/// Notification interruption level (maps to UNNotificationInterruptionLevel)
@@ -74,6 +79,8 @@ public enum Request: Sendable {
case canvasSnapshot(session: String, outPath: String?)
case nodeList
case nodeInvoke(nodeId: String, command: String, paramsJSON: String?)
case cameraSnap(facing: CameraFacing?, maxWidth: Int?, quality: Double?, outPath: String?)
case cameraClip(facing: CameraFacing?, durationMs: Int?, includeAudio: Bool, outPath: String?)
}
// MARK: - Responses
@@ -104,6 +111,11 @@ extension Request: Codable {
case path
case javaScript
case outPath
case facing
case maxWidth
case quality
case durationMs
case includeAudio
case placement
case nodeId
case nodeCommand
@@ -124,6 +136,8 @@ extension Request: Codable {
case canvasSnapshot
case nodeList
case nodeInvoke
case cameraSnap
case cameraClip
}
public func encode(to encoder: Encoder) throws {
@@ -198,6 +212,20 @@ extension Request: Codable {
try container.encode(nodeId, forKey: .nodeId)
try container.encode(command, forKey: .nodeCommand)
try container.encodeIfPresent(paramsJSON, forKey: .paramsJSON)
case let .cameraSnap(facing, maxWidth, quality, outPath):
try container.encode(Kind.cameraSnap, forKey: .type)
try container.encodeIfPresent(facing, forKey: .facing)
try container.encodeIfPresent(maxWidth, forKey: .maxWidth)
try container.encodeIfPresent(quality, forKey: .quality)
try container.encodeIfPresent(outPath, forKey: .outPath)
case let .cameraClip(facing, durationMs, includeAudio, outPath):
try container.encode(Kind.cameraClip, forKey: .type)
try container.encodeIfPresent(facing, forKey: .facing)
try container.encodeIfPresent(durationMs, forKey: .durationMs)
try container.encode(includeAudio, forKey: .includeAudio)
try container.encodeIfPresent(outPath, forKey: .outPath)
}
}
@@ -274,6 +302,20 @@ extension Request: Codable {
let command = try container.decode(String.self, forKey: .nodeCommand)
let paramsJSON = try container.decodeIfPresent(String.self, forKey: .paramsJSON)
self = .nodeInvoke(nodeId: nodeId, command: command, paramsJSON: paramsJSON)
case .cameraSnap:
let facing = try container.decodeIfPresent(CameraFacing.self, forKey: .facing)
let maxWidth = try container.decodeIfPresent(Int.self, forKey: .maxWidth)
let quality = try container.decodeIfPresent(Double.self, forKey: .quality)
let outPath = try container.decodeIfPresent(String.self, forKey: .outPath)
self = .cameraSnap(facing: facing, maxWidth: maxWidth, quality: quality, outPath: outPath)
case .cameraClip:
let facing = try container.decodeIfPresent(CameraFacing.self, forKey: .facing)
let durationMs = try container.decodeIfPresent(Int.self, forKey: .durationMs)
let includeAudio = (try? container.decode(Bool.self, forKey: .includeAudio)) ?? true
let outPath = try container.decodeIfPresent(String.self, forKey: .outPath)
self = .cameraClip(facing: facing, durationMs: durationMs, includeAudio: includeAudio, outPath: outPath)
}
}
}

View File

@@ -0,0 +1,62 @@
import ClawdisIPC
import Foundation
import Testing
@Suite struct CameraIPCTests {
@Test func cameraSnapCodableRoundtrip() throws {
let req: Request = .cameraSnap(
facing: .front,
maxWidth: 640,
quality: 0.85,
outPath: "/tmp/test.jpg")
let data = try JSONEncoder().encode(req)
let decoded = try JSONDecoder().decode(Request.self, from: data)
switch decoded {
case let .cameraSnap(facing, maxWidth, quality, outPath):
#expect(facing == .front)
#expect(maxWidth == 640)
#expect(quality == 0.85)
#expect(outPath == "/tmp/test.jpg")
default:
Issue.record("expected cameraSnap, got \(decoded)")
}
}
@Test func cameraClipCodableRoundtrip() throws {
let req: Request = .cameraClip(
facing: .back,
durationMs: 3000,
includeAudio: false,
outPath: "/tmp/test.mp4")
let data = try JSONEncoder().encode(req)
let decoded = try JSONDecoder().decode(Request.self, from: data)
switch decoded {
case let .cameraClip(facing, durationMs, includeAudio, outPath):
#expect(facing == .back)
#expect(durationMs == 3000)
#expect(includeAudio == false)
#expect(outPath == "/tmp/test.mp4")
default:
Issue.record("expected cameraClip, got \(decoded)")
}
}
@Test func cameraClipDefaultsIncludeAudioToTrueWhenMissing() throws {
let json = """
{"type":"cameraClip","durationMs":1234}
"""
let decoded = try JSONDecoder().decode(Request.self, from: Data(json.utf8))
switch decoded {
case let .cameraClip(_, durationMs, includeAudio, _):
#expect(durationMs == 1234)
#expect(includeAudio == true)
default:
Issue.record("expected cameraClip, got \(decoded)")
}
}
}

View File

@@ -0,0 +1,58 @@
import Foundation
public enum ClawdisCameraCommand: String, Codable, Sendable {
case snap = "camera.snap"
case clip = "camera.clip"
}
public enum ClawdisCameraFacing: String, Codable, Sendable {
case back
case front
}
public enum ClawdisCameraImageFormat: String, Codable, Sendable {
case jpg
case jpeg
}
public enum ClawdisCameraVideoFormat: String, Codable, Sendable {
case mp4
}
public struct ClawdisCameraSnapParams: Codable, Sendable, Equatable {
public var facing: ClawdisCameraFacing?
public var maxWidth: Int?
public var quality: Double?
public var format: ClawdisCameraImageFormat?
public init(
facing: ClawdisCameraFacing? = nil,
maxWidth: Int? = nil,
quality: Double? = nil,
format: ClawdisCameraImageFormat? = nil)
{
self.facing = facing
self.maxWidth = maxWidth
self.quality = quality
self.format = format
}
}
public struct ClawdisCameraClipParams: Codable, Sendable, Equatable {
public var facing: ClawdisCameraFacing?
public var durationMs: Int?
public var includeAudio: Bool?
public var format: ClawdisCameraVideoFormat?
public init(
facing: ClawdisCameraFacing? = nil,
durationMs: Int? = nil,
includeAudio: Bool? = nil,
format: ClawdisCameraVideoFormat? = nil)
{
self.facing = facing
self.durationMs = durationMs
self.includeAudio = includeAudio
self.format = format
}
}

98
docs/camera.md Normal file
View File

@@ -0,0 +1,98 @@
---
summary: "Camera capture (iOS node + macOS app) for agent use: photos (jpg) and short video clips (mp4)"
read_when:
- Adding or modifying camera capture on iOS nodes or macOS
- Extending agent-accessible MEDIA temp-file workflows
---
# Camera capture (agent)
Clawdis supports **camera capture** for agent workflows:
- **iOS node** (paired via Gateway): capture a **photo** (`jpg`) or **short video clip** (`mp4`, with optional audio) via `node.invoke`.
- **macOS app** (local control socket): capture a **photo** (`jpg`) or **short video clip** (`mp4`, with optional audio) via `clawdis-mac`.
All camera access is gated behind **user-controlled settings**.
## iOS node
### User setting (default on)
- iOS Settings tab → **Camera****Allow Camera** (`camera.enabled`)
- Default: **on** (missing key is treated as enabled).
- When off: `camera.*` commands return `CAMERA_DISABLED`.
### Commands (via Gateway `node.invoke`)
- `camera.snap`
- Params:
- `facing`: `front|back` (default: `front`)
- `maxWidth`: number (optional)
- `quality`: `0..1` (optional; default `0.9`)
- `format`: currently `jpg`
- Response payload:
- `format: "jpg"`
- `base64: "<...>"`
- `width`, `height`
- `camera.clip`
- Params:
- `facing`: `front|back` (default: `front`)
- `durationMs`: number (default `3000`, clamped to a max)
- `includeAudio`: boolean (default `true`)
- `format`: currently `mp4`
- Response payload:
- `format: "mp4"`
- `base64: "<...>"`
- `durationMs`
- `hasAudio`
### Foreground requirement
Like `screen.*`, the iOS node only allows `camera.*` commands in the **foreground**. Background invocations return `NODE_BACKGROUND_UNAVAILABLE`.
### CLI helper (temp files + MEDIA)
The easiest way to get attachments is via the CLI helper, which writes decoded media to a temp file and prints `MEDIA:<path>`.
Examples:
```bash
clawdis nodes camera snap --node <id> # default: both front + back (2 MEDIA lines)
clawdis nodes camera snap --node <id> --facing front
clawdis nodes camera clip --node <id> --duration 3000
clawdis nodes camera clip --node <id> --no-audio
```
Notes:
- `nodes camera snap` defaults to **both** facings to give the agent both views.
- Output files are temporary (in the OS temp directory) unless you build your own wrapper.
## macOS app
### User setting (default off)
The macOS companion app exposes a checkbox:
- **Settings → Debug → Camera → Allow Camera (agent)** (`clawdis.cameraEnabled`)
- Default: **off**
- When off: camera requests return “Camera disabled by user”.
### CLI helper (local control socket)
The `clawdis-mac` helper talks to the running menu bar app over the local control socket.
Examples:
```bash
clawdis-mac camera snap # prints MEDIA:<path>
clawdis-mac camera snap --max-width 1280
clawdis-mac camera clip --duration-ms 3000 # prints MEDIA:<path>
clawdis-mac camera clip --no-audio
```
## Safety + practical limits
- Camera and microphone access trigger the usual OS permission prompts (and require usage strings in Info.plist).
- Video clips are intentionally short to avoid oversized bridge payloads (base64 overhead + WebSocket message limits).

View File

@@ -98,6 +98,8 @@ cat > "$APP_ROOT/Contents/Info.plist" <<PLIST
<string>Clawdis needs notification permission to show alerts for agent actions.</string>
<key>NSScreenCaptureDescription</key>
<string>Clawdis captures the screen when the agent needs screenshots for context.</string>
<key>NSCameraUsageDescription</key>
<string>Clawdis can capture photos or short video clips when requested by the agent.</string>
<key>NSMicrophoneUsageDescription</key>
<string>Clawdis needs the mic for Voice Wake tests and agent audio capture.</string>
<key>NSSpeechRecognitionUsageDescription</key>

View File

@@ -0,0 +1,64 @@
import * as fs from "node:fs/promises";
import * as os from "node:os";
import * as path from "node:path";
import { describe, expect, it } from "vitest";
import {
cameraTempPath,
parseCameraClipPayload,
parseCameraSnapPayload,
writeBase64ToFile,
} from "./nodes-camera.js";
describe("nodes camera helpers", () => {
it("parses camera.snap payload", () => {
expect(
parseCameraSnapPayload({
format: "jpg",
base64: "aGk=",
width: 10,
height: 20,
}),
).toEqual({ format: "jpg", base64: "aGk=", width: 10, height: 20 });
});
it("rejects invalid camera.snap payload", () => {
expect(() => parseCameraSnapPayload({ format: "jpg" })).toThrow(
/invalid camera\.snap payload/i,
);
});
it("parses camera.clip payload", () => {
expect(
parseCameraClipPayload({
format: "mp4",
base64: "AAEC",
durationMs: 1234,
hasAudio: true,
}),
).toEqual({
format: "mp4",
base64: "AAEC",
durationMs: 1234,
hasAudio: true,
});
});
it("builds stable temp paths when id provided", () => {
const p = cameraTempPath({
kind: "snap",
facing: "front",
ext: "jpg",
tmpDir: "/tmp",
id: "id1",
});
expect(p).toBe(path.join("/tmp", "clawdis-camera-snap-front-id1.jpg"));
});
it("writes base64 to file", async () => {
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdis-test-"));
const out = path.join(dir, "x.bin");
await writeBase64ToFile(out, "aGk=");
await expect(fs.readFile(out, "utf8")).resolves.toBe("hi");
await fs.rm(dir, { recursive: true, force: true });
});
});

92
src/cli/nodes-camera.ts Normal file
View File

@@ -0,0 +1,92 @@
import { randomUUID } from "node:crypto";
import * as fs from "node:fs/promises";
import * as os from "node:os";
import * as path from "node:path";
export type CameraFacing = "front" | "back";
export type CameraSnapPayload = {
format: string;
base64: string;
width: number;
height: number;
};
export type CameraClipPayload = {
format: string;
base64: string;
durationMs: number;
hasAudio: boolean;
};
function asRecord(value: unknown): Record<string, unknown> {
return typeof value === "object" && value !== null
? (value as Record<string, unknown>)
: {};
}
function asString(value: unknown): string | undefined {
return typeof value === "string" ? value : undefined;
}
function asNumber(value: unknown): number | undefined {
return typeof value === "number" && Number.isFinite(value)
? value
: undefined;
}
function asBoolean(value: unknown): boolean | undefined {
return typeof value === "boolean" ? value : undefined;
}
export function parseCameraSnapPayload(value: unknown): CameraSnapPayload {
const obj = asRecord(value);
const format = asString(obj.format);
const base64 = asString(obj.base64);
const width = asNumber(obj.width);
const height = asNumber(obj.height);
if (!format || !base64 || width === undefined || height === undefined) {
throw new Error("invalid camera.snap payload");
}
return { format, base64, width, height };
}
export function parseCameraClipPayload(value: unknown): CameraClipPayload {
const obj = asRecord(value);
const format = asString(obj.format);
const base64 = asString(obj.base64);
const durationMs = asNumber(obj.durationMs);
const hasAudio = asBoolean(obj.hasAudio);
if (
!format ||
!base64 ||
durationMs === undefined ||
hasAudio === undefined
) {
throw new Error("invalid camera.clip payload");
}
return { format, base64, durationMs, hasAudio };
}
export function cameraTempPath(opts: {
kind: "snap" | "clip";
facing?: CameraFacing;
ext: string;
tmpDir?: string;
id?: string;
}) {
const tmpDir = opts.tmpDir ?? os.tmpdir();
const id = opts.id ?? randomUUID();
const facingPart = opts.facing ? `-${opts.facing}` : "";
const ext = opts.ext.startsWith(".") ? opts.ext : `.${opts.ext}`;
return path.join(
tmpDir,
`clawdis-camera-${opts.kind}${facingPart}-${id}${ext}`,
);
}
export async function writeBase64ToFile(filePath: string, base64: string) {
const buf = Buffer.from(base64, "base64");
await fs.writeFile(filePath, buf);
return { path: filePath, bytes: buf.length };
}

View File

@@ -1,6 +1,13 @@
import type { Command } from "commander";
import { callGateway, randomIdempotencyKey } from "../gateway/call.js";
import { defaultRuntime } from "../runtime.js";
import {
type CameraFacing,
cameraTempPath,
parseCameraClipPayload,
parseCameraSnapPayload,
writeBase64ToFile,
} from "./nodes-camera.js";
type NodesRpcOpts = {
url?: string;
@@ -12,6 +19,11 @@ type NodesRpcOpts = {
params?: string;
invokeTimeout?: string;
idempotencyKey?: string;
facing?: string;
maxWidth?: string;
quality?: string;
duration?: string;
audio?: boolean;
};
type NodeListNode = {
@@ -340,4 +352,203 @@ export function registerNodesCli(program: Command) {
}),
{ timeoutMs: 30_000 },
);
const parseFacing = (value: string): CameraFacing => {
const v = String(value ?? "")
.trim()
.toLowerCase();
if (v === "front" || v === "back") return v;
throw new Error(`invalid facing: ${value} (expected front|back)`);
};
const camera = nodes
.command("camera")
.description("Capture camera media from a paired node");
nodesCallOpts(
camera
.command("snap")
.description("Capture a photo from a node camera (prints MEDIA:<path>)")
.requiredOption("--node <idOrNameOrIp>", "Node id, name, or IP")
.option("--facing <front|back|both>", "Camera facing", "both")
.option("--max-width <px>", "Max width in px (optional)")
.option("--quality <0-1>", "JPEG quality (default 0.9)")
.option(
"--invoke-timeout <ms>",
"Node invoke timeout in ms (default 20000)",
"20000",
)
.action(async (opts: NodesRpcOpts) => {
try {
const nodeId = await resolveNodeId(opts, String(opts.node ?? ""));
const facingOpt = String(opts.facing ?? "both")
.trim()
.toLowerCase();
const facings: CameraFacing[] =
facingOpt === "both"
? ["front", "back"]
: facingOpt === "front" || facingOpt === "back"
? [facingOpt]
: (() => {
throw new Error(
`invalid facing: ${String(opts.facing)} (expected front|back|both)`,
);
})();
const maxWidth = opts.maxWidth
? Number.parseInt(String(opts.maxWidth), 10)
: undefined;
const quality = opts.quality
? Number.parseFloat(String(opts.quality))
: undefined;
const timeoutMs = opts.invokeTimeout
? Number.parseInt(String(opts.invokeTimeout), 10)
: undefined;
const results: Array<{
facing: CameraFacing;
path: string;
width: number;
height: number;
}> = [];
for (const facing of facings) {
const invokeParams: Record<string, unknown> = {
nodeId,
command: "camera.snap",
params: {
facing,
maxWidth: Number.isFinite(maxWidth) ? maxWidth : undefined,
quality: Number.isFinite(quality) ? quality : undefined,
format: "jpg",
},
idempotencyKey: randomIdempotencyKey(),
};
if (typeof timeoutMs === "number" && Number.isFinite(timeoutMs)) {
invokeParams.timeoutMs = timeoutMs;
}
const raw = (await callGatewayCli(
"node.invoke",
opts,
invokeParams,
)) as unknown;
const res =
typeof raw === "object" && raw !== null
? (raw as { payload?: unknown })
: {};
const payload = parseCameraSnapPayload(res.payload);
const filePath = cameraTempPath({
kind: "snap",
facing,
ext: payload.format === "jpeg" ? "jpg" : payload.format,
});
await writeBase64ToFile(filePath, payload.base64);
results.push({
facing,
path: filePath,
width: payload.width,
height: payload.height,
});
}
if (opts.json) {
defaultRuntime.log(JSON.stringify({ files: results }, null, 2));
return;
}
defaultRuntime.log(results.map((r) => `MEDIA:${r.path}`).join("\n"));
} catch (err) {
defaultRuntime.error(`nodes camera snap failed: ${String(err)}`);
defaultRuntime.exit(1);
}
}),
{ timeoutMs: 60_000 },
);
nodesCallOpts(
camera
.command("clip")
.description(
"Capture a short video clip from a node camera (prints MEDIA:<path>)",
)
.requiredOption("--node <idOrNameOrIp>", "Node id, name, or IP")
.option("--facing <front|back>", "Camera facing", "front")
.option("--duration <ms>", "Duration in ms (default 3000)", "3000")
.option("--no-audio", "Disable audio capture")
.option(
"--invoke-timeout <ms>",
"Node invoke timeout in ms (default 45000)",
"45000",
)
.action(async (opts: NodesRpcOpts & { audio?: boolean }) => {
try {
const nodeId = await resolveNodeId(opts, String(opts.node ?? ""));
const facing = parseFacing(String(opts.facing ?? "front"));
const durationMs = Number.parseInt(
String(opts.duration ?? "3000"),
10,
);
const includeAudio = opts.audio !== false;
const timeoutMs = opts.invokeTimeout
? Number.parseInt(String(opts.invokeTimeout), 10)
: undefined;
const invokeParams: Record<string, unknown> = {
nodeId,
command: "camera.clip",
params: {
facing,
durationMs: Number.isFinite(durationMs) ? durationMs : undefined,
includeAudio,
format: "mp4",
},
idempotencyKey: randomIdempotencyKey(),
};
if (typeof timeoutMs === "number" && Number.isFinite(timeoutMs)) {
invokeParams.timeoutMs = timeoutMs;
}
const raw = (await callGatewayCli(
"node.invoke",
opts,
invokeParams,
)) as unknown;
const res =
typeof raw === "object" && raw !== null
? (raw as { payload?: unknown })
: {};
const payload = parseCameraClipPayload(res.payload);
const filePath = cameraTempPath({
kind: "clip",
facing,
ext: payload.format,
});
await writeBase64ToFile(filePath, payload.base64);
if (opts.json) {
defaultRuntime.log(
JSON.stringify(
{
file: {
facing,
path: filePath,
durationMs: payload.durationMs,
hasAudio: payload.hasAudio,
},
},
null,
2,
),
);
return;
}
defaultRuntime.log(`MEDIA:${filePath}`);
} catch (err) {
defaultRuntime.error(`nodes camera clip failed: ${String(err)}`);
defaultRuntime.exit(1);
}
}),
{ timeoutMs: 90_000 },
);
}

View File

@@ -1,3 +1,4 @@
import * as fs from "node:fs/promises";
import { beforeEach, describe, expect, it, vi } from "vitest";
const sendCommand = vi.fn();
@@ -148,4 +149,145 @@ describe("cli program", () => {
);
expect(runtime.log).toHaveBeenCalled();
});
it("runs nodes camera snap and prints two MEDIA paths", async () => {
callGateway
.mockResolvedValueOnce({
ts: Date.now(),
nodes: [
{
nodeId: "ios-node",
displayName: "iOS Node",
remoteIp: "192.168.0.88",
connected: true,
},
],
})
.mockResolvedValueOnce({
ok: true,
nodeId: "ios-node",
command: "camera.snap",
payload: { format: "jpg", base64: "aGk=", width: 1, height: 1 },
})
.mockResolvedValueOnce({
ok: true,
nodeId: "ios-node",
command: "camera.snap",
payload: { format: "jpg", base64: "aGk=", width: 1, height: 1 },
});
const program = buildProgram();
runtime.log.mockClear();
await program.parseAsync(
["nodes", "camera", "snap", "--node", "ios-node"],
{
from: "user",
},
);
expect(callGateway).toHaveBeenNthCalledWith(
2,
expect.objectContaining({
method: "node.invoke",
params: expect.objectContaining({
nodeId: "ios-node",
command: "camera.snap",
timeoutMs: 20000,
idempotencyKey: "idem-test",
params: expect.objectContaining({ facing: "front", format: "jpg" }),
}),
}),
);
expect(callGateway).toHaveBeenNthCalledWith(
3,
expect.objectContaining({
method: "node.invoke",
params: expect.objectContaining({
nodeId: "ios-node",
command: "camera.snap",
timeoutMs: 20000,
idempotencyKey: "idem-test",
params: expect.objectContaining({ facing: "back", format: "jpg" }),
}),
}),
);
const out = String(runtime.log.mock.calls[0]?.[0] ?? "");
const mediaPaths = out
.split("\n")
.filter((l) => l.startsWith("MEDIA:"))
.map((l) => l.replace(/^MEDIA:/, ""))
.filter(Boolean);
expect(mediaPaths).toHaveLength(2);
try {
for (const p of mediaPaths) {
await expect(fs.readFile(p, "utf8")).resolves.toBe("hi");
}
} finally {
await Promise.all(mediaPaths.map((p) => fs.unlink(p).catch(() => {})));
}
});
it("runs nodes camera clip and prints one MEDIA path", async () => {
callGateway
.mockResolvedValueOnce({
ts: Date.now(),
nodes: [
{
nodeId: "ios-node",
displayName: "iOS Node",
remoteIp: "192.168.0.88",
connected: true,
},
],
})
.mockResolvedValueOnce({
ok: true,
nodeId: "ios-node",
command: "camera.clip",
payload: {
format: "mp4",
base64: "aGk=",
durationMs: 3000,
hasAudio: true,
},
});
const program = buildProgram();
runtime.log.mockClear();
await program.parseAsync(
["nodes", "camera", "clip", "--node", "ios-node", "--duration", "3000"],
{ from: "user" },
);
expect(callGateway).toHaveBeenNthCalledWith(
2,
expect.objectContaining({
method: "node.invoke",
params: expect.objectContaining({
nodeId: "ios-node",
command: "camera.clip",
timeoutMs: 45000,
idempotencyKey: "idem-test",
params: expect.objectContaining({
facing: "front",
durationMs: 3000,
includeAudio: true,
format: "mp4",
}),
}),
}),
);
const out = String(runtime.log.mock.calls[0]?.[0] ?? "");
const mediaPath = out.replace(/^MEDIA:/, "").trim();
expect(mediaPath).toMatch(/clawdis-camera-clip-front-.*\.mp4$/);
try {
await expect(fs.readFile(mediaPath, "utf8")).resolves.toBe("hi");
} finally {
await fs.unlink(mediaPath).catch(() => {});
}
});
});