feat(discord): add voice message support

Adds support for sending Discord voice messages via the message tool with asVoice: true parameter. Voice messages require: - OGG/Opus format (auto-converted if needed via ffmpeg) - Waveform data (generated from audio samples) - Duration in seconds - Message flag 8192 (IS_VOICE_MESSAGE) Implementation: - New voice-message.ts with audio processing utilities - getAudioDuration() using ffprobe - generateWaveform() samples audio and creates base64 waveform - ensureOggOpus() converts audio to required format - sendDiscordVoiceMessage() handles 3-step Discord upload process Usage: message(action='send', channel='discord', target='...', path='/path/to/audio.mp3', asVoice=true) Note: Voice messages cannot include text content (Discord limitation)
2026-02-02 17:00:19 +01:00
parent aec3221391
commit a09e4fac3f
5 changed files with 444 additions and 1 deletions
--- a/src/agents/tools/discord-actions-messaging.ts
+++ b/src/agents/tools/discord-actions-messaging.ts
@@ -18,6 +18,7 @@ import {
  sendMessageDiscord,
  sendPollDiscord,
  sendStickerDiscord,
+  sendVoiceMessageDiscord,
  unpinMessageDiscord,
 } from "../../discord/send.js";
 import { resolveDiscordChannelId } from "../../discord/targets.js";
@@ -230,11 +231,25 @@ export async function handleDiscordMessagingAction(
      const to = readStringParam(params, "to", { required: true });
      const content = readStringParam(params, "content", {
        required: true,
+        allowEmpty: true,
      });
      const mediaUrl = readStringParam(params, "mediaUrl");
      const replyTo = readStringParam(params, "replyTo");
+      const asVoice = params.asVoice === true;
      const embeds =
        Array.isArray(params.embeds) && params.embeds.length > 0 ? params.embeds : undefined;
+
+      // Handle voice message sending
+      if (asVoice && mediaUrl) {
+        // Voice messages require a local file path or downloadable URL
+        // They cannot include text content (Discord limitation)
+        const result = await sendVoiceMessageDiscord(to, mediaUrl, {
+          ...(accountId ? { accountId } : {}),
+          replyTo,
+        });
+        return jsonResult({ ok: true, result, voiceMessage: true });
+      }
+
      const result = await sendMessageDiscord(to, content, {
        ...(accountId ? { accountId } : {}),
        mediaUrl,
--- a/src/channels/plugins/actions/discord/handle-action.ts
+++ b/src/channels/plugins/actions/discord/handle-action.ts
@@ -41,6 +41,7 @@ export async function handleDiscordMessageAction(
    const mediaUrl = readStringParam(params, "media", { trim: false });
    const replyTo = readStringParam(params, "replyTo");
    const embeds = Array.isArray(params.embeds) ? params.embeds : undefined;
+    const asVoice = params.asVoice === true;
    return await handleDiscordAction(
      {
        action: "sendMessage",
@@ -50,6 +51,7 @@ export async function handleDiscordMessageAction(
        mediaUrl: mediaUrl ?? undefined,
        replyTo: replyTo ?? undefined,
        embeds,
+        asVoice,
      },
      cfg,
    );
--- a/src/discord/send.outbound.ts
+++ b/src/discord/send.outbound.ts
@@ -1,6 +1,7 @@
 import type { RequestClient } from "@buape/carbon";
 import type { APIChannel } from "discord-api-types/v10";
 import { ChannelType, Routes } from "discord-api-types/v10";
+import fs from "node:fs/promises";
 import type { RetryConfig } from "../infra/retry.js";
 import type { PollInput } from "../polls.js";
 import type { DiscordSendResult } from "./send.types.js";
@@ -21,6 +22,11 @@ import {
  sendDiscordMedia,
  sendDiscordText,
 } from "./send.shared.js";
+import {
+  ensureOggOpus,
+  getVoiceMessageMetadata,
+  sendDiscordVoiceMessage,
+} from "./voice-message.js";

 type DiscordSendOpts = {
  token?: string;
@@ -31,6 +37,7 @@ type DiscordSendOpts = {
  replyTo?: string;
  retry?: RetryConfig;
  embeds?: unknown[];
+  silent?: boolean;
 };

 /** Discord thread names are capped at 100 characters. */
@@ -131,6 +138,7 @@ export async function sendMessageDiscord(
          accountInfo.config.maxLinesPerMessage,
          undefined,
          chunkMode,
+          opts.silent,
        );
        for (const chunk of afterMediaChunks) {
          await sendDiscordText(
@@ -142,6 +150,7 @@ export async function sendMessageDiscord(
            accountInfo.config.maxLinesPerMessage,
            undefined,
            chunkMode,
+            opts.silent,
          );
        }
      } else {
@@ -155,6 +164,7 @@ export async function sendMessageDiscord(
            accountInfo.config.maxLinesPerMessage,
            undefined,
            chunkMode,
+            opts.silent,
          );
        }
      }
@@ -191,6 +201,7 @@ export async function sendMessageDiscord(
        accountInfo.config.maxLinesPerMessage,
        opts.embeds,
        chunkMode,
+        opts.silent,
      );
    } else {
      result = await sendDiscordText(
@@ -202,6 +213,7 @@ export async function sendMessageDiscord(
        accountInfo.config.maxLinesPerMessage,
        opts.embeds,
        chunkMode,
+        opts.silent,
      );
    }
  } catch (err) {
@@ -277,3 +289,87 @@ export async function sendPollDiscord(
    channelId: String(res.channel_id ?? channelId),
  };
 }
+
+type VoiceMessageOpts = {
+  token?: string;
+  accountId?: string;
+  verbose?: boolean;
+  rest?: RequestClient;
+  replyTo?: string;
+  retry?: RetryConfig;
+  silent?: boolean;
+};
+
+/**
+ * Send a voice message to Discord.
+ *
+ * Voice messages are a special Discord feature that displays audio with a waveform
+ * visualization. They require OGG/Opus format and cannot include text content.
+ *
+ * @param to - Recipient (user ID for DM or channel ID)
+ * @param audioPath - Path to local audio file (will be converted to OGG/Opus if needed)
+ * @param opts - Send options
+ */
+export async function sendVoiceMessageDiscord(
+  to: string,
+  audioPath: string,
+  opts: VoiceMessageOpts = {},
+): Promise<DiscordSendResult> {
+  const cfg = loadConfig();
+  const accountInfo = resolveDiscordAccount({
+    cfg,
+    accountId: opts.accountId,
+  });
+  const { token, rest, request } = createDiscordClient(opts, cfg);
+  const recipient = await parseAndResolveRecipient(to, opts.accountId);
+  const { channelId } = await resolveChannelId(rest, recipient, request);
+
+  // Convert to OGG/Opus if needed
+  const { path: oggPath, cleanup } = await ensureOggOpus(audioPath);
+
+  try {
+    // Get voice message metadata (duration and waveform)
+    const metadata = await getVoiceMessageMetadata(oggPath);
+
+    // Read the audio file
+    const audioBuffer = await fs.readFile(oggPath);
+
+    // Send the voice message
+    const result = await sendDiscordVoiceMessage(
+      rest,
+      channelId,
+      audioBuffer,
+      metadata,
+      opts.replyTo,
+      request,
+      opts.silent,
+    );
+
+    recordChannelActivity({
+      channel: "discord",
+      accountId: accountInfo.accountId,
+      direction: "outbound",
+    });
+
+    return {
+      messageId: result.id ? String(result.id) : "unknown",
+      channelId: String(result.channel_id ?? channelId),
+    };
+  } catch (err) {
+    throw await buildDiscordSendError(err, {
+      channelId,
+      rest,
+      token,
+      hasMedia: true,
+    });
+  } finally {
+    // Clean up temporary OGG file if we created one
+    if (cleanup) {
+      try {
+        await fs.unlink(oggPath);
+      } catch {
+        // Ignore cleanup errors
+      }
+    }
+  }
+}
--- a/src/discord/send.ts
+++ b/src/discord/send.ts
@@ -37,7 +37,12 @@ export {
  searchMessagesDiscord,
  unpinMessageDiscord,
 } from "./send.messages.js";
-export { sendMessageDiscord, sendPollDiscord, sendStickerDiscord } from "./send.outbound.js";
+export {
+  sendMessageDiscord,
+  sendPollDiscord,
+  sendStickerDiscord,
+  sendVoiceMessageDiscord,
+} from "./send.outbound.js";
 export {
  fetchChannelPermissionsDiscord,
  fetchReactionsDiscord,
--- a/src/discord/voice-message.ts
+++ b/src/discord/voice-message.ts
@@ -0,0 +1,325 @@
+/**
+ * Discord Voice Message Support
+ *
+ * Implements sending voice messages via Discord's API.
+ * Voice messages require:
+ * - OGG/Opus format audio
+ * - Waveform data (base64 encoded, up to 256 samples, 0-255 values)
+ * - Duration in seconds
+ * - Message flag 8192 (IS_VOICE_MESSAGE)
+ * - No other content (text, embeds, etc.)
+ */
+
+import type { RequestClient } from "@buape/carbon";
+import { execFile } from "node:child_process";
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import { promisify } from "node:util";
+import type { RetryRunner } from "../infra/retry-policy.js";
+
+const execFileAsync = promisify(execFile);
+
+const DISCORD_VOICE_MESSAGE_FLAG = 8192;
+const WAVEFORM_SAMPLES = 256;
+
+export type VoiceMessageMetadata = {
+  durationSecs: number;
+  waveform: string; // base64 encoded
+};
+
+/**
+ * Get audio duration using ffprobe
+ */
+export async function getAudioDuration(filePath: string): Promise<number> {
+  try {
+    const { stdout } = await execFileAsync("ffprobe", [
+      "-v",
+      "error",
+      "-show_entries",
+      "format=duration",
+      "-of",
+      "csv=p=0",
+      filePath,
+    ]);
+    const duration = parseFloat(stdout.trim());
+    if (isNaN(duration)) {
+      throw new Error("Could not parse duration");
+    }
+    return Math.round(duration * 100) / 100; // Round to 2 decimal places
+  } catch (err) {
+    throw new Error(`Failed to get audio duration: ${err instanceof Error ? err.message : err}`);
+  }
+}
+
+/**
+ * Generate waveform data from audio file using ffmpeg
+ * Returns base64 encoded byte array of amplitude samples (0-255)
+ */
+export async function generateWaveform(filePath: string): Promise<string> {
+  try {
+    // Use ffmpeg to extract raw audio samples and compute amplitudes
+    // We'll get the peak amplitude for each segment of the audio
+    const { stdout } = await execFileAsync(
+      "ffmpeg",
+      [
+        "-i",
+        filePath,
+        "-af",
+        `aresample=8000,asetnsamples=n=${WAVEFORM_SAMPLES}:p=0,astats=metadata=1:reset=1`,
+        "-f",
+        "null",
+        "-",
+      ],
+      { encoding: "buffer", maxBuffer: 1024 * 1024 },
+    );
+
+    // Fallback: generate a simple waveform by sampling the audio
+    // This is a simplified approach - extract raw PCM and sample it
+    const waveformData = await generateWaveformFromPcm(filePath);
+    return waveformData;
+  } catch {
+    // If ffmpeg approach fails, generate a placeholder waveform
+    return generatePlaceholderWaveform();
+  }
+}
+
+/**
+ * Generate waveform by extracting raw PCM data and sampling amplitudes
+ */
+async function generateWaveformFromPcm(filePath: string): Promise<string> {
+  const tempDir = os.tmpdir();
+  const tempPcm = path.join(tempDir, `waveform-${Date.now()}.raw`);
+
+  try {
+    // Convert to raw 16-bit signed PCM, mono, 8kHz
+    await execFileAsync("ffmpeg", [
+      "-y",
+      "-i",
+      filePath,
+      "-f",
+      "s16le",
+      "-acodec",
+      "pcm_s16le",
+      "-ac",
+      "1",
+      "-ar",
+      "8000",
+      tempPcm,
+    ]);
+
+    const pcmData = await fs.readFile(tempPcm);
+    const samples = new Int16Array(pcmData.buffer, pcmData.byteOffset, pcmData.byteLength / 2);
+
+    // Sample the PCM data to get WAVEFORM_SAMPLES points
+    const step = Math.max(1, Math.floor(samples.length / WAVEFORM_SAMPLES));
+    const waveform: number[] = [];
+
+    for (let i = 0; i < WAVEFORM_SAMPLES && i * step < samples.length; i++) {
+      // Get average absolute amplitude for this segment
+      let sum = 0;
+      let count = 0;
+      for (let j = 0; j < step && i * step + j < samples.length; j++) {
+        sum += Math.abs(samples[i * step + j]!);
+        count++;
+      }
+      const avg = count > 0 ? sum / count : 0;
+      // Normalize to 0-255 (16-bit signed max is 32767)
+      const normalized = Math.min(255, Math.round((avg / 32767) * 255));
+      waveform.push(normalized);
+    }
+
+    // Pad with zeros if we don't have enough samples
+    while (waveform.length < WAVEFORM_SAMPLES) {
+      waveform.push(0);
+    }
+
+    return Buffer.from(waveform).toString("base64");
+  } finally {
+    // Clean up temp file
+    try {
+      await fs.unlink(tempPcm);
+    } catch {
+      // Ignore cleanup errors
+    }
+  }
+}
+
+/**
+ * Generate a placeholder waveform (for when audio processing fails)
+ */
+function generatePlaceholderWaveform(): string {
+  // Generate a simple sine-wave-like pattern
+  const waveform: number[] = [];
+  for (let i = 0; i < WAVEFORM_SAMPLES; i++) {
+    const value = Math.round(128 + 64 * Math.sin((i / WAVEFORM_SAMPLES) * Math.PI * 8));
+    waveform.push(Math.min(255, Math.max(0, value)));
+  }
+  return Buffer.from(waveform).toString("base64");
+}
+
+/**
+ * Convert audio file to OGG/Opus format if needed
+ * Returns path to the OGG file (may be same as input if already OGG/Opus)
+ */
+export async function ensureOggOpus(filePath: string): Promise<{ path: string; cleanup: boolean }> {
+  const ext = path.extname(filePath).toLowerCase();
+
+  // Check if already OGG
+  if (ext === ".ogg") {
+    // Verify it's Opus codec, not Vorbis (Vorbis won't play on mobile)
+    try {
+      const { stdout } = await execFileAsync("ffprobe", [
+        "-v",
+        "error",
+        "-select_streams",
+        "a:0",
+        "-show_entries",
+        "stream=codec_name",
+        "-of",
+        "csv=p=0",
+        filePath,
+      ]);
+      if (stdout.trim().toLowerCase() === "opus") {
+        return { path: filePath, cleanup: false };
+      }
+    } catch {
+      // If probe fails, convert anyway
+    }
+  }
+
+  // Convert to OGG/Opus
+  const tempDir = os.tmpdir();
+  const outputPath = path.join(tempDir, `voice-${Date.now()}.ogg`);
+
+  await execFileAsync("ffmpeg", [
+    "-y",
+    "-i",
+    filePath,
+    "-c:a",
+    "libopus",
+    "-b:a",
+    "64k",
+    outputPath,
+  ]);
+
+  return { path: outputPath, cleanup: true };
+}
+
+/**
+ * Get voice message metadata (duration and waveform)
+ */
+export async function getVoiceMessageMetadata(filePath: string): Promise<VoiceMessageMetadata> {
+  const [durationSecs, waveform] = await Promise.all([
+    getAudioDuration(filePath),
+    generateWaveform(filePath),
+  ]);
+
+  return { durationSecs, waveform };
+}
+
+type UploadUrlResponse = {
+  attachments: Array<{
+    id: number;
+    upload_url: string;
+    upload_filename: string;
+  }>;
+};
+
+/**
+ * Send a voice message to Discord
+ *
+ * This follows Discord's voice message protocol:
+ * 1. Request upload URL from Discord
+ * 2. Upload the OGG file to the provided URL
+ * 3. Send the message with flag 8192 and attachment metadata
+ */
+export async function sendDiscordVoiceMessage(
+  rest: RequestClient,
+  channelId: string,
+  audioBuffer: Buffer,
+  metadata: VoiceMessageMetadata,
+  replyTo: string | undefined,
+  request: RetryRunner,
+): Promise<{ id: string; channel_id: string }> {
+  const filename = "voice-message.ogg";
+  const fileSize = audioBuffer.byteLength;
+
+  // Step 1: Request upload URL
+  const uploadUrlResponse = (await request(
+    () =>
+      rest.post(`/channels/${channelId}/attachments`, {
+        body: {
+          files: [
+            {
+              filename,
+              file_size: fileSize,
+              id: "0",
+            },
+          ],
+        },
+      }) as Promise<UploadUrlResponse>,
+    "voice-upload-url",
+  )) as UploadUrlResponse;
+
+  if (!uploadUrlResponse.attachments?.[0]) {
+    throw new Error("Failed to get upload URL for voice message");
+  }
+
+  const { upload_url, upload_filename } = uploadUrlResponse.attachments[0];
+
+  // Step 2: Upload the file to Discord's CDN
+  const uploadResponse = await fetch(upload_url, {
+    method: "PUT",
+    headers: {
+      "Content-Type": "audio/ogg",
+    },
+    body: new Uint8Array(audioBuffer),
+  });
+
+  if (!uploadResponse.ok) {
+    throw new Error(`Failed to upload voice message: ${uploadResponse.status}`);
+  }
+
+  // Step 3: Send the message with voice message flag and metadata
+  const messagePayload: {
+    flags: number;
+    attachments: Array<{
+      id: string;
+      filename: string;
+      uploaded_filename: string;
+      duration_secs: number;
+      waveform: string;
+    }>;
+    message_reference?: { message_id: string; fail_if_not_exists: boolean };
+  } = {
+    flags: DISCORD_VOICE_MESSAGE_FLAG,
+    attachments: [
+      {
+        id: "0",
+        filename,
+        uploaded_filename: upload_filename,
+        duration_secs: metadata.durationSecs,
+        waveform: metadata.waveform,
+      },
+    ],
+  };
+
+  // Note: Voice messages cannot have content, but can have message_reference for replies
+  if (replyTo) {
+    messagePayload.message_reference = {
+      message_id: replyTo,
+      fail_if_not_exists: false,
+    };
+  }
+
+  const res = (await request(
+    () =>
+      rest.post(`/channels/${channelId}/messages`, {
+        body: messagePayload,
+      }) as Promise<{ id: string; channel_id: string }>,
+    "voice-message",
+  )) as { id: string; channel_id: string };
+
+  return res;
+}