feat(discord): add voice message support
Adds support for sending Discord voice messages via the message tool
with asVoice: true parameter.
Voice messages require:
- OGG/Opus format (auto-converted if needed via ffmpeg)
- Waveform data (generated from audio samples)
- Duration in seconds
- Message flag 8192 (IS_VOICE_MESSAGE)
Implementation:
- New voice-message.ts with audio processing utilities
- getAudioDuration() using ffprobe
- generateWaveform() samples audio and creates base64 waveform
- ensureOggOpus() converts audio to required format
- sendDiscordVoiceMessage() handles 3-step Discord upload process
Usage:
message(action='send', channel='discord', target='...',
path='/path/to/audio.mp3', asVoice=true)
Note: Voice messages cannot include text content (Discord limitation)
This commit is contained in:
@@ -18,6 +18,7 @@ import {
|
||||
sendMessageDiscord,
|
||||
sendPollDiscord,
|
||||
sendStickerDiscord,
|
||||
sendVoiceMessageDiscord,
|
||||
unpinMessageDiscord,
|
||||
} from "../../discord/send.js";
|
||||
import { resolveDiscordChannelId } from "../../discord/targets.js";
|
||||
@@ -230,11 +231,25 @@ export async function handleDiscordMessagingAction(
|
||||
const to = readStringParam(params, "to", { required: true });
|
||||
const content = readStringParam(params, "content", {
|
||||
required: true,
|
||||
allowEmpty: true,
|
||||
});
|
||||
const mediaUrl = readStringParam(params, "mediaUrl");
|
||||
const replyTo = readStringParam(params, "replyTo");
|
||||
const asVoice = params.asVoice === true;
|
||||
const embeds =
|
||||
Array.isArray(params.embeds) && params.embeds.length > 0 ? params.embeds : undefined;
|
||||
|
||||
// Handle voice message sending
|
||||
if (asVoice && mediaUrl) {
|
||||
// Voice messages require a local file path or downloadable URL
|
||||
// They cannot include text content (Discord limitation)
|
||||
const result = await sendVoiceMessageDiscord(to, mediaUrl, {
|
||||
...(accountId ? { accountId } : {}),
|
||||
replyTo,
|
||||
});
|
||||
return jsonResult({ ok: true, result, voiceMessage: true });
|
||||
}
|
||||
|
||||
const result = await sendMessageDiscord(to, content, {
|
||||
...(accountId ? { accountId } : {}),
|
||||
mediaUrl,
|
||||
|
||||
@@ -41,6 +41,7 @@ export async function handleDiscordMessageAction(
|
||||
const mediaUrl = readStringParam(params, "media", { trim: false });
|
||||
const replyTo = readStringParam(params, "replyTo");
|
||||
const embeds = Array.isArray(params.embeds) ? params.embeds : undefined;
|
||||
const asVoice = params.asVoice === true;
|
||||
return await handleDiscordAction(
|
||||
{
|
||||
action: "sendMessage",
|
||||
@@ -50,6 +51,7 @@ export async function handleDiscordMessageAction(
|
||||
mediaUrl: mediaUrl ?? undefined,
|
||||
replyTo: replyTo ?? undefined,
|
||||
embeds,
|
||||
asVoice,
|
||||
},
|
||||
cfg,
|
||||
);
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import type { RequestClient } from "@buape/carbon";
|
||||
import type { APIChannel } from "discord-api-types/v10";
|
||||
import { ChannelType, Routes } from "discord-api-types/v10";
|
||||
import fs from "node:fs/promises";
|
||||
import type { RetryConfig } from "../infra/retry.js";
|
||||
import type { PollInput } from "../polls.js";
|
||||
import type { DiscordSendResult } from "./send.types.js";
|
||||
@@ -21,6 +22,11 @@ import {
|
||||
sendDiscordMedia,
|
||||
sendDiscordText,
|
||||
} from "./send.shared.js";
|
||||
import {
|
||||
ensureOggOpus,
|
||||
getVoiceMessageMetadata,
|
||||
sendDiscordVoiceMessage,
|
||||
} from "./voice-message.js";
|
||||
|
||||
type DiscordSendOpts = {
|
||||
token?: string;
|
||||
@@ -31,6 +37,7 @@ type DiscordSendOpts = {
|
||||
replyTo?: string;
|
||||
retry?: RetryConfig;
|
||||
embeds?: unknown[];
|
||||
silent?: boolean;
|
||||
};
|
||||
|
||||
/** Discord thread names are capped at 100 characters. */
|
||||
@@ -131,6 +138,7 @@ export async function sendMessageDiscord(
|
||||
accountInfo.config.maxLinesPerMessage,
|
||||
undefined,
|
||||
chunkMode,
|
||||
opts.silent,
|
||||
);
|
||||
for (const chunk of afterMediaChunks) {
|
||||
await sendDiscordText(
|
||||
@@ -142,6 +150,7 @@ export async function sendMessageDiscord(
|
||||
accountInfo.config.maxLinesPerMessage,
|
||||
undefined,
|
||||
chunkMode,
|
||||
opts.silent,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
@@ -155,6 +164,7 @@ export async function sendMessageDiscord(
|
||||
accountInfo.config.maxLinesPerMessage,
|
||||
undefined,
|
||||
chunkMode,
|
||||
opts.silent,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -191,6 +201,7 @@ export async function sendMessageDiscord(
|
||||
accountInfo.config.maxLinesPerMessage,
|
||||
opts.embeds,
|
||||
chunkMode,
|
||||
opts.silent,
|
||||
);
|
||||
} else {
|
||||
result = await sendDiscordText(
|
||||
@@ -202,6 +213,7 @@ export async function sendMessageDiscord(
|
||||
accountInfo.config.maxLinesPerMessage,
|
||||
opts.embeds,
|
||||
chunkMode,
|
||||
opts.silent,
|
||||
);
|
||||
}
|
||||
} catch (err) {
|
||||
@@ -277,3 +289,87 @@ export async function sendPollDiscord(
|
||||
channelId: String(res.channel_id ?? channelId),
|
||||
};
|
||||
}
|
||||
|
||||
type VoiceMessageOpts = {
|
||||
token?: string;
|
||||
accountId?: string;
|
||||
verbose?: boolean;
|
||||
rest?: RequestClient;
|
||||
replyTo?: string;
|
||||
retry?: RetryConfig;
|
||||
silent?: boolean;
|
||||
};
|
||||
|
||||
/**
|
||||
* Send a voice message to Discord.
|
||||
*
|
||||
* Voice messages are a special Discord feature that displays audio with a waveform
|
||||
* visualization. They require OGG/Opus format and cannot include text content.
|
||||
*
|
||||
* @param to - Recipient (user ID for DM or channel ID)
|
||||
* @param audioPath - Path to local audio file (will be converted to OGG/Opus if needed)
|
||||
* @param opts - Send options
|
||||
*/
|
||||
export async function sendVoiceMessageDiscord(
|
||||
to: string,
|
||||
audioPath: string,
|
||||
opts: VoiceMessageOpts = {},
|
||||
): Promise<DiscordSendResult> {
|
||||
const cfg = loadConfig();
|
||||
const accountInfo = resolveDiscordAccount({
|
||||
cfg,
|
||||
accountId: opts.accountId,
|
||||
});
|
||||
const { token, rest, request } = createDiscordClient(opts, cfg);
|
||||
const recipient = await parseAndResolveRecipient(to, opts.accountId);
|
||||
const { channelId } = await resolveChannelId(rest, recipient, request);
|
||||
|
||||
// Convert to OGG/Opus if needed
|
||||
const { path: oggPath, cleanup } = await ensureOggOpus(audioPath);
|
||||
|
||||
try {
|
||||
// Get voice message metadata (duration and waveform)
|
||||
const metadata = await getVoiceMessageMetadata(oggPath);
|
||||
|
||||
// Read the audio file
|
||||
const audioBuffer = await fs.readFile(oggPath);
|
||||
|
||||
// Send the voice message
|
||||
const result = await sendDiscordVoiceMessage(
|
||||
rest,
|
||||
channelId,
|
||||
audioBuffer,
|
||||
metadata,
|
||||
opts.replyTo,
|
||||
request,
|
||||
opts.silent,
|
||||
);
|
||||
|
||||
recordChannelActivity({
|
||||
channel: "discord",
|
||||
accountId: accountInfo.accountId,
|
||||
direction: "outbound",
|
||||
});
|
||||
|
||||
return {
|
||||
messageId: result.id ? String(result.id) : "unknown",
|
||||
channelId: String(result.channel_id ?? channelId),
|
||||
};
|
||||
} catch (err) {
|
||||
throw await buildDiscordSendError(err, {
|
||||
channelId,
|
||||
rest,
|
||||
token,
|
||||
hasMedia: true,
|
||||
});
|
||||
} finally {
|
||||
// Clean up temporary OGG file if we created one
|
||||
if (cleanup) {
|
||||
try {
|
||||
await fs.unlink(oggPath);
|
||||
} catch {
|
||||
// Ignore cleanup errors
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,7 +37,12 @@ export {
|
||||
searchMessagesDiscord,
|
||||
unpinMessageDiscord,
|
||||
} from "./send.messages.js";
|
||||
export { sendMessageDiscord, sendPollDiscord, sendStickerDiscord } from "./send.outbound.js";
|
||||
export {
|
||||
sendMessageDiscord,
|
||||
sendPollDiscord,
|
||||
sendStickerDiscord,
|
||||
sendVoiceMessageDiscord,
|
||||
} from "./send.outbound.js";
|
||||
export {
|
||||
fetchChannelPermissionsDiscord,
|
||||
fetchReactionsDiscord,
|
||||
|
||||
325
src/discord/voice-message.ts
Normal file
325
src/discord/voice-message.ts
Normal file
@@ -0,0 +1,325 @@
|
||||
/**
|
||||
* Discord Voice Message Support
|
||||
*
|
||||
* Implements sending voice messages via Discord's API.
|
||||
* Voice messages require:
|
||||
* - OGG/Opus format audio
|
||||
* - Waveform data (base64 encoded, up to 256 samples, 0-255 values)
|
||||
* - Duration in seconds
|
||||
* - Message flag 8192 (IS_VOICE_MESSAGE)
|
||||
* - No other content (text, embeds, etc.)
|
||||
*/
|
||||
|
||||
import type { RequestClient } from "@buape/carbon";
|
||||
import { execFile } from "node:child_process";
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { promisify } from "node:util";
|
||||
import type { RetryRunner } from "../infra/retry-policy.js";
|
||||
|
||||
const execFileAsync = promisify(execFile);
|
||||
|
||||
const DISCORD_VOICE_MESSAGE_FLAG = 8192;
|
||||
const WAVEFORM_SAMPLES = 256;
|
||||
|
||||
export type VoiceMessageMetadata = {
|
||||
durationSecs: number;
|
||||
waveform: string; // base64 encoded
|
||||
};
|
||||
|
||||
/**
|
||||
* Get audio duration using ffprobe
|
||||
*/
|
||||
export async function getAudioDuration(filePath: string): Promise<number> {
|
||||
try {
|
||||
const { stdout } = await execFileAsync("ffprobe", [
|
||||
"-v",
|
||||
"error",
|
||||
"-show_entries",
|
||||
"format=duration",
|
||||
"-of",
|
||||
"csv=p=0",
|
||||
filePath,
|
||||
]);
|
||||
const duration = parseFloat(stdout.trim());
|
||||
if (isNaN(duration)) {
|
||||
throw new Error("Could not parse duration");
|
||||
}
|
||||
return Math.round(duration * 100) / 100; // Round to 2 decimal places
|
||||
} catch (err) {
|
||||
throw new Error(`Failed to get audio duration: ${err instanceof Error ? err.message : err}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate waveform data from audio file using ffmpeg
|
||||
* Returns base64 encoded byte array of amplitude samples (0-255)
|
||||
*/
|
||||
export async function generateWaveform(filePath: string): Promise<string> {
|
||||
try {
|
||||
// Use ffmpeg to extract raw audio samples and compute amplitudes
|
||||
// We'll get the peak amplitude for each segment of the audio
|
||||
const { stdout } = await execFileAsync(
|
||||
"ffmpeg",
|
||||
[
|
||||
"-i",
|
||||
filePath,
|
||||
"-af",
|
||||
`aresample=8000,asetnsamples=n=${WAVEFORM_SAMPLES}:p=0,astats=metadata=1:reset=1`,
|
||||
"-f",
|
||||
"null",
|
||||
"-",
|
||||
],
|
||||
{ encoding: "buffer", maxBuffer: 1024 * 1024 },
|
||||
);
|
||||
|
||||
// Fallback: generate a simple waveform by sampling the audio
|
||||
// This is a simplified approach - extract raw PCM and sample it
|
||||
const waveformData = await generateWaveformFromPcm(filePath);
|
||||
return waveformData;
|
||||
} catch {
|
||||
// If ffmpeg approach fails, generate a placeholder waveform
|
||||
return generatePlaceholderWaveform();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate waveform by extracting raw PCM data and sampling amplitudes
|
||||
*/
|
||||
async function generateWaveformFromPcm(filePath: string): Promise<string> {
|
||||
const tempDir = os.tmpdir();
|
||||
const tempPcm = path.join(tempDir, `waveform-${Date.now()}.raw`);
|
||||
|
||||
try {
|
||||
// Convert to raw 16-bit signed PCM, mono, 8kHz
|
||||
await execFileAsync("ffmpeg", [
|
||||
"-y",
|
||||
"-i",
|
||||
filePath,
|
||||
"-f",
|
||||
"s16le",
|
||||
"-acodec",
|
||||
"pcm_s16le",
|
||||
"-ac",
|
||||
"1",
|
||||
"-ar",
|
||||
"8000",
|
||||
tempPcm,
|
||||
]);
|
||||
|
||||
const pcmData = await fs.readFile(tempPcm);
|
||||
const samples = new Int16Array(pcmData.buffer, pcmData.byteOffset, pcmData.byteLength / 2);
|
||||
|
||||
// Sample the PCM data to get WAVEFORM_SAMPLES points
|
||||
const step = Math.max(1, Math.floor(samples.length / WAVEFORM_SAMPLES));
|
||||
const waveform: number[] = [];
|
||||
|
||||
for (let i = 0; i < WAVEFORM_SAMPLES && i * step < samples.length; i++) {
|
||||
// Get average absolute amplitude for this segment
|
||||
let sum = 0;
|
||||
let count = 0;
|
||||
for (let j = 0; j < step && i * step + j < samples.length; j++) {
|
||||
sum += Math.abs(samples[i * step + j]!);
|
||||
count++;
|
||||
}
|
||||
const avg = count > 0 ? sum / count : 0;
|
||||
// Normalize to 0-255 (16-bit signed max is 32767)
|
||||
const normalized = Math.min(255, Math.round((avg / 32767) * 255));
|
||||
waveform.push(normalized);
|
||||
}
|
||||
|
||||
// Pad with zeros if we don't have enough samples
|
||||
while (waveform.length < WAVEFORM_SAMPLES) {
|
||||
waveform.push(0);
|
||||
}
|
||||
|
||||
return Buffer.from(waveform).toString("base64");
|
||||
} finally {
|
||||
// Clean up temp file
|
||||
try {
|
||||
await fs.unlink(tempPcm);
|
||||
} catch {
|
||||
// Ignore cleanup errors
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a placeholder waveform (for when audio processing fails)
|
||||
*/
|
||||
function generatePlaceholderWaveform(): string {
|
||||
// Generate a simple sine-wave-like pattern
|
||||
const waveform: number[] = [];
|
||||
for (let i = 0; i < WAVEFORM_SAMPLES; i++) {
|
||||
const value = Math.round(128 + 64 * Math.sin((i / WAVEFORM_SAMPLES) * Math.PI * 8));
|
||||
waveform.push(Math.min(255, Math.max(0, value)));
|
||||
}
|
||||
return Buffer.from(waveform).toString("base64");
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert audio file to OGG/Opus format if needed
|
||||
* Returns path to the OGG file (may be same as input if already OGG/Opus)
|
||||
*/
|
||||
export async function ensureOggOpus(filePath: string): Promise<{ path: string; cleanup: boolean }> {
|
||||
const ext = path.extname(filePath).toLowerCase();
|
||||
|
||||
// Check if already OGG
|
||||
if (ext === ".ogg") {
|
||||
// Verify it's Opus codec, not Vorbis (Vorbis won't play on mobile)
|
||||
try {
|
||||
const { stdout } = await execFileAsync("ffprobe", [
|
||||
"-v",
|
||||
"error",
|
||||
"-select_streams",
|
||||
"a:0",
|
||||
"-show_entries",
|
||||
"stream=codec_name",
|
||||
"-of",
|
||||
"csv=p=0",
|
||||
filePath,
|
||||
]);
|
||||
if (stdout.trim().toLowerCase() === "opus") {
|
||||
return { path: filePath, cleanup: false };
|
||||
}
|
||||
} catch {
|
||||
// If probe fails, convert anyway
|
||||
}
|
||||
}
|
||||
|
||||
// Convert to OGG/Opus
|
||||
const tempDir = os.tmpdir();
|
||||
const outputPath = path.join(tempDir, `voice-${Date.now()}.ogg`);
|
||||
|
||||
await execFileAsync("ffmpeg", [
|
||||
"-y",
|
||||
"-i",
|
||||
filePath,
|
||||
"-c:a",
|
||||
"libopus",
|
||||
"-b:a",
|
||||
"64k",
|
||||
outputPath,
|
||||
]);
|
||||
|
||||
return { path: outputPath, cleanup: true };
|
||||
}
|
||||
|
||||
/**
|
||||
* Get voice message metadata (duration and waveform)
|
||||
*/
|
||||
export async function getVoiceMessageMetadata(filePath: string): Promise<VoiceMessageMetadata> {
|
||||
const [durationSecs, waveform] = await Promise.all([
|
||||
getAudioDuration(filePath),
|
||||
generateWaveform(filePath),
|
||||
]);
|
||||
|
||||
return { durationSecs, waveform };
|
||||
}
|
||||
|
||||
type UploadUrlResponse = {
|
||||
attachments: Array<{
|
||||
id: number;
|
||||
upload_url: string;
|
||||
upload_filename: string;
|
||||
}>;
|
||||
};
|
||||
|
||||
/**
|
||||
* Send a voice message to Discord
|
||||
*
|
||||
* This follows Discord's voice message protocol:
|
||||
* 1. Request upload URL from Discord
|
||||
* 2. Upload the OGG file to the provided URL
|
||||
* 3. Send the message with flag 8192 and attachment metadata
|
||||
*/
|
||||
export async function sendDiscordVoiceMessage(
|
||||
rest: RequestClient,
|
||||
channelId: string,
|
||||
audioBuffer: Buffer,
|
||||
metadata: VoiceMessageMetadata,
|
||||
replyTo: string | undefined,
|
||||
request: RetryRunner,
|
||||
): Promise<{ id: string; channel_id: string }> {
|
||||
const filename = "voice-message.ogg";
|
||||
const fileSize = audioBuffer.byteLength;
|
||||
|
||||
// Step 1: Request upload URL
|
||||
const uploadUrlResponse = (await request(
|
||||
() =>
|
||||
rest.post(`/channels/${channelId}/attachments`, {
|
||||
body: {
|
||||
files: [
|
||||
{
|
||||
filename,
|
||||
file_size: fileSize,
|
||||
id: "0",
|
||||
},
|
||||
],
|
||||
},
|
||||
}) as Promise<UploadUrlResponse>,
|
||||
"voice-upload-url",
|
||||
)) as UploadUrlResponse;
|
||||
|
||||
if (!uploadUrlResponse.attachments?.[0]) {
|
||||
throw new Error("Failed to get upload URL for voice message");
|
||||
}
|
||||
|
||||
const { upload_url, upload_filename } = uploadUrlResponse.attachments[0];
|
||||
|
||||
// Step 2: Upload the file to Discord's CDN
|
||||
const uploadResponse = await fetch(upload_url, {
|
||||
method: "PUT",
|
||||
headers: {
|
||||
"Content-Type": "audio/ogg",
|
||||
},
|
||||
body: new Uint8Array(audioBuffer),
|
||||
});
|
||||
|
||||
if (!uploadResponse.ok) {
|
||||
throw new Error(`Failed to upload voice message: ${uploadResponse.status}`);
|
||||
}
|
||||
|
||||
// Step 3: Send the message with voice message flag and metadata
|
||||
const messagePayload: {
|
||||
flags: number;
|
||||
attachments: Array<{
|
||||
id: string;
|
||||
filename: string;
|
||||
uploaded_filename: string;
|
||||
duration_secs: number;
|
||||
waveform: string;
|
||||
}>;
|
||||
message_reference?: { message_id: string; fail_if_not_exists: boolean };
|
||||
} = {
|
||||
flags: DISCORD_VOICE_MESSAGE_FLAG,
|
||||
attachments: [
|
||||
{
|
||||
id: "0",
|
||||
filename,
|
||||
uploaded_filename: upload_filename,
|
||||
duration_secs: metadata.durationSecs,
|
||||
waveform: metadata.waveform,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
// Note: Voice messages cannot have content, but can have message_reference for replies
|
||||
if (replyTo) {
|
||||
messagePayload.message_reference = {
|
||||
message_id: replyTo,
|
||||
fail_if_not_exists: false,
|
||||
};
|
||||
}
|
||||
|
||||
const res = (await request(
|
||||
() =>
|
||||
rest.post(`/channels/${channelId}/messages`, {
|
||||
body: messagePayload,
|
||||
}) as Promise<{ id: string; channel_id: string }>,
|
||||
"voice-message",
|
||||
)) as { id: string; channel_id: string };
|
||||
|
||||
return res;
|
||||
}
|
||||
Reference in New Issue
Block a user