diff --git a/scripts/test-parallel.mjs b/scripts/test-parallel.mjs
index bed23a431..5abfdf0fa 100644
--- a/scripts/test-parallel.mjs
+++ b/scripts/test-parallel.mjs
@@ -43,7 +43,6 @@ const unitIsolatedFilesRaw = [
   "src/agents/subagent-announce.format.test.ts",
   "src/infra/archive.test.ts",
   "src/cli/daemon-cli.coverage.test.ts",
-  "test/media-understanding.auto.test.ts",
   // Model normalization test imports config/model discovery stack; keep off unit-fast critical path.
   "src/agents/models-config.normalizes-gemini-3-ids-preview-google-providers.test.ts",
   // Auth profile rotation suite is retry-heavy and high-variance under vmForks contention.
diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts
index c798cfb28..3f6278065 100644
--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@@ -6,6 +6,8 @@ import type { MsgContext } from "../auto-reply/templating.js";
 import type { OpenClawConfig } from "../config/config.js";
 import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
 import { fetchRemoteMedia } from "../media/fetch.js";
+import { withEnvAsync } from "../test-utils/env.js";
+import { clearMediaUnderstandingBinaryCacheForTests } from "./runner.js";
 
 vi.mock("../agents/model-auth.js", () => ({
   resolveApiKeyForProvider: vi.fn(async () => ({
@@ -115,12 +117,38 @@ async function createTempMediaFile(params: { fileName: string; content: Buffer |
   return mediaPath;
 }
 
+async function createMockExecutable(dir: string, name: string) {
+  const executablePath = path.join(dir, name);
+  await fs.writeFile(executablePath, "echo mocked\n", { mode: 0o755 });
+  return executablePath;
+}
+
+async function withMediaAutoDetectEnv<T>(
+  env: Record<string, string | undefined>,
+  run: () => Promise<T>,
+): Promise<T> {
+  return await withEnvAsync(
+    {
+      SHERPA_ONNX_MODEL_DIR: undefined,
+      WHISPER_CPP_MODEL: undefined,
+      OPENAI_API_KEY: undefined,
+      GROQ_API_KEY: undefined,
+      DEEPGRAM_API_KEY: undefined,
+      GEMINI_API_KEY: undefined,
+      OPENCLAW_AGENT_DIR: undefined,
+      PI_CODING_AGENT_DIR: undefined,
+      ...env,
+    },
+    run,
+  );
+}
+
 async function createAudioCtx(params?: {
   body?: string;
   fileName?: string;
   mediaType?: string;
   content?: Buffer | string;
-}) {
+}): Promise<MsgContext> {
   const mediaPath = await createTempMediaFile({
     fileName: params?.fileName ?? "note.ogg",
     content: params?.content ?? Buffer.from([0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8]),
@@ -179,6 +207,7 @@ describe("applyMediaUnderstanding", () => {
       contentType: "audio/ogg",
       fileName: "note.ogg",
     });
+    clearMediaUnderstandingBinaryCacheForTests();
   });
 
   afterAll(async () => {
@@ -357,6 +386,119 @@ describe("applyMediaUnderstanding", () => {
     expect(ctx.Body).toBe("[Audio]\nTranscript:\ncli transcript");
   });
 
+  it("auto-detects sherpa for audio when binary and model files are available", async () => {
+    const binDir = await createTempMediaDir();
+    const modelDir = await createTempMediaDir();
+    await createMockExecutable(binDir, "sherpa-onnx-offline");
+    await fs.writeFile(path.join(modelDir, "tokens.txt"), "a");
+    await fs.writeFile(path.join(modelDir, "encoder.onnx"), "a");
+    await fs.writeFile(path.join(modelDir, "decoder.onnx"), "a");
+    await fs.writeFile(path.join(modelDir, "joiner.onnx"), "a");
+
+    const ctx = await createAudioCtx({
+      fileName: "sample.wav",
+      mediaType: "audio/wav",
+      content: "audio",
+    });
+    const cfg: OpenClawConfig = { tools: { media: { audio: {} } } };
+
+    const execModule = await import("../process/exec.js");
+    const mockedRunExec = vi.mocked(execModule.runExec);
+    mockedRunExec.mockResolvedValueOnce({
+      stdout: '{"text":"sherpa ok"}',
+      stderr: "",
+    });
+
+    await withMediaAutoDetectEnv(
+      {
+        PATH: binDir,
+        SHERPA_ONNX_MODEL_DIR: modelDir,
+      },
+      async () => {
+        const result = await applyMediaUnderstanding({ ctx, cfg });
+        expect(result.appliedAudio).toBe(true);
+      },
+    );
+
+    expect(ctx.Transcript).toBe("sherpa ok");
+    expect(mockedRunExec).toHaveBeenCalledWith(
+      "sherpa-onnx-offline",
+      expect.any(Array),
+      expect.any(Object),
+    );
+  });
+
+  it("auto-detects whisper-cli when sherpa is unavailable", async () => {
+    const binDir = await createTempMediaDir();
+    const modelDir = await createTempMediaDir();
+    await createMockExecutable(binDir, "whisper-cli");
+    const modelPath = path.join(modelDir, "tiny.bin");
+    await fs.writeFile(modelPath, "model");
+
+    const ctx = await createAudioCtx({
+      fileName: "sample.wav",
+      mediaType: "audio/wav",
+      content: "audio",
+    });
+    const cfg: OpenClawConfig = { tools: { media: { audio: {} } } };
+
+    const execModule = await import("../process/exec.js");
+    const mockedRunExec = vi.mocked(execModule.runExec);
+    mockedRunExec.mockResolvedValueOnce({
+      stdout: "whisper cpp ok\n",
+      stderr: "",
+    });
+
+    await withMediaAutoDetectEnv(
+      {
+        PATH: binDir,
+        WHISPER_CPP_MODEL: modelPath,
+      },
+      async () => {
+        const result = await applyMediaUnderstanding({ ctx, cfg });
+        expect(result.appliedAudio).toBe(true);
+      },
+    );
+
+    expect(ctx.Transcript).toBe("whisper cpp ok");
+    expect(mockedRunExec).toHaveBeenCalledWith(
+      "whisper-cli",
+      expect.any(Array),
+      expect.any(Object),
+    );
+  });
+
+  it("skips audio auto-detect when no supported binaries or provider keys are available", async () => {
+    const emptyBinDir = await createTempMediaDir();
+    const isolatedAgentDir = await createTempMediaDir();
+    const ctx = await createAudioCtx({
+      fileName: "sample.wav",
+      mediaType: "audio/wav",
+      content: "audio",
+    });
+    const cfg: OpenClawConfig = { tools: { media: { audio: {} } } };
+
+    const execModule = await import("../process/exec.js");
+    const mockedRunExec = vi.mocked(execModule.runExec);
+    mockedRunExec.mockReset();
+
+    await withMediaAutoDetectEnv(
+      {
+        PATH: emptyBinDir,
+        OPENCLAW_AGENT_DIR: isolatedAgentDir,
+        PI_CODING_AGENT_DIR: isolatedAgentDir,
+      },
+      async () => {
+        const result = await applyMediaUnderstanding({ ctx, cfg });
+        expect(result.appliedAudio).toBe(false);
+      },
+    );
+
+    expect(ctx.Transcript).toBeUndefined();
+    expect(ctx.Body).toBe("<media:audio>");
+    expect(mockedRunExec).not.toHaveBeenCalled();
+  });
+
   it("uses CLI image understanding and preserves caption for commands", async () => {
     const imagePath = await createTempMediaFile({
       fileName: "photo.jpg",
diff --git a/test/media-understanding.auto.test.ts b/test/media-understanding.auto.test.ts
deleted file mode 100644
index 99358115d..000000000
--- a/test/media-understanding.auto.test.ts
+++ /dev/null
@@ -1,223 +0,0 @@
-import fs from "node:fs/promises";
-import path from "node:path";
-import { afterEach, beforeEach, describe, expect, it } from "vitest";
-import type { MsgContext } from "../src/auto-reply/templating.js";
-import type { OpenClawConfig } from "../src/config/config.js";
-import { resolvePreferredOpenClawTmpDir } from "../src/infra/tmp-openclaw-dir.js";
-import { applyMediaUnderstanding } from "../src/media-understanding/apply.js";
-import { clearMediaUnderstandingBinaryCacheForTests } from "../src/media-understanding/runner.js";
-
-const makeTempDir = async (prefix: string) => {
-  const baseDir = resolvePreferredOpenClawTmpDir();
-  await fs.mkdir(baseDir, { recursive: true });
-  return await fs.mkdtemp(path.join(baseDir, prefix));
-};
-
-const writeExecutable = async (dir: string, name: string, content: string) => {
-  const filePath = path.join(dir, name);
-  await fs.writeFile(filePath, content, { mode: 0o755 });
-  return filePath;
-};
-
-const makeTempMedia = async (ext: string) => {
-  const dir = await makeTempDir("openclaw-media-e2e-");
-  const filePath = path.join(dir, `sample${ext}`);
-  await fs.writeFile(filePath, "audio");
-  return { dir, filePath };
-};
-
-const envSnapshot = () => ({
-  PATH: process.env.PATH,
-  SHERPA_ONNX_MODEL_DIR: process.env.SHERPA_ONNX_MODEL_DIR,
-  WHISPER_CPP_MODEL: process.env.WHISPER_CPP_MODEL,
-  OPENAI_API_KEY: process.env.OPENAI_API_KEY,
-  GROQ_API_KEY: process.env.GROQ_API_KEY,
-  DEEPGRAM_API_KEY: process.env.DEEPGRAM_API_KEY,
-  GEMINI_API_KEY: process.env.GEMINI_API_KEY,
-  OPENCLAW_AGENT_DIR: process.env.OPENCLAW_AGENT_DIR,
-  PI_CODING_AGENT_DIR: process.env.PI_CODING_AGENT_DIR,
-});
-
-const restoreEnv = (snapshot: ReturnType<typeof envSnapshot>) => {
-  const restoreEnvVar = (key: string, value: string | undefined) => {
-    if (value === undefined) {
-      delete process.env[key];
-    } else {
-      process.env[key] = value;
-    }
-  };
-  restoreEnvVar("PATH", snapshot.PATH);
-  restoreEnvVar("SHERPA_ONNX_MODEL_DIR", snapshot.SHERPA_ONNX_MODEL_DIR);
-  restoreEnvVar("WHISPER_CPP_MODEL", snapshot.WHISPER_CPP_MODEL);
-  restoreEnvVar("OPENAI_API_KEY", snapshot.OPENAI_API_KEY);
-  restoreEnvVar("GROQ_API_KEY", snapshot.GROQ_API_KEY);
-  restoreEnvVar("DEEPGRAM_API_KEY", snapshot.DEEPGRAM_API_KEY);
-  restoreEnvVar("GEMINI_API_KEY", snapshot.GEMINI_API_KEY);
-  restoreEnvVar("OPENCLAW_AGENT_DIR", snapshot.OPENCLAW_AGENT_DIR);
-  restoreEnvVar("PI_CODING_AGENT_DIR", snapshot.PI_CODING_AGENT_DIR);
-};
-
-const withEnvSnapshot = async <T>(run: () => Promise<T>): Promise<T> => {
-  const snapshot = envSnapshot();
-  try {
-    return await run();
-  } finally {
-    restoreEnv(snapshot);
-  }
-};
-
-const createTrackedTempDir = async (tempPaths: string[], prefix: string) => {
-  const dir = await makeTempDir(prefix);
-  tempPaths.push(dir);
-  return dir;
-};
-
-const createTrackedTempMedia = async (tempPaths: string[], ext: string) => {
-  const media = await makeTempMedia(ext);
-  tempPaths.push(media.dir);
-  return media.filePath;
-};
-
-describe("media understanding auto-detect (e2e)", () => {
-  let tempPaths: string[] = [];
-
-  beforeEach(() => {
-    clearMediaUnderstandingBinaryCacheForTests();
-  });
-
-  afterEach(async () => {
-    for (const p of tempPaths) {
-      await fs.rm(p, { recursive: true, force: true }).catch(() => {});
-    }
-    tempPaths = [];
-  });
-
-  it.skipIf(process.platform === "win32")("uses sherpa-onnx-offline when available", async () => {
-    await withEnvSnapshot(async () => {
-      const binDir = await createTrackedTempDir(tempPaths, "openclaw-bin-sherpa-");
-      const modelDir = await createTrackedTempDir(tempPaths, "openclaw-sherpa-model-");
-
-      await fs.writeFile(path.join(modelDir, "tokens.txt"), "a");
-      await fs.writeFile(path.join(modelDir, "encoder.onnx"), "a");
-      await fs.writeFile(path.join(modelDir, "decoder.onnx"), "a");
-      await fs.writeFile(path.join(modelDir, "joiner.onnx"), "a");
-
-      await writeExecutable(
-        binDir,
-        "sherpa-onnx-offline",
-        `#!/usr/bin/env bash\necho "{\\"text\\":\\"sherpa ok\\"}"\n`,
-      );
-
-      process.env.PATH = `${binDir}:/usr/bin:/bin`;
-      process.env.SHERPA_ONNX_MODEL_DIR = modelDir;
-
-      const filePath = await createTrackedTempMedia(tempPaths, ".wav");
-
-      const ctx: MsgContext = {
-        Body: "<media:audio>",
-        MediaPath: filePath,
-        MediaType: "audio/wav",
-      };
-      const cfg: OpenClawConfig = { tools: { media: { audio: {} } } };
-
-      await applyMediaUnderstanding({ ctx, cfg });
-
-      expect(ctx.Transcript).toBe("sherpa ok");
-    });
-  });
-
-  it.skipIf(process.platform === "win32")("uses whisper-cli when sherpa is missing", async () => {
-    await withEnvSnapshot(async () => {
-      const binDir = await createTrackedTempDir(tempPaths, "openclaw-bin-whispercpp-");
-      const modelDir = await createTrackedTempDir(tempPaths, "openclaw-whispercpp-model-");
-
-      const modelPath = path.join(modelDir, "tiny.bin");
-      await fs.writeFile(modelPath, "model");
-
-      await writeExecutable(
-        binDir,
-        "whisper-cli",
-        "#!/usr/bin/env bash\n" +
-          'out=""\n' +
-          'prev=""\n' +
-          'for arg in "$@"; do\n' +
-          '  if [ "$prev" = "-of" ]; then out="$arg"; break; fi\n' +
-          '  prev="$arg"\n' +
-          "done\n" +
-          'if [ -n "$out" ]; then echo \'whisper cpp ok\' > "${out}.txt"; fi\n',
-      );
-
-      process.env.PATH = `${binDir}:/usr/bin:/bin`;
-      process.env.WHISPER_CPP_MODEL = modelPath;
-
-      const filePath = await createTrackedTempMedia(tempPaths, ".wav");
-
-      const ctx: MsgContext = {
-        Body: "<media:audio>",
-        MediaPath: filePath,
-        MediaType: "audio/wav",
-      };
-      const cfg: OpenClawConfig = { tools: { media: { audio: {} } } };
-
-      await applyMediaUnderstanding({ ctx, cfg });
-
-      expect(ctx.Transcript).toBe("whisper cpp ok");
-    });
-  });
-
-  it.skipIf(process.platform === "win32")("uses gemini CLI for images when available", async () => {
-    await withEnvSnapshot(async () => {
-      const binDir = await createTrackedTempDir(tempPaths, "openclaw-bin-gemini-");
-
-      await writeExecutable(
-        binDir,
-        "gemini",
-        `#!/usr/bin/env bash\necho '{"response":"gemini ok"}'\n`,
-      );
-
-      process.env.PATH = `${binDir}:/usr/bin:/bin`;
-
-      const filePath = await createTrackedTempMedia(tempPaths, ".png");
-
-      const ctx: MsgContext = {
-        Body: "<media:image>",
-        MediaPath: filePath,
-        MediaType: "image/png",
-      };
-      const cfg: OpenClawConfig = { tools: { media: { image: {} } } };
-
-      await applyMediaUnderstanding({ ctx, cfg });
-
-      expect(ctx.Body).toContain("gemini ok");
-    });
-  });
-
-  it("skips auto-detect when no supported binaries are available", async () => {
-    await withEnvSnapshot(async () => {
-      const emptyBinDir = await createTrackedTempDir(tempPaths, "openclaw-bin-empty-");
-      const isolatedAgentDir = await createTrackedTempDir(tempPaths, "openclaw-agent-empty-");
-      process.env.PATH = emptyBinDir;
-      delete process.env.SHERPA_ONNX_MODEL_DIR;
-      delete process.env.WHISPER_CPP_MODEL;
-      delete process.env.OPENAI_API_KEY;
-      delete process.env.GROQ_API_KEY;
-      delete process.env.DEEPGRAM_API_KEY;
-      delete process.env.GEMINI_API_KEY;
-      process.env.OPENCLAW_AGENT_DIR = isolatedAgentDir;
-      process.env.PI_CODING_AGENT_DIR = isolatedAgentDir;
-
-      const filePath = await createTrackedTempMedia(tempPaths, ".wav");
-      const ctx: MsgContext = {
-        Body: "<media:audio>",
-        MediaPath: filePath,
-        MediaType: "audio/wav",
-      };
-      const cfg: OpenClawConfig = { tools: { media: { audio: {} } } };
-
-      await applyMediaUnderstanding({ ctx, cfg });
-
-      expect(ctx.Transcript).toBeUndefined();
-      expect(ctx.Body).toBe("<media:audio>");
-    });
-  });
-});