diff --git a/CHANGELOG.md b/CHANGELOG.md index b9bbdf3cb..6814cff66 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,7 @@ Docs: https://docs.openclaw.ai - Gateway/Config reload: compare array-valued config paths structurally during diffing so unchanged `memory.qmd.paths` and `memory.qmd.scope.rules` no longer trigger false restart-required reloads. (#23185) Thanks @rex05ai. - Cron/Scheduling: validate runtime cron expressions before schedule/stagger evaluation so malformed persisted jobs report a clear `invalid cron schedule: expr is required` error instead of crashing with `undefined.trim` failures and auto-disable churn. (#23223) Thanks @asimons81. - Memory/QMD: migrate legacy unscoped collection bindings (for example `memory-root`) to per-agent scoped names (for example `memory-root-main`) during startup when safe, so QMD-backed `memory_search` no longer fails with `Collection not found` after upgrades. (#23228, #20727) Thanks @JLDynamics and @AaronFaby. +- Memory/QMD: normalize Han-script BM25 search queries before invoking `qmd search` so mixed CJK+Latin prompts no longer return empty results due to tokenizer mismatch. (#23426) Thanks @LunaLee0130. - TUI/Input: enable multiline-paste burst coalescing on macOS Terminal.app and iTerm so pasted blocks no longer submit line-by-line as separate messages. (#18809) Thanks @fwends. - TUI/RTL: isolate right-to-left script lines (Arabic/Hebrew ranges) with Unicode bidi isolation marks in TUI text sanitization so RTL assistant output no longer renders in reversed visual order in terminal chat panes. (#21936) Thanks @Asm3r96. - TUI/Status: request immediate renders after setting `sending`/`waiting` activity states so in-flight runs always show visible progress indicators instead of appearing idle until completion. (#21549) Thanks @13Guinness. diff --git a/src/memory/qmd-manager.test.ts b/src/memory/qmd-manager.test.ts index d8212bdd7..7e97fcca7 100644 --- a/src/memory/qmd-manager.test.ts +++ b/src/memory/qmd-manager.test.ts @@ -729,6 +729,121 @@ describe("QmdMemoryManager", () => { await manager.close(); }); + it("normalizes mixed Han-script BM25 queries before qmd search", async () => { + cfg = { + ...cfg, + memory: { + backend: "qmd", + qmd: { + includeDefaultMemory: false, + searchMode: "search", + update: { interval: "0s", debounceMs: 60_000, onBoot: false }, + paths: [{ path: workspaceDir, pattern: "**/*.md", name: "workspace" }], + }, + }, + } as OpenClawConfig; + spawnMock.mockImplementation((_cmd: string, args: string[]) => { + if (args[0] === "search") { + const child = createMockChild({ autoClose: false }); + emitAndClose(child, "stdout", "[]"); + return child; + } + return createMockChild(); + }); + + const { manager, resolved } = await createManager(); + const maxResults = resolved.qmd?.limits.maxResults; + if (!maxResults) { + throw new Error("qmd maxResults missing"); + } + + await expect( + manager.search("記憶系統升級 QMD", { sessionKey: "agent:main:slack:dm:u123" }), + ).resolves.toEqual([]); + + const searchCall = spawnMock.mock.calls.find( + (call: unknown[]) => (call[1] as string[])?.[0] === "search", + ); + expect(searchCall?.[1]).toEqual([ + "search", + "記憶 憶系 系統 統升 升級 qmd", + "--json", + "-n", + String(maxResults), + "-c", + "workspace-main", + ]); + await manager.close(); + }); + + it("falls back to the original query when Han normalization yields no BM25 tokens", async () => { + cfg = { + ...cfg, + memory: { + backend: "qmd", + qmd: { + includeDefaultMemory: false, + searchMode: "search", + update: { interval: "0s", debounceMs: 60_000, onBoot: false }, + paths: [{ path: workspaceDir, pattern: "**/*.md", name: "workspace" }], + }, + }, + } as OpenClawConfig; + spawnMock.mockImplementation((_cmd: string, args: string[]) => { + if (args[0] === "search") { + const child = createMockChild({ autoClose: false }); + emitAndClose(child, "stdout", "[]"); + return child; + } + return createMockChild(); + }); + + const { manager } = await createManager(); + await expect(manager.search("記", { sessionKey: "agent:main:slack:dm:u123" })).resolves.toEqual( + [], + ); + + const searchCall = spawnMock.mock.calls.find( + (call: unknown[]) => (call[1] as string[])?.[0] === "search", + ); + expect(searchCall?.[1]?.[1]).toBe("記"); + await manager.close(); + }); + + it("keeps original Han queries in qmd query mode", async () => { + cfg = { + ...cfg, + memory: { + backend: "qmd", + qmd: { + includeDefaultMemory: false, + searchMode: "query", + update: { interval: "0s", debounceMs: 60_000, onBoot: false }, + paths: [{ path: workspaceDir, pattern: "**/*.md", name: "workspace" }], + }, + }, + } as OpenClawConfig; + spawnMock.mockImplementation((_cmd: string, args: string[]) => { + if (args[0] === "query") { + const child = createMockChild({ autoClose: false }); + emitAndClose(child, "stdout", "[]"); + return child; + } + return createMockChild(); + }); + + const { manager } = await createManager(); + await expect( + manager.search("記憶系統升級 QMD", { sessionKey: "agent:main:slack:dm:u123" }), + ).resolves.toEqual([]); + + const queryCall = spawnMock.mock.calls.find( + (call: unknown[]) => (call[1] as string[])?.[0] === "query", + ); + expect(queryCall?.[1]?.[1]).toBe("記憶系統升級 QMD"); + await manager.close(); + }); + it("retries search with qmd query when configured mode rejects flags", async () => { cfg = { ...cfg, diff --git a/src/memory/qmd-manager.ts b/src/memory/qmd-manager.ts index 03f49de61..bb9215224 100644 --- a/src/memory/qmd-manager.ts +++ b/src/memory/qmd-manager.ts @@ -31,6 +31,7 @@ import type { ResolvedQmdMcporterConfig, } from "./backend-config.js"; import { parseQmdQueryJson, type QmdQueryResult } from "./qmd-query-parser.js"; +import { extractKeywords } from "./query-expansion.js"; const log = createSubsystemLogger("memory"); @@ -40,9 +41,45 @@ const MAX_QMD_OUTPUT_CHARS = 200_000; const NUL_MARKER_RE = /(?:\^@|\\0|\\x00|\\u0000|null\s*byte|nul\s*byte)/i; const QMD_EMBED_BACKOFF_BASE_MS = 60_000; const QMD_EMBED_BACKOFF_MAX_MS = 60 * 60 * 1000; +const HAN_SCRIPT_RE = /[\u3400-\u9fff]/u; +const QMD_BM25_HAN_KEYWORD_LIMIT = 12; let qmdEmbedQueueTail: Promise = Promise.resolve(); +function hasHanScript(value: string): boolean { + return HAN_SCRIPT_RE.test(value); +} + +function normalizeHanBm25Query(query: string): string { + const trimmed = query.trim(); + if (!trimmed || !hasHanScript(trimmed)) { + return trimmed; + } + const keywords = extractKeywords(trimmed); + const normalizedKeywords: string[] = []; + const seen = new Set(); + for (const keyword of keywords) { + const token = keyword.trim(); + if (!token || seen.has(token)) { + continue; + } + const includesHan = hasHanScript(token); + // Han unigrams are usually too broad for BM25 and can drown signal. + if (includesHan && Array.from(token).length < 2) { + continue; + } + if (!includesHan && token.length < 2) { + continue; + } + seen.add(token); + normalizedKeywords.push(token); + if (normalizedKeywords.length >= QMD_BM25_HAN_KEYWORD_LIMIT) { + break; + } + } + return normalizedKeywords.length > 0 ? normalizedKeywords.join(" ") : trimmed; +} + async function runWithQmdEmbedLock(task: () => Promise): Promise { const previous = qmdEmbedQueueTail; let release: (() => void) | undefined; @@ -1728,10 +1765,11 @@ export class QmdMemoryManager implements MemorySearchManager { query: string, limit: number, ): string[] { + const normalizedQuery = command === "search" ? normalizeHanBm25Query(query) : query; if (command === "query") { - return ["query", query, "--json", "-n", String(limit)]; + return ["query", normalizedQuery, "--json", "-n", String(limit)]; } - return [command, query, "--json", "-n", String(limit)]; + return [command, normalizedQuery, "--json", "-n", String(limit)]; } }