Memory/QMD: normalize Han-script BM25 search queries
This commit is contained in:
@@ -729,6 +729,121 @@ describe("QmdMemoryManager", () => {
|
||||
await manager.close();
|
||||
});
|
||||
|
||||
it("normalizes mixed Han-script BM25 queries before qmd search", async () => {
|
||||
cfg = {
|
||||
...cfg,
|
||||
memory: {
|
||||
backend: "qmd",
|
||||
qmd: {
|
||||
includeDefaultMemory: false,
|
||||
searchMode: "search",
|
||||
update: { interval: "0s", debounceMs: 60_000, onBoot: false },
|
||||
paths: [{ path: workspaceDir, pattern: "**/*.md", name: "workspace" }],
|
||||
},
|
||||
},
|
||||
} as OpenClawConfig;
|
||||
spawnMock.mockImplementation((_cmd: string, args: string[]) => {
|
||||
if (args[0] === "search") {
|
||||
const child = createMockChild({ autoClose: false });
|
||||
emitAndClose(child, "stdout", "[]");
|
||||
return child;
|
||||
}
|
||||
return createMockChild();
|
||||
});
|
||||
|
||||
const { manager, resolved } = await createManager();
|
||||
const maxResults = resolved.qmd?.limits.maxResults;
|
||||
if (!maxResults) {
|
||||
throw new Error("qmd maxResults missing");
|
||||
}
|
||||
|
||||
await expect(
|
||||
manager.search("記憶系統升級 QMD", { sessionKey: "agent:main:slack:dm:u123" }),
|
||||
).resolves.toEqual([]);
|
||||
|
||||
const searchCall = spawnMock.mock.calls.find(
|
||||
(call: unknown[]) => (call[1] as string[])?.[0] === "search",
|
||||
);
|
||||
expect(searchCall?.[1]).toEqual([
|
||||
"search",
|
||||
"記憶 憶系 系統 統升 升級 qmd",
|
||||
"--json",
|
||||
"-n",
|
||||
String(maxResults),
|
||||
"-c",
|
||||
"workspace-main",
|
||||
]);
|
||||
await manager.close();
|
||||
});
|
||||
|
||||
it("falls back to the original query when Han normalization yields no BM25 tokens", async () => {
|
||||
cfg = {
|
||||
...cfg,
|
||||
memory: {
|
||||
backend: "qmd",
|
||||
qmd: {
|
||||
includeDefaultMemory: false,
|
||||
searchMode: "search",
|
||||
update: { interval: "0s", debounceMs: 60_000, onBoot: false },
|
||||
paths: [{ path: workspaceDir, pattern: "**/*.md", name: "workspace" }],
|
||||
},
|
||||
},
|
||||
} as OpenClawConfig;
|
||||
spawnMock.mockImplementation((_cmd: string, args: string[]) => {
|
||||
if (args[0] === "search") {
|
||||
const child = createMockChild({ autoClose: false });
|
||||
emitAndClose(child, "stdout", "[]");
|
||||
return child;
|
||||
}
|
||||
return createMockChild();
|
||||
});
|
||||
|
||||
const { manager } = await createManager();
|
||||
await expect(manager.search("記", { sessionKey: "agent:main:slack:dm:u123" })).resolves.toEqual(
|
||||
[],
|
||||
);
|
||||
|
||||
const searchCall = spawnMock.mock.calls.find(
|
||||
(call: unknown[]) => (call[1] as string[])?.[0] === "search",
|
||||
);
|
||||
expect(searchCall?.[1]?.[1]).toBe("記");
|
||||
await manager.close();
|
||||
});
|
||||
|
||||
it("keeps original Han queries in qmd query mode", async () => {
|
||||
cfg = {
|
||||
...cfg,
|
||||
memory: {
|
||||
backend: "qmd",
|
||||
qmd: {
|
||||
includeDefaultMemory: false,
|
||||
searchMode: "query",
|
||||
update: { interval: "0s", debounceMs: 60_000, onBoot: false },
|
||||
paths: [{ path: workspaceDir, pattern: "**/*.md", name: "workspace" }],
|
||||
},
|
||||
},
|
||||
} as OpenClawConfig;
|
||||
spawnMock.mockImplementation((_cmd: string, args: string[]) => {
|
||||
if (args[0] === "query") {
|
||||
const child = createMockChild({ autoClose: false });
|
||||
emitAndClose(child, "stdout", "[]");
|
||||
return child;
|
||||
}
|
||||
return createMockChild();
|
||||
});
|
||||
|
||||
const { manager } = await createManager();
|
||||
await expect(
|
||||
manager.search("記憶系統升級 QMD", { sessionKey: "agent:main:slack:dm:u123" }),
|
||||
).resolves.toEqual([]);
|
||||
|
||||
const queryCall = spawnMock.mock.calls.find(
|
||||
(call: unknown[]) => (call[1] as string[])?.[0] === "query",
|
||||
);
|
||||
expect(queryCall?.[1]?.[1]).toBe("記憶系統升級 QMD");
|
||||
await manager.close();
|
||||
});
|
||||
|
||||
it("retries search with qmd query when configured mode rejects flags", async () => {
|
||||
cfg = {
|
||||
...cfg,
|
||||
|
||||
@@ -31,6 +31,7 @@ import type {
|
||||
ResolvedQmdMcporterConfig,
|
||||
} from "./backend-config.js";
|
||||
import { parseQmdQueryJson, type QmdQueryResult } from "./qmd-query-parser.js";
|
||||
import { extractKeywords } from "./query-expansion.js";
|
||||
|
||||
const log = createSubsystemLogger("memory");
|
||||
|
||||
@@ -40,9 +41,45 @@ const MAX_QMD_OUTPUT_CHARS = 200_000;
|
||||
const NUL_MARKER_RE = /(?:\^@|\\0|\\x00|\\u0000|null\s*byte|nul\s*byte)/i;
|
||||
const QMD_EMBED_BACKOFF_BASE_MS = 60_000;
|
||||
const QMD_EMBED_BACKOFF_MAX_MS = 60 * 60 * 1000;
|
||||
const HAN_SCRIPT_RE = /[\u3400-\u9fff]/u;
|
||||
const QMD_BM25_HAN_KEYWORD_LIMIT = 12;
|
||||
|
||||
let qmdEmbedQueueTail: Promise<void> = Promise.resolve();
|
||||
|
||||
function hasHanScript(value: string): boolean {
|
||||
return HAN_SCRIPT_RE.test(value);
|
||||
}
|
||||
|
||||
function normalizeHanBm25Query(query: string): string {
|
||||
const trimmed = query.trim();
|
||||
if (!trimmed || !hasHanScript(trimmed)) {
|
||||
return trimmed;
|
||||
}
|
||||
const keywords = extractKeywords(trimmed);
|
||||
const normalizedKeywords: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
for (const keyword of keywords) {
|
||||
const token = keyword.trim();
|
||||
if (!token || seen.has(token)) {
|
||||
continue;
|
||||
}
|
||||
const includesHan = hasHanScript(token);
|
||||
// Han unigrams are usually too broad for BM25 and can drown signal.
|
||||
if (includesHan && Array.from(token).length < 2) {
|
||||
continue;
|
||||
}
|
||||
if (!includesHan && token.length < 2) {
|
||||
continue;
|
||||
}
|
||||
seen.add(token);
|
||||
normalizedKeywords.push(token);
|
||||
if (normalizedKeywords.length >= QMD_BM25_HAN_KEYWORD_LIMIT) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return normalizedKeywords.length > 0 ? normalizedKeywords.join(" ") : trimmed;
|
||||
}
|
||||
|
||||
async function runWithQmdEmbedLock<T>(task: () => Promise<T>): Promise<T> {
|
||||
const previous = qmdEmbedQueueTail;
|
||||
let release: (() => void) | undefined;
|
||||
@@ -1728,10 +1765,11 @@ export class QmdMemoryManager implements MemorySearchManager {
|
||||
query: string,
|
||||
limit: number,
|
||||
): string[] {
|
||||
const normalizedQuery = command === "search" ? normalizeHanBm25Query(query) : query;
|
||||
if (command === "query") {
|
||||
return ["query", query, "--json", "-n", String(limit)];
|
||||
return ["query", normalizedQuery, "--json", "-n", String(limit)];
|
||||
}
|
||||
return [command, query, "--json", "-n", String(limit)];
|
||||
return [command, normalizedQuery, "--json", "-n", String(limit)];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user