feat(memory): add Japanese query expansion support for FTS (#23156)

* Memory: add Japanese query expansion support

* Docs/Changelog: credit Japanese FTS update
This commit is contained in:
Vincent Koc
2026-02-22 11:19:20 -05:00
committed by GitHub
parent f442a3539f
commit 21cbf59509
3 changed files with 100 additions and 4 deletions

View File

@@ -13,6 +13,7 @@ Docs: https://docs.openclaw.ai
- Channels/Config: unify channel preview streaming config handling with a shared resolver and canonical migration path.
- Discord/Allowlist: canonicalize resolved Discord allowlist names to IDs and split resolution flow for clearer fail-closed behavior.
- Memory/FTS: add Korean stop-word filtering and particle-aware keyword extraction (including mixed Korean/English stems) for query expansion in FTS-only search mode. (#18899) Thanks @ruypang.
- Memory/FTS: add Japanese-aware query expansion tokenization and stop-word filtering (including mixed-script terms like ASCII + katakana) for FTS-only search mode. Thanks @vincentkoc.
- iOS/Talk: prefetch TTS segments and suppress expected speech-cancellation errors for smoother talk playback. (#22833) Thanks @ngutman.
### Breaking

View File

@@ -95,6 +95,28 @@ describe("extractKeywords", () => {
expect(keywords).toContain("논의");
});
it("extracts keywords from Japanese conversational query", () => {
const keywords = extractKeywords("昨日話したデプロイ戦略");
expect(keywords).toContain("デプロイ");
expect(keywords).toContain("戦略");
expect(keywords).not.toContain("昨日");
});
it("handles mixed Japanese and English query", () => {
const keywords = extractKeywords("昨日話したAPIのバグ");
expect(keywords).toContain("api");
expect(keywords).toContain("バグ");
expect(keywords).not.toContain("した");
});
it("filters Japanese stop words", () => {
const keywords = extractKeywords("これ それ そして どう");
expect(keywords).not.toContain("これ");
expect(keywords).not.toContain("それ");
expect(keywords).not.toContain("そして");
expect(keywords).not.toContain("どう");
});
it("handles empty query", () => {
expect(extractKeywords("")).toEqual([]);
expect(extractKeywords(" ")).toEqual([]);

View File

@@ -273,6 +273,59 @@ function isUsefulKoreanStem(stem: string): boolean {
return /^[a-z0-9_]+$/i.test(stem);
}
const STOP_WORDS_JA = new Set([
// Pronouns and references
"これ",
"それ",
"あれ",
"この",
"その",
"あの",
"ここ",
"そこ",
"あそこ",
// Common auxiliaries / vague verbs
"する",
"した",
"して",
"です",
"ます",
"いる",
"ある",
"なる",
"できる",
// Particles / connectors
"の",
"こと",
"もの",
"ため",
"そして",
"しかし",
"また",
"でも",
"から",
"まで",
"より",
"だけ",
// Question words
"なぜ",
"どう",
"何",
"いつ",
"どこ",
"誰",
"どれ",
// Time (vague)
"昨日",
"今日",
"明日",
"最近",
"今",
"さっき",
"前",
"後",
]);
const STOP_WORDS_ZH = new Set([
// Pronouns
"我",
@@ -395,7 +448,7 @@ function isValidKeyword(token: string): boolean {
}
/**
* Simple tokenizer that handles English, Chinese, and Korean text.
* Simple tokenizer that handles English, Chinese, Korean, and Japanese text.
* For Chinese, we do character-based splitting since we don't have a proper segmenter.
* For English, we split on whitespace and punctuation.
*/
@@ -407,8 +460,23 @@ function tokenize(text: string): string[] {
const segments = normalized.split(/[\s\p{P}]+/u).filter(Boolean);
for (const segment of segments) {
// Check if segment contains CJK characters (Chinese)
if (/[\u4e00-\u9fff]/.test(segment)) {
// Japanese text often mixes scripts (kanji/kana/ASCII) without spaces.
// Extract script-specific chunks so technical terms like "API" / "バグ" are retained.
if (/[\u3040-\u30ff]/.test(segment)) {
const jpParts =
segment.match(/[a-z0-9_]+|[\u30a0-\u30ffー]+|[\u4e00-\u9fff]+|[\u3040-\u309f]{2,}/g) ?? [];
for (const part of jpParts) {
if (/^[\u4e00-\u9fff]+$/.test(part)) {
tokens.push(part);
for (let i = 0; i < part.length - 1; i++) {
tokens.push(part[i] + part[i + 1]);
}
} else {
tokens.push(part);
}
}
} else if (/[\u4e00-\u9fff]/.test(segment)) {
// Check if segment contains CJK characters (Chinese)
// For Chinese, extract character n-grams (unigrams and bigrams)
const chars = Array.from(segment).filter((c) => /[\u4e00-\u9fff]/.test(c));
// Add individual characters
@@ -453,7 +521,12 @@ export function extractKeywords(query: string): string[] {
for (const token of tokens) {
// Skip stop words
if (STOP_WORDS_EN.has(token) || STOP_WORDS_ZH.has(token) || STOP_WORDS_KO.has(token)) {
if (
STOP_WORDS_EN.has(token) ||
STOP_WORDS_ZH.has(token) ||
STOP_WORDS_KO.has(token) ||
STOP_WORDS_JA.has(token)
) {
continue;
}
// Skip invalid keywords