feat(memory): add Japanese query expansion support for FTS (#23156)
* Memory: add Japanese query expansion support * Docs/Changelog: credit Japanese FTS update
This commit is contained in:
@@ -13,6 +13,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Channels/Config: unify channel preview streaming config handling with a shared resolver and canonical migration path.
|
||||
- Discord/Allowlist: canonicalize resolved Discord allowlist names to IDs and split resolution flow for clearer fail-closed behavior.
|
||||
- Memory/FTS: add Korean stop-word filtering and particle-aware keyword extraction (including mixed Korean/English stems) for query expansion in FTS-only search mode. (#18899) Thanks @ruypang.
|
||||
- Memory/FTS: add Japanese-aware query expansion tokenization and stop-word filtering (including mixed-script terms like ASCII + katakana) for FTS-only search mode. Thanks @vincentkoc.
|
||||
- iOS/Talk: prefetch TTS segments and suppress expected speech-cancellation errors for smoother talk playback. (#22833) Thanks @ngutman.
|
||||
|
||||
### Breaking
|
||||
|
||||
@@ -95,6 +95,28 @@ describe("extractKeywords", () => {
|
||||
expect(keywords).toContain("논의");
|
||||
});
|
||||
|
||||
it("extracts keywords from Japanese conversational query", () => {
|
||||
const keywords = extractKeywords("昨日話したデプロイ戦略");
|
||||
expect(keywords).toContain("デプロイ");
|
||||
expect(keywords).toContain("戦略");
|
||||
expect(keywords).not.toContain("昨日");
|
||||
});
|
||||
|
||||
it("handles mixed Japanese and English query", () => {
|
||||
const keywords = extractKeywords("昨日話したAPIのバグ");
|
||||
expect(keywords).toContain("api");
|
||||
expect(keywords).toContain("バグ");
|
||||
expect(keywords).not.toContain("した");
|
||||
});
|
||||
|
||||
it("filters Japanese stop words", () => {
|
||||
const keywords = extractKeywords("これ それ そして どう");
|
||||
expect(keywords).not.toContain("これ");
|
||||
expect(keywords).not.toContain("それ");
|
||||
expect(keywords).not.toContain("そして");
|
||||
expect(keywords).not.toContain("どう");
|
||||
});
|
||||
|
||||
it("handles empty query", () => {
|
||||
expect(extractKeywords("")).toEqual([]);
|
||||
expect(extractKeywords(" ")).toEqual([]);
|
||||
|
||||
@@ -273,6 +273,59 @@ function isUsefulKoreanStem(stem: string): boolean {
|
||||
return /^[a-z0-9_]+$/i.test(stem);
|
||||
}
|
||||
|
||||
const STOP_WORDS_JA = new Set([
|
||||
// Pronouns and references
|
||||
"これ",
|
||||
"それ",
|
||||
"あれ",
|
||||
"この",
|
||||
"その",
|
||||
"あの",
|
||||
"ここ",
|
||||
"そこ",
|
||||
"あそこ",
|
||||
// Common auxiliaries / vague verbs
|
||||
"する",
|
||||
"した",
|
||||
"して",
|
||||
"です",
|
||||
"ます",
|
||||
"いる",
|
||||
"ある",
|
||||
"なる",
|
||||
"できる",
|
||||
// Particles / connectors
|
||||
"の",
|
||||
"こと",
|
||||
"もの",
|
||||
"ため",
|
||||
"そして",
|
||||
"しかし",
|
||||
"また",
|
||||
"でも",
|
||||
"から",
|
||||
"まで",
|
||||
"より",
|
||||
"だけ",
|
||||
// Question words
|
||||
"なぜ",
|
||||
"どう",
|
||||
"何",
|
||||
"いつ",
|
||||
"どこ",
|
||||
"誰",
|
||||
"どれ",
|
||||
// Time (vague)
|
||||
"昨日",
|
||||
"今日",
|
||||
"明日",
|
||||
"最近",
|
||||
"今",
|
||||
"さっき",
|
||||
"前",
|
||||
"後",
|
||||
]);
|
||||
|
||||
const STOP_WORDS_ZH = new Set([
|
||||
// Pronouns
|
||||
"我",
|
||||
@@ -395,7 +448,7 @@ function isValidKeyword(token: string): boolean {
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple tokenizer that handles English, Chinese, and Korean text.
|
||||
* Simple tokenizer that handles English, Chinese, Korean, and Japanese text.
|
||||
* For Chinese, we do character-based splitting since we don't have a proper segmenter.
|
||||
* For English, we split on whitespace and punctuation.
|
||||
*/
|
||||
@@ -407,8 +460,23 @@ function tokenize(text: string): string[] {
|
||||
const segments = normalized.split(/[\s\p{P}]+/u).filter(Boolean);
|
||||
|
||||
for (const segment of segments) {
|
||||
// Check if segment contains CJK characters (Chinese)
|
||||
if (/[\u4e00-\u9fff]/.test(segment)) {
|
||||
// Japanese text often mixes scripts (kanji/kana/ASCII) without spaces.
|
||||
// Extract script-specific chunks so technical terms like "API" / "バグ" are retained.
|
||||
if (/[\u3040-\u30ff]/.test(segment)) {
|
||||
const jpParts =
|
||||
segment.match(/[a-z0-9_]+|[\u30a0-\u30ffー]+|[\u4e00-\u9fff]+|[\u3040-\u309f]{2,}/g) ?? [];
|
||||
for (const part of jpParts) {
|
||||
if (/^[\u4e00-\u9fff]+$/.test(part)) {
|
||||
tokens.push(part);
|
||||
for (let i = 0; i < part.length - 1; i++) {
|
||||
tokens.push(part[i] + part[i + 1]);
|
||||
}
|
||||
} else {
|
||||
tokens.push(part);
|
||||
}
|
||||
}
|
||||
} else if (/[\u4e00-\u9fff]/.test(segment)) {
|
||||
// Check if segment contains CJK characters (Chinese)
|
||||
// For Chinese, extract character n-grams (unigrams and bigrams)
|
||||
const chars = Array.from(segment).filter((c) => /[\u4e00-\u9fff]/.test(c));
|
||||
// Add individual characters
|
||||
@@ -453,7 +521,12 @@ export function extractKeywords(query: string): string[] {
|
||||
|
||||
for (const token of tokens) {
|
||||
// Skip stop words
|
||||
if (STOP_WORDS_EN.has(token) || STOP_WORDS_ZH.has(token) || STOP_WORDS_KO.has(token)) {
|
||||
if (
|
||||
STOP_WORDS_EN.has(token) ||
|
||||
STOP_WORDS_ZH.has(token) ||
|
||||
STOP_WORDS_KO.has(token) ||
|
||||
STOP_WORDS_JA.has(token)
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
// Skip invalid keywords
|
||||
|
||||
Reference in New Issue
Block a user