diff --git a/CHANGELOG.md b/CHANGELOG.md index 54ae7a775..f44d8a266 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ Docs: https://docs.openclaw.ai - Channels/Config: unify channel preview streaming config handling with a shared resolver and canonical migration path. - Discord/Allowlist: canonicalize resolved Discord allowlist names to IDs and split resolution flow for clearer fail-closed behavior. - Memory/FTS: add Korean stop-word filtering and particle-aware keyword extraction (including mixed Korean/English stems) for query expansion in FTS-only search mode. (#18899) Thanks @ruypang. +- Memory/FTS: add Japanese-aware query expansion tokenization and stop-word filtering (including mixed-script terms like ASCII + katakana) for FTS-only search mode. Thanks @vincentkoc. - iOS/Talk: prefetch TTS segments and suppress expected speech-cancellation errors for smoother talk playback. (#22833) Thanks @ngutman. ### Breaking diff --git a/src/memory/query-expansion.test.ts b/src/memory/query-expansion.test.ts index 955e74858..708d24695 100644 --- a/src/memory/query-expansion.test.ts +++ b/src/memory/query-expansion.test.ts @@ -95,6 +95,28 @@ describe("extractKeywords", () => { expect(keywords).toContain("논의"); }); + it("extracts keywords from Japanese conversational query", () => { + const keywords = extractKeywords("昨日話したデプロイ戦略"); + expect(keywords).toContain("デプロイ"); + expect(keywords).toContain("戦略"); + expect(keywords).not.toContain("昨日"); + }); + + it("handles mixed Japanese and English query", () => { + const keywords = extractKeywords("昨日話したAPIのバグ"); + expect(keywords).toContain("api"); + expect(keywords).toContain("バグ"); + expect(keywords).not.toContain("した"); + }); + + it("filters Japanese stop words", () => { + const keywords = extractKeywords("これ それ そして どう"); + expect(keywords).not.toContain("これ"); + expect(keywords).not.toContain("それ"); + expect(keywords).not.toContain("そして"); + expect(keywords).not.toContain("どう"); + }); + it("handles empty query", () => { expect(extractKeywords("")).toEqual([]); expect(extractKeywords(" ")).toEqual([]); diff --git a/src/memory/query-expansion.ts b/src/memory/query-expansion.ts index efb940e04..7fea63b57 100644 --- a/src/memory/query-expansion.ts +++ b/src/memory/query-expansion.ts @@ -273,6 +273,59 @@ function isUsefulKoreanStem(stem: string): boolean { return /^[a-z0-9_]+$/i.test(stem); } +const STOP_WORDS_JA = new Set([ + // Pronouns and references + "これ", + "それ", + "あれ", + "この", + "その", + "あの", + "ここ", + "そこ", + "あそこ", + // Common auxiliaries / vague verbs + "する", + "した", + "して", + "です", + "ます", + "いる", + "ある", + "なる", + "できる", + // Particles / connectors + "の", + "こと", + "もの", + "ため", + "そして", + "しかし", + "また", + "でも", + "から", + "まで", + "より", + "だけ", + // Question words + "なぜ", + "どう", + "何", + "いつ", + "どこ", + "誰", + "どれ", + // Time (vague) + "昨日", + "今日", + "明日", + "最近", + "今", + "さっき", + "前", + "後", +]); + const STOP_WORDS_ZH = new Set([ // Pronouns "我", @@ -395,7 +448,7 @@ function isValidKeyword(token: string): boolean { } /** - * Simple tokenizer that handles English, Chinese, and Korean text. + * Simple tokenizer that handles English, Chinese, Korean, and Japanese text. * For Chinese, we do character-based splitting since we don't have a proper segmenter. * For English, we split on whitespace and punctuation. */ @@ -407,8 +460,23 @@ function tokenize(text: string): string[] { const segments = normalized.split(/[\s\p{P}]+/u).filter(Boolean); for (const segment of segments) { - // Check if segment contains CJK characters (Chinese) - if (/[\u4e00-\u9fff]/.test(segment)) { + // Japanese text often mixes scripts (kanji/kana/ASCII) without spaces. + // Extract script-specific chunks so technical terms like "API" / "バグ" are retained. + if (/[\u3040-\u30ff]/.test(segment)) { + const jpParts = + segment.match(/[a-z0-9_]+|[\u30a0-\u30ffー]+|[\u4e00-\u9fff]+|[\u3040-\u309f]{2,}/g) ?? []; + for (const part of jpParts) { + if (/^[\u4e00-\u9fff]+$/.test(part)) { + tokens.push(part); + for (let i = 0; i < part.length - 1; i++) { + tokens.push(part[i] + part[i + 1]); + } + } else { + tokens.push(part); + } + } + } else if (/[\u4e00-\u9fff]/.test(segment)) { + // Check if segment contains CJK characters (Chinese) // For Chinese, extract character n-grams (unigrams and bigrams) const chars = Array.from(segment).filter((c) => /[\u4e00-\u9fff]/.test(c)); // Add individual characters @@ -453,7 +521,12 @@ export function extractKeywords(query: string): string[] { for (const token of tokens) { // Skip stop words - if (STOP_WORDS_EN.has(token) || STOP_WORDS_ZH.has(token) || STOP_WORDS_KO.has(token)) { + if ( + STOP_WORDS_EN.has(token) || + STOP_WORDS_ZH.has(token) || + STOP_WORDS_KO.has(token) || + STOP_WORDS_JA.has(token) + ) { continue; } // Skip invalid keywords