feat(memory): add Japanese query expansion support for FTS (#23156)

* Memory: add Japanese query expansion support * Docs/Changelog: credit Japanese FTS update
2026-02-22 11:19:20 -05:00
parent f442a3539f
commit 21cbf59509
3 changed files with 100 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@ Docs: https://docs.openclaw.ai
 - Channels/Config: unify channel preview streaming config handling with a shared resolver and canonical migration path.
 - Discord/Allowlist: canonicalize resolved Discord allowlist names to IDs and split resolution flow for clearer fail-closed behavior.
 - Memory/FTS: add Korean stop-word filtering and particle-aware keyword extraction (including mixed Korean/English stems) for query expansion in FTS-only search mode. (#18899) Thanks @ruypang.
+- Memory/FTS: add Japanese-aware query expansion tokenization and stop-word filtering (including mixed-script terms like ASCII + katakana) for FTS-only search mode. Thanks @vincentkoc.
 - iOS/Talk: prefetch TTS segments and suppress expected speech-cancellation errors for smoother talk playback. (#22833) Thanks @ngutman.

 ### Breaking
--- a/src/memory/query-expansion.test.ts
+++ b/src/memory/query-expansion.test.ts
@@ -95,6 +95,28 @@ describe("extractKeywords", () => {
    expect(keywords).toContain("논의");
  });

+  it("extracts keywords from Japanese conversational query", () => {
+    const keywords = extractKeywords("昨日話したデプロイ戦略");
+    expect(keywords).toContain("デプロイ");
+    expect(keywords).toContain("戦略");
+    expect(keywords).not.toContain("昨日");
+  });
+
+  it("handles mixed Japanese and English query", () => {
+    const keywords = extractKeywords("昨日話したAPIのバグ");
+    expect(keywords).toContain("api");
+    expect(keywords).toContain("バグ");
+    expect(keywords).not.toContain("した");
+  });
+
+  it("filters Japanese stop words", () => {
+    const keywords = extractKeywords("これ それ そして どう");
+    expect(keywords).not.toContain("これ");
+    expect(keywords).not.toContain("それ");
+    expect(keywords).not.toContain("そして");
+    expect(keywords).not.toContain("どう");
+  });
+
  it("handles empty query", () => {
    expect(extractKeywords("")).toEqual([]);
    expect(extractKeywords("   ")).toEqual([]);
--- a/src/memory/query-expansion.ts
+++ b/src/memory/query-expansion.ts
@@ -273,6 +273,59 @@ function isUsefulKoreanStem(stem: string): boolean {
  return /^[a-z0-9_]+$/i.test(stem);
 }

+const STOP_WORDS_JA = new Set([
+  // Pronouns and references
+  "これ",
+  "それ",
+  "あれ",
+  "この",
+  "その",
+  "あの",
+  "ここ",
+  "そこ",
+  "あそこ",
+  // Common auxiliaries / vague verbs
+  "する",
+  "した",
+  "して",
+  "です",
+  "ます",
+  "いる",
+  "ある",
+  "なる",
+  "できる",
+  // Particles / connectors
+  "の",
+  "こと",
+  "もの",
+  "ため",
+  "そして",
+  "しかし",
+  "また",
+  "でも",
+  "から",
+  "まで",
+  "より",
+  "だけ",
+  // Question words
+  "なぜ",
+  "どう",
+  "何",
+  "いつ",
+  "どこ",
+  "誰",
+  "どれ",
+  // Time (vague)
+  "昨日",
+  "今日",
+  "明日",
+  "最近",
+  "今",
+  "さっき",
+  "前",
+  "後",
+]);
+
 const STOP_WORDS_ZH = new Set([
  // Pronouns
  "我",
@@ -395,7 +448,7 @@ function isValidKeyword(token: string): boolean {
 }

 /**
- * Simple tokenizer that handles English, Chinese, and Korean text.
+ * Simple tokenizer that handles English, Chinese, Korean, and Japanese text.
 * For Chinese, we do character-based splitting since we don't have a proper segmenter.
 * For English, we split on whitespace and punctuation.
 */
@@ -407,8 +460,23 @@ function tokenize(text: string): string[] {
  const segments = normalized.split(/[\s\p{P}]+/u).filter(Boolean);

  for (const segment of segments) {
-    // Check if segment contains CJK characters (Chinese)
-    if (/[\u4e00-\u9fff]/.test(segment)) {
+    // Japanese text often mixes scripts (kanji/kana/ASCII) without spaces.
+    // Extract script-specific chunks so technical terms like "API" / "バグ" are retained.
+    if (/[\u3040-\u30ff]/.test(segment)) {
+      const jpParts =
+        segment.match(/[a-z0-9_]+|[\u30a0-\u30ffー]+|[\u4e00-\u9fff]+|[\u3040-\u309f]{2,}/g) ?? [];
+      for (const part of jpParts) {
+        if (/^[\u4e00-\u9fff]+$/.test(part)) {
+          tokens.push(part);
+          for (let i = 0; i < part.length - 1; i++) {
+            tokens.push(part[i] + part[i + 1]);
+          }
+        } else {
+          tokens.push(part);
+        }
+      }
+    } else if (/[\u4e00-\u9fff]/.test(segment)) {
+      // Check if segment contains CJK characters (Chinese)
      // For Chinese, extract character n-grams (unigrams and bigrams)
      const chars = Array.from(segment).filter((c) => /[\u4e00-\u9fff]/.test(c));
      // Add individual characters
@@ -453,7 +521,12 @@ export function extractKeywords(query: string): string[] {

  for (const token of tokens) {
    // Skip stop words
-    if (STOP_WORDS_EN.has(token) || STOP_WORDS_ZH.has(token) || STOP_WORDS_KO.has(token)) {
+    if (
+      STOP_WORDS_EN.has(token) ||
+      STOP_WORDS_ZH.has(token) ||
+      STOP_WORDS_KO.has(token) ||
+      STOP_WORDS_JA.has(token)
+    ) {
      continue;
    }
    // Skip invalid keywords