From ec26ebcc9e53f67f6242266556ed13445e2f9688 Mon Sep 17 00:00:00 2001
From: うろちょろ <112748593+chiyonn@users.noreply.github.com>
Date: Tue, 02 Dec 2025 18:04:38 +0000
Subject: [PATCH] feat: improve search tokenization for CJK languages (#2231)
---
quartz/components/scripts/search.inline.ts | 48 ++++++++++-
quartz/components/scripts/search.test.ts | 163 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 206 insertions(+), 5 deletions(-)
diff --git a/quartz/components/scripts/search.inline.ts b/quartz/components/scripts/search.inline.ts
index 6a84a50..717f17f 100644
--- a/quartz/components/scripts/search.inline.ts
+++ b/quartz/components/scripts/search.inline.ts
@@ -16,11 +16,49 @@
type SearchType = "basic" | "tags"
let searchType: SearchType = "basic"
let currentSearchTerm: string = ""
-const encoder = (str: string) => {
- return str
- .toLowerCase()
- .split(/\s+/)
- .filter((token) => token.length > 0)
+const encoder = (str: string): string[] => {
+ const tokens: string[] = []
+ let bufferStart = -1
+ let bufferEnd = -1
+ const lower = str.toLowerCase()
+
+ let i = 0
+ for (const char of lower) {
+ const code = char.codePointAt(0)!
+
+ const isCJK =
+ (code >= 0x3040 && code <= 0x309f) ||
+ (code >= 0x30a0 && code <= 0x30ff) ||
+ (code >= 0x4e00 && code <= 0x9fff) ||
+ (code >= 0xac00 && code <= 0xd7af) ||
+ (code >= 0x20000 && code <= 0x2a6df)
+
+ const isWhitespace = code === 32 || code === 9 || code === 10 || code === 13
+
+ if (isCJK) {
+ if (bufferStart !== -1) {
+ tokens.push(lower.slice(bufferStart, bufferEnd))
+ bufferStart = -1
+ }
+ tokens.push(char)
+ } else if (isWhitespace) {
+ if (bufferStart !== -1) {
+ tokens.push(lower.slice(bufferStart, bufferEnd))
+ bufferStart = -1
+ }
+ } else {
+ if (bufferStart === -1) bufferStart = i
+ bufferEnd = i + char.length
+ }
+
+ i += char.length
+ }
+
+ if (bufferStart !== -1) {
+ tokens.push(lower.slice(bufferStart))
+ }
+
+ return tokens
}
let index = new FlexSearch.Document<Item>({
diff --git a/quartz/components/scripts/search.test.ts b/quartz/components/scripts/search.test.ts
new file mode 100644
index 0000000..221da83
--- /dev/null
+++ b/quartz/components/scripts/search.test.ts
@@ -0,0 +1,163 @@
+import test, { describe } from "node:test"
+import assert from "node:assert"
+
+// Inline the encoder function from search.inline.ts for testing
+const encoder = (str: string): string[] => {
+ const tokens: string[] = []
+ let bufferStart = -1
+ let bufferEnd = -1
+ const lower = str.toLowerCase()
+
+ let i = 0
+ for (const char of lower) {
+ const code = char.codePointAt(0)!
+
+ const isCJK =
+ (code >= 0x3040 && code <= 0x309f) ||
+ (code >= 0x30a0 && code <= 0x30ff) ||
+ (code >= 0x4e00 && code <= 0x9fff) ||
+ (code >= 0xac00 && code <= 0xd7af) ||
+ (code >= 0x20000 && code <= 0x2a6df)
+
+ const isWhitespace = code === 32 || code === 9 || code === 10 || code === 13
+
+ if (isCJK) {
+ if (bufferStart !== -1) {
+ tokens.push(lower.slice(bufferStart, bufferEnd))
+ bufferStart = -1
+ }
+ tokens.push(char)
+ } else if (isWhitespace) {
+ if (bufferStart !== -1) {
+ tokens.push(lower.slice(bufferStart, bufferEnd))
+ bufferStart = -1
+ }
+ } else {
+ if (bufferStart === -1) bufferStart = i
+ bufferEnd = i + char.length
+ }
+
+ i += char.length
+ }
+
+ if (bufferStart !== -1) {
+ tokens.push(lower.slice(bufferStart))
+ }
+
+ return tokens
+}
+
+describe("search encoder", () => {
+ describe("English text", () => {
+ test("should tokenize simple English words", () => {
+ const result = encoder("hello world")
+ assert.deepStrictEqual(result, ["hello", "world"])
+ })
+
+ test("should handle multiple spaces", () => {
+ const result = encoder("hello world")
+ assert.deepStrictEqual(result, ["hello", "world"])
+ })
+
+ test("should handle tabs and newlines", () => {
+ const result = encoder("hello\tworld\ntest")
+ assert.deepStrictEqual(result, ["hello", "world", "test"])
+ })
+
+ test("should lowercase all text", () => {
+ const result = encoder("Hello WORLD Test")
+ assert.deepStrictEqual(result, ["hello", "world", "test"])
+ })
+ })
+
+ describe("CJK text", () => {
+ test("should tokenize Japanese Hiragana character by character", () => {
+ const result = encoder("こんにちは")
+ assert.deepStrictEqual(result, ["こ", "ん", "に", "ち", "は"])
+ })
+
+ test("should tokenize Japanese Katakana character by character", () => {
+ const result = encoder("コントロール")
+ assert.deepStrictEqual(result, ["コ", "ン", "ト", "ロ", "ー", "ル"])
+ })
+
+ test("should tokenize Japanese Kanji character by character", () => {
+ const result = encoder("日本語")
+ assert.deepStrictEqual(result, ["日", "本", "語"])
+ })
+
+ test("should tokenize Korean Hangul character by character", () => {
+ const result = encoder("안녕하세요")
+ assert.deepStrictEqual(result, ["안", "녕", "하", "세", "요"])
+ })
+
+ test("should tokenize Chinese characters character by character", () => {
+ const result = encoder("你好世界")
+ assert.deepStrictEqual(result, ["你", "好", "世", "界"])
+ })
+
+ test("should handle mixed Hiragana/Katakana/Kanji", () => {
+ const result = encoder("て以来")
+ assert.deepStrictEqual(result, ["て", "以", "来"])
+ })
+ })
+
+ describe("Mixed CJK and English", () => {
+ test("should handle Japanese with English words", () => {
+ const result = encoder("hello 世界")
+ assert.deepStrictEqual(result, ["hello", "世", "界"])
+ })
+
+ test("should handle English with Japanese words", () => {
+ const result = encoder("世界 hello world")
+ assert.deepStrictEqual(result, ["世", "界", "hello", "world"])
+ })
+
+ test("should handle complex mixed content", () => {
+ const result = encoder("これはtest文章です")
+ assert.deepStrictEqual(result, ["こ", "れ", "は", "test", "文", "章", "で", "す"])
+ })
+
+ test("should handle mixed Korean and English", () => {
+ const result = encoder("hello 안녕 world")
+ assert.deepStrictEqual(result, ["hello", "안", "녕", "world"])
+ })
+
+ test("should handle mixed Chinese and English", () => {
+ const result = encoder("你好 world")
+ assert.deepStrictEqual(result, ["你", "好", "world"])
+ })
+ })
+
+ describe("Edge cases", () => {
+ test("should handle empty string", () => {
+ const result = encoder("")
+ assert.deepStrictEqual(result, [])
+ })
+
+ test("should handle only whitespace", () => {
+ const result = encoder(" \t\n ")
+ assert.deepStrictEqual(result, [])
+ })
+
+ test("should handle single character", () => {
+ const result = encoder("a")
+ assert.deepStrictEqual(result, ["a"])
+ })
+
+ test("should handle single CJK character", () => {
+ const result = encoder("あ")
+ assert.deepStrictEqual(result, ["あ"])
+ })
+
+ test("should handle CJK with trailing whitespace", () => {
+ const result = encoder("日本語 ")
+ assert.deepStrictEqual(result, ["日", "本", "語"])
+ })
+
+ test("should handle English with trailing whitespace", () => {
+ const result = encoder("hello ")
+ assert.deepStrictEqual(result, ["hello"])
+ })
+ })
+})
--
Gitblit v1.10.0