| New file |
| | |
| | | import test, { describe } from "node:test" |
| | | import assert from "node:assert" |
| | | |
| | | // Inline the encoder function from search.inline.ts for testing |
| | | const encoder = (str: string): string[] => { |
| | | const tokens: string[] = [] |
| | | let bufferStart = -1 |
| | | let bufferEnd = -1 |
| | | const lower = str.toLowerCase() |
| | | |
| | | let i = 0 |
| | | for (const char of lower) { |
| | | const code = char.codePointAt(0)! |
| | | |
| | | const isCJK = |
| | | (code >= 0x3040 && code <= 0x309f) || |
| | | (code >= 0x30a0 && code <= 0x30ff) || |
| | | (code >= 0x4e00 && code <= 0x9fff) || |
| | | (code >= 0xac00 && code <= 0xd7af) || |
| | | (code >= 0x20000 && code <= 0x2a6df) |
| | | |
| | | const isWhitespace = code === 32 || code === 9 || code === 10 || code === 13 |
| | | |
| | | if (isCJK) { |
| | | if (bufferStart !== -1) { |
| | | tokens.push(lower.slice(bufferStart, bufferEnd)) |
| | | bufferStart = -1 |
| | | } |
| | | tokens.push(char) |
| | | } else if (isWhitespace) { |
| | | if (bufferStart !== -1) { |
| | | tokens.push(lower.slice(bufferStart, bufferEnd)) |
| | | bufferStart = -1 |
| | | } |
| | | } else { |
| | | if (bufferStart === -1) bufferStart = i |
| | | bufferEnd = i + char.length |
| | | } |
| | | |
| | | i += char.length |
| | | } |
| | | |
| | | if (bufferStart !== -1) { |
| | | tokens.push(lower.slice(bufferStart)) |
| | | } |
| | | |
| | | return tokens |
| | | } |
| | | |
| | | describe("search encoder", () => { |
| | | describe("English text", () => { |
| | | test("should tokenize simple English words", () => { |
| | | const result = encoder("hello world") |
| | | assert.deepStrictEqual(result, ["hello", "world"]) |
| | | }) |
| | | |
| | | test("should handle multiple spaces", () => { |
| | | const result = encoder("hello world") |
| | | assert.deepStrictEqual(result, ["hello", "world"]) |
| | | }) |
| | | |
| | | test("should handle tabs and newlines", () => { |
| | | const result = encoder("hello\tworld\ntest") |
| | | assert.deepStrictEqual(result, ["hello", "world", "test"]) |
| | | }) |
| | | |
| | | test("should lowercase all text", () => { |
| | | const result = encoder("Hello WORLD Test") |
| | | assert.deepStrictEqual(result, ["hello", "world", "test"]) |
| | | }) |
| | | }) |
| | | |
| | | describe("CJK text", () => { |
| | | test("should tokenize Japanese Hiragana character by character", () => { |
| | | const result = encoder("こんにちは") |
| | | assert.deepStrictEqual(result, ["こ", "ん", "に", "ち", "は"]) |
| | | }) |
| | | |
| | | test("should tokenize Japanese Katakana character by character", () => { |
| | | const result = encoder("コントロール") |
| | | assert.deepStrictEqual(result, ["コ", "ン", "ト", "ロ", "ー", "ル"]) |
| | | }) |
| | | |
| | | test("should tokenize Japanese Kanji character by character", () => { |
| | | const result = encoder("日本語") |
| | | assert.deepStrictEqual(result, ["日", "本", "語"]) |
| | | }) |
| | | |
| | | test("should tokenize Korean Hangul character by character", () => { |
| | | const result = encoder("안녕하세요") |
| | | assert.deepStrictEqual(result, ["안", "녕", "하", "세", "요"]) |
| | | }) |
| | | |
| | | test("should tokenize Chinese characters character by character", () => { |
| | | const result = encoder("你好世界") |
| | | assert.deepStrictEqual(result, ["你", "好", "世", "界"]) |
| | | }) |
| | | |
| | | test("should handle mixed Hiragana/Katakana/Kanji", () => { |
| | | const result = encoder("て以来") |
| | | assert.deepStrictEqual(result, ["て", "以", "来"]) |
| | | }) |
| | | }) |
| | | |
| | | describe("Mixed CJK and English", () => { |
| | | test("should handle Japanese with English words", () => { |
| | | const result = encoder("hello 世界") |
| | | assert.deepStrictEqual(result, ["hello", "世", "界"]) |
| | | }) |
| | | |
| | | test("should handle English with Japanese words", () => { |
| | | const result = encoder("世界 hello world") |
| | | assert.deepStrictEqual(result, ["世", "界", "hello", "world"]) |
| | | }) |
| | | |
| | | test("should handle complex mixed content", () => { |
| | | const result = encoder("これはtest文章です") |
| | | assert.deepStrictEqual(result, ["こ", "れ", "は", "test", "文", "章", "で", "す"]) |
| | | }) |
| | | |
| | | test("should handle mixed Korean and English", () => { |
| | | const result = encoder("hello 안녕 world") |
| | | assert.deepStrictEqual(result, ["hello", "안", "녕", "world"]) |
| | | }) |
| | | |
| | | test("should handle mixed Chinese and English", () => { |
| | | const result = encoder("你好 world") |
| | | assert.deepStrictEqual(result, ["你", "好", "world"]) |
| | | }) |
| | | }) |
| | | |
| | | describe("Edge cases", () => { |
| | | test("should handle empty string", () => { |
| | | const result = encoder("") |
| | | assert.deepStrictEqual(result, []) |
| | | }) |
| | | |
| | | test("should handle only whitespace", () => { |
| | | const result = encoder(" \t\n ") |
| | | assert.deepStrictEqual(result, []) |
| | | }) |
| | | |
| | | test("should handle single character", () => { |
| | | const result = encoder("a") |
| | | assert.deepStrictEqual(result, ["a"]) |
| | | }) |
| | | |
| | | test("should handle single CJK character", () => { |
| | | const result = encoder("あ") |
| | | assert.deepStrictEqual(result, ["あ"]) |
| | | }) |
| | | |
| | | test("should handle CJK with trailing whitespace", () => { |
| | | const result = encoder("日本語 ") |
| | | assert.deepStrictEqual(result, ["日", "本", "語"]) |
| | | }) |
| | | |
| | | test("should handle English with trailing whitespace", () => { |
| | | const result = encoder("hello ") |
| | | assert.deepStrictEqual(result, ["hello"]) |
| | | }) |
| | | }) |
| | | }) |