import test, { describe } from "node:test"
|
import assert from "node:assert"
|
|
// Inline the encoder function from search.inline.ts for testing
|
const encoder = (str: string): string[] => {
|
const tokens: string[] = []
|
let bufferStart = -1
|
let bufferEnd = -1
|
const lower = str.toLowerCase()
|
|
let i = 0
|
for (const char of lower) {
|
const code = char.codePointAt(0)!
|
|
const isCJK =
|
(code >= 0x3040 && code <= 0x309f) ||
|
(code >= 0x30a0 && code <= 0x30ff) ||
|
(code >= 0x4e00 && code <= 0x9fff) ||
|
(code >= 0xac00 && code <= 0xd7af) ||
|
(code >= 0x20000 && code <= 0x2a6df)
|
|
const isWhitespace = code === 32 || code === 9 || code === 10 || code === 13
|
|
if (isCJK) {
|
if (bufferStart !== -1) {
|
tokens.push(lower.slice(bufferStart, bufferEnd))
|
bufferStart = -1
|
}
|
tokens.push(char)
|
} else if (isWhitespace) {
|
if (bufferStart !== -1) {
|
tokens.push(lower.slice(bufferStart, bufferEnd))
|
bufferStart = -1
|
}
|
} else {
|
if (bufferStart === -1) bufferStart = i
|
bufferEnd = i + char.length
|
}
|
|
i += char.length
|
}
|
|
if (bufferStart !== -1) {
|
tokens.push(lower.slice(bufferStart))
|
}
|
|
return tokens
|
}
|
|
describe("search encoder", () => {
|
describe("English text", () => {
|
test("should tokenize simple English words", () => {
|
const result = encoder("hello world")
|
assert.deepStrictEqual(result, ["hello", "world"])
|
})
|
|
test("should handle multiple spaces", () => {
|
const result = encoder("hello world")
|
assert.deepStrictEqual(result, ["hello", "world"])
|
})
|
|
test("should handle tabs and newlines", () => {
|
const result = encoder("hello\tworld\ntest")
|
assert.deepStrictEqual(result, ["hello", "world", "test"])
|
})
|
|
test("should lowercase all text", () => {
|
const result = encoder("Hello WORLD Test")
|
assert.deepStrictEqual(result, ["hello", "world", "test"])
|
})
|
})
|
|
describe("CJK text", () => {
|
test("should tokenize Japanese Hiragana character by character", () => {
|
const result = encoder("γγγ«γ‘γ―")
|
assert.deepStrictEqual(result, ["γ", "γ", "γ«", "γ‘", "γ―"])
|
})
|
|
test("should tokenize Japanese Katakana character by character", () => {
|
const result = encoder("γ³γ³γγγΌγ«")
|
assert.deepStrictEqual(result, ["γ³", "γ³", "γ", "γ", "γΌ", "γ«"])
|
})
|
|
test("should tokenize Japanese Kanji character by character", () => {
|
const result = encoder("ζ₯ζ¬θͺ")
|
assert.deepStrictEqual(result, ["ζ₯", "ζ¬", "θͺ"])
|
})
|
|
test("should tokenize Korean Hangul character by character", () => {
|
const result = encoder("μλ
νμΈμ")
|
assert.deepStrictEqual(result, ["μ", "λ
", "ν", "μΈ", "μ"])
|
})
|
|
test("should tokenize Chinese characters character by character", () => {
|
const result = encoder("δ½ ε₯½δΈη")
|
assert.deepStrictEqual(result, ["δ½ ", "ε₯½", "δΈ", "η"])
|
})
|
|
test("should handle mixed Hiragana/Katakana/Kanji", () => {
|
const result = encoder("γ¦δ»₯ζ₯")
|
assert.deepStrictEqual(result, ["γ¦", "δ»₯", "ζ₯"])
|
})
|
})
|
|
describe("Mixed CJK and English", () => {
|
test("should handle Japanese with English words", () => {
|
const result = encoder("hello δΈη")
|
assert.deepStrictEqual(result, ["hello", "δΈ", "η"])
|
})
|
|
test("should handle English with Japanese words", () => {
|
const result = encoder("δΈη hello world")
|
assert.deepStrictEqual(result, ["δΈ", "η", "hello", "world"])
|
})
|
|
test("should handle complex mixed content", () => {
|
const result = encoder("γγγ―testζη« γ§γ")
|
assert.deepStrictEqual(result, ["γ", "γ", "γ―", "test", "ζ", "η« ", "γ§", "γ"])
|
})
|
|
test("should handle mixed Korean and English", () => {
|
const result = encoder("hello μλ
world")
|
assert.deepStrictEqual(result, ["hello", "μ", "λ
", "world"])
|
})
|
|
test("should handle mixed Chinese and English", () => {
|
const result = encoder("δ½ ε₯½ world")
|
assert.deepStrictEqual(result, ["δ½ ", "ε₯½", "world"])
|
})
|
})
|
|
describe("Edge cases", () => {
|
test("should handle empty string", () => {
|
const result = encoder("")
|
assert.deepStrictEqual(result, [])
|
})
|
|
test("should handle only whitespace", () => {
|
const result = encoder(" \t\n ")
|
assert.deepStrictEqual(result, [])
|
})
|
|
test("should handle single character", () => {
|
const result = encoder("a")
|
assert.deepStrictEqual(result, ["a"])
|
})
|
|
test("should handle single CJK character", () => {
|
const result = encoder("γ")
|
assert.deepStrictEqual(result, ["γ"])
|
})
|
|
test("should handle CJK with trailing whitespace", () => {
|
const result = encoder("ζ₯ζ¬θͺ ")
|
assert.deepStrictEqual(result, ["ζ₯", "ζ¬", "θͺ"])
|
})
|
|
test("should handle English with trailing whitespace", () => {
|
const result = encoder("hello ")
|
assert.deepStrictEqual(result, ["hello"])
|
})
|
})
|
})
|