scapegoat/Docs2.git

import test, { describe } from "node:test"
import assert from "node:assert"
 
// Inline the encoder function from search.inline.ts for testing
const encoder = (str: string): string[] => {
  const tokens: string[] = []
  let bufferStart = -1
  let bufferEnd = -1
  const lower = str.toLowerCase()
 
  let i = 0
  for (const char of lower) {
    const code = char.codePointAt(0)!
 
    const isCJK =
      (code >= 0x3040 && code <= 0x309f) ||
      (code >= 0x30a0 && code <= 0x30ff) ||
      (code >= 0x4e00 && code <= 0x9fff) ||
      (code >= 0xac00 && code <= 0xd7af) ||
      (code >= 0x20000 && code <= 0x2a6df)
 
    const isWhitespace = code === 32 || code === 9 || code === 10 || code === 13
 
    if (isCJK) {
      if (bufferStart !== -1) {
        tokens.push(lower.slice(bufferStart, bufferEnd))
        bufferStart = -1
      }
      tokens.push(char)
    } else if (isWhitespace) {
      if (bufferStart !== -1) {
        tokens.push(lower.slice(bufferStart, bufferEnd))
        bufferStart = -1
      }
    } else {
      if (bufferStart === -1) bufferStart = i
      bufferEnd = i + char.length
    }
 
    i += char.length
  }
 
  if (bufferStart !== -1) {
    tokens.push(lower.slice(bufferStart))
  }
 
  return tokens
}
 
describe("search encoder", () => {
  describe("English text", () => {
    test("should tokenize simple English words", () => {
      const result = encoder("hello world")
      assert.deepStrictEqual(result, ["hello", "world"])
    })
 
    test("should handle multiple spaces", () => {
      const result = encoder("hello   world")
      assert.deepStrictEqual(result, ["hello", "world"])
    })
 
    test("should handle tabs and newlines", () => {
      const result = encoder("hello\tworld\ntest")
      assert.deepStrictEqual(result, ["hello", "world", "test"])
    })
 
    test("should lowercase all text", () => {
      const result = encoder("Hello WORLD Test")
      assert.deepStrictEqual(result, ["hello", "world", "test"])
    })
  })
 
  describe("CJK text", () => {
    test("should tokenize Japanese Hiragana character by character", () => {
      const result = encoder("こんにちは")
      assert.deepStrictEqual(result, ["こ", "ん", "に", "ち", "は"])
    })
 
    test("should tokenize Japanese Katakana character by character", () => {
      const result = encoder("コントロール")
      assert.deepStrictEqual(result, ["コ", "ン", "ト", "ロ", "ー", "ル"])
    })
 
    test("should tokenize Japanese Kanji character by character", () => {
      const result = encoder("日本語")
      assert.deepStrictEqual(result, ["日", "本", "語"])
    })
 
    test("should tokenize Korean Hangul character by character", () => {
      const result = encoder("안녕하세요")
      assert.deepStrictEqual(result, ["안", "녕", "하", "세", "요"])
    })
 
    test("should tokenize Chinese characters character by character", () => {
      const result = encoder("你好世界")
      assert.deepStrictEqual(result, ["你", "好", "世", "界"])
    })
 
    test("should handle mixed Hiragana/Katakana/Kanji", () => {
      const result = encoder("て以来")
      assert.deepStrictEqual(result, ["て", "以", "来"])
    })
  })
 
  describe("Mixed CJK and English", () => {
    test("should handle Japanese with English words", () => {
      const result = encoder("hello 世界")
      assert.deepStrictEqual(result, ["hello", "世", "界"])
    })
 
    test("should handle English with Japanese words", () => {
      const result = encoder("世界 hello world")
      assert.deepStrictEqual(result, ["世", "界", "hello", "world"])
    })
 
    test("should handle complex mixed content", () => {
      const result = encoder("これはtest文章です")
      assert.deepStrictEqual(result, ["こ", "れ", "は", "test", "文", "章", "で", "す"])
    })
 
    test("should handle mixed Korean and English", () => {
      const result = encoder("hello 안녕 world")
      assert.deepStrictEqual(result, ["hello", "안", "녕", "world"])
    })
 
    test("should handle mixed Chinese and English", () => {
      const result = encoder("你好 world")
      assert.deepStrictEqual(result, ["你", "好", "world"])
    })
  })
 
  describe("Edge cases", () => {
    test("should handle empty string", () => {
      const result = encoder("")
      assert.deepStrictEqual(result, [])
    })
 
    test("should handle only whitespace", () => {
      const result = encoder("   \t\n  ")
      assert.deepStrictEqual(result, [])
    })
 
    test("should handle single character", () => {
      const result = encoder("a")
      assert.deepStrictEqual(result, ["a"])
    })
 
    test("should handle single CJK character", () => {
      const result = encoder("あ")
      assert.deepStrictEqual(result, ["あ"])
    })
 
    test("should handle CJK with trailing whitespace", () => {
      const result = encoder("日本語  ")
      assert.deepStrictEqual(result, ["日", "本", "語"])
    })
 
    test("should handle English with trailing whitespace", () => {
      const result = encoder("hello  ")
      assert.deepStrictEqual(result, ["hello"])
    })
  })
})