Spaces:
Running
Running
| import { describe, expect, it } from "vitest"; | |
| import { sanitizeUnicodeSurrogates } from "./rerankerService"; | |
| describe("sanitizeUnicodeSurrogates", () => { | |
| describe("valid input passthrough", () => { | |
| it("should return empty string unchanged", () => { | |
| expect(sanitizeUnicodeSurrogates("")).toBe(""); | |
| }); | |
| it("should return ASCII text unchanged", () => { | |
| const input = "Hello, World! 123"; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe(input); | |
| }); | |
| it("should return valid Unicode text unchanged", () => { | |
| const input = "HΓ©llo WΓΆrld ζ₯ζ¬θͺ π"; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe(input); | |
| }); | |
| it("should preserve valid surrogate pairs (emoji)", () => { | |
| const input = "Text with emoji πππ"; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe(input); | |
| }); | |
| it("should preserve valid surrogate pairs in complex text", () => { | |
| const input = "Start π middle π end"; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe(input); | |
| }); | |
| }); | |
| describe("unpaired high surrogate handling", () => { | |
| it("should replace lone high surrogate at end of string", () => { | |
| const highSurrogate = String.fromCharCode(0xd800); | |
| const input = `text${highSurrogate}`; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe("text\ufffd"); | |
| }); | |
| it("should replace high surrogate followed by non-surrogate", () => { | |
| const highSurrogate = String.fromCharCode(0xd800); | |
| const input = `${highSurrogate}A`; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffdA"); | |
| }); | |
| it("should replace high surrogate followed by another high surrogate", () => { | |
| const high1 = String.fromCharCode(0xd800); | |
| const high2 = String.fromCharCode(0xd801); | |
| const input = `${high1}${high2}`; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd\ufffd"); | |
| }); | |
| it("should replace multiple consecutive unpaired high surrogates", () => { | |
| const high = String.fromCharCode(0xd800); | |
| const input = `${high}${high}${high}`; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd\ufffd\ufffd"); | |
| }); | |
| }); | |
| describe("unpaired low surrogate handling", () => { | |
| it("should replace lone low surrogate at start of string", () => { | |
| const lowSurrogate = String.fromCharCode(0xdc00); | |
| const input = `${lowSurrogate}text`; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffdtext"); | |
| }); | |
| it("should replace lone low surrogate in middle of string", () => { | |
| const lowSurrogate = String.fromCharCode(0xdc00); | |
| const input = `before${lowSurrogate}after`; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe("before\ufffdafter"); | |
| }); | |
| it("should replace multiple consecutive unpaired low surrogates", () => { | |
| const low = String.fromCharCode(0xdc00); | |
| const input = `${low}${low}`; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd\ufffd"); | |
| }); | |
| }); | |
| describe("mixed surrogate scenarios", () => { | |
| it("should handle low surrogate followed by high surrogate (reversed pair)", () => { | |
| const low = String.fromCharCode(0xdc00); | |
| const high = String.fromCharCode(0xd800); | |
| const input = `${low}${high}`; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd\ufffd"); | |
| }); | |
| it("should handle valid pair followed by unpaired high", () => { | |
| const validEmoji = "π"; | |
| const unpairedHigh = String.fromCharCode(0xd83d); | |
| const input = `${validEmoji}${unpairedHigh}`; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe("π\ufffd"); | |
| }); | |
| it("should handle unpaired low followed by valid pair", () => { | |
| const unpairedLow = String.fromCharCode(0xdc00); | |
| const validEmoji = "π"; | |
| const input = `${unpairedLow}${validEmoji}`; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffdπ"); | |
| }); | |
| it("should handle interleaved valid and invalid surrogates", () => { | |
| const high = String.fromCharCode(0xd800); | |
| const low = String.fromCharCode(0xdc00); | |
| const input = `A${high}B${low}C`; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe("A\ufffdB\ufffdC"); | |
| }); | |
| }); | |
| describe("edge cases from real-world scenarios", () => { | |
| it("should handle text that might come from corrupted web content", () => { | |
| const corruptedChar = String.fromCharCode(0xd834); | |
| const input = `Search result: ${corruptedChar} more text`; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe( | |
| "Search result: \ufffd more text", | |
| ); | |
| }); | |
| it("should preserve valid content around invalid surrogates", () => { | |
| const badHigh = String.fromCharCode(0xd83d); | |
| const input = `Valid text ζ₯ζ¬θͺ ${badHigh} more valid π end`; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe( | |
| "Valid text ζ₯ζ¬θͺ \ufffd more valid π end", | |
| ); | |
| }); | |
| it("should handle boundary surrogate values", () => { | |
| const minHigh = String.fromCharCode(0xd800); | |
| const maxHigh = String.fromCharCode(0xdbff); | |
| const minLow = String.fromCharCode(0xdc00); | |
| const maxLow = String.fromCharCode(0xdfff); | |
| expect(sanitizeUnicodeSurrogates(minHigh)).toBe("\ufffd"); | |
| expect(sanitizeUnicodeSurrogates(maxHigh)).toBe("\ufffd"); | |
| expect(sanitizeUnicodeSurrogates(minLow)).toBe("\ufffd"); | |
| expect(sanitizeUnicodeSurrogates(maxLow)).toBe("\ufffd"); | |
| expect(sanitizeUnicodeSurrogates(`${minHigh}${minLow}`)).toBe( | |
| `${minHigh}${minLow}`, | |
| ); | |
| expect(sanitizeUnicodeSurrogates(`${maxHigh}${maxLow}`)).toBe( | |
| `${maxHigh}${maxLow}`, | |
| ); | |
| }); | |
| it("should handle long strings with scattered invalid surrogates", () => { | |
| const unpairedHigh = String.fromCharCode(0xd800); | |
| const unpairedLow = String.fromCharCode(0xdc00); | |
| const chunks = [ | |
| "Start of document.", | |
| unpairedHigh, | |
| " Some middle content.", | |
| unpairedLow, | |
| " More content here.", | |
| unpairedHigh, | |
| " End of document.", | |
| ]; | |
| const input = chunks.join(""); | |
| const expected = | |
| "Start of document.\ufffd Some middle content.\ufffd More content here.\ufffd End of document."; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe(expected); | |
| }); | |
| it("should preserve adjacent high+low as valid pair even in mixed context", () => { | |
| const high = String.fromCharCode(0xd800); | |
| const low = String.fromCharCode(0xdc00); | |
| const validPair = `${high}${low}`; | |
| const input = `Text ${high} orphan, then valid pair: ${validPair} end`; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe( | |
| `Text \ufffd orphan, then valid pair: ${validPair} end`, | |
| ); | |
| }); | |
| }); | |
| describe("literal syntax and complex sequences", () => { | |
| it("should handle mixed valid and invalid surrogates using literals", () => { | |
| const input = "A\uD800B\uD83D\uDE00C\uDC00D"; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe( | |
| "A\uFFFDB\uD83D\uDE00C\uFFFDD", | |
| ); | |
| }); | |
| it("should handle surrogate pair followed by lone high surrogate", () => { | |
| const input = "π\uD800"; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe("π\uFFFD"); | |
| }); | |
| it("should handle lone high surrogate followed by valid surrogate pair", () => { | |
| const input = "\uD801\uD800\uDC00"; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe("\uFFFD\uD800\uDC00"); | |
| }); | |
| it("should handle multiple lone surrogates in a row", () => { | |
| const input = "\uD800\uDC00\uD801"; | |
| expect(sanitizeUnicodeSurrogates(input)).toBe("\uD800\uDC00\uFFFD"); | |
| }); | |
| }); | |
| }); | |