Spaces:

Felladrin
/

MiniSearch

Running

MiniSearch / server /rerankerService.test.ts

github-actions[bot]

Sync from https://github.com/felladrin/MiniSearch

f31a721 10 days ago

7.62 kB

	import { describe, expect, it } from "vitest";
	import { sanitizeUnicodeSurrogates } from "./rerankerService";

	describe("sanitizeUnicodeSurrogates", () => {
	describe("valid input passthrough", () => {
	it("should return empty string unchanged", () => {
	expect(sanitizeUnicodeSurrogates("")).toBe("");
	});

	it("should return ASCII text unchanged", () => {
	const input = "Hello, World! 123";
	expect(sanitizeUnicodeSurrogates(input)).toBe(input);
	});

	it("should return valid Unicode text unchanged", () => {
	const input = "Héllo Wörld 日本語 🎉";
	expect(sanitizeUnicodeSurrogates(input)).toBe(input);
	});

	it("should preserve valid surrogate pairs (emoji)", () => {
	const input = "Text with emoji 😀🎊🚀";
	expect(sanitizeUnicodeSurrogates(input)).toBe(input);
	});

	it("should preserve valid surrogate pairs in complex text", () => {
	const input = "Start 🎉 middle 🚀 end";
	expect(sanitizeUnicodeSurrogates(input)).toBe(input);
	});
	});

	describe("unpaired high surrogate handling", () => {
	it("should replace lone high surrogate at end of string", () => {
	const highSurrogate = String.fromCharCode(0xd800);
	const input = `text${highSurrogate}`;
	expect(sanitizeUnicodeSurrogates(input)).toBe("text\ufffd");
	});

	it("should replace high surrogate followed by non-surrogate", () => {
	const highSurrogate = String.fromCharCode(0xd800);
	const input = `${highSurrogate}A`;
	expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffdA");
	});

	it("should replace high surrogate followed by another high surrogate", () => {
	const high1 = String.fromCharCode(0xd800);
	const high2 = String.fromCharCode(0xd801);
	const input = `${high1}${high2}`;
	expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd\ufffd");
	});

	it("should replace multiple consecutive unpaired high surrogates", () => {
	const high = String.fromCharCode(0xd800);
	const input = `${high}${high}${high}`;
	expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd\ufffd\ufffd");
	});
	});

	describe("unpaired low surrogate handling", () => {
	it("should replace lone low surrogate at start of string", () => {
	const lowSurrogate = String.fromCharCode(0xdc00);
	const input = `${lowSurrogate}text`;
	expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffdtext");
	});

	it("should replace lone low surrogate in middle of string", () => {
	const lowSurrogate = String.fromCharCode(0xdc00);
	const input = `before${lowSurrogate}after`;
	expect(sanitizeUnicodeSurrogates(input)).toBe("before\ufffdafter");
	});

	it("should replace multiple consecutive unpaired low surrogates", () => {
	const low = String.fromCharCode(0xdc00);
	const input = `${low}${low}`;
	expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd\ufffd");
	});
	});

	describe("mixed surrogate scenarios", () => {
	it("should handle low surrogate followed by high surrogate (reversed pair)", () => {
	const low = String.fromCharCode(0xdc00);
	const high = String.fromCharCode(0xd800);
	const input = `${low}${high}`;
	expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd\ufffd");
	});

	it("should handle valid pair followed by unpaired high", () => {
	const validEmoji = "😀";
	const unpairedHigh = String.fromCharCode(0xd83d);
	const input = `${validEmoji}${unpairedHigh}`;
	expect(sanitizeUnicodeSurrogates(input)).toBe("😀\ufffd");
	});

	it("should handle unpaired low followed by valid pair", () => {
	const unpairedLow = String.fromCharCode(0xdc00);
	const validEmoji = "🎉";
	const input = `${unpairedLow}${validEmoji}`;
	expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd🎉");
	});

	it("should handle interleaved valid and invalid surrogates", () => {
	const high = String.fromCharCode(0xd800);
	const low = String.fromCharCode(0xdc00);
	const input = `A${high}B${low}C`;
	expect(sanitizeUnicodeSurrogates(input)).toBe("A\ufffdB\ufffdC");
	});
	});

	describe("edge cases from real-world scenarios", () => {
	it("should handle text that might come from corrupted web content", () => {
	const corruptedChar = String.fromCharCode(0xd834);
	const input = `Search result: ${corruptedChar} more text`;
	expect(sanitizeUnicodeSurrogates(input)).toBe(
	"Search result: \ufffd more text",
	);
	});

	it("should preserve valid content around invalid surrogates", () => {
	const badHigh = String.fromCharCode(0xd83d);
	const input = `Valid text 日本語 ${badHigh} more valid 🎉 end`;
	expect(sanitizeUnicodeSurrogates(input)).toBe(
	"Valid text 日本語 \ufffd more valid 🎉 end",
	);
	});

	it("should handle boundary surrogate values", () => {
	const minHigh = String.fromCharCode(0xd800);
	const maxHigh = String.fromCharCode(0xdbff);
	const minLow = String.fromCharCode(0xdc00);
	const maxLow = String.fromCharCode(0xdfff);

	expect(sanitizeUnicodeSurrogates(minHigh)).toBe("\ufffd");
	expect(sanitizeUnicodeSurrogates(maxHigh)).toBe("\ufffd");
	expect(sanitizeUnicodeSurrogates(minLow)).toBe("\ufffd");
	expect(sanitizeUnicodeSurrogates(maxLow)).toBe("\ufffd");

	expect(sanitizeUnicodeSurrogates(`${minHigh}${minLow}`)).toBe(
	`${minHigh}${minLow}`,
	);
	expect(sanitizeUnicodeSurrogates(`${maxHigh}${maxLow}`)).toBe(
	`${maxHigh}${maxLow}`,
	);
	});

	it("should handle long strings with scattered invalid surrogates", () => {
	const unpairedHigh = String.fromCharCode(0xd800);
	const unpairedLow = String.fromCharCode(0xdc00);
	const chunks = [
	"Start of document.",
	unpairedHigh,
	" Some middle content.",
	unpairedLow,
	" More content here.",
	unpairedHigh,
	" End of document.",
	];
	const input = chunks.join("");
	const expected =
	"Start of document.\ufffd Some middle content.\ufffd More content here.\ufffd End of document.";
	expect(sanitizeUnicodeSurrogates(input)).toBe(expected);
	});

	it("should preserve adjacent high+low as valid pair even in mixed context", () => {
	const high = String.fromCharCode(0xd800);
	const low = String.fromCharCode(0xdc00);
	const validPair = `${high}${low}`;
	const input = `Text ${high} orphan, then valid pair: ${validPair} end`;
	expect(sanitizeUnicodeSurrogates(input)).toBe(
	`Text \ufffd orphan, then valid pair: ${validPair} end`,
	);
	});
	});

	describe("literal syntax and complex sequences", () => {
	it("should handle mixed valid and invalid surrogates using literals", () => {
	const input = "A\uD800B\uD83D\uDE00C\uDC00D";
	expect(sanitizeUnicodeSurrogates(input)).toBe(
	"A\uFFFDB\uD83D\uDE00C\uFFFDD",
	);
	});

	it("should handle surrogate pair followed by lone high surrogate", () => {
	const input = "😀\uD800";
	expect(sanitizeUnicodeSurrogates(input)).toBe("😀\uFFFD");
	});

	it("should handle lone high surrogate followed by valid surrogate pair", () => {
	const input = "\uD801\uD800\uDC00";
	expect(sanitizeUnicodeSurrogates(input)).toBe("\uFFFD\uD800\uDC00");
	});

	it("should handle multiple lone surrogates in a row", () => {
	const input = "\uD800\uDC00\uD801";
	expect(sanitizeUnicodeSurrogates(input)).toBe("\uD800\uDC00\uFFFD");
	});
	});
	});