Add Lichess PGN -> PAWN Parquet extraction pipeline (#4)

87cdae5 unverified about 1 hour ago

30.3 kB

	//! PGN game parsing: file → SAN moves → PAWN token sequences.
	//!
	//! Full pipeline in Rust: reads PGN files, extracts SAN move strings,
	//! converts to PAWN tokens via shakmaty. Uses rayon for parallel
	//! token conversion.
	//!
	//! Also provides enriched parsing that extracts clock annotations,
	//! eval annotations, and PGN headers for dataset construction.

	use std::collections::{HashMap, HashSet};
	use std::fs;
	use rayon::prelude::*;
	use shakmaty::{Chess, Position};
	use shakmaty::san::San;

	use crate::board::move_to_token;

	// ---------------------------------------------------------------------------
	// Enriched PGN parsing — extracts moves, clocks, evals, and headers
	// ---------------------------------------------------------------------------

	/// A fully parsed game with move tokens, annotations, and metadata.
	pub struct EnrichedGame {
	/// PAWN token indices for each ply (not padded).
	pub tokens: Vec<u16>,
	/// Seconds remaining on clock after each ply (0 = no annotation).
	pub clocks: Vec<u16>,
	/// Centipawns from white's perspective after each ply.
	/// Mate scores: ±(32767-N). No annotation: 0x8000 (-32768 as i16).
	pub evals: Vec<i16>,
	/// Number of valid plies.
	pub game_length: usize,
	/// PGN header fields (e.g., "White" -> "alice", "WhiteElo" -> "1873").
	pub headers: HashMap<String, String>,
	}

	/// Parse a PGN string into enriched games.
	///
	/// Extracts SAN moves (tokenized), `[%clk h:mm:ss]` annotations,
	/// `[%eval ±N.NN]` / `[%eval #±N]` annotations, and all PGN headers.
	/// Tokenization uses shakmaty and is parallelized with rayon.
	pub fn parse_pgn_enriched(
	content: &str,
	max_ply: usize,
	max_games: usize,
	min_ply: usize,
	) -> Vec<EnrichedGame> {
	let raw_games = parse_raw_games(content, max_games, None, None);

	// Phase 2: parallel tokenization + annotation extraction
	raw_games
	.into_par_iter()
	.filter_map(\|raw\| {
	let (san_moves, clocks_raw, evals_raw) = extract_moves_and_annotations(&raw.movetext);
	if san_moves.len() < min_ply {
	return None;
	}

	// Tokenize SAN moves via shakmaty
	let refs: Vec<&str> = san_moves.iter().map(\|s\| s.as_str()).collect();
	let (tokens, n_valid) = san_moves_to_tokens(&refs, max_ply);
	if n_valid < min_ply {
	return None;
	}

	// Trim annotations to match token count (moves may have failed to parse).
	let clocks = clocks_raw.into_iter().take(n_valid).collect();
	let evals = evals_raw.into_iter().take(n_valid).collect();

	Some(EnrichedGame {
	tokens,
	clocks,
	evals,
	game_length: n_valid,
	headers: raw.headers,
	})
	})
	.collect()
	}

	/// Count games in a PGN string whose UTCDate falls within [start, end].
	///
	/// Header-only scan — no movetext parsing, no tokenization.
	/// Returns (count_in_range, offset) where offset is the running game index
	/// that should be passed to the next chunk for correct global indexing.
	pub fn count_games_in_date_range(
	content: &str,
	date_start: &str,
	date_end: &str,
	) -> usize {
	let mut count = 0;
	let mut current_date: Option<String> = None;
	let mut in_movetext = false;

	for line in content.lines() {
	let line = line.trim();
	if line.is_empty() {
	if in_movetext {
	// End of game — check if the date was in range
	if let Some(ref d) = current_date {
	if d.as_str() >= date_start && d.as_str() <= date_end {
	count += 1;
	}
	}
	current_date = None;
	in_movetext = false;
	}
	continue;
	}

	if line.starts_with('[') && line.ends_with(']') {
	if let Some((key, value)) = parse_header_line(line) {
	if key == "UTCDate" {
	current_date = Some(value);
	}
	}
	in_movetext = false;
	} else {
	in_movetext = true;
	}
	}

	// Handle last game
	if in_movetext {
	if let Some(ref d) = current_date {
	if d.as_str() >= date_start && d.as_str() <= date_end {
	count += 1;
	}
	}
	}

	count
	}

	/// Parse a PGN string, but only tokenize games at specific indices within a
	/// date range. Used for uniform random sampling: Python counts games in the
	/// date range (via `count_games_in_date_range`), generates a random index
	/// set, then calls this to parse only those games.
	///
	/// `indices` are 0-based within the date-range-matching games of this chunk.
	/// `game_offset` is the number of date-matching games seen in previous chunks,
	/// so global index = game_offset + local_index.
	pub fn parse_pgn_enriched_sampled(
	content: &str,
	max_ply: usize,
	min_ply: usize,
	date_start: &str,
	date_end: &str,
	indices: &HashSet<usize>,
	game_offset: usize,
	) -> Vec<EnrichedGame> {
	let raw_games = parse_raw_games(content, usize::MAX, Some((date_start, date_end)), Some((indices, game_offset)));

	raw_games
	.into_par_iter()
	.filter_map(\|raw\| {
	let (san_moves, clocks_raw, evals_raw) = extract_moves_and_annotations(&raw.movetext);
	if san_moves.len() < min_ply {
	return None;
	}

	let refs: Vec<&str> = san_moves.iter().map(\|s\| s.as_str()).collect();
	let (tokens, n_valid) = san_moves_to_tokens(&refs, max_ply);
	if n_valid < min_ply {
	return None;
	}

	let clocks = clocks_raw.into_iter().take(n_valid).collect();
	let evals = evals_raw.into_iter().take(n_valid).collect();

	Some(EnrichedGame {
	tokens,
	clocks,
	evals,
	game_length: n_valid,
	headers: raw.headers,
	})
	})
	.collect()
	}

	/// Raw game data before tokenization.
	struct RawGame {
	headers: HashMap<String, String>,
	movetext: String,
	}

	/// Single-threaded PGN line scanner. Extracts headers and raw movetext.
	///
	/// If `date_range` is Some((start, end)), only games whose UTCDate falls
	/// within [start, end] are included. If `sample` is Some((indices, offset)),
	/// only games whose (offset + local_index) is in the index set are kept.
	fn parse_raw_games(
	content: &str,
	max_games: usize,
	date_range: Option<(&str, &str)>,
	sample: Option<(&HashSet<usize>, usize)>,
	) -> Vec<RawGame> {
	let mut games = Vec::new();
	let mut headers: HashMap<String, String> = HashMap::new();
	let mut movetext_lines: Vec<&str> = Vec::new();
	let mut in_movetext = false;
	let mut date_excluded = false; // UTCDate outside date_range
	let mut has_utc_date = false; // saw a UTCDate header for this game
	let mut date_matched_idx = 0usize; // count of date-matching games seen

	for line in content.lines() {
	let line = line.trim();
	if line.is_empty() {
	if in_movetext {
	// End of movetext — game boundary.
	// Exclude if date is out of range OR if date_range is active
	// but no UTCDate header was found (consistent with count_games_in_date_range).
	let excluded = date_excluded \|\| (date_range.is_some() && !has_utc_date);
	if !excluded && !movetext_lines.is_empty() {
	// Game passed date filter. Check sample if present.
	let keep = match sample {
	Some((indices, offset)) => indices.contains(&(offset + date_matched_idx)),
	None => true,
	};
	date_matched_idx += 1;

	if keep {
	games.push(RawGame {
	headers: std::mem::take(&mut headers),
	movetext: movetext_lines.join(" "),
	});
	if games.len() >= max_games {
	break;
	}
	}
	}
	movetext_lines.clear();
	headers.clear();
	in_movetext = false;
	date_excluded = false;
	has_utc_date = false;
	}
	// Blank line between headers and movetext: don't reset state
	continue;
	}

	// Header line: [Key "Value"]
	if line.starts_with('[') && line.ends_with(']') {
	if let Some((key, value)) = parse_header_line(line) {
	if key == "UTCDate" {
	has_utc_date = true;
	if let Some((start, end)) = date_range {
	if value.as_str() < start \|\| value.as_str() > end {
	date_excluded = true;
	}
	}
	}
	if !date_excluded {
	headers.insert(key, value);
	}
	}
	in_movetext = false;
	continue;
	}

	if !date_excluded {
	in_movetext = true;
	movetext_lines.push(line);
	}
	}

	// Handle last game
	let last_excluded = date_excluded \|\| (date_range.is_some() && !has_utc_date);
	if in_movetext && !last_excluded && !movetext_lines.is_empty() && games.len() < max_games {
	let keep = match sample {
	Some((indices, offset)) => indices.contains(&(offset + date_matched_idx)),
	None => true,
	};
	if keep {
	games.push(RawGame {
	headers: std::mem::take(&mut headers),
	movetext: movetext_lines.join(" "),
	});
	}
	}

	games
	}

	/// Parse a PGN header line like `[White "alice"]` into ("White", "alice").
	fn parse_header_line(line: &str) -> Option<(String, String)> {
	// Strip surrounding brackets
	let inner = line.strip_prefix('[')?.strip_suffix(']')?.trim();
	let space = inner.find(' ')?;
	let key = inner[..space].to_string();
	let value_part = inner[space..].trim();
	// Strip surrounding quotes
	let value = value_part
	.strip_prefix('"')
	.and_then(\|v\| v.strip_suffix('"'))
	.unwrap_or(value_part)
	.to_string();
	Some((key, value))
	}

	/// Sentinel for "no clock annotation" (0x8000 as u16 = 32768).
	const CLOCK_NONE: u16 = 0x8000;
	/// Sentinel for "no eval annotation" (0x8000 as i16 = -32768).
	const EVAL_NONE: i16 = -0x8000; // i16::MIN

	/// Extract SAN moves, clock annotations, and eval annotations from movetext.
	///
	/// Returns (san_moves, clocks, evals) where clocks[i] is the clock after
	/// move i (CLOCK_NONE if no annotation) and evals[i] is centipawns after
	/// move i (EVAL_NONE if no annotation).
	fn extract_moves_and_annotations(text: &str) -> (Vec<String>, Vec<u16>, Vec<i16>) {
	let mut moves = Vec::new();
	let mut clocks = Vec::new();
	let mut evals = Vec::new();

	let bytes = text.as_bytes();
	let len = bytes.len();

	// Lichess format: move { comment } move { comment } ...
	// The comment annotates the move immediately before it.
	let mut i = 0;
	while i < len {
	if bytes[i].is_ascii_whitespace() {
	i += 1;
	continue;
	}

	// Comment: { ... } — applies to the last pushed move
	if bytes[i] == b'{' {
	i += 1;
	let start = i;
	while i < len && bytes[i] != b'}' {
	i += 1;
	}
	let comment = &text[start..i];
	if i < len { i += 1; }

	// Apply to last move
	if let Some(last_clk) = clocks.last_mut() {
	let mut clk = CLOCK_NONE;
	let mut ev = EVAL_NONE;
	parse_comment(comment, &mut clk, &mut ev);
	if clk != CLOCK_NONE { *last_clk = clk; }
	if ev != EVAL_NONE {
	if let Some(last_ev) = evals.last_mut() {
	*last_ev = ev;
	}
	}
	}
	continue;
	}

	let start = i;
	while i < len && !bytes[i].is_ascii_whitespace() && bytes[i] != b'{' {
	i += 1;
	}
	let token = &text[start..i];

	if token.starts_with('$') { continue; }
	if token == "1-0" \|\| token == "0-1" \|\| token == "1/2-1/2" \|\| token == "*" { break; }

	let stripped = token.trim_end_matches('.');
	if !stripped.is_empty() && stripped.bytes().all(\|b\| b.is_ascii_digit()) { continue; }

	moves.push(token.to_string());
	clocks.push(CLOCK_NONE);
	evals.push(EVAL_NONE);
	}

	(moves, clocks, evals)
	}

	/// Parse a PGN comment body for clock and eval annotations.
	///
	/// Lichess format: `[%clk 0:03:00]` and `[%eval 1.23]` or `[%eval #-3]`.
	fn parse_comment(comment: &str, clock: &mut u16, eval: &mut i16) {
	// Clock: [%clk H:MM:SS]
	if let Some(pos) = comment.find("[%clk ") {
	let rest = &comment[pos + 6..];
	if let Some(end) = rest.find(']') {
	let clk_str = rest[..end].trim();
	if let Some(secs) = parse_clock(clk_str) {
	*clock = secs;
	}
	}
	}

	// Eval: [%eval 1.23] or [%eval #-3]
	if let Some(pos) = comment.find("[%eval ") {
	let rest = &comment[pos + 7..];
	if let Some(end) = rest.find(']') {
	let eval_str = rest[..end].trim();
	if let Some(cp) = parse_eval(eval_str) {
	*eval = cp;
	}
	}
	}
	}

	/// Parse "H:MM:SS" into total seconds as u16.
	fn parse_clock(s: &str) -> Option<u16> {
	let parts: Vec<&str> = s.split(':').collect();
	if parts.len() != 3 { return None; }
	let h: u32 = parts[0].parse().ok()?;
	let m: u32 = parts[1].parse().ok()?;
	let s: u32 = parts[2].parse().ok()?;
	let total = h * 3600 + m * 60 + s;
	// Cap at 0x7FFF (32767) to avoid collision with CLOCK_NONE (0x8000)
	Some(total.min(0x7FFF) as u16)
	}

	/// Parse eval string into centipawns (i16).
	/// "1.23" → 123, "-0.50" → -50.
	/// Mate scores: "#N" → 32767-N, "#-N" → -(32767-N).
	/// Bit 14 is always set for mates, making them detectable via bitmask.
	/// Centipawn values are clamped to ±16383 to avoid overlap with the mate range.
	fn parse_eval(s: &str) -> Option<i16> {
	if s.starts_with('#') {
	let rest = &s[1..];
	let n: i32 = rest.parse().ok()?;
	let abs_n = n.unsigned_abs().max(1) as i16;
	let mate_val = 32767 - abs_n;
	Some(if n > 0 { mate_val } else { -mate_val })
	} else {
	let f: f64 = s.parse().ok()?;
	let cp = (f * 100.0).round() as i32;
	Some(cp.clamp(-16383, 16383) as i16)
	}
	}

	/// Convert a sequence of SAN move strings to PAWN token indices.
	///
	/// Returns (tokens, n_valid) where tokens has length up to max_ply,
	/// and n_valid is how many moves were successfully parsed.
	/// Stops at the first parse error or illegal move.
	pub fn san_moves_to_tokens(
	san_moves: &[&str],
	max_ply: usize,
	) -> (Vec<u16>, usize) {
	let mut pos = Chess::default();
	let mut tokens = Vec::with_capacity(san_moves.len().min(max_ply));

	for (i, san_str) in san_moves.iter().enumerate() {
	if i >= max_ply {
	break;
	}

	let san = match San::from_ascii(san_str.as_bytes()) {
	Ok(s) => s,
	Err(_) => break,
	};

	let m = match san.to_move(&pos) {
	Ok(m) => m,
	Err(_) => break,
	};

	let token = move_to_token(&m);
	tokens.push(token);
	pos.play_unchecked(m);
	}

	let n = tokens.len();
	(tokens, n)
	}

	/// Batch convert: multiple games, each as a list of SAN moves.
	/// Returns a flat (n_games * max_ply) i16 array (0-padded) + lengths.
	pub fn batch_san_to_tokens(
	games: &[Vec<&str>],
	max_ply: usize,
	) -> (Vec<i16>, Vec<i16>) {
	let n = games.len();
	let mut flat = vec![0i16; n * max_ply];
	let mut lengths = Vec::with_capacity(n);

	for (gi, san_moves) in games.iter().enumerate() {
	let (tokens, n_valid) = san_moves_to_tokens(san_moves, max_ply);
	for (t, &tok) in tokens.iter().enumerate() {
	flat[gi * max_ply + t] = tok as i16;
	}
	lengths.push(n_valid as i16);
	}

	(flat, lengths)
	}

	/// Parse a PGN file and extract SAN move lists for each game.
	///
	/// Handles standard PGN: skips headers ([...]), strips move numbers,
	/// comments ({...}), NAGs ($N), and result markers.
	fn parse_pgn_to_san(content: &str, max_games: usize) -> Vec<Vec<String>> {
	let mut games = Vec::new();
	let mut movetext_lines: Vec<&str> = Vec::new();
	let mut in_movetext = false;

	for line in content.lines() {
	let line = line.trim();
	if line.is_empty() {
	if in_movetext && !movetext_lines.is_empty() {
	let text: String = movetext_lines.join(" ");
	if let Some(moves) = extract_san_moves(&text) {
	if !moves.is_empty() {
	games.push(moves);
	if games.len() >= max_games {
	break;
	}
	}
	}
	movetext_lines.clear();
	in_movetext = false;
	}
	continue;
	}

	if line.starts_with('[') {
	in_movetext = false;
	continue;
	}

	in_movetext = true;
	movetext_lines.push(line);
	}

	// Handle last game
	if !movetext_lines.is_empty() && games.len() < max_games {
	let text: String = movetext_lines.join(" ");
	if let Some(moves) = extract_san_moves(&text) {
	if !moves.is_empty() {
	games.push(moves);
	}
	}
	}

	games
	}

	/// Extract SAN moves from a PGN movetext string.
	fn extract_san_moves(text: &str) -> Option<Vec<String>> {
	let mut moves = Vec::new();

	// First strip comments { ... } (can span multiple words)
	let mut cleaned = String::with_capacity(text.len());
	let mut in_comment = false;
	for ch in text.chars() {
	if ch == '{' { in_comment = true; continue; }
	if ch == '}' { in_comment = false; continue; }
	if !in_comment { cleaned.push(ch); }
	}

	for token in cleaned.split_whitespace() {
	// Skip NAGs: $1, $2, etc.
	if token.starts_with('$') {
	continue;
	}

	// Result markers — stop parsing
	if token == "1-0" \|\| token == "0-1" \|\| token == "1/2-1/2" \|\| token == "*" {
	break;
	}

	// Skip move numbers: "1.", "1...", "23."
	let stripped = token.trim_end_matches('.');
	if !stripped.is_empty() && stripped.bytes().all(\|b\| b.is_ascii_digit()) {
	continue;
	}

	moves.push(token.to_string());
	}

	Some(moves)
	}

	/// Full pipeline: read PGN file → parse → convert to tokens (parallel).
	///
	/// Returns (flat_tokens: Vec<i16> of shape n_games*max_ply, lengths: Vec<i16>).
	pub fn pgn_file_to_tokens(
	path: &str,
	max_ply: usize,
	max_games: usize,
	min_ply: usize,
	) -> (Vec<i16>, Vec<i16>, usize) {
	let content = fs::read_to_string(path)
	.unwrap_or_else(\|e\| panic!("Failed to read PGN file {}: {}", path, e));

	let san_games = parse_pgn_to_san(&content, max_games);
	let n_parsed = san_games.len();

	// Parallel token conversion with rayon
	let converted: Vec<(Vec<u16>, usize)> = san_games
	.par_iter()
	.map(\|moves\| {
	let refs: Vec<&str> = moves.iter().map(\|s\| s.as_str()).collect();
	san_moves_to_tokens(&refs, max_ply)
	})
	.collect();

	// Filter by min_ply and pack into flat array
	let filtered: Vec<&(Vec<u16>, usize)> = converted
	.iter()
	.filter(\|(_, n)\| *n >= min_ply)
	.collect();

	let n = filtered.len();
	let mut flat = vec![0i16; n * max_ply];
	let mut lengths = Vec::with_capacity(n);

	for (gi, (tokens, n_valid)) in filtered.iter().enumerate() {
	for (t, &tok) in tokens.iter().enumerate() {
	flat[gi * max_ply + t] = tok as i16;
	}
	lengths.push(*n_valid as i16);
	}

	(flat, lengths, n_parsed)
	}

	#[cfg(test)]
	mod tests {
	use super::*;

	#[test]
	fn test_san_to_tokens() {
	let moves = vec!["e4", "e5", "Qh5", "Nc6", "Bc4", "Nf6", "Qxf7#"];
	let (tokens, n) = san_moves_to_tokens(&moves, 256);
	assert_eq!(n, 7);
	assert_eq!(tokens.len(), 7);
	let e2e4 = crate::vocab::base_grid_token(12, 28);
	assert_eq!(tokens[0], e2e4);
	}

	#[test]
	fn test_san_to_tokens_max_ply() {
	let moves = vec!["e4", "e5", "Nf3", "Nc6"];
	let (tokens, n) = san_moves_to_tokens(&moves, 2);
	assert_eq!(n, 2);
	assert_eq!(tokens.len(), 2);
	}

	#[test]
	fn test_extract_san_moves() {
	let text = "1. e4 e5 2. Nf3 Nc6 3. Bb5 {Spanish} a6 1-0";
	let moves = extract_san_moves(text).unwrap();
	assert_eq!(moves, vec!["e4", "e5", "Nf3", "Nc6", "Bb5", "a6"]);
	}

	#[test]
	fn test_extract_san_with_nags() {
	let text = "1. e4 $1 e5 2. Nf3 $2 Nc6 0-1";
	let moves = extract_san_moves(text).unwrap();
	assert_eq!(moves, vec!["e4", "e5", "Nf3", "Nc6"]);
	}

	#[test]
	fn test_parse_pgn_to_san() {
	let pgn = r#"[Event "Test"]
	[White "Alice"]
	[Black "Bob"]

	1. e4 e5 2. Nf3 Nc6 1-0

	[Event "Test2"]

	1. d4 d5 0-1
	"#;
	let games = parse_pgn_to_san(pgn, 100);
	assert_eq!(games.len(), 2);
	assert_eq!(games[0], vec!["e4", "e5", "Nf3", "Nc6"]);
	assert_eq!(games[1], vec!["d4", "d5"]);
	}

	#[test]
	fn test_pgn_file_to_tokens_inline() {
	// Test the full pipeline with a temp file
	let dir = std::env::temp_dir();
	let path = dir.join("test_pgn.pgn");
	fs::write(&path, r#"[Event "Test"]

	1. e4 e5 2. Nf3 Nc6 1-0

	[Event "Test2"]

	1. d4 d5 0-1
	"#).unwrap();

	let (flat, lengths, n_parsed) = pgn_file_to_tokens(
	path.to_str().unwrap(), 256, 100, 2
	);
	assert_eq!(n_parsed, 2);
	assert_eq!(lengths.len(), 2);
	assert_eq!(lengths[0], 4);
	assert_eq!(lengths[1], 2);
	assert_eq!(flat.len(), 2 * 256);

	fs::remove_file(path).ok();
	}

	// --- Enriched parsing tests ---

	#[test]
	fn test_parse_clock() {
	assert_eq!(parse_clock("0:10:00"), Some(600));
	assert_eq!(parse_clock("1:30:00"), Some(5400));
	assert_eq!(parse_clock("0:00:05"), Some(5));
	assert_eq!(parse_clock("0:03:00"), Some(180));
	assert_eq!(parse_clock("bad"), None);
	}

	#[test]
	fn test_parse_eval() {
	assert_eq!(parse_eval("0.23"), Some(23));
	assert_eq!(parse_eval("-1.50"), Some(-150));
	assert_eq!(parse_eval("0.00"), Some(0));
	// Mate scores: 32767 - N
	assert_eq!(parse_eval("#1"), Some(32766));
	assert_eq!(parse_eval("#-1"), Some(-32766));
	assert_eq!(parse_eval("#3"), Some(32764));
	assert_eq!(parse_eval("#-3"), Some(-32764));
	assert_eq!(parse_eval("#10"), Some(32757));
	// Bit 14 (0x4000 = 16384) is set for all mate values
	assert!(parse_eval("#1").unwrap() & 0x4000 != 0);
	assert!(parse_eval("#100").unwrap() & 0x4000 != 0);
	// Centipawns clamped to ±16383 to avoid mate range
	assert_eq!(parse_eval("200.00"), Some(16383));
	assert_eq!(parse_eval("-200.00"), Some(-16383));
	}

	#[test]
	fn test_parse_header_line() {
	assert_eq!(
	parse_header_line(r#"[White "alice"]"#),
	Some(("White".to_string(), "alice".to_string()))
	);
	assert_eq!(
	parse_header_line(r#"[WhiteElo "1873"]"#),
	Some(("WhiteElo".to_string(), "1873".to_string()))
	);
	assert_eq!(
	parse_header_line(r#"[Opening "Bird Opening: Dutch Variation"]"#),
	Some(("Opening".to_string(), "Bird Opening: Dutch Variation".to_string()))
	);
	}

	#[test]
	fn test_extract_moves_and_annotations() {
	let text = r#"1. e4 { [%clk 0:10:00] [%eval 0.23] } 1... e5 { [%clk 0:09:58] [%eval 0.31] } 2. Nf3 { [%clk 0:09:55] } 1-0"#;
	let (moves, clocks, evals) = extract_moves_and_annotations(text);
	assert_eq!(moves, vec!["e4", "e5", "Nf3"]);
	assert_eq!(clocks, vec![600, 598, 595]);
	assert_eq!(evals, vec![23, 31, EVAL_NONE]);
	}

	#[test]
	fn test_extract_moves_no_annotations() {
	let text = "1. e4 e5 2. Nf3 Nc6 1-0";
	let (moves, clocks, evals) = extract_moves_and_annotations(text);
	assert_eq!(moves, vec!["e4", "e5", "Nf3", "Nc6"]);
	assert_eq!(clocks, vec![CLOCK_NONE, CLOCK_NONE, CLOCK_NONE, CLOCK_NONE]);
	assert_eq!(evals, vec![EVAL_NONE; 4]);
	}

	#[test]
	fn test_extract_moves_mate_eval() {
	let text = r#"1. e4 { [%eval 0.23] } 1... e5 { [%eval #-3] } 1-0"#;
	let (moves, _clocks, evals) = extract_moves_and_annotations(text);
	assert_eq!(moves, vec!["e4", "e5"]);
	assert_eq!(evals, vec![23, -32764]);
	}

	#[test]
	fn test_enriched_full_game() {
	let pgn = r#"[Event "Rated Rapid game"]
	[Site "https://lichess.org/abc123"]
	[White "alice"]
	[Black "bob"]
	[Result "1-0"]
	[WhiteElo "1873"]
	[BlackElo "1844"]
	[WhiteRatingDiff "+6"]
	[BlackRatingDiff "-26"]
	[ECO "C20"]
	[Opening "King's Pawn Game"]
	[TimeControl "600+0"]
	[Termination "Normal"]
	[UTCDate "2025.01.15"]
	[UTCTime "12:30:00"]

	1. e4 { [%clk 0:10:00] [%eval 0.23] } 1... e5 { [%clk 0:09:58] [%eval 0.31] } 2. Nf3 { [%clk 0:09:50] [%eval 0.25] } 2... Nc6 { [%clk 0:09:45] [%eval 0.30] } 1-0
	"#;
	let games = parse_pgn_enriched(pgn, 256, 100, 2);
	assert_eq!(games.len(), 1);
	let g = &games[0];
	assert_eq!(g.game_length, 4);
	assert_eq!(g.clocks, vec![600, 598, 590, 585]);
	assert_eq!(g.evals, vec![23, 31, 25, 30]);
	assert_eq!(g.headers.get("White").unwrap(), "alice");
	assert_eq!(g.headers.get("WhiteElo").unwrap(), "1873");
	assert_eq!(g.headers.get("Site").unwrap(), "https://lichess.org/abc123");
	assert_eq!(g.headers.get("ECO").unwrap(), "C20");
	assert_eq!(g.headers.get("TimeControl").unwrap(), "600+0");
	}

	#[test]
	fn test_enriched_tokens_match_legacy() {
	// Enriched parsing should produce the same tokens as the legacy pipeline
	let pgn = r#"[Event "Test"]

	1. e4 { [%clk 0:10:00] } 1... e5 { [%clk 0:09:58] } 2. Nf3 { [%clk 0:09:50] } 2... Nc6 { [%clk 0:09:45] } 1-0
	"#;
	let enriched = parse_pgn_enriched(pgn, 256, 100, 2);
	let legacy = parse_pgn_to_san(pgn, 100);

	assert_eq!(enriched.len(), 1);
	assert_eq!(legacy.len(), 1);

	// Convert legacy SAN to tokens for comparison
	let refs: Vec<&str> = legacy[0].iter().map(\|s\| s.as_str()).collect();
	let (legacy_tokens, legacy_n) = san_moves_to_tokens(&refs, 256);

	assert_eq!(enriched[0].tokens, legacy_tokens);
	assert_eq!(enriched[0].game_length, legacy_n);
	}

	#[test]
	fn test_count_games_in_date_range() {
	let pgn = r#"[Event "Game 1"]
	[UTCDate "2023.12.05"]

	1. e4 e5 1-0

	[Event "Game 2"]
	[UTCDate "2023.12.20"]

	1. d4 d5 0-1

	[Event "Game 3"]
	[UTCDate "2025.01.15"]

	1. e4 c5 1-0
	"#;
	assert_eq!(count_games_in_date_range(pgn, "2023.12.01", "2023.12.31"), 2);
	assert_eq!(count_games_in_date_range(pgn, "2023.12.01", "2023.12.14"), 1);
	assert_eq!(count_games_in_date_range(pgn, "2023.12.15", "2023.12.31"), 1);
	assert_eq!(count_games_in_date_range(pgn, "2025.01.01", "2025.01.31"), 1);
	assert_eq!(count_games_in_date_range(pgn, "2024.01.01", "2024.12.31"), 0);
	}

	#[test]
	fn test_sampled_parsing() {
	let pgn = r#"[Event "Game 1"]
	[UTCDate "2023.12.05"]

	1. e4 e5 1-0

	[Event "Game 2"]
	[UTCDate "2023.12.10"]

	1. d4 d5 0-1

	[Event "Game 3"]
	[UTCDate "2023.12.20"]

	1. e4 c5 1-0

	[Event "Game 4"]
	[UTCDate "2025.01.15"]

	1. Nf3 d5 1-0
	"#;
	// 3 games match Dec 2023 (indices 0, 1, 2 within the date range)
	assert_eq!(count_games_in_date_range(pgn, "2023.12.01", "2023.12.31"), 3);

	// Sample only index 1 (Game 2)
	let indices: HashSet<usize> = HashSet::from([1]);
	let sampled = parse_pgn_enriched_sampled(
	pgn, 256, 2, "2023.12.01", "2023.12.31", &indices, 0,
	);
	assert_eq!(sampled.len(), 1);
	assert_eq!(sampled[0].headers.get("UTCDate").unwrap(), "2023.12.10");

	// Sample indices 0 and 2 (Game 1 and Game 3)
	let indices: HashSet<usize> = HashSet::from([0, 2]);
	let sampled = parse_pgn_enriched_sampled(
	pgn, 256, 2, "2023.12.01", "2023.12.31", &indices, 0,
	);
	assert_eq!(sampled.len(), 2);
	assert_eq!(sampled[0].headers.get("UTCDate").unwrap(), "2023.12.05");
	assert_eq!(sampled[1].headers.get("UTCDate").unwrap(), "2023.12.20");

	// Sample with offset: simulating a second chunk where previous chunk had 1 match.
	// Global index 2 = offset 1 + local index 1 => selects Game 2 (local idx 1).
	let indices: HashSet<usize> = HashSet::from([2]);
	let sampled = parse_pgn_enriched_sampled(
	pgn, 256, 2, "2023.12.01", "2023.12.31", &indices, 1,
	);
	assert_eq!(sampled.len(), 1);
	assert_eq!(sampled[0].headers.get("UTCDate").unwrap(), "2023.12.10");

	// Offset that skips all local games: offset=3 means local indices are 3,4,5
	// but we only ask for global index 0, which isn't in this chunk.
	let indices: HashSet<usize> = HashSet::from([0]);
	let sampled = parse_pgn_enriched_sampled(
	pgn, 256, 2, "2023.12.01", "2023.12.31", &indices, 3,
	);
	assert_eq!(sampled.len(), 0);
	}
	}