Add Lichess PGN -> PAWN Parquet extraction pipeline (#4)

Browse files

Files changed (8) hide show

.dockerignore +2 -0
Dockerfile +7 -0
deploy/entrypoint-lichess-parquet.sh +62 -0
engine/python/chess_engine/__init__.py +6 -0
engine/src/lib.rs +264 -0
engine/src/pgn.rs +628 -0
scripts/extract_lichess_parquet.py +603 -0
tests/test_enriched_pgn.py +393 -0

.dockerignore CHANGED Viewed

@@ -14,6 +14,8 @@ deploy/
 !deploy/entrypoint-extract.sh
 !deploy/entrypoint-lc0.sh
 !deploy/entrypoint-rosa-sweep.sh
 *.so
 CLAUDE.md
 docs/

 !deploy/entrypoint-extract.sh
 !deploy/entrypoint-lc0.sh
 !deploy/entrypoint-rosa-sweep.sh
+!deploy/entrypoint-lc0-selfplay.sh
+!deploy/entrypoint-lichess-parquet.sh
 *.so
 CLAUDE.md
 docs/

Dockerfile CHANGED Viewed

@@ -92,6 +92,13 @@ COPY deploy/entrypoint-rosa-sweep.sh /entrypoint-rosa-sweep.sh
 RUN chmod +x /entrypoint-rosa-sweep.sh
 ENTRYPOINT ["/entrypoint-rosa-sweep.sh"]
 # ── Interactive (default) — SSH + Jupyter, stays alive ───────────────
 FROM runtime-base AS interactive
 # Inherits /start.sh entrypoint from Runpod base image

 RUN chmod +x /entrypoint-rosa-sweep.sh
 ENTRYPOINT ["/entrypoint-rosa-sweep.sh"]
+# ── Lichess extract — downloads PGN, writes Parquet, pushes to HF ───
+FROM runtime-base AS lichess-extract
+RUN pip install --no-cache-dir zstandard
+COPY deploy/entrypoint-lichess-parquet.sh /entrypoint-lichess-parquet.sh
+RUN chmod +x /entrypoint-lichess-parquet.sh
+ENTRYPOINT ["/entrypoint-lichess-parquet.sh"]
 # ── Interactive (default) — SSH + Jupyter, stays alive ───────────────
 FROM runtime-base AS interactive
 # Inherits /start.sh entrypoint from Runpod base image

deploy/entrypoint-lichess-parquet.sh ADDED Viewed

	@@ -0,0 +1,62 @@

+#!/bin/bash
+# Lichess PGN -> PAWN Parquet extraction entrypoint.
+# Downloads monthly database dumps, parses via Rust engine, writes sharded
+# Parquet with train/val/test splits, and optionally pushes to HuggingFace.
+#
+# Required env vars:
+#   MONTHS          — space-separated training months (e.g., "2025-01 2025-02 2025-03")
+#
+# Optional env vars:
+#   HF_TOKEN        — HuggingFace token (for pushing dataset)
+#   HF_REPO         — HuggingFace dataset repo (e.g., "thomas-schweich/pawn-lichess-full")
+#   HOLDOUT_MONTH   — month for val/test (e.g., "2023-12")
+#   HOLDOUT_GAMES   — games per split from holdout month (default: 50000)
+#   BATCH_SIZE      — games per parsing batch (default: 500000)
+#   SHARD_SIZE      — games per output shard (default: 1000000)
+#   MAX_GAMES       — stop after this many training games (for testing)
+#   OUTPUT_DIR      — output directory (default: /workspace/lichess-parquet)
+#   SEED            — random seed for holdout sampling (default: 42)
+set -euo pipefail
+echo "=== Lichess Parquet Extraction ==="
+echo "  Training months: ${MONTHS:?MONTHS env var is required}"
+echo "  Holdout month: ${HOLDOUT_MONTH:-none}"
+echo "  Holdout games/split: ${HOLDOUT_GAMES:-50000}"
+echo "  HF Repo: ${HF_REPO:-none}"
+echo "  Batch size: ${BATCH_SIZE:-500000}"
+echo "  Shard size: ${SHARD_SIZE:-1000000}"
+echo ""
+# Persist HF token if provided
+if [ -n "${HF_TOKEN:-}" ]; then
+    mkdir -p ~/.cache/huggingface
+    echo -n "$HF_TOKEN" > ~/.cache/huggingface/token
+    echo "HF token persisted"
+fi
+# Install zstandard if not available (needed for streaming decompression)
+python3 -c "import zstandard" 2>/dev/null || pip install --no-cache-dir zstandard
+# Build the command as an array to avoid shell injection
+CMD=(python3 /opt/pawn/scripts/extract_lichess_parquet.py
+    --months $MONTHS
+    --output "${OUTPUT_DIR:-/workspace/lichess-parquet}"
+    --batch-size "${BATCH_SIZE:-500000}"
+    --shard-size "${SHARD_SIZE:-1000000}"
+    --seed "${SEED:-42}"
+)
+if [ -n "${HOLDOUT_MONTH:-}" ]; then
+    CMD+=(--holdout-month "$HOLDOUT_MONTH")
+    CMD+=(--holdout-games "${HOLDOUT_GAMES:-50000}")
+fi
+if [ -n "${HF_REPO:-}" ]; then
+    CMD+=(--hf-repo "$HF_REPO")
+fi
+if [ -n "${MAX_GAMES:-}" ]; then
+    CMD+=(--max-games "$MAX_GAMES")
+fi
+echo "Running: ${CMD[*]}"
+echo ""
+exec "${CMD[@]}"

engine/python/chess_engine/__init__.py CHANGED Viewed

@@ -26,6 +26,9 @@ from chess_engine._engine import (
     # PGN parsing
     parse_pgn_file,
     pgn_to_tokens,
     # UCI parsing
     parse_uci_file,
     uci_to_tokens,
@@ -60,6 +63,9 @@ __all__ = [
     "validate_games",
     "parse_pgn_file",
     "pgn_to_tokens",
     "parse_uci_file",
     "uci_to_tokens",
     "pgn_to_uci",

     # PGN parsing
     parse_pgn_file,
     pgn_to_tokens,
+    parse_pgn_enriched,
+    count_pgn_games_in_date_range,
+    parse_pgn_sampled,
     # UCI parsing
     parse_uci_file,
     uci_to_tokens,
     "validate_games",
     "parse_pgn_file",
     "pgn_to_tokens",
+    "parse_pgn_enriched",
+    "count_pgn_games_in_date_range",
+    "parse_pgn_sampled",
     "parse_uci_file",
     "uci_to_tokens",
     "pgn_to_uci",

engine/src/lib.rs CHANGED Viewed

@@ -809,6 +809,267 @@ fn pgn_to_uci(py: Python<'_>, games: Vec<Vec<String>>) -> PyResult<Vec<Vec<Strin
     Ok(results.into_iter().map(|(uci, _)| uci).collect())
 }
 // ---------------------------------------------------------------------------
 // UCI engine self-play generation
 // ---------------------------------------------------------------------------
@@ -1172,6 +1433,9 @@ fn _engine(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_function(wrap_pyfunction!(parse_uci_file, m)?)?;
     m.add_function(wrap_pyfunction!(uci_to_tokens, m)?)?;
     m.add_function(wrap_pyfunction!(pgn_to_uci, m)?)?;
     m.add_function(wrap_pyfunction!(generate_engine_games_py, m)?)?;
     m.add_function(wrap_pyfunction!(compute_accuracy_ceiling_py, m)?)?;
     Ok(())

     Ok(results.into_iter().map(|(uci, _)| uci).collect())
 }
+// ---------------------------------------------------------------------------
+// Enriched PGN parsing (for dataset construction)
+// ---------------------------------------------------------------------------
+/// Parse PGN text with full annotation extraction for dataset building.
+///
+/// Extracts move tokens, clock annotations, eval annotations, and all PGN
+/// headers in a single pass. Designed for streaming: Python passes chunks of
+/// PGN text (containing complete games), Rust returns structured columns.
+///
+/// Returns a dict with:
+///   tokens: ndarray[i16, (N, max_ply)]   — PAWN token IDs, 0-padded
+///   clocks: ndarray[u16, (N, max_ply)]   — seconds remaining, 0-padded
+///   evals: ndarray[i16, (N, max_ply)]    — centipawns, 0-padded (i16::MIN = no annotation)
+///   game_lengths: ndarray[u16, (N,)]     — number of plies per game
+///   white_elo: ndarray[u16, (N,)]        — white Elo (0 if missing)
+///   black_elo: ndarray[u16, (N,)]        — black Elo (0 if missing)
+///   white_rating_diff: ndarray[i16, (N,)] — white rating change (0 if missing)
+///   black_rating_diff: ndarray[i16, (N,)] — black rating change (0 if missing)
+///   result: list[str]                     — "1-0", "0-1", "1/2-1/2", or ""
+///   white: list[str]                      — white player name
+///   black: list[str]                      — black player name
+///   eco: list[str]                        — ECO code
+///   opening: list[str]                    — opening name
+///   time_control: list[str]               — time control string
+///   termination: list[str]                — termination reason
+///   date_time: list[str]                  — "YYYY.MM.DD HH:MM:SS" UTC
+///   site: list[str]                       — game URL
+#[pyfunction]
+#[pyo3(signature = (content, max_ply=255, max_games=1_000_000, min_ply=1))]
+fn parse_pgn_enriched<'py>(
+    py: Python<'py>,
+    content: &str,
+    max_ply: usize,
+    max_games: usize,
+    min_ply: usize,
+) -> PyResult<PyObject> {
+    let games = py.allow_threads(|| {
+        pgn::parse_pgn_enriched(content, max_ply, max_games, min_ply)
+    });
+    let n = games.len();
+    let dict = PyDict::new(py);
+    // Flat 0-padded arrays for tokens, clocks, evals (N * max_ply)
+    let mut flat_tokens = vec![0i16; n * max_ply];
+    let mut flat_clocks = vec![0u16; n * max_ply];
+    let mut flat_evals = vec![0i16; n * max_ply];
+    // Scalar arrays
+    let mut lengths_out = Vec::with_capacity(n);
+    let mut white_elo_out = Vec::with_capacity(n);
+    let mut black_elo_out = Vec::with_capacity(n);
+    let mut white_rd_out = Vec::with_capacity(n);
+    let mut black_rd_out = Vec::with_capacity(n);
+    // String lists
+    let mut result_out = Vec::with_capacity(n);
+    let mut white_out = Vec::with_capacity(n);
+    let mut black_out = Vec::with_capacity(n);
+    let mut eco_out = Vec::with_capacity(n);
+    let mut opening_out = Vec::with_capacity(n);
+    let mut tc_out = Vec::with_capacity(n);
+    let mut term_out = Vec::with_capacity(n);
+    let mut datetime_out = Vec::with_capacity(n);
+    let mut site_out = Vec::with_capacity(n);
+    for (gi, g) in games.iter().enumerate() {
+        let offset = gi * max_ply;
+        let len = g.game_length.min(max_ply);
+        for t in 0..len {
+            flat_tokens[offset + t] = g.tokens[t] as i16;
+            flat_clocks[offset + t] = g.clocks[t];
+            flat_evals[offset + t] = g.evals[t];
+        }
+        lengths_out.push(g.game_length as u16);
+        let h = &g.headers;
+        white_elo_out.push(h.get("WhiteElo").and_then(|s| s.parse::<u16>().ok()).unwrap_or(0));
+        black_elo_out.push(h.get("BlackElo").and_then(|s| s.parse::<u16>().ok()).unwrap_or(0));
+        white_rd_out.push(h.get("WhiteRatingDiff").and_then(|s| s.parse::<i16>().ok()).unwrap_or(0));
+        black_rd_out.push(h.get("BlackRatingDiff").and_then(|s| s.parse::<i16>().ok()).unwrap_or(0));
+        result_out.push(h.get("Result").cloned().unwrap_or_default());
+        white_out.push(h.get("White").cloned().unwrap_or_default());
+        black_out.push(h.get("Black").cloned().unwrap_or_default());
+        eco_out.push(h.get("ECO").cloned().unwrap_or_default());
+        opening_out.push(h.get("Opening").cloned().unwrap_or_default());
+        tc_out.push(h.get("TimeControl").cloned().unwrap_or_default());
+        term_out.push(h.get("Termination").cloned().unwrap_or_default());
+        site_out.push(h.get("Site").cloned().unwrap_or_default());
+        let date = h.get("UTCDate").cloned().unwrap_or_default();
+        let time = h.get("UTCTime").cloned().unwrap_or_default();
+        if !date.is_empty() && !time.is_empty() {
+            datetime_out.push(format!("{} {}", date, time));
+        } else {
+            datetime_out.push(date);
+        }
+    }
+    // 2D numpy arrays: (N, max_ply)
+    let tokens_arr = numpy::PyArray::from_vec(py, flat_tokens).reshape([n, max_ply])?;
+    let clocks_arr = numpy::PyArray::from_vec(py, flat_clocks).reshape([n, max_ply])?;
+    let evals_arr = numpy::PyArray::from_vec(py, flat_evals).reshape([n, max_ply])?;
+    // 1D numpy arrays
+    let lengths_arr = numpy::PyArray::from_vec(py, lengths_out);
+    let white_elo_arr = numpy::PyArray::from_vec(py, white_elo_out);
+    let black_elo_arr = numpy::PyArray::from_vec(py, black_elo_out);
+    let white_rd_arr = numpy::PyArray::from_vec(py, white_rd_out);
+    let black_rd_arr = numpy::PyArray::from_vec(py, black_rd_out);
+    dict.set_item("tokens", tokens_arr)?;
+    dict.set_item("clocks", clocks_arr)?;
+    dict.set_item("evals", evals_arr)?;
+    dict.set_item("game_lengths", lengths_arr)?;
+    dict.set_item("white_elo", white_elo_arr)?;
+    dict.set_item("black_elo", black_elo_arr)?;
+    dict.set_item("white_rating_diff", white_rd_arr)?;
+    dict.set_item("black_rating_diff", black_rd_arr)?;
+    dict.set_item("result", result_out)?;
+    dict.set_item("white", white_out)?;
+    dict.set_item("black", black_out)?;
+    dict.set_item("eco", eco_out)?;
+    dict.set_item("opening", opening_out)?;
+    dict.set_item("time_control", tc_out)?;
+    dict.set_item("termination", term_out)?;
+    dict.set_item("date_time", datetime_out)?;
+    dict.set_item("site", site_out)?;
+    Ok(dict.into())
+}
+/// Count games in a PGN string whose UTCDate falls within [date_start, date_end].
+/// Header-only scan — no tokenization. Very fast.
+#[pyfunction]
+fn count_pgn_games_in_date_range(
+    py: Python<'_>,
+    content: &str,
+    date_start: &str,
+    date_end: &str,
+) -> PyResult<usize> {
+    let count = py.allow_threads(|| {
+        pgn::count_games_in_date_range(content, date_start, date_end)
+    });
+    Ok(count)
+}
+/// Parse only specific games (by index within a date range) from a PGN string.
+///
+/// Used for uniform random sampling: call count_pgn_games_in_date_range first
+/// to get the total, generate random indices in Python, then call this to
+/// parse only those games. `game_offset` is the cumulative count of
+/// date-matching games from previous chunks.
+///
+/// Returns the same dict format as parse_pgn_enriched.
+#[pyfunction]
+#[pyo3(signature = (content, indices, date_start, date_end, game_offset=0, max_ply=255, min_ply=1))]
+fn parse_pgn_sampled<'py>(
+    py: Python<'py>,
+    content: &str,
+    indices: Vec<usize>,
+    date_start: &str,
+    date_end: &str,
+    game_offset: usize,
+    max_ply: usize,
+    min_ply: usize,
+) -> PyResult<PyObject> {
+    let index_set: std::collections::HashSet<usize> = indices.into_iter().collect();
+    let games = py.allow_threads(|| {
+        pgn::parse_pgn_enriched_sampled(
+            content, max_ply, min_ply, date_start, date_end, &index_set, game_offset,
+        )
+    });
+    // Reuse the same dict-building logic as parse_pgn_enriched
+    let n = games.len();
+    let dict = PyDict::new(py);
+    let mut flat_tokens = vec![0i16; n * max_ply];
+    let mut flat_clocks = vec![0u16; n * max_ply];
+    let mut flat_evals = vec![0i16; n * max_ply];
+    let mut lengths_out = Vec::with_capacity(n);
+    let mut white_elo_out = Vec::with_capacity(n);
+    let mut black_elo_out = Vec::with_capacity(n);
+    let mut white_rd_out = Vec::with_capacity(n);
+    let mut black_rd_out = Vec::with_capacity(n);
+    let mut result_out = Vec::with_capacity(n);
+    let mut white_out = Vec::with_capacity(n);
+    let mut black_out = Vec::with_capacity(n);
+    let mut eco_out = Vec::with_capacity(n);
+    let mut opening_out = Vec::with_capacity(n);
+    let mut tc_out = Vec::with_capacity(n);
+    let mut term_out = Vec::with_capacity(n);
+    let mut datetime_out = Vec::with_capacity(n);
+    let mut site_out = Vec::with_capacity(n);
+    for (gi, g) in games.iter().enumerate() {
+        let offset = gi * max_ply;
+        let len = g.game_length.min(max_ply);
+        for t in 0..len {
+            flat_tokens[offset + t] = g.tokens[t] as i16;
+            flat_clocks[offset + t] = g.clocks[t];
+            flat_evals[offset + t] = g.evals[t];
+        }
+        lengths_out.push(g.game_length as u16);
+        let h = &g.headers;
+        white_elo_out.push(h.get("WhiteElo").and_then(|s| s.parse::<u16>().ok()).unwrap_or(0));
+        black_elo_out.push(h.get("BlackElo").and_then(|s| s.parse::<u16>().ok()).unwrap_or(0));
+        white_rd_out.push(h.get("WhiteRatingDiff").and_then(|s| s.parse::<i16>().ok()).unwrap_or(0));
+        black_rd_out.push(h.get("BlackRatingDiff").and_then(|s| s.parse::<i16>().ok()).unwrap_or(0));
+        result_out.push(h.get("Result").cloned().unwrap_or_default());
+        white_out.push(h.get("White").cloned().unwrap_or_default());
+        black_out.push(h.get("Black").cloned().unwrap_or_default());
+        eco_out.push(h.get("ECO").cloned().unwrap_or_default());
+        opening_out.push(h.get("Opening").cloned().unwrap_or_default());
+        tc_out.push(h.get("TimeControl").cloned().unwrap_or_default());
+        term_out.push(h.get("Termination").cloned().unwrap_or_default());
+        site_out.push(h.get("Site").cloned().unwrap_or_default());
+        let date = h.get("UTCDate").cloned().unwrap_or_default();
+        let time = h.get("UTCTime").cloned().unwrap_or_default();
+        if !date.is_empty() && !time.is_empty() {
+            datetime_out.push(format!("{} {}", date, time));
+        } else {
+            datetime_out.push(date);
+        }
+    }
+    let tokens_arr = numpy::PyArray::from_vec(py, flat_tokens).reshape([n, max_ply])?;
+    let clocks_arr = numpy::PyArray::from_vec(py, flat_clocks).reshape([n, max_ply])?;
+    let evals_arr = numpy::PyArray::from_vec(py, flat_evals).reshape([n, max_ply])?;
+    let lengths_arr = numpy::PyArray::from_vec(py, lengths_out);
+    let white_elo_arr = numpy::PyArray::from_vec(py, white_elo_out);
+    let black_elo_arr = numpy::PyArray::from_vec(py, black_elo_out);
+    let white_rd_arr = numpy::PyArray::from_vec(py, white_rd_out);
+    let black_rd_arr = numpy::PyArray::from_vec(py, black_rd_out);
+    dict.set_item("tokens", tokens_arr)?;
+    dict.set_item("clocks", clocks_arr)?;
+    dict.set_item("evals", evals_arr)?;
+    dict.set_item("game_lengths", lengths_arr)?;
+    dict.set_item("white_elo", white_elo_arr)?;
+    dict.set_item("black_elo", black_elo_arr)?;
+    dict.set_item("white_rating_diff", white_rd_arr)?;
+    dict.set_item("black_rating_diff", black_rd_arr)?;
+    dict.set_item("result", result_out)?;
+    dict.set_item("white", white_out)?;
+    dict.set_item("black", black_out)?;
+    dict.set_item("eco", eco_out)?;
+    dict.set_item("opening", opening_out)?;
+    dict.set_item("time_control", tc_out)?;
+    dict.set_item("termination", term_out)?;
+    dict.set_item("date_time", datetime_out)?;
+    dict.set_item("site", site_out)?;
+    Ok(dict.into())
+}
 // ---------------------------------------------------------------------------
 // UCI engine self-play generation
 // ---------------------------------------------------------------------------
     m.add_function(wrap_pyfunction!(parse_uci_file, m)?)?;
     m.add_function(wrap_pyfunction!(uci_to_tokens, m)?)?;
     m.add_function(wrap_pyfunction!(pgn_to_uci, m)?)?;
+    m.add_function(wrap_pyfunction!(parse_pgn_enriched, m)?)?;
+    m.add_function(wrap_pyfunction!(count_pgn_games_in_date_range, m)?)?;
+    m.add_function(wrap_pyfunction!(parse_pgn_sampled, m)?)?;
     m.add_function(wrap_pyfunction!(generate_engine_games_py, m)?)?;
     m.add_function(wrap_pyfunction!(compute_accuracy_ceiling_py, m)?)?;
     Ok(())

engine/src/pgn.rs CHANGED Viewed

@@ -3,7 +3,11 @@
 //! Full pipeline in Rust: reads PGN files, extracts SAN move strings,
 //! converts to PAWN tokens via shakmaty. Uses rayon for parallel
 //! token conversion.
 use std::fs;
 use rayon::prelude::*;
 use shakmaty::{Chess, Position};
@@ -11,6 +15,420 @@ use shakmaty::san::San;
 use crate::board::move_to_token;
 /// Convert a sequence of SAN move strings to PAWN token indices.
 ///
 /// Returns (tokens, n_valid) where tokens has length up to max_ply,
@@ -277,4 +695,214 @@ mod tests {
         fs::remove_file(path).ok();
     }
 }

 //! Full pipeline in Rust: reads PGN files, extracts SAN move strings,
 //! converts to PAWN tokens via shakmaty. Uses rayon for parallel
 //! token conversion.
+//!
+//! Also provides enriched parsing that extracts clock annotations,
+//! eval annotations, and PGN headers for dataset construction.
+use std::collections::{HashMap, HashSet};
 use std::fs;
 use rayon::prelude::*;
 use shakmaty::{Chess, Position};
 use crate::board::move_to_token;
+// ---------------------------------------------------------------------------
+// Enriched PGN parsing — extracts moves, clocks, evals, and headers
+// ---------------------------------------------------------------------------
+/// A fully parsed game with move tokens, annotations, and metadata.
+pub struct EnrichedGame {
+    /// PAWN token indices for each ply (not padded).
+    pub tokens: Vec<u16>,
+    /// Seconds remaining on clock after each ply (0 = no annotation).
+    pub clocks: Vec<u16>,
+    /// Centipawns from white's perspective after each ply.
+    /// Mate scores: ±(32767-N). No annotation: 0x8000 (-32768 as i16).
+    pub evals: Vec<i16>,
+    /// Number of valid plies.
+    pub game_length: usize,
+    /// PGN header fields (e.g., "White" -> "alice", "WhiteElo" -> "1873").
+    pub headers: HashMap<String, String>,
+}
+/// Parse a PGN string into enriched games.
+///
+/// Extracts SAN moves (tokenized), `[%clk h:mm:ss]` annotations,
+/// `[%eval ±N.NN]` / `[%eval #±N]` annotations, and all PGN headers.
+/// Tokenization uses shakmaty and is parallelized with rayon.
+pub fn parse_pgn_enriched(
+    content: &str,
+    max_ply: usize,
+    max_games: usize,
+    min_ply: usize,
+) -> Vec<EnrichedGame> {
+    let raw_games = parse_raw_games(content, max_games, None, None);
+    // Phase 2: parallel tokenization + annotation extraction
+    raw_games
+        .into_par_iter()
+        .filter_map(|raw| {
+            let (san_moves, clocks_raw, evals_raw) = extract_moves_and_annotations(&raw.movetext);
+            if san_moves.len() < min_ply {
+                return None;
+            }
+            // Tokenize SAN moves via shakmaty
+            let refs: Vec<&str> = san_moves.iter().map(|s| s.as_str()).collect();
+            let (tokens, n_valid) = san_moves_to_tokens(&refs, max_ply);
+            if n_valid < min_ply {
+                return None;
+            }
+            // Trim annotations to match token count (moves may have failed to parse).
+            let clocks = clocks_raw.into_iter().take(n_valid).collect();
+            let evals = evals_raw.into_iter().take(n_valid).collect();
+            Some(EnrichedGame {
+                tokens,
+                clocks,
+                evals,
+                game_length: n_valid,
+                headers: raw.headers,
+            })
+        })
+        .collect()
+}
+/// Count games in a PGN string whose UTCDate falls within [start, end].
+///
+/// Header-only scan — no movetext parsing, no tokenization.
+/// Returns (count_in_range, offset) where offset is the running game index
+/// that should be passed to the next chunk for correct global indexing.
+pub fn count_games_in_date_range(
+    content: &str,
+    date_start: &str,
+    date_end: &str,
+) -> usize {
+    let mut count = 0;
+    let mut current_date: Option<String> = None;
+    let mut in_movetext = false;
+    for line in content.lines() {
+        let line = line.trim();
+        if line.is_empty() {
+            if in_movetext {
+                // End of game — check if the date was in range
+                if let Some(ref d) = current_date {
+                    if d.as_str() >= date_start && d.as_str() <= date_end {
+                        count += 1;
+                    }
+                }
+                current_date = None;
+                in_movetext = false;
+            }
+            continue;
+        }
+        if line.starts_with('[') && line.ends_with(']') {
+            if let Some((key, value)) = parse_header_line(line) {
+                if key == "UTCDate" {
+                    current_date = Some(value);
+                }
+            }
+            in_movetext = false;
+        } else {
+            in_movetext = true;
+        }
+    }
+    // Handle last game
+    if in_movetext {
+        if let Some(ref d) = current_date {
+            if d.as_str() >= date_start && d.as_str() <= date_end {
+                count += 1;
+            }
+        }
+    }
+    count
+}
+/// Parse a PGN string, but only tokenize games at specific indices within a
+/// date range. Used for uniform random sampling: Python counts games in the
+/// date range (via `count_games_in_date_range`), generates a random index
+/// set, then calls this to parse only those games.
+///
+/// `indices` are 0-based within the date-range-matching games of this chunk.
+/// `game_offset` is the number of date-matching games seen in previous chunks,
+/// so global index = game_offset + local_index.
+pub fn parse_pgn_enriched_sampled(
+    content: &str,
+    max_ply: usize,
+    min_ply: usize,
+    date_start: &str,
+    date_end: &str,
+    indices: &HashSet<usize>,
+    game_offset: usize,
+) -> Vec<EnrichedGame> {
+    let raw_games = parse_raw_games(content, usize::MAX, Some((date_start, date_end)), Some((indices, game_offset)));
+    raw_games
+        .into_par_iter()
+        .filter_map(|raw| {
+            let (san_moves, clocks_raw, evals_raw) = extract_moves_and_annotations(&raw.movetext);
+            if san_moves.len() < min_ply {
+                return None;
+            }
+            let refs: Vec<&str> = san_moves.iter().map(|s| s.as_str()).collect();
+            let (tokens, n_valid) = san_moves_to_tokens(&refs, max_ply);
+            if n_valid < min_ply {
+                return None;
+            }
+            let clocks = clocks_raw.into_iter().take(n_valid).collect();
+            let evals = evals_raw.into_iter().take(n_valid).collect();
+            Some(EnrichedGame {
+                tokens,
+                clocks,
+                evals,
+                game_length: n_valid,
+                headers: raw.headers,
+            })
+        })
+        .collect()
+}
+/// Raw game data before tokenization.
+struct RawGame {
+    headers: HashMap<String, String>,
+    movetext: String,
+}
+/// Single-threaded PGN line scanner. Extracts headers and raw movetext.
+///
+/// If `date_range` is Some((start, end)), only games whose UTCDate falls
+/// within [start, end] are included. If `sample` is Some((indices, offset)),
+/// only games whose (offset + local_index) is in the index set are kept.
+fn parse_raw_games(
+    content: &str,
+    max_games: usize,
+    date_range: Option<(&str, &str)>,
+    sample: Option<(&HashSet<usize>, usize)>,
+) -> Vec<RawGame> {
+    let mut games = Vec::new();
+    let mut headers: HashMap<String, String> = HashMap::new();
+    let mut movetext_lines: Vec<&str> = Vec::new();
+    let mut in_movetext = false;
+    let mut date_excluded = false;   // UTCDate outside date_range
+    let mut has_utc_date = false;    // saw a UTCDate header for this game
+    let mut date_matched_idx = 0usize; // count of date-matching games seen
+    for line in content.lines() {
+        let line = line.trim();
+        if line.is_empty() {
+            if in_movetext {
+                // End of movetext — game boundary.
+                // Exclude if date is out of range OR if date_range is active
+                // but no UTCDate header was found (consistent with count_games_in_date_range).
+                let excluded = date_excluded || (date_range.is_some() && !has_utc_date);
+                if !excluded && !movetext_lines.is_empty() {
+                    // Game passed date filter. Check sample if present.
+                    let keep = match sample {
+                        Some((indices, offset)) => indices.contains(&(offset + date_matched_idx)),
+                        None => true,
+                    };
+                    date_matched_idx += 1;
+                    if keep {
+                        games.push(RawGame {
+                            headers: std::mem::take(&mut headers),
+                            movetext: movetext_lines.join(" "),
+                        });
+                        if games.len() >= max_games {
+                            break;
+                        }
+                    }
+                }
+                movetext_lines.clear();
+                headers.clear();
+                in_movetext = false;
+                date_excluded = false;
+                has_utc_date = false;
+            }
+            // Blank line between headers and movetext: don't reset state
+            continue;
+        }
+        // Header line: [Key "Value"]
+        if line.starts_with('[') && line.ends_with(']') {
+            if let Some((key, value)) = parse_header_line(line) {
+                if key == "UTCDate" {
+                    has_utc_date = true;
+                    if let Some((start, end)) = date_range {
+                        if value.as_str() < start || value.as_str() > end {
+                            date_excluded = true;
+                        }
+                    }
+                }
+                if !date_excluded {
+                    headers.insert(key, value);
+                }
+            }
+            in_movetext = false;
+            continue;
+        }
+        if !date_excluded {
+            in_movetext = true;
+            movetext_lines.push(line);
+        }
+    }
+    // Handle last game
+    let last_excluded = date_excluded || (date_range.is_some() && !has_utc_date);
+    if in_movetext && !last_excluded && !movetext_lines.is_empty() && games.len() < max_games {
+        let keep = match sample {
+            Some((indices, offset)) => indices.contains(&(offset + date_matched_idx)),
+            None => true,
+        };
+        if keep {
+            games.push(RawGame {
+                headers: std::mem::take(&mut headers),
+                movetext: movetext_lines.join(" "),
+            });
+        }
+    }
+    games
+}
+/// Parse a PGN header line like `[White "alice"]` into ("White", "alice").
+fn parse_header_line(line: &str) -> Option<(String, String)> {
+    // Strip surrounding brackets
+    let inner = line.strip_prefix('[')?.strip_suffix(']')?.trim();
+    let space = inner.find(' ')?;
+    let key = inner[..space].to_string();
+    let value_part = inner[space..].trim();
+    // Strip surrounding quotes
+    let value = value_part
+        .strip_prefix('"')
+        .and_then(|v| v.strip_suffix('"'))
+        .unwrap_or(value_part)
+        .to_string();
+    Some((key, value))
+}
+/// Sentinel for "no clock annotation" (0x8000 as u16 = 32768).
+const CLOCK_NONE: u16 = 0x8000;
+/// Sentinel for "no eval annotation" (0x8000 as i16 = -32768).
+const EVAL_NONE: i16 = -0x8000; // i16::MIN
+/// Extract SAN moves, clock annotations, and eval annotations from movetext.
+///
+/// Returns (san_moves, clocks, evals) where clocks[i] is the clock after
+/// move i (CLOCK_NONE if no annotation) and evals[i] is centipawns after
+/// move i (EVAL_NONE if no annotation).
+fn extract_moves_and_annotations(text: &str) -> (Vec<String>, Vec<u16>, Vec<i16>) {
+    let mut moves = Vec::new();
+    let mut clocks = Vec::new();
+    let mut evals = Vec::new();
+    let bytes = text.as_bytes();
+    let len = bytes.len();
+    // Lichess format: move { comment } move { comment } ...
+    // The comment annotates the move immediately before it.
+    let mut i = 0;
+    while i < len {
+        if bytes[i].is_ascii_whitespace() {
+            i += 1;
+            continue;
+        }
+        // Comment: { ... } — applies to the last pushed move
+        if bytes[i] == b'{' {
+            i += 1;
+            let start = i;
+            while i < len && bytes[i] != b'}' {
+                i += 1;
+            }
+            let comment = &text[start..i];
+            if i < len { i += 1; }
+            // Apply to last move
+            if let Some(last_clk) = clocks.last_mut() {
+                let mut clk = CLOCK_NONE;
+                let mut ev = EVAL_NONE;
+                parse_comment(comment, &mut clk, &mut ev);
+                if clk != CLOCK_NONE { *last_clk = clk; }
+                if ev != EVAL_NONE {
+                    if let Some(last_ev) = evals.last_mut() {
+                        *last_ev = ev;
+                    }
+                }
+            }
+            continue;
+        }
+        let start = i;
+        while i < len && !bytes[i].is_ascii_whitespace() && bytes[i] != b'{' {
+            i += 1;
+        }
+        let token = &text[start..i];
+        if token.starts_with('$') { continue; }
+        if token == "1-0" || token == "0-1" || token == "1/2-1/2" || token == "*" { break; }
+        let stripped = token.trim_end_matches('.');
+        if !stripped.is_empty() && stripped.bytes().all(|b| b.is_ascii_digit()) { continue; }
+        moves.push(token.to_string());
+        clocks.push(CLOCK_NONE);
+        evals.push(EVAL_NONE);
+    }
+    (moves, clocks, evals)
+}
+/// Parse a PGN comment body for clock and eval annotations.
+///
+/// Lichess format: `[%clk 0:03:00]` and `[%eval 1.23]` or `[%eval #-3]`.
+fn parse_comment(comment: &str, clock: &mut u16, eval: &mut i16) {
+    // Clock: [%clk H:MM:SS]
+    if let Some(pos) = comment.find("[%clk ") {
+        let rest = &comment[pos + 6..];
+        if let Some(end) = rest.find(']') {
+            let clk_str = rest[..end].trim();
+            if let Some(secs) = parse_clock(clk_str) {
+                *clock = secs;
+            }
+        }
+    }
+    // Eval: [%eval 1.23] or [%eval #-3]
+    if let Some(pos) = comment.find("[%eval ") {
+        let rest = &comment[pos + 7..];
+        if let Some(end) = rest.find(']') {
+            let eval_str = rest[..end].trim();
+            if let Some(cp) = parse_eval(eval_str) {
+                *eval = cp;
+            }
+        }
+    }
+}
+/// Parse "H:MM:SS" into total seconds as u16.
+fn parse_clock(s: &str) -> Option<u16> {
+    let parts: Vec<&str> = s.split(':').collect();
+    if parts.len() != 3 { return None; }
+    let h: u32 = parts[0].parse().ok()?;
+    let m: u32 = parts[1].parse().ok()?;
+    let s: u32 = parts[2].parse().ok()?;
+    let total = h * 3600 + m * 60 + s;
+    // Cap at 0x7FFF (32767) to avoid collision with CLOCK_NONE (0x8000)
+    Some(total.min(0x7FFF) as u16)
+}
+/// Parse eval string into centipawns (i16).
+/// "1.23" → 123, "-0.50" → -50.
+/// Mate scores: "#N" → 32767-N, "#-N" → -(32767-N).
+/// Bit 14 is always set for mates, making them detectable via bitmask.
+/// Centipawn values are clamped to ±16383 to avoid overlap with the mate range.
+fn parse_eval(s: &str) -> Option<i16> {
+    if s.starts_with('#') {
+        let rest = &s[1..];
+        let n: i32 = rest.parse().ok()?;
+        let abs_n = n.unsigned_abs().max(1) as i16;
+        let mate_val = 32767 - abs_n;
+        Some(if n > 0 { mate_val } else { -mate_val })
+    } else {
+        let f: f64 = s.parse().ok()?;
+        let cp = (f * 100.0).round() as i32;
+        Some(cp.clamp(-16383, 16383) as i16)
+    }
+}
 /// Convert a sequence of SAN move strings to PAWN token indices.
 ///
 /// Returns (tokens, n_valid) where tokens has length up to max_ply,
         fs::remove_file(path).ok();
     }
+    // --- Enriched parsing tests ---
+    #[test]
+    fn test_parse_clock() {
+        assert_eq!(parse_clock("0:10:00"), Some(600));
+        assert_eq!(parse_clock("1:30:00"), Some(5400));
+        assert_eq!(parse_clock("0:00:05"), Some(5));
+        assert_eq!(parse_clock("0:03:00"), Some(180));
+        assert_eq!(parse_clock("bad"), None);
+    }
+    #[test]
+    fn test_parse_eval() {
+        assert_eq!(parse_eval("0.23"), Some(23));
+        assert_eq!(parse_eval("-1.50"), Some(-150));
+        assert_eq!(parse_eval("0.00"), Some(0));
+        // Mate scores: 32767 - N
+        assert_eq!(parse_eval("#1"), Some(32766));
+        assert_eq!(parse_eval("#-1"), Some(-32766));
+        assert_eq!(parse_eval("#3"), Some(32764));
+        assert_eq!(parse_eval("#-3"), Some(-32764));
+        assert_eq!(parse_eval("#10"), Some(32757));
+        // Bit 14 (0x4000 = 16384) is set for all mate values
+        assert!(parse_eval("#1").unwrap() & 0x4000 != 0);
+        assert!(parse_eval("#100").unwrap() & 0x4000 != 0);
+        // Centipawns clamped to ±16383 to avoid mate range
+        assert_eq!(parse_eval("200.00"), Some(16383));
+        assert_eq!(parse_eval("-200.00"), Some(-16383));
+    }
+    #[test]
+    fn test_parse_header_line() {
+        assert_eq!(
+            parse_header_line(r#"[White "alice"]"#),
+            Some(("White".to_string(), "alice".to_string()))
+        );
+        assert_eq!(
+            parse_header_line(r#"[WhiteElo "1873"]"#),
+            Some(("WhiteElo".to_string(), "1873".to_string()))
+        );
+        assert_eq!(
+            parse_header_line(r#"[Opening "Bird Opening: Dutch Variation"]"#),
+            Some(("Opening".to_string(), "Bird Opening: Dutch Variation".to_string()))
+        );
+    }
+    #[test]
+    fn test_extract_moves_and_annotations() {
+        let text = r#"1. e4 { [%clk 0:10:00] [%eval 0.23] } 1... e5 { [%clk 0:09:58] [%eval 0.31] } 2. Nf3 { [%clk 0:09:55] } 1-0"#;
+        let (moves, clocks, evals) = extract_moves_and_annotations(text);
+        assert_eq!(moves, vec!["e4", "e5", "Nf3"]);
+        assert_eq!(clocks, vec![600, 598, 595]);
+        assert_eq!(evals, vec![23, 31, EVAL_NONE]);
+    }
+    #[test]
+    fn test_extract_moves_no_annotations() {
+        let text = "1. e4 e5 2. Nf3 Nc6 1-0";
+        let (moves, clocks, evals) = extract_moves_and_annotations(text);
+        assert_eq!(moves, vec!["e4", "e5", "Nf3", "Nc6"]);
+        assert_eq!(clocks, vec![CLOCK_NONE, CLOCK_NONE, CLOCK_NONE, CLOCK_NONE]);
+        assert_eq!(evals, vec![EVAL_NONE; 4]);
+    }
+    #[test]
+    fn test_extract_moves_mate_eval() {
+        let text = r#"1. e4 { [%eval 0.23] } 1... e5 { [%eval #-3] } 1-0"#;
+        let (moves, _clocks, evals) = extract_moves_and_annotations(text);
+        assert_eq!(moves, vec!["e4", "e5"]);
+        assert_eq!(evals, vec![23, -32764]);
+    }
+    #[test]
+    fn test_enriched_full_game() {
+        let pgn = r#"[Event "Rated Rapid game"]
+[Site "https://lichess.org/abc123"]
+[White "alice"]
+[Black "bob"]
+[Result "1-0"]
+[WhiteElo "1873"]
+[BlackElo "1844"]
+[WhiteRatingDiff "+6"]
+[BlackRatingDiff "-26"]
+[ECO "C20"]
+[Opening "King's Pawn Game"]
+[TimeControl "600+0"]
+[Termination "Normal"]
+[UTCDate "2025.01.15"]
+[UTCTime "12:30:00"]
+1. e4 { [%clk 0:10:00] [%eval 0.23] } 1... e5 { [%clk 0:09:58] [%eval 0.31] } 2. Nf3 { [%clk 0:09:50] [%eval 0.25] } 2... Nc6 { [%clk 0:09:45] [%eval 0.30] } 1-0
+"#;
+        let games = parse_pgn_enriched(pgn, 256, 100, 2);
+        assert_eq!(games.len(), 1);
+        let g = &games[0];
+        assert_eq!(g.game_length, 4);
+        assert_eq!(g.clocks, vec![600, 598, 590, 585]);
+        assert_eq!(g.evals, vec![23, 31, 25, 30]);
+        assert_eq!(g.headers.get("White").unwrap(), "alice");
+        assert_eq!(g.headers.get("WhiteElo").unwrap(), "1873");
+        assert_eq!(g.headers.get("Site").unwrap(), "https://lichess.org/abc123");
+        assert_eq!(g.headers.get("ECO").unwrap(), "C20");
+        assert_eq!(g.headers.get("TimeControl").unwrap(), "600+0");
+    }
+    #[test]
+    fn test_enriched_tokens_match_legacy() {
+        // Enriched parsing should produce the same tokens as the legacy pipeline
+        let pgn = r#"[Event "Test"]
+1. e4 { [%clk 0:10:00] } 1... e5 { [%clk 0:09:58] } 2. Nf3 { [%clk 0:09:50] } 2... Nc6 { [%clk 0:09:45] } 1-0
+"#;
+        let enriched = parse_pgn_enriched(pgn, 256, 100, 2);
+        let legacy = parse_pgn_to_san(pgn, 100);
+        assert_eq!(enriched.len(), 1);
+        assert_eq!(legacy.len(), 1);
+        // Convert legacy SAN to tokens for comparison
+        let refs: Vec<&str> = legacy[0].iter().map(|s| s.as_str()).collect();
+        let (legacy_tokens, legacy_n) = san_moves_to_tokens(&refs, 256);
+        assert_eq!(enriched[0].tokens, legacy_tokens);
+        assert_eq!(enriched[0].game_length, legacy_n);
+    }
+    #[test]
+    fn test_count_games_in_date_range() {
+        let pgn = r#"[Event "Game 1"]
+[UTCDate "2023.12.05"]
+1. e4 e5 1-0
+[Event "Game 2"]
+[UTCDate "2023.12.20"]
+1. d4 d5 0-1
+[Event "Game 3"]
+[UTCDate "2025.01.15"]
+1. e4 c5 1-0
+"#;
+        assert_eq!(count_games_in_date_range(pgn, "2023.12.01", "2023.12.31"), 2);
+        assert_eq!(count_games_in_date_range(pgn, "2023.12.01", "2023.12.14"), 1);
+        assert_eq!(count_games_in_date_range(pgn, "2023.12.15", "2023.12.31"), 1);
+        assert_eq!(count_games_in_date_range(pgn, "2025.01.01", "2025.01.31"), 1);
+        assert_eq!(count_games_in_date_range(pgn, "2024.01.01", "2024.12.31"), 0);
+    }
+    #[test]
+    fn test_sampled_parsing() {
+        let pgn = r#"[Event "Game 1"]
+[UTCDate "2023.12.05"]
+1. e4 e5 1-0
+[Event "Game 2"]
+[UTCDate "2023.12.10"]
+1. d4 d5 0-1
+[Event "Game 3"]
+[UTCDate "2023.12.20"]
+1. e4 c5 1-0
+[Event "Game 4"]
+[UTCDate "2025.01.15"]
+1. Nf3 d5 1-0
+"#;
+        // 3 games match Dec 2023 (indices 0, 1, 2 within the date range)
+        assert_eq!(count_games_in_date_range(pgn, "2023.12.01", "2023.12.31"), 3);
+        // Sample only index 1 (Game 2)
+        let indices: HashSet<usize> = HashSet::from([1]);
+        let sampled = parse_pgn_enriched_sampled(
+            pgn, 256, 2, "2023.12.01", "2023.12.31", &indices, 0,
+        );
+        assert_eq!(sampled.len(), 1);
+        assert_eq!(sampled[0].headers.get("UTCDate").unwrap(), "2023.12.10");
+        // Sample indices 0 and 2 (Game 1 and Game 3)
+        let indices: HashSet<usize> = HashSet::from([0, 2]);
+        let sampled = parse_pgn_enriched_sampled(
+            pgn, 256, 2, "2023.12.01", "2023.12.31", &indices, 0,
+        );
+        assert_eq!(sampled.len(), 2);
+        assert_eq!(sampled[0].headers.get("UTCDate").unwrap(), "2023.12.05");
+        assert_eq!(sampled[1].headers.get("UTCDate").unwrap(), "2023.12.20");
+        // Sample with offset: simulating a second chunk where previous chunk had 1 match.
+        // Global index 2 = offset 1 + local index 1 => selects Game 2 (local idx 1).
+        let indices: HashSet<usize> = HashSet::from([2]);
+        let sampled = parse_pgn_enriched_sampled(
+            pgn, 256, 2, "2023.12.01", "2023.12.31", &indices, 1,
+        );
+        assert_eq!(sampled.len(), 1);
+        assert_eq!(sampled[0].headers.get("UTCDate").unwrap(), "2023.12.10");
+        // Offset that skips all local games: offset=3 means local indices are 3,4,5
+        // but we only ask for global index 0, which isn't in this chunk.
+        let indices: HashSet<usize> = HashSet::from([0]);
+        let sampled = parse_pgn_enriched_sampled(
+            pgn, 256, 2, "2023.12.01", "2023.12.31", &indices, 3,
+        );
+        assert_eq!(sampled.len(), 0);
+    }
 }

scripts/extract_lichess_parquet.py ADDED Viewed

	@@ -0,0 +1,603 @@

+#!/usr/bin/env python3
+"""Extract Lichess monthly PGN database dumps into PAWN-compatible Parquet.
+Downloads a zstd-compressed PGN from database.lichess.org, parses games via
+the Rust chess engine (tokens, clocks, evals, headers), builds a Polars
+DataFrame, and writes sharded Parquet to disk with train/val/test splits.
+The output schema stores pre-tokenized move sequences as list[int16],
+clock annotations as list[uint16] (seconds remaining, 0x8000=missing),
+eval annotations as list[int16] (centipawns, mate=±(32767-N),
+0x8000=missing), and metadata columns. Player usernames are hashed to
+uint64 via Polars xxHash64 (deterministic within a Polars version).
+Training months are written as chronologically-ordered shards. Holdout
+val/test data is uniformly sampled from a separate month via two-pass
+Rust-side date filtering.
+Designed to run on a CPU pod with the pawn Docker image.
+Usage:
+    python scripts/extract_lichess_parquet.py \\
+        --months 2025-01 2025-02 2025-03 \\
+        --output /workspace/lichess-parquet \\
+        --hf-repo thomas-schweich/lichess-pawn \\
+        --batch-size 500000
+"""
+import argparse
+import io
+import os
+import sys
+import time
+import urllib.request
+from datetime import datetime
+from pathlib import Path
+import numpy as np
+import chess_engine
+import polars as pl
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+LICHESS_URL_TEMPLATE = (
+    "https://database.lichess.org/standard/"
+    "lichess_db_standard_rated_{year_month}.pgn.zst"
+)
+MAX_PLY = 255  # Max plies per game (token sequence = outcome + 255 plies)
+EVAL_MISSING = -32768  # i16::MIN sentinel for missing eval
+SHARD_TARGET_GAMES = 1_000_000  # Target games per shard
+def log(msg: str) -> None:
+    ts = datetime.now().strftime("%H:%M:%S")
+    print(f"[{ts}] {msg}", flush=True)
+# ---------------------------------------------------------------------------
+# PGN streaming
+# ---------------------------------------------------------------------------
+def stream_pgn_games(fileobj, batch_size: int):
+    """Yield batches of complete PGN game strings from a text stream.
+    Each batch is a single string containing `batch_size` complete games
+    (delimited by blank lines between the last movetext and next header).
+    """
+    import zstandard as zstd
+    dctx = zstd.ZstdDecompressor()
+    reader = dctx.stream_reader(fileobj)
+    text_reader = io.TextIOWrapper(reader, encoding="latin-1", errors="replace")
+    buf = []
+    game_count = 0
+    in_movetext = False
+    for line in text_reader:
+        stripped = line.strip()
+        if not stripped:
+            if in_movetext:
+                # End of movetext — game boundary
+                game_count += 1
+                in_movetext = False
+                buf.append(line)
+                if game_count >= batch_size:
+                    yield "".join(buf), game_count
+                    buf.clear()
+                    game_count = 0
+                continue
+            buf.append(line)
+            continue
+        if stripped.startswith("["):
+            in_movetext = False
+        else:
+            in_movetext = True
+        buf.append(line)
+    # Final batch
+    if buf:
+        yield "".join(buf), game_count
+def download_zst(year_month: str, output_dir: Path) -> Path:
+    """Download a Lichess zstd PGN dump to disk. Returns path to .zst file."""
+    url = LICHESS_URL_TEMPLATE.format(year_month=year_month)
+    zst_path = output_dir / f"lichess_{year_month}.pgn.zst"
+    if zst_path.exists():
+        log(f"  Using cached {zst_path} ({zst_path.stat().st_size / 1e9:.1f} GB)")
+        return zst_path
+    log(f"  Downloading {url}")
+    req = urllib.request.Request(url)
+    req.add_header("User-Agent", "pawn-lichess-extract/1.0")
+    response = urllib.request.urlopen(req)
+    # Write to a temp file and rename on completion to avoid partial downloads
+    tmp_path = zst_path.with_suffix(".zst.tmp")
+    t0 = time.monotonic()
+    downloaded = 0
+    try:
+        with open(tmp_path, "wb") as f:
+            while True:
+                chunk = response.read(8 * 1024 * 1024)  # 8 MB
+                if not chunk:
+                    break
+                f.write(chunk)
+                downloaded += len(chunk)
+                elapsed = time.monotonic() - t0
+                rate_mb = (downloaded / 1e6) / elapsed if elapsed > 0 else 0
+                print(f"\r  Downloaded {downloaded / 1e9:.2f} GB ({rate_mb:.0f} MB/s)", end="", flush=True)
+        print(flush=True)
+        tmp_path.rename(zst_path)
+    except BaseException:
+        tmp_path.unlink(missing_ok=True)
+        raise
+    response.close()
+    log(f"  Saved {zst_path} ({downloaded / 1e9:.2f} GB in {time.monotonic() - t0:.0f}s)")
+    return zst_path
+def stream_pgn_from_zst(zst_path: Path, batch_size: int):
+    """Yield PGN text batches from a local .zst file."""
+    with open(zst_path, "rb") as f:
+        yield from stream_pgn_games(f, batch_size)
+def download_month(year_month: str, output_dir: Path, batch_size: int):
+    """Download and parse a single month's PGN dump, yielding parsed batches."""
+    zst_path = download_zst(year_month, output_dir)
+    total_games = 0
+    batch_num = 0
+    for pgn_text, n_games_in_chunk in stream_pgn_from_zst(zst_path, batch_size):
+        if not pgn_text.strip():
+            continue
+        t0 = time.monotonic()
+        parsed = chess_engine.parse_pgn_enriched(
+            pgn_text, max_ply=MAX_PLY, max_games=batch_size * 2, min_ply=1
+        )
+        dt = time.monotonic() - t0
+        n = parsed["tokens"].shape[0]
+        total_games += n
+        batch_num += 1
+        rate = n / dt if dt > 0 else 0
+        log(f"  [{year_month}] batch {batch_num}: {n:,} games parsed in {dt:.1f}s ({rate:,.0f} games/s) | total: {total_games:,}")
+        yield parsed
+    log(f"  [{year_month}] Done — {total_games:,} games total")
+def sample_holdout(
+    zst_path: Path,
+    date_start: str,
+    date_end: str,
+    n_games: int,
+    batch_size: int,
+    seed: int,
+) -> pl.DataFrame:
+    """Uniformly sample n_games from a date range within a .zst PGN dump.
+    Two-pass approach:
+    1. Count games in the date range (header-only scan, no tokenization)
+    2. Generate random indices, parse only those games
+    """
+    # Pass 1: count
+    log(f"  Pass 1: counting games in [{date_start}, {date_end}]")
+    total_in_range = 0
+    t0 = time.monotonic()
+    for pgn_text, _ in stream_pgn_from_zst(zst_path, batch_size):
+        if not pgn_text.strip():
+            continue
+        total_in_range += chess_engine.count_pgn_games_in_date_range(
+            pgn_text, date_start, date_end
+        )
+    dt = time.monotonic() - t0
+    log(f"  Found {total_in_range:,} games in range ({dt:.1f}s)")
+    if total_in_range == 0:
+        return pl.DataFrame()
+    # Generate random sample indices
+    actual_n = min(n_games, total_in_range)
+    rng = np.random.default_rng(seed)
+    indices = set(rng.choice(total_in_range, size=actual_n, replace=False).tolist())
+    log(f"  Sampling {actual_n:,} of {total_in_range:,} games (seed={seed})")
+    # Pass 2: parse only sampled games
+    log(f"  Pass 2: parsing sampled games")
+    frames = []
+    game_offset = 0
+    t0 = time.monotonic()
+    for pgn_text, _ in stream_pgn_from_zst(zst_path, batch_size):
+        if not pgn_text.strip():
+            continue
+        # Count how many date-matching games are in this chunk
+        chunk_count = chess_engine.count_pgn_games_in_date_range(
+            pgn_text, date_start, date_end
+        )
+        # Check if any of our target indices fall in this chunk's range
+        chunk_indices = [
+            i for i in range(game_offset, game_offset + chunk_count)
+            if i in indices
+        ]
+        if chunk_indices:
+            parsed = chess_engine.parse_pgn_sampled(
+                pgn_text,
+                chunk_indices,
+                date_start,
+                date_end,
+                game_offset=game_offset,
+                max_ply=MAX_PLY,
+                min_ply=1,
+            )
+            n = parsed["tokens"].shape[0]
+            if n > 0:
+                frames.append(batch_to_dataframe(parsed))
+        game_offset += chunk_count
+    dt = time.monotonic() - t0
+    if not frames:
+        log(f"  No games parsed")
+        return pl.DataFrame()
+    result = pl.concat(frames)
+    log(f"  Parsed {len(result):,} games ({dt:.1f}s)")
+    return result
+# ---------------------------------------------------------------------------
+# DataFrame construction
+# ---------------------------------------------------------------------------
+def numpy_rows_to_list_series(
+    arr: np.ndarray, lengths: np.ndarray, name: str, inner_dtype: pl.DataType
+) -> pl.Series:
+    """Convert a 0-padded (N, max_ply) numpy array to a Polars List series,
+    trimming each row to its actual game length."""
+    rows = [arr[i, :lengths[i]].tolist() for i in range(len(arr))]
+    return pl.Series(name, rows, dtype=pl.List(inner_dtype))
+def batch_to_dataframe(parsed: dict) -> pl.DataFrame:
+    """Convert a parsed batch dict from Rust into a Polars DataFrame.
+    Rust returns numpy arrays: tokens/clocks/evals as (N, max_ply),
+    scalar fields as (N,) arrays, and strings as Python lists.
+    """
+    tokens: np.ndarray = parsed["tokens"]  # (N, max_ply) i16
+    n = tokens.shape[0]
+    if n == 0:
+        return pl.DataFrame()
+    lengths: np.ndarray = parsed["game_lengths"]  # (N,) u16
+    # Parse datetime strings -> proper datetime
+    # Format: "YYYY.MM.DD HH:MM:SS"
+    datetimes = []
+    for dt_str in parsed["date_time"]:
+        if dt_str and len(dt_str) >= 10:
+            try:
+                datetimes.append(datetime.strptime(dt_str, "%Y.%m.%d %H:%M:%S"))
+            except ValueError:
+                try:
+                    datetimes.append(datetime.strptime(dt_str[:10], "%Y.%m.%d"))
+                except ValueError:
+                    datetimes.append(None)
+        else:
+            datetimes.append(None)
+    df = pl.DataFrame({
+        "tokens": numpy_rows_to_list_series(tokens, lengths, "tokens", pl.Int16),
+        "clock": numpy_rows_to_list_series(parsed["clocks"], lengths, "clock", pl.UInt16),
+        "eval": numpy_rows_to_list_series(parsed["evals"], lengths, "eval", pl.Int16),
+        "game_length": pl.Series("game_length", parsed["game_lengths"], dtype=pl.UInt16),
+        "result": pl.Series("result", parsed["result"], dtype=pl.Utf8),
+        "white_player": pl.Series("white_player", parsed["white"], dtype=pl.Utf8),
+        "black_player": pl.Series("black_player", parsed["black"], dtype=pl.Utf8),
+        "white_elo": pl.Series("white_elo", parsed["white_elo"], dtype=pl.UInt16),
+        "black_elo": pl.Series("black_elo", parsed["black_elo"], dtype=pl.UInt16),
+        "white_rating_diff": pl.Series("white_rating_diff", parsed["white_rating_diff"], dtype=pl.Int16),
+        "black_rating_diff": pl.Series("black_rating_diff", parsed["black_rating_diff"], dtype=pl.Int16),
+        "eco": pl.Series("eco", parsed["eco"], dtype=pl.Utf8),
+        "opening": pl.Series("opening", parsed["opening"], dtype=pl.Utf8),
+        "time_control": pl.Series("time_control", parsed["time_control"], dtype=pl.Utf8),
+        "termination": pl.Series("termination", parsed["termination"], dtype=pl.Utf8),
+        "date": pl.Series("date", datetimes, dtype=pl.Datetime("ms")),
+        "site": pl.Series("site", parsed["site"], dtype=pl.Utf8),
+    })
+    # Hash usernames: vectorized xxHash64 via Polars.
+    # NOTE: hash() output is deterministic within a Polars version but the
+    # algorithm is not guaranteed stable across major versions. Originally
+    # recorded with Polars 1.39.3. Pin Polars version (via uv.lock) and
+    # tag the repo to ensure reproducibility. See test_enriched_pgn.py
+    # TestPlayerHashRegression for the snapshot test.
+    df = df.with_columns(
+        pl.col("white_player").hash().alias("white_player"),
+        pl.col("black_player").hash().alias("black_player"),
+    )
+    return df
+# ---------------------------------------------------------------------------
+# Shard writing
+# ---------------------------------------------------------------------------
+def write_shard(
+    df: pl.DataFrame,
+    output_dir: Path,
+    split: str,
+    shard_idx: int,
+    total_shards: int,
+) -> Path:
+    """Write a single Parquet shard with HF-compatible naming."""
+    name = f"{split}-{shard_idx:05d}-of-{total_shards:05d}.parquet"
+    path = output_dir / "data" / name
+    path.parent.mkdir(parents=True, exist_ok=True)
+    df.write_parquet(path, compression="zstd", compression_level=3)
+    size_mb = path.stat().st_size / 1024 / 1024
+    log(f"  Wrote {name}: {len(df):,} games, {size_mb:.1f} MB")
+    return path
+# ---------------------------------------------------------------------------
+# HuggingFace upload
+# ---------------------------------------------------------------------------
+def upload_to_hf(output_dir: Path, hf_repo: str) -> None:
+    """Upload the output directory to HuggingFace as a dataset."""
+    from huggingface_hub import HfApi
+    api = HfApi()
+    log(f"Uploading to HuggingFace: {hf_repo}")
+    # Create repo if it doesn't exist
+    api.create_repo(hf_repo, repo_type="dataset", exist_ok=True)
+    # Upload the data directory
+    api.upload_folder(
+        repo_id=hf_repo,
+        folder_path=str(output_dir),
+        repo_type="dataset",
+    )
+    log(f"Upload complete: https://huggingface.co/datasets/{hf_repo}")
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+class SplitBuffer:
+    """Accumulates DataFrames for a single split and flushes to Parquet shards."""
+    def __init__(self, split: str, shard_size: int, output_dir: Path):
+        self.split = split
+        self.shard_size = shard_size
+        self.output_dir = output_dir / "data"
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.frames: list[pl.DataFrame] = []
+        self.buffered = 0
+        self.total_games = 0
+        self.shard_paths: list[Path] = []
+        self.shard_idx = 0
+    def add(self, df: pl.DataFrame) -> None:
+        if df.is_empty():
+            return
+        self.frames.append(df)
+        self.buffered += len(df)
+        self.total_games += len(df)
+        self._flush_full()
+    def _flush_full(self) -> None:
+        while self.buffered >= self.shard_size:
+            combined = pl.concat(self.frames)
+            shard_df = combined.head(self.shard_size)
+            leftover = combined.slice(self.shard_size)
+            self._write_shard(shard_df)
+            self.frames = [leftover] if len(leftover) > 0 else []
+            self.buffered = len(leftover) if len(leftover) > 0 else 0
+    def flush_remaining(self) -> None:
+        if self.frames:
+            combined = pl.concat(self.frames)
+            if len(combined) > 0:
+                self._write_shard(combined)
+            self.frames.clear()
+            self.buffered = 0
+    def _write_shard(self, df: pl.DataFrame) -> None:
+        # Write with placeholder name; rename after all shards are counted
+        path = self.output_dir / f"{self.split}-temp-{self.shard_idx:05d}.parquet"
+        df.write_parquet(path, compression="zstd", compression_level=3)
+        size_mb = path.stat().st_size / 1024 / 1024
+        log(f"  [{self.split}] shard {self.shard_idx}: {len(df):,} games, {size_mb:.1f} MB")
+        self.shard_paths.append(path)
+        self.shard_idx += 1
+    def rename_shards(self) -> list[Path]:
+        """Rename temp shards to HF-compatible names with correct total count."""
+        n = len(self.shard_paths)
+        final = []
+        for i, path in enumerate(self.shard_paths):
+            new_name = f"{self.split}-{i:05d}-of-{n:05d}.parquet"
+            new_path = path.parent / new_name
+            path.rename(new_path)
+            final.append(new_path)
+            log(f"  {path.name} -> {new_name}")
+        self.shard_paths = final
+        return final
+def main():
+    parser = argparse.ArgumentParser(
+        description="Extract Lichess PGN dumps to PAWN-compatible Parquet"
+    )
+    parser.add_argument(
+        "--months", nargs="+", required=True,
+        help="Training month(s) to download, e.g. 2025-01 2025-02 2025-03"
+    )
+    parser.add_argument(
+        "--output", type=Path, default=Path("/workspace/lichess-parquet"),
+        help="Output directory for Parquet shards"
+    )
+    parser.add_argument(
+        "--hf-repo", type=str, default=None,
+        help="HuggingFace dataset repo to push to (e.g. thomas-schweich/pawn-lichess-full)"
+    )
+    parser.add_argument(
+        "--batch-size", type=int, default=500_000,
+        help="Games per batch during parsing (controls memory usage)"
+    )
+    parser.add_argument(
+        "--shard-size", type=int, default=SHARD_TARGET_GAMES,
+        help="Target games per output shard"
+    )
+    parser.add_argument(
+        "--holdout-month", type=str, default=None,
+        help="Month to use for val/test (e.g. 2023-12). First half of month "
+             "-> val, second half -> test. Randomly samples --holdout-games "
+             "from each half."
+    )
+    parser.add_argument(
+        "--holdout-games", type=int, default=50_000,
+        help="Number of games to sample for each of val and test (default: 50000)"
+    )
+    parser.add_argument(
+        "--max-games", type=int, default=None,
+        help="Stop after this many training games (for testing)"
+    )
+    parser.add_argument(
+        "--seed", type=int, default=42,
+        help="Random seed for holdout sampling (default: 42)"
+    )
+    args = parser.parse_args()
+    log("=== Lichess Parquet Extraction ===")
+    log(f"Training months: {args.months}")
+    log(f"Output: {args.output}")
+    log(f"Batch size: {args.batch_size:,}")
+    log(f"Shard size: {args.shard_size:,}")
+    if args.holdout_month:
+        log(f"Holdout month: {args.holdout_month}")
+        log(f"Holdout games per split: {args.holdout_games:,}")
+        log(f"Holdout seed: {args.seed}")
+    if args.max_games:
+        log(f"Max training games: {args.max_games:,}")
+    log("")
+    args.output.mkdir(parents=True, exist_ok=True)
+    # ── Phase 1: Process holdout month (val/test) ──────────────────────
+    buffers = {
+        "train": SplitBuffer("train", args.shard_size, args.output),
+        "validation": SplitBuffer("validation", args.shard_size, args.output),
+        "test": SplitBuffer("test", args.shard_size, args.output),
+    }
+    if args.holdout_month:
+        log(f"\n=== Processing holdout month {args.holdout_month} ===")
+        # Download the zstd file (reused for both count and parse passes)
+        zst_path = download_zst(args.holdout_month, args.output)
+        # Date ranges: first half of month -> val, second half -> test
+        # UTCDate format is "YYYY.MM.DD"
+        year, mon = args.holdout_month.split("-")
+        val_start = f"{year}.{mon}.01"
+        val_end = f"{year}.{mon}.14"
+        test_start = f"{year}.{mon}.15"
+        test_end = f"{year}.{mon}.31"
+        log(f"  Val date range: [{val_start}, {val_end}]")
+        val_df = sample_holdout(
+            zst_path, val_start, val_end,
+            args.holdout_games, args.batch_size, args.seed,
+        )
+        if not val_df.is_empty():
+            buffers["validation"].add(val_df)
+        log(f"  Test date range: [{test_start}, {test_end}]")
+        test_df = sample_holdout(
+            zst_path, test_start, test_end,
+            args.holdout_games, args.batch_size, args.seed + 1,
+        )
+        if not test_df.is_empty():
+            buffers["test"].add(test_df)
+    # ── Phase 2: Process training months ─────────────────���─────────────
+    total_train = 0
+    stop = False
+    for month in args.months:
+        if stop:
+            break
+        log(f"\n=== Processing {month} (train) ===")
+        for parsed in download_month(month, args.output, args.batch_size):
+            df = batch_to_dataframe(parsed)
+            if df.is_empty():
+                continue
+            if args.max_games:
+                remaining = args.max_games - total_train
+                if remaining <= 0:
+                    stop = True
+                    break
+                if len(df) > remaining:
+                    df = df.head(remaining)
+            total_train += len(df)
+            buffers["train"].add(df)
+            if args.max_games and total_train >= args.max_games:
+                stop = True
+                break
+    # Flush remaining data in each buffer
+    for buf in buffers.values():
+        buf.flush_remaining()
+    log(f"\n=== Renaming shards ===")
+    final_paths = []
+    for buf in buffers.values():
+        if buf.shard_paths:
+            final_paths.extend(buf.rename_shards())
+    # Summary
+    log(f"\n=== Summary ===")
+    total_games = sum(buf.total_games for buf in buffers.values())
+    log(f"Total games: {total_games:,}")
+    for name, buf in buffers.items():
+        if buf.total_games > 0:
+            log(f"  {name}: {buf.total_games:,} games, {len(buf.shard_paths)} shards")
+    if final_paths:
+        total_size = sum(p.stat().st_size for p in final_paths)
+        log(f"Total size: {total_size / 1024 / 1024 / 1024:.2f} GB")
+    # Upload to HuggingFace
+    if args.hf_repo:
+        upload_to_hf(args.output, args.hf_repo)
+    log("\nDone!")
+if __name__ == "__main__":
+    main()

tests/test_enriched_pgn.py ADDED Viewed

	@@ -0,0 +1,393 @@

+"""Tests for enriched PGN parsing and dataset extraction pipeline."""
+import numpy as np
+import polars as pl
+import pytest
+import sys
+from pathlib import Path
+import chess_engine
+# Import extraction helper
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "scripts"))
+from extract_lichess_parquet import batch_to_dataframe
+# ---------------------------------------------------------------------------
+# Test PGN data — each game has distinct moves, metadata, and annotations
+# ---------------------------------------------------------------------------
+PGNS = {
+    "alice_v_bob": """\
+[Event "Rated Rapid game"]
+[Site "https://lichess.org/game001"]
+[White "alice"]
+[Black "bob"]
+[Result "1-0"]
+[WhiteElo "1873"]
+[BlackElo "1844"]
+[WhiteRatingDiff "+6"]
+[BlackRatingDiff "-26"]
+[ECO "C20"]
+[Opening "King's Pawn Game"]
+[TimeControl "600+0"]
+[Termination "Normal"]
+[UTCDate "2025.01.10"]
+[UTCTime "10:00:00"]
+1. e4 { [%clk 0:10:00] [%eval 0.23] } 1... e5 { [%clk 0:09:58] [%eval 0.31] } 2. Nf3 { [%clk 0:09:50] [%eval 0.25] } 2... Nc6 { [%clk 0:09:45] [%eval 0.30] } 1-0
+""",
+    "bob_v_alice": """\
+[Event "Rated Blitz game"]
+[Site "https://lichess.org/game002"]
+[White "bob"]
+[Black "alice"]
+[Result "0-1"]
+[WhiteElo "1850"]
+[BlackElo "1880"]
+[WhiteRatingDiff "-5"]
+[BlackRatingDiff "+5"]
+[ECO "B20"]
+[Opening "Sicilian Defense"]
+[TimeControl "300+3"]
+[Termination "Time forfeit"]
+[UTCDate "2025.02.14"]
+[UTCTime "20:00:00"]
+1. e4 { [%clk 0:05:00] [%eval 0.20] } 1... c5 { [%clk 0:04:55] [%eval 0.25] } 2. d4 { [%clk 0:04:48] [%eval 0.40] } 0-1
+""",
+    "alice_v_xavier": """\
+[Event "Rated Classical game"]
+[Site "https://lichess.org/game003"]
+[White "alice"]
+[Black "xavier"]
+[Result "1/2-1/2"]
+[WhiteElo "1900"]
+[BlackElo "2100"]
+[WhiteRatingDiff "+3"]
+[BlackRatingDiff "-1"]
+[ECO "D30"]
+[Opening "Queen's Gambit Declined"]
+[TimeControl "1800+30"]
+[Termination "Normal"]
+[UTCDate "2025.03.01"]
+[UTCTime "15:30:00"]
+1. d4 { [%clk 0:30:00] [%eval 0.10] } 1... d5 { [%clk 0:29:50] [%eval 0.15] } 2. c4 { [%clk 0:29:40] [%eval 0.20] } 2... e6 { [%clk 0:29:30] [%eval 0.18] } 3. Nf3 { [%clk 0:29:20] [%eval 0.22] } 1/2-1/2
+""",
+    "xavier_v_alice": """\
+[Event "Rated Rapid game"]
+[Site "https://lichess.org/game004"]
+[White "xavier"]
+[Black "alice"]
+[Result "1-0"]
+[WhiteElo "2105"]
+[BlackElo "1895"]
+[WhiteRatingDiff "+2"]
+[BlackRatingDiff "-4"]
+[ECO "A45"]
+[Opening "Trompowsky Attack"]
+[TimeControl "900+10"]
+[Termination "Normal"]
+[UTCDate "2025.01.20"]
+[UTCTime "18:00:00"]
+1. d4 { [%clk 0:15:00] } 1... Nf6 { [%clk 0:14:55] } 2. Bg5 { [%clk 0:14:48] } 1-0
+""",
+    "bob_v_xavier": """\
+[Event "Rated Bullet game"]
+[Site "https://lichess.org/game005"]
+[White "bob"]
+[Black "xavier"]
+[Result "0-1"]
+[WhiteElo "1840"]
+[BlackElo "2110"]
+[WhiteRatingDiff "-8"]
+[BlackRatingDiff "+3"]
+[ECO "C50"]
+[Opening "Italian Game"]
+[TimeControl "60+0"]
+[Termination "Normal"]
+[UTCDate "2025.02.28"]
+[UTCTime "23:59:00"]
+1. e4 { [%clk 0:01:00] [%eval 0.20] } 1... e5 { [%clk 0:00:59] [%eval 0.25] } 2. Nf3 { [%clk 0:00:55] [%eval 0.30] } 2... Nc6 { [%clk 0:00:53] [%eval 0.28] } 3. Bc4 { [%clk 0:00:50] [%eval 0.35] } 3... Bc5 { [%clk 0:00:48] [%eval 0.30] } 0-1
+""",
+    "xavier_v_bob": """\
+[Event "Rated Rapid game"]
+[Site "https://lichess.org/game006"]
+[White "xavier"]
+[Black "bob"]
+[Result "1-0"]
+[WhiteElo "2115"]
+[BlackElo "1835"]
+[WhiteRatingDiff "+1"]
+[BlackRatingDiff "-7"]
+[ECO "E00"]
+[Opening "Queen's Pawn Game"]
+[TimeControl "600+5"]
+[Termination "Normal"]
+[UTCDate "2025.03.15"]
+[UTCTime "09:00:00"]
+1. d4 { [%clk 0:10:00] [%eval 0.15] } 1... Nf6 { [%clk 0:09:55] [%eval 0.20] } 2. c4 { [%clk 0:09:48] [%eval 0.25] } 2... e6 { [%clk 0:09:40] [%eval 0.22] } 3. Nc3 { [%clk 0:09:35] [%eval 0.28] } 3... Bb4 { [%clk 0:09:28] [%eval 0.30] } 4. Qc2 { [%clk 0:09:20] [%eval 0.32] } 1-0
+""",
+}
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+class TestEnrichedParsing:
+    """Test the Rust parse_pgn_enriched function."""
+    def test_basic_parsing(self):
+        pgn = PGNS["alice_v_bob"]
+        r = chess_engine.parse_pgn_enriched(pgn)
+        assert r["tokens"].shape == (1, 255)
+        assert r["clocks"].shape == (1, 255)
+        assert r["evals"].shape == (1, 255)
+        assert r["game_lengths"].shape == (1,)
+        assert r["game_lengths"][0] == 4
+    def test_return_types(self):
+        pgn = PGNS["alice_v_bob"]
+        r = chess_engine.parse_pgn_enriched(pgn)
+        for key in ("tokens", "clocks", "evals"):
+            assert isinstance(r[key], np.ndarray), f"{key} should be ndarray"
+        for key in ("game_lengths", "white_elo", "black_elo",
+                     "white_rating_diff", "black_rating_diff"):
+            assert isinstance(r[key], np.ndarray), f"{key} should be ndarray"
+        for key in ("result", "white", "black", "eco", "opening",
+                     "time_control", "termination", "date_time", "site"):
+            assert isinstance(r[key], list), f"{key} should be list"
+    def test_clock_extraction(self):
+        pgn = PGNS["alice_v_bob"]
+        r = chess_engine.parse_pgn_enriched(pgn)
+        clocks = r["clocks"][0, :4]
+        assert list(clocks) == [600, 598, 590, 585]
+    def test_eval_extraction(self):
+        pgn = PGNS["alice_v_bob"]
+        r = chess_engine.parse_pgn_enriched(pgn)
+        evals = r["evals"][0, :4]
+        assert list(evals) == [23, 31, 25, 30]
+    def test_missing_eval_sentinel(self):
+        pgn = PGNS["xavier_v_alice"]  # no eval annotations
+        r = chess_engine.parse_pgn_enriched(pgn)
+        length = r["game_lengths"][0]
+        evals = r["evals"][0, :length]
+        # Rust uses i16::MIN (-32768) as the "no eval" sentinel
+        assert all(e == -32768 for e in evals), (
+            f"Missing evals should be -32768 (i16::MIN), got {list(evals)}"
+        )
+    def test_padding_is_zero(self):
+        pgn = PGNS["alice_v_bob"]
+        r = chess_engine.parse_pgn_enriched(pgn)
+        length = r["game_lengths"][0]
+        assert np.all(r["tokens"][0, length:] == 0)
+        assert np.all(r["clocks"][0, length:] == 0)
+        assert np.all(r["evals"][0, length:] == 0)
+    def test_headers_extracted(self):
+        pgn = PGNS["alice_v_bob"]
+        r = chess_engine.parse_pgn_enriched(pgn)
+        assert r["white"][0] == "alice"
+        assert r["black"][0] == "bob"
+        assert r["result"][0] == "1-0"
+        assert r["white_elo"][0] == 1873
+        assert r["black_elo"][0] == 1844
+        assert r["white_rating_diff"][0] == 6
+        assert r["black_rating_diff"][0] == -26
+        assert r["eco"][0] == "C20"
+        assert r["time_control"][0] == "600+0"
+        assert r["site"][0] == "https://lichess.org/game001"
+    def test_different_games_produce_different_tokens(self):
+        """Each test PGN has distinct moves — tokens must differ."""
+        all_tokens = {}
+        for name, pgn in PGNS.items():
+            r = chess_engine.parse_pgn_enriched(pgn)
+            length = r["game_lengths"][0]
+            all_tokens[name] = tuple(r["tokens"][0, :length])
+        names = list(all_tokens.keys())
+        for i in range(len(names)):
+            for j in range(i + 1, len(names)):
+                assert all_tokens[names[i]] != all_tokens[names[j]], (
+                    f"{names[i]} and {names[j]} should have different token sequences"
+                )
+class TestPlayerHashing:
+    """Test that player username hashing is deterministic and independent of context.
+    Each PGN has different moves, metadata, Elo, time control, dates, and
+    game lengths — the only thing shared is the player name strings. If
+    hashing accidentally depended on row context, these would diverge.
+    """
+    @pytest.fixture
+    def player_hashes(self):
+        """Parse all 6 PGNs separately and collect per-player hash values."""
+        hashes = {}  # name -> list of observed uint64 hashes
+        for name, pgn in PGNS.items():
+            r = chess_engine.parse_pgn_enriched(pgn)
+            df = batch_to_dataframe(r)
+            w_name = r["white"][0]
+            b_name = r["black"][0]
+            w_hash = df["white_player"][0]
+            b_hash = df["black_player"][0]
+            hashes.setdefault(w_name, []).append(w_hash)
+            hashes.setdefault(b_name, []).append(b_hash)
+        return hashes
+    def test_same_name_always_same_hash(self, player_hashes):
+        """A player name must always produce the same hash regardless of
+        which game it appears in, whether as white or black, and what
+        the surrounding metadata looks like."""
+        for name, vals in player_hashes.items():
+            unique = set(vals)
+            assert len(unique) == 1, (
+                f"'{name}' produced {len(unique)} distinct hashes across "
+                f"{len(vals)} appearances: {unique}"
+            )
+    def test_different_names_different_hashes(self, player_hashes):
+        """alice, bob, and xavier must all have distinct hashes."""
+        canonical = {name: vals[0] for name, vals in player_hashes.items()}
+        hash_vals = list(canonical.values())
+        assert len(set(hash_vals)) == len(hash_vals), (
+            f"Hash collision among players: {canonical}"
+        )
+    def test_hash_dtype_is_uint64(self, player_hashes):
+        # Parse one game and check the Polars column dtype
+        r = chess_engine.parse_pgn_enriched(PGNS["alice_v_bob"])
+        df = batch_to_dataframe(r)
+        assert df["white_player"].dtype == pl.UInt64
+        assert df["black_player"].dtype == pl.UInt64
+    def test_hash_appears_in_both_columns(self, player_hashes):
+        """alice appears as both white and black — hash must match in both columns."""
+        # alice is white in alice_v_bob and black in bob_v_alice
+        r1 = chess_engine.parse_pgn_enriched(PGNS["alice_v_bob"])
+        df1 = batch_to_dataframe(r1)
+        r2 = chess_engine.parse_pgn_enriched(PGNS["bob_v_alice"])
+        df2 = batch_to_dataframe(r2)
+        alice_as_white = df1["white_player"][0]
+        alice_as_black = df2["black_player"][0]
+        assert alice_as_white == alice_as_black, (
+            f"alice hash differs: white={alice_as_white}, black={alice_as_black}"
+        )
+class TestPlayerHashRegression:
+    """Snapshot test: catch if a Polars update changes the hash algorithm.
+    These exact values were recorded with Polars 1.39.3 using the default
+    hash() seed (xxHash64). If this test fails after a Polars upgrade, the
+    dataset must be regenerated to stay consistent (or the old Polars
+    version must be pinned).
+    """
+    EXPECTED_HASHES = {
+        "alice": 573680751236103438,
+        "bob": 11376496890720967193,
+        "xavier": 2453512920044318708,
+    }
+    def test_hash_values_match_snapshot(self):
+        """Verify that pl.Series.hash() produces the exact same uint64
+        values that were recorded when the dataset was built."""
+        for name, expected in self.EXPECTED_HASHES.items():
+            actual = pl.Series([name]).hash()[0]
+            assert actual == expected, (
+                f"Hash regression for '{name}': expected {expected}, got {actual}. "
+                f"Polars hash algorithm may have changed — dataset must be regenerated."
+            )
+    def test_snapshot_matches_pipeline(self):
+        """The snapshot values must agree with what batch_to_dataframe produces."""
+        combined = "\n".join(PGNS.values())
+        r = chess_engine.parse_pgn_enriched(combined)
+        df = batch_to_dataframe(r)
+        for name, expected in self.EXPECTED_HASHES.items():
+            # Find rows where this player appears as white
+            white_rows = [
+                i for i, w in enumerate(r["white"]) if w == name
+            ]
+            for i in white_rows:
+                actual = df["white_player"][i]
+                assert actual == expected, (
+                    f"Pipeline hash for '{name}' (white, row {i}): "
+                    f"expected {expected}, got {actual}"
+                )
+            # Find rows where this player appears as black
+            black_rows = [
+                i for i, b in enumerate(r["black"]) if b == name
+            ]
+            for i in black_rows:
+                actual = df["black_player"][i]
+                assert actual == expected, (
+                    f"Pipeline hash for '{name}' (black, row {i}): "
+                    f"expected {expected}, got {actual}"
+                )
+class TestBatchToDataframe:
+    """Test the full batch_to_dataframe pipeline."""
+    def test_schema(self):
+        r = chess_engine.parse_pgn_enriched(PGNS["alice_v_bob"])
+        df = batch_to_dataframe(r)
+        assert df["tokens"].dtype == pl.List(pl.Int16)
+        assert df["clock"].dtype == pl.List(pl.UInt16)
+        assert df["eval"].dtype == pl.List(pl.Int16)
+        assert df["game_length"].dtype == pl.UInt16
+        assert df["white_elo"].dtype == pl.UInt16
+        assert df["black_elo"].dtype == pl.UInt16
+        assert df["white_rating_diff"].dtype == pl.Int16
+        assert df["black_rating_diff"].dtype == pl.Int16
+        assert df["white_player"].dtype == pl.UInt64
+        assert df["black_player"].dtype == pl.UInt64
+    def test_list_columns_trimmed_to_game_length(self):
+        """List columns should contain exactly game_length elements (no padding)."""
+        r = chess_engine.parse_pgn_enriched(PGNS["bob_v_xavier"])
+        df = batch_to_dataframe(r)
+        gl = df["game_length"][0]
+        assert len(df["tokens"][0]) == gl
+        assert len(df["clock"][0]) == gl
+        assert len(df["eval"][0]) == gl
+    def test_parquet_roundtrip(self, tmp_path):
+        """Write to Parquet and read back — all values must survive."""
+        r = chess_engine.parse_pgn_enriched(PGNS["xavier_v_bob"])
+        df = batch_to_dataframe(r)
+        path = tmp_path / "test.parquet"
+        df.write_parquet(path, compression="zstd")
+        df2 = pl.read_parquet(path)
+        assert df.shape == df2.shape
+        assert df["tokens"].to_list() == df2["tokens"].to_list()
+        assert df["clock"].to_list() == df2["clock"].to_list()
+        assert df["eval"].to_list() == df2["eval"].to_list()
+        assert df["white_player"].to_list() == df2["white_player"].to_list()
+    def test_multi_game_batch(self):
+        """Parse all 6 games in a single PGN string."""
+        combined = "\n".join(PGNS.values())
+        r = chess_engine.parse_pgn_enriched(combined)
+        df = batch_to_dataframe(r)
+        assert len(df) == 6
+        # Each game should have a different game length
+        lengths = df["game_length"].to_list()
+        assert len(set(lengths)) > 1, "Games should have different lengths"