|
import chess |
|
import chess.pgn |
|
import csv |
|
import os |
|
|
|
start_at = 0 |
|
total_games = 92055571 |
|
def process_pgn_file(input_file, output_file): |
|
with open(input_file, 'r') as pgn_file, open(output_file, 'a', newline='') as csv_file: |
|
csv_writer = csv.writer(csv_file) |
|
if start_at == 0: |
|
csv_writer.writerow(['transcript']) |
|
|
|
file_size = os.stat(pgn_file.fileno()).st_size |
|
pgn_file.seek(int(file_size * (start_at / total_games))) |
|
|
|
games_seen = 0 |
|
games_added = 0 |
|
while True: |
|
game = chess.pgn.read_game(pgn_file) |
|
if game is None: |
|
break |
|
games_seen += 1 |
|
|
|
|
|
if ( |
|
game.headers['Result'] == '1-0' and |
|
'Rated' in game.headers['Event'] and |
|
1500 < int(game.headers['WhiteElo']) < 2400 and |
|
1400 < int(game.headers['BlackElo']) < 2800 |
|
): |
|
board = chess.Board() |
|
moves = [] |
|
move_number = 1 |
|
for move in game.mainline_moves(): |
|
if board.turn == chess.WHITE: |
|
moves.append(f"{move_number}.") |
|
move_number += 1 |
|
san = board.san(move) |
|
moves.append(san + " ") |
|
board.push(board.parse_san(san)) |
|
|
|
if board.is_game_over() and board.result() == "1-0": |
|
transcript = ''.join(moves) |
|
csv_writer.writerow([transcript.rstrip()]) |
|
games_added += 1 |
|
if games_added % 100 == 0: |
|
print(f"Added {games_added} of {games_seen} games. {(games_seen+start_at)/float(total_games):.2%} complete.") |
|
|
|
|
|
input_file = './chess-mamba-vs-xformer/lichess_db_standard_rated_2022-07.pgn' |
|
output_file = './chess-mamba-vs-xformer/lichess_transcripts_phase2_stable.csv' |
|
process_pgn_file(input_file, output_file) |