File size: 2,029 Bytes
80bc2b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0179a9
 
80bc2b3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import chess
import chess.pgn
import csv
import os

start_at = 0
total_games = 92055571
def process_pgn_file(input_file, output_file):
    with open(input_file, 'r') as pgn_file, open(output_file, 'a', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        if start_at == 0:
            csv_writer.writerow(['transcript'])

        file_size = os.stat(pgn_file.fileno()).st_size
        pgn_file.seek(int(file_size * (start_at / total_games)))

        games_seen = 0
        games_added = 0
        while True:
            game = chess.pgn.read_game(pgn_file)
            if game is None:
                break
            games_seen += 1

            # Filter games based on the specified criteria
            if (
                game.headers['Result'] == '1-0' and
                'Rated' in game.headers['Event'] and
                1500 < int(game.headers['WhiteElo']) < 2400 and
                1400 < int(game.headers['BlackElo']) < 2800
            ):
                board = chess.Board()
                moves = []
                move_number = 1
                for move in game.mainline_moves():
                    if board.turn == chess.WHITE:
                        moves.append(f"{move_number}.")
                        move_number += 1
                    san = board.san(move)
                    moves.append(san + " ")
                    board.push(board.parse_san(san))

                if board.is_game_over() and board.result() == "1-0":
                    transcript = ''.join(moves)
                    csv_writer.writerow([transcript.rstrip()])
                    games_added += 1
                    if games_added % 100 == 0:
                        print(f"Added {games_added} of {games_seen} games. {(games_seen+start_at)/float(total_games):.2%} complete.")

# Usage example
input_file = './chess-mamba-vs-xformer/lichess_db_standard_rated_2022-07.pgn'
output_file = './chess-mamba-vs-xformer/lichess_transcripts_phase2_stable.csv'
process_pgn_file(input_file, output_file)