HaileyStorm commited on
Commit
22cfeab
1 Parent(s): d3da7b8

Delete filter_lichess_multi.py

Browse files
Files changed (1) hide show
  1. filter_lichess_multi.py +0 -77
filter_lichess_multi.py DELETED
@@ -1,77 +0,0 @@
1
- import chess
2
- import chess.pgn
3
- import csv
4
- import os
5
- import threading
6
- import mmap
7
-
8
- start_at = 0
9
- total_games = 92055571
10
- num_threads = 8
11
-
12
- def process_pgn_chunk(pgn_data, output_file, start_index, end_index):
13
- with open(output_file, 'a', newline='') as csv_file:
14
- csv_writer = csv.writer(csv_file)
15
-
16
- pgn = chess.pgn.read_game(chess.pgn.StringIO(pgn_data.decode('utf-8')))
17
- games_seen = 0
18
- games_added = 0
19
- while pgn is not None:
20
- if games_seen >= end_index - start_index:
21
- break
22
- games_seen += 1
23
-
24
- # Filter games based on the specified criteria
25
- if (
26
- pgn.headers['Result'] == '1-0' and
27
- 'Rated' in pgn.headers['Event'] and
28
- 1500 < int(pgn.headers['WhiteElo']) < 2400 and
29
- 1400 < int(pgn.headers['BlackElo']) < 2800
30
- ):
31
- board = chess.Board()
32
- moves = []
33
- move_number = 1
34
- for move in pgn.mainline_moves():
35
- if board.turn == chess.WHITE:
36
- moves.append(f"{move_number}.")
37
- move_number += 1
38
- san = board.san(move)
39
- moves.append(san + " ")
40
- board.push(board.parse_san(san))
41
-
42
- if board.is_game_over() and board.result() == "1-0":
43
- transcript = ''.join(moves)
44
- csv_writer.writerow([transcript.rstrip()])
45
- games_added += 1
46
- if games_added % 100 == 0:
47
- print(f"Thread {threading.current_thread().name} - Added {games_added} of {games_seen} games. {(games_seen+start_index)/float(total_games):.2%} complete.")
48
-
49
- pgn = chess.pgn.read_game(chess.pgn.StringIO(pgn_data.decode('utf-8')))
50
-
51
- def process_pgn_file(input_file, output_file):
52
- with open(output_file, 'w', newline='') as csv_file:
53
- csv_writer = csv.writer(csv_file)
54
- csv_writer.writerow(['transcript'])
55
-
56
- file_size = os.path.getsize(input_file)
57
- chunk_size = (file_size - start_at) // num_threads
58
- threads = []
59
- with open(input_file, 'rb') as pgn_file:
60
- with mmap.mmap(pgn_file.fileno(), 0, access=mmap.ACCESS_READ) as pgn_mmap:
61
- for i in range(num_threads):
62
- start_index = start_at + i * chunk_size
63
- end_index = start_at + (i + 1) * chunk_size
64
- if i == num_threads - 1:
65
- end_index = file_size
66
- pgn_data = pgn_mmap[start_index:end_index]
67
- thread = threading.Thread(target=process_pgn_chunk, args=(pgn_data, f"{output_file[:-4]}_{i}.csv", start_index, end_index))
68
- threads.append(thread)
69
- thread.start()
70
-
71
- for thread in threads:
72
- thread.join()
73
-
74
- # Usage example
75
- input_file = './chess-mamba-vs-xformer/lichess_db_standard_rated_2022-07.pgn'
76
- output_file = './chess-mamba-vs-xformer/lichess_transcripts_phase2_stable.csv'
77
- process_pgn_file(input_file, output_file)