HaileyStorm
commited on
Commit
•
80bc2b3
1
Parent(s):
062c52f
Upload 5 files
Browse files- csv2pqt_windraw.py +78 -0
- filter_csv.py +16 -0
- filter_lichess.py +53 -0
- merge_csv.py +14 -0
- sort_split.py +62 -0
csv2pqt_windraw.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import pyarrow as pa
|
3 |
+
import pyarrow.parquet as pq
|
4 |
+
import numpy as np
|
5 |
+
import tiktoken
|
6 |
+
import pickle
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
import random
|
9 |
+
import os
|
10 |
+
|
11 |
+
|
12 |
+
move_num_in_gamestate = False
|
13 |
+
|
14 |
+
def tokenize_game(game, stoi):
|
15 |
+
# Remove the prefix and tokenize the game
|
16 |
+
game_cleaned = game.split('\n\n', 1)[1] if '\n\n' in game else game
|
17 |
+
game_cleaned = ' '.join(['.' + m.split(".")[-1] if "." in m else m for m in game_cleaned.split()])
|
18 |
+
return np.array(encode(game_cleaned), dtype=np.uint8)
|
19 |
+
|
20 |
+
if __name__ == "__main__":
|
21 |
+
dataset_path = "/media/hailey/TVBox/csv_datasets/anneal.csv"
|
22 |
+
meta_path = "data/chess/meta.pkl"
|
23 |
+
|
24 |
+
# Load metadata for tokenization
|
25 |
+
if move_num_in_gamestate:
|
26 |
+
meta_path = os.path.join(os.path.join('data', 'chess'), 'meta.pkl')
|
27 |
+
with open(meta_path, "rb") as f:
|
28 |
+
meta = pickle.load(f)
|
29 |
+
stoi, itos = meta["stoi"], meta["itos"]
|
30 |
+
encode = lambda s: [stoi[c] for c in s]
|
31 |
+
decode = lambda l: "".join([itos[i] for i in l])
|
32 |
+
else:
|
33 |
+
stoi = {' ': 0, '.': 1, 'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6, 'f': 7, 'g': 8, 'h': 9, '1': 10, '2': 11, '3': 12, '4': 13, '5': 14, '6': 15, '7': 16, '8': 17, 'B': 18, 'N': 19, 'R': 20, 'Q': 21, 'K': 22, 'O': 23, 'x': 24, '+': 25, '#': 26, '=': 27}
|
34 |
+
itos = {0: ' ', 1: '.', 2: 'a', 3: 'b', 4: 'c', 5: 'd', 6: 'e', 7: 'f', 8: 'g', 9: 'h', 10: '1', 11: '2', 12: '3', 13: '4', 14: '5', 15: '6', 16: '7', 17: '8', 18: 'B', 19: 'N', 20: 'R', 21: 'Q', 22: 'K', 23: 'O', 24: 'x', 25: '+', 26: '#', 27: '='}
|
35 |
+
for s in stoi:
|
36 |
+
assert itos[stoi[s]] == s
|
37 |
+
encode = lambda s: [stoi[c] for c in s.replace('-', '')]
|
38 |
+
decode = lambda l: "".join([itos[i] for i in l]).replace("OOO", "O-O-O").replace("OO", "O-O")
|
39 |
+
|
40 |
+
# Read CSV with headers
|
41 |
+
print("Opening csv...")
|
42 |
+
df = pd.read_csv(dataset_path)
|
43 |
+
#print(df.iloc[random.randint(0, len(df) - 1)])
|
44 |
+
|
45 |
+
# Report statistics
|
46 |
+
total_games = len(df)
|
47 |
+
#white_wins = len(df[df['Result'] == '1-0'])
|
48 |
+
#white_draws = len(df[df['Result'] == '1/2-1/2'])
|
49 |
+
#discarded_games = total_games - white_wins #- white_draws
|
50 |
+
print(f"Total games: {total_games}. Tokenizing...")
|
51 |
+
#print(f"White wins: {white_wins} ({white_wins/total_games*100:.2f}%)")
|
52 |
+
#print(f"White draws: {white_draws} ({white_draws/total_games*100:.2f}%)")
|
53 |
+
#print(f"Discarded games: {discarded_games} ({discarded_games/total_games*100:.2f}%)")
|
54 |
+
|
55 |
+
# Filter out games where white loses
|
56 |
+
#df = df[df['Result'].isin(['1-0', '1/2-1/2'])]
|
57 |
+
#df = df[df['Result'] == '1-0']
|
58 |
+
|
59 |
+
# Tokenize games in the 'transcript' column
|
60 |
+
df['tokenized'] = df['transcript'].apply(lambda x: tokenize_game(x, stoi))
|
61 |
+
print("Tokenized. Writing parquet file...")
|
62 |
+
|
63 |
+
# Split dataset into training and validation
|
64 |
+
#train_df, val_df = train_test_split(df, test_size=0.0, random_state=42)
|
65 |
+
train_df = df
|
66 |
+
val_df = None
|
67 |
+
|
68 |
+
# Define a function to write the DataFrame to a Parquet file with multiple rows per row group
|
69 |
+
def write_parquet_with_row_groups(df, file_name, rows_per_group=100):
|
70 |
+
table = pa.Table.from_pandas(df[['tokenized']])
|
71 |
+
writer = pq.ParquetWriter(file_name, table.schema)
|
72 |
+
for i in range(0, len(df), rows_per_group):
|
73 |
+
writer.write_table(table.slice(i, min(rows_per_group, len(df) - i)))
|
74 |
+
writer.close()
|
75 |
+
|
76 |
+
write_parquet_with_row_groups(train_df, '/media/hailey/TVBox/NEW_anneal.parquet')
|
77 |
+
#write_parquet_with_row_groups(val_df, 'val_lich_windraw.parquet')
|
78 |
+
print("Done.")
|
filter_csv.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
# Read the CSV file
|
4 |
+
df = pd.read_csv('gt1_8kElo_all.zip')
|
5 |
+
|
6 |
+
# Filter the DataFrame based on the conditions
|
7 |
+
filtered_df = df[(df['Result'] == '1-0') &
|
8 |
+
(df['WhiteElo'] > 1900) &
|
9 |
+
(df['WhiteElo'] < 2300) &
|
10 |
+
(df['BlackElo'] < 2600)]
|
11 |
+
|
12 |
+
# Select only the 'transcript' column
|
13 |
+
transcript_df = filtered_df[['transcript']]
|
14 |
+
|
15 |
+
# Save the filtered 'transcript' column to a new CSV file
|
16 |
+
transcript_df.to_csv('NEW_lichess_filtered.csv', index=False)
|
filter_lichess.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import chess
|
2 |
+
import chess.pgn
|
3 |
+
import csv
|
4 |
+
import os
|
5 |
+
|
6 |
+
start_at = 0
|
7 |
+
total_games = 92055571
|
8 |
+
def process_pgn_file(input_file, output_file):
|
9 |
+
with open(input_file, 'r') as pgn_file, open(output_file, 'a', newline='') as csv_file:
|
10 |
+
csv_writer = csv.writer(csv_file)
|
11 |
+
if start_at == 0:
|
12 |
+
csv_writer.writerow(['transcript'])
|
13 |
+
|
14 |
+
file_size = os.stat(pgn_file.fileno()).st_size
|
15 |
+
pgn_file.seek(int(file_size * (start_at / total_games)))
|
16 |
+
|
17 |
+
games_seen = 0
|
18 |
+
games_added = 0
|
19 |
+
while True:
|
20 |
+
game = chess.pgn.read_game(pgn_file)
|
21 |
+
if game is None:
|
22 |
+
break
|
23 |
+
games_seen += 1
|
24 |
+
|
25 |
+
# Filter games based on the specified criteria
|
26 |
+
if (
|
27 |
+
game.headers['Result'] == '1-0' and
|
28 |
+
'Rated' in game.headers['Event'] and
|
29 |
+
1500 < int(game.headers['WhiteElo']) < 2400 and
|
30 |
+
1400 < int(game.headers['BlackElo']) < 2800
|
31 |
+
):
|
32 |
+
board = chess.Board()
|
33 |
+
moves = []
|
34 |
+
move_number = 1
|
35 |
+
for move in game.mainline_moves():
|
36 |
+
if board.turn == chess.WHITE:
|
37 |
+
moves.append(f"{move_number}.")
|
38 |
+
move_number += 1
|
39 |
+
san = board.san(move)
|
40 |
+
moves.append(san + " ")
|
41 |
+
board.push(board.parse_san(san))
|
42 |
+
|
43 |
+
if board.is_game_over() and board.result() == "1-0":
|
44 |
+
transcript = ''.join(moves)
|
45 |
+
csv_writer.writerow([transcript.rstrip()])
|
46 |
+
games_added += 1
|
47 |
+
if games_added % 100 == 0:
|
48 |
+
print(f"Added {games_added} of {games_seen} games. {(games_seen+start_at)/float(total_games):.2%} complete.")
|
49 |
+
|
50 |
+
# Usage example
|
51 |
+
input_file = './lichess_db_standard_rated_2022-07.pgn'
|
52 |
+
output_file = './lichess_transcripts_phase2_stable.csv'
|
53 |
+
process_pgn_file(input_file, output_file)
|
merge_csv.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
# Read the first CSV file
|
4 |
+
df1 = pd.read_csv('/media/hailey/TVBox/lichess_db_stable.csv')
|
5 |
+
|
6 |
+
# Read the second CSV file and filter it
|
7 |
+
df2 = pd.read_csv('/media/hailey/TVBox/lichess_transcripts_2.csv')
|
8 |
+
#df2 = df2[df2['Result'] == '1-0'][['transcript']]
|
9 |
+
|
10 |
+
# Concatenate the filtered DataFrames
|
11 |
+
merged_df = pd.concat([df1, df2], ignore_index=True)
|
12 |
+
|
13 |
+
# Save the merged DataFrame to a new CSV file
|
14 |
+
merged_df.to_csv('/media/hailey/TVBox/lichess_db_stable2.csv', index=False)
|
sort_split.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import pyarrow.parquet as pq
|
3 |
+
import os
|
4 |
+
import numpy as np
|
5 |
+
import math
|
6 |
+
|
7 |
+
def sort_and_split_parquet(input_file, output_dir, n_splits, prefix, min_len, max_len):
|
8 |
+
# Load the parquet file
|
9 |
+
print("Loading parquet file...")
|
10 |
+
df = pq.read_table(input_file).to_pandas()
|
11 |
+
|
12 |
+
# Sort by the length of the 'tokenized' column
|
13 |
+
print("Sorting games & filtering by length...")
|
14 |
+
df['length'] = df['tokenized'].apply(len)
|
15 |
+
df_sorted = df.sort_values(by='length').drop(columns=['length'])
|
16 |
+
lenb4 = len(df_sorted)
|
17 |
+
df_sorted = df_sorted[df_sorted['tokenized'].apply(len) <= max_len]
|
18 |
+
df_sorted = df_sorted[df_sorted['tokenized'].apply(len) >= min_len]
|
19 |
+
if len(df_sorted) < lenb4:
|
20 |
+
removed = lenb4 - len(df_sorted)
|
21 |
+
print(f"Removed {removed} ({float(removed)/lenb4:.2%}) short and long games.")
|
22 |
+
|
23 |
+
# Calculate the number of rows per split
|
24 |
+
total_rows = len(df_sorted)
|
25 |
+
rows_per_split = math.ceil(total_rows / n_splits)
|
26 |
+
|
27 |
+
print("Dataset sorted. Splitting...")
|
28 |
+
games = 0
|
29 |
+
# Split and save each part
|
30 |
+
for i in range(n_splits):
|
31 |
+
start_row = i * rows_per_split
|
32 |
+
end_row = min(start_row + rows_per_split, total_rows)
|
33 |
+
split_df = df_sorted.iloc[start_row:end_row]
|
34 |
+
#lenb4 = len(split_df)
|
35 |
+
#split_df = split_df[split_df['tokenized'].apply(len) <= max_len]
|
36 |
+
#if len(split_df) < lenb4:
|
37 |
+
# print(f"\tRemoved {lenb4 - len(split_df)} long games.")
|
38 |
+
games += len(split_df)
|
39 |
+
|
40 |
+
first_game_length = len(split_df.iloc[0]['tokenized'])
|
41 |
+
last_game_length = len(split_df.iloc[-1]['tokenized'])
|
42 |
+
|
43 |
+
# Save the split DataFrame as a parquet file
|
44 |
+
split_file_name = f"{prefix}_{i}.parquet"
|
45 |
+
split_df.to_parquet(os.path.join(output_dir, split_file_name))
|
46 |
+
|
47 |
+
print(f"Saved {split_file_name}... Game lengths: {first_game_length} - {last_game_length}")
|
48 |
+
print(f"Saved {games} games total.")
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
input_file = '/media/hailey/TVBox/NEW_stable.parquet'
|
53 |
+
output_dir = '/media/hailey/More/AI/mamba.py/data/stable'
|
54 |
+
os.makedirs(output_dir, exist_ok=True)
|
55 |
+
n_splits = 360 #should be roughly input size / 10MB
|
56 |
+
prefix = "stable"
|
57 |
+
min_len = 200
|
58 |
+
max_len = 1536
|
59 |
+
|
60 |
+
sort_and_split_parquet(input_file, output_dir, n_splits, prefix, min_len, max_len)
|
61 |
+
print("Done.")
|
62 |
+
|