chess-mamba-vs-xformer / csv2pqt_windraw.py
HaileyStorm's picture
Upload 5 files
80bc2b3 verified
raw
history blame
No virus
3.56 kB
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import tiktoken
import pickle
from sklearn.model_selection import train_test_split
import random
import os
move_num_in_gamestate = False
def tokenize_game(game, stoi):
# Remove the prefix and tokenize the game
game_cleaned = game.split('\n\n', 1)[1] if '\n\n' in game else game
game_cleaned = ' '.join(['.' + m.split(".")[-1] if "." in m else m for m in game_cleaned.split()])
return np.array(encode(game_cleaned), dtype=np.uint8)
if __name__ == "__main__":
dataset_path = "/media/hailey/TVBox/csv_datasets/anneal.csv"
meta_path = "data/chess/meta.pkl"
# Load metadata for tokenization
if move_num_in_gamestate:
meta_path = os.path.join(os.path.join('data', 'chess'), 'meta.pkl')
with open(meta_path, "rb") as f:
meta = pickle.load(f)
stoi, itos = meta["stoi"], meta["itos"]
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])
else:
stoi = {' ': 0, '.': 1, 'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6, 'f': 7, 'g': 8, 'h': 9, '1': 10, '2': 11, '3': 12, '4': 13, '5': 14, '6': 15, '7': 16, '8': 17, 'B': 18, 'N': 19, 'R': 20, 'Q': 21, 'K': 22, 'O': 23, 'x': 24, '+': 25, '#': 26, '=': 27}
itos = {0: ' ', 1: '.', 2: 'a', 3: 'b', 4: 'c', 5: 'd', 6: 'e', 7: 'f', 8: 'g', 9: 'h', 10: '1', 11: '2', 12: '3', 13: '4', 14: '5', 15: '6', 16: '7', 17: '8', 18: 'B', 19: 'N', 20: 'R', 21: 'Q', 22: 'K', 23: 'O', 24: 'x', 25: '+', 26: '#', 27: '='}
for s in stoi:
assert itos[stoi[s]] == s
encode = lambda s: [stoi[c] for c in s.replace('-', '')]
decode = lambda l: "".join([itos[i] for i in l]).replace("OOO", "O-O-O").replace("OO", "O-O")
# Read CSV with headers
print("Opening csv...")
df = pd.read_csv(dataset_path)
#print(df.iloc[random.randint(0, len(df) - 1)])
# Report statistics
total_games = len(df)
#white_wins = len(df[df['Result'] == '1-0'])
#white_draws = len(df[df['Result'] == '1/2-1/2'])
#discarded_games = total_games - white_wins #- white_draws
print(f"Total games: {total_games}. Tokenizing...")
#print(f"White wins: {white_wins} ({white_wins/total_games*100:.2f}%)")
#print(f"White draws: {white_draws} ({white_draws/total_games*100:.2f}%)")
#print(f"Discarded games: {discarded_games} ({discarded_games/total_games*100:.2f}%)")
# Filter out games where white loses
#df = df[df['Result'].isin(['1-0', '1/2-1/2'])]
#df = df[df['Result'] == '1-0']
# Tokenize games in the 'transcript' column
df['tokenized'] = df['transcript'].apply(lambda x: tokenize_game(x, stoi))
print("Tokenized. Writing parquet file...")
# Split dataset into training and validation
#train_df, val_df = train_test_split(df, test_size=0.0, random_state=42)
train_df = df
val_df = None
# Define a function to write the DataFrame to a Parquet file with multiple rows per row group
def write_parquet_with_row_groups(df, file_name, rows_per_group=100):
table = pa.Table.from_pandas(df[['tokenized']])
writer = pq.ParquetWriter(file_name, table.schema)
for i in range(0, len(df), rows_per_group):
writer.write_table(table.slice(i, min(rows_per_group, len(df) - i)))
writer.close()
write_parquet_with_row_groups(train_df, '/media/hailey/TVBox/NEW_anneal.parquet')
#write_parquet_with_row_groups(val_df, 'val_lich_windraw.parquet')
print("Done.")