HaileyStorm
/

chess-mamba-vs-xformer

Model card Files Files and versions Community

HaileyStorm commited on Mar 26

Commit

80bc2b3

•

1 Parent(s): 062c52f

Upload 5 files

Browse files

Files changed (5) hide show

csv2pqt_windraw.py +78 -0
filter_csv.py +16 -0
filter_lichess.py +53 -0
merge_csv.py +14 -0
sort_split.py +62 -0

csv2pqt_windraw.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+import numpy as np
+import tiktoken
+import pickle
+from sklearn.model_selection import train_test_split
+import random
+import os
+move_num_in_gamestate = False
+def tokenize_game(game, stoi):
+    # Remove the prefix and tokenize the game
+    game_cleaned = game.split('\n\n', 1)[1] if '\n\n' in game else game
+    game_cleaned = ' '.join(['.' + m.split(".")[-1] if "." in m else m for m in game_cleaned.split()])
+    return np.array(encode(game_cleaned), dtype=np.uint8)
+if __name__ == "__main__":
+    dataset_path = "/media/hailey/TVBox/csv_datasets/anneal.csv"
+    meta_path = "data/chess/meta.pkl"
+    # Load metadata for tokenization
+    if move_num_in_gamestate:
+        meta_path = os.path.join(os.path.join('data', 'chess'), 'meta.pkl')
+        with open(meta_path, "rb") as f:
+            meta = pickle.load(f)
+        stoi, itos = meta["stoi"], meta["itos"]
+        encode = lambda s: [stoi[c] for c in s]
+        decode = lambda l: "".join([itos[i] for i in l])
+    else:
+        stoi = {' ': 0, '.': 1, 'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6, 'f': 7, 'g': 8, 'h': 9, '1': 10, '2': 11, '3': 12, '4': 13, '5': 14, '6': 15, '7': 16, '8': 17, 'B': 18, 'N': 19, 'R': 20, 'Q': 21, 'K': 22, 'O': 23, 'x': 24, '+': 25, '#': 26, '=': 27}
+        itos = {0: ' ', 1: '.', 2: 'a', 3: 'b', 4: 'c', 5: 'd', 6: 'e', 7: 'f', 8: 'g', 9: 'h', 10: '1', 11: '2', 12: '3', 13: '4', 14: '5', 15: '6', 16: '7', 17: '8', 18: 'B', 19: 'N', 20: 'R', 21: 'Q', 22: 'K', 23: 'O', 24: 'x', 25: '+', 26: '#', 27: '='}
+        for s in stoi:
+            assert itos[stoi[s]] == s
+        encode = lambda s: [stoi[c] for c in s.replace('-', '')]
+        decode = lambda l: "".join([itos[i] for i in l]).replace("OOO", "O-O-O").replace("OO", "O-O")
+    # Read CSV with headers
+    print("Opening csv...")
+    df = pd.read_csv(dataset_path)
+    #print(df.iloc[random.randint(0, len(df) - 1)])
+    # Report statistics
+    total_games = len(df)
+    #white_wins = len(df[df['Result'] == '1-0'])
+    #white_draws = len(df[df['Result'] == '1/2-1/2'])
+    #discarded_games = total_games - white_wins #- white_draws
+    print(f"Total games: {total_games}. Tokenizing...")
+    #print(f"White wins: {white_wins} ({white_wins/total_games*100:.2f}%)")
+    #print(f"White draws: {white_draws} ({white_draws/total_games*100:.2f}%)")
+    #print(f"Discarded games: {discarded_games} ({discarded_games/total_games*100:.2f}%)")
+    # Filter out games where white loses
+    #df = df[df['Result'].isin(['1-0', '1/2-1/2'])]
+    #df = df[df['Result'] == '1-0']
+    # Tokenize games in the 'transcript' column
+    df['tokenized'] = df['transcript'].apply(lambda x: tokenize_game(x, stoi))
+    print("Tokenized. Writing parquet file...")
+    # Split dataset into training and validation
+    #train_df, val_df = train_test_split(df, test_size=0.0, random_state=42)
+    train_df = df
+    val_df = None
+    # Define a function to write the DataFrame to a Parquet file with multiple rows per row group
+    def write_parquet_with_row_groups(df, file_name, rows_per_group=100):
+        table = pa.Table.from_pandas(df[['tokenized']])
+        writer = pq.ParquetWriter(file_name, table.schema)
+        for i in range(0, len(df), rows_per_group):
+            writer.write_table(table.slice(i, min(rows_per_group, len(df) - i)))
+        writer.close()
+    write_parquet_with_row_groups(train_df, '/media/hailey/TVBox/NEW_anneal.parquet')
+    #write_parquet_with_row_groups(val_df, 'val_lich_windraw.parquet')
+    print("Done.")

filter_csv.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import pandas as pd
+# Read the CSV file
+df = pd.read_csv('gt1_8kElo_all.zip')
+# Filter the DataFrame based on the conditions
+filtered_df = df[(df['Result'] == '1-0') &
+                 (df['WhiteElo'] > 1900) &
+                 (df['WhiteElo'] < 2300) &
+                 (df['BlackElo'] < 2600)]
+# Select only the 'transcript' column
+transcript_df = filtered_df[['transcript']]
+# Save the filtered 'transcript' column to a new CSV file
+transcript_df.to_csv('NEW_lichess_filtered.csv', index=False)

filter_lichess.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import chess
+import chess.pgn
+import csv
+import os
+start_at = 0
+total_games = 92055571
+def process_pgn_file(input_file, output_file):
+    with open(input_file, 'r') as pgn_file, open(output_file, 'a', newline='') as csv_file:
+        csv_writer = csv.writer(csv_file)
+        if start_at == 0:
+            csv_writer.writerow(['transcript'])
+        file_size = os.stat(pgn_file.fileno()).st_size
+        pgn_file.seek(int(file_size * (start_at / total_games)))
+        games_seen = 0
+        games_added = 0
+        while True:
+            game = chess.pgn.read_game(pgn_file)
+            if game is None:
+                break
+            games_seen += 1
+            # Filter games based on the specified criteria
+            if (
+                game.headers['Result'] == '1-0' and
+                'Rated' in game.headers['Event'] and
+                1500 < int(game.headers['WhiteElo']) < 2400 and
+                1400 < int(game.headers['BlackElo']) < 2800
+            ):
+                board = chess.Board()
+                moves = []
+                move_number = 1
+                for move in game.mainline_moves():
+                    if board.turn == chess.WHITE:
+                        moves.append(f"{move_number}.")
+                        move_number += 1
+                    san = board.san(move)
+                    moves.append(san + " ")
+                    board.push(board.parse_san(san))
+                if board.is_game_over() and board.result() == "1-0":
+                    transcript = ''.join(moves)
+                    csv_writer.writerow([transcript.rstrip()])
+                    games_added += 1
+                    if games_added % 100 == 0:
+                        print(f"Added {games_added} of {games_seen} games. {(games_seen+start_at)/float(total_games):.2%} complete.")
+# Usage example
+input_file = './lichess_db_standard_rated_2022-07.pgn'
+output_file = './lichess_transcripts_phase2_stable.csv'
+process_pgn_file(input_file, output_file)

merge_csv.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import pandas as pd
+# Read the first CSV file
+df1 = pd.read_csv('/media/hailey/TVBox/lichess_db_stable.csv')
+# Read the second CSV file and filter it
+df2 = pd.read_csv('/media/hailey/TVBox/lichess_transcripts_2.csv')
+#df2 = df2[df2['Result'] == '1-0'][['transcript']]
+# Concatenate the filtered DataFrames
+merged_df = pd.concat([df1, df2], ignore_index=True)
+# Save the merged DataFrame to a new CSV file
+merged_df.to_csv('/media/hailey/TVBox/lichess_db_stable2.csv', index=False)

sort_split.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import pandas as pd
+import pyarrow.parquet as pq
+import os
+import numpy as np
+import math
+def sort_and_split_parquet(input_file, output_dir, n_splits, prefix, min_len, max_len):
+    # Load the parquet file
+    print("Loading parquet file...")
+    df = pq.read_table(input_file).to_pandas()
+    # Sort by the length of the 'tokenized' column
+    print("Sorting games & filtering by length...")
+    df['length'] = df['tokenized'].apply(len)
+    df_sorted = df.sort_values(by='length').drop(columns=['length'])
+    lenb4 = len(df_sorted)
+    df_sorted = df_sorted[df_sorted['tokenized'].apply(len) <= max_len]
+    df_sorted = df_sorted[df_sorted['tokenized'].apply(len) >= min_len]
+    if len(df_sorted) < lenb4:
+        removed = lenb4 - len(df_sorted)
+        print(f"Removed {removed} ({float(removed)/lenb4:.2%}) short and long games.")
+    # Calculate the number of rows per split
+    total_rows = len(df_sorted)
+    rows_per_split = math.ceil(total_rows / n_splits)
+    print("Dataset sorted. Splitting...")
+    games = 0
+    # Split and save each part
+    for i in range(n_splits):
+        start_row = i * rows_per_split
+        end_row = min(start_row + rows_per_split, total_rows)
+        split_df = df_sorted.iloc[start_row:end_row]
+        #lenb4 = len(split_df)
+        #split_df = split_df[split_df['tokenized'].apply(len) <= max_len]
+        #if len(split_df) < lenb4:
+        #    print(f"\tRemoved {lenb4 - len(split_df)} long games.")
+        games += len(split_df)
+        first_game_length = len(split_df.iloc[0]['tokenized'])
+        last_game_length = len(split_df.iloc[-1]['tokenized'])
+        # Save the split DataFrame as a parquet file
+        split_file_name = f"{prefix}_{i}.parquet"
+        split_df.to_parquet(os.path.join(output_dir, split_file_name))
+        print(f"Saved {split_file_name}... Game lengths: {first_game_length} - {last_game_length}")
+    print(f"Saved {games} games total.")
+input_file = '/media/hailey/TVBox/NEW_stable.parquet'
+output_dir = '/media/hailey/More/AI/mamba.py/data/stable'
+os.makedirs(output_dir, exist_ok=True)
+n_splits = 360 #should be roughly input size / 10MB
+prefix = "stable"
+min_len = 200
+max_len = 1536
+sort_and_split_parquet(input_file, output_dir, n_splits, prefix, min_len, max_len)
+print("Done.")