File size: 2,281 Bytes
80bc2b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import pandas as pd
import pyarrow.parquet as pq
import os
import numpy as np
import math
def sort_and_split_parquet(input_file, output_dir, n_splits, prefix, min_len, max_len):
# Load the parquet file
print("Loading parquet file...")
df = pq.read_table(input_file).to_pandas()
# Sort by the length of the 'tokenized' column
print("Sorting games & filtering by length...")
df['length'] = df['tokenized'].apply(len)
df_sorted = df.sort_values(by='length').drop(columns=['length'])
lenb4 = len(df_sorted)
df_sorted = df_sorted[df_sorted['tokenized'].apply(len) <= max_len]
df_sorted = df_sorted[df_sorted['tokenized'].apply(len) >= min_len]
if len(df_sorted) < lenb4:
removed = lenb4 - len(df_sorted)
print(f"Removed {removed} ({float(removed)/lenb4:.2%}) short and long games.")
# Calculate the number of rows per split
total_rows = len(df_sorted)
rows_per_split = math.ceil(total_rows / n_splits)
print("Dataset sorted. Splitting...")
games = 0
# Split and save each part
for i in range(n_splits):
start_row = i * rows_per_split
end_row = min(start_row + rows_per_split, total_rows)
split_df = df_sorted.iloc[start_row:end_row]
#lenb4 = len(split_df)
#split_df = split_df[split_df['tokenized'].apply(len) <= max_len]
#if len(split_df) < lenb4:
# print(f"\tRemoved {lenb4 - len(split_df)} long games.")
games += len(split_df)
first_game_length = len(split_df.iloc[0]['tokenized'])
last_game_length = len(split_df.iloc[-1]['tokenized'])
# Save the split DataFrame as a parquet file
split_file_name = f"{prefix}_{i}.parquet"
split_df.to_parquet(os.path.join(output_dir, split_file_name))
print(f"Saved {split_file_name}... Game lengths: {first_game_length} - {last_game_length}")
print(f"Saved {games} games total.")
input_file = '/media/hailey/TVBox/NEW_stable.parquet'
output_dir = '/media/hailey/More/AI/mamba.py/data/stable'
os.makedirs(output_dir, exist_ok=True)
n_splits = 360 #should be roughly input size / 10MB
prefix = "stable"
min_len = 200
max_len = 1536
sort_and_split_parquet(input_file, output_dir, n_splits, prefix, min_len, max_len)
print("Done.")
|