HaileyStorm commited on
Commit
1e37a7a
1 Parent(s): b086bf5

Upload dedupe.py

Browse files
Files changed (1) hide show
  1. chess-gpt-eval/dedupe.py +26 -0
chess-gpt-eval/dedupe.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ filename = 'logs/Mamba/6_6M/1way_ckpt_2741760b_pt_vs_lc0_sweep.csv'
4
+ #filename = 'logs/Mamba/11M/ckpt_1188012b_pt_vs_lc0_sweep.csv'
5
+ #filename = 'logs/11M/Round 1/ckpt_2608480_pt_vs_lc0_sweep.csv'
6
+
7
+ # Read in the CSV file
8
+ df = pd.read_csv(filename)
9
+
10
+ # Count the original total
11
+ original_total = df.shape[0]
12
+
13
+ # Filter out duplicates in the 'transcript' column
14
+ df = df.drop_duplicates(subset='transcript')
15
+
16
+ # Count the remove, and remaining rows
17
+ removed = original_total - df.shape[0]
18
+ remaining = df.shape[0]
19
+
20
+ # Print out the results
21
+ print("Original total rows:", original_total)
22
+ print("Removed rows:", removed)
23
+ print("Remaining rows:", remaining)
24
+
25
+ # Write the filtered data to a new CSV file
26
+ df.to_csv(filename, index=False)