File size: 735 Bytes
1e37a7a
 
ccb4a44
1e37a7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import pandas as pd

filename = 'logs/Mamba/50M/ckpt_2955050b_pt_vs_lc0_sweep.csv'
#filename = 'logs/Mamba/11M/ckpt_1188012b_pt_vs_lc0_sweep.csv'
#filename = 'logs/11M/Round 1/ckpt_2608480_pt_vs_lc0_sweep.csv'

# Read in the CSV file
df = pd.read_csv(filename)

# Count the original total
original_total = df.shape[0]

# Filter out duplicates in the 'transcript' column
df = df.drop_duplicates(subset='transcript')

# Count the remove, and remaining rows
removed = original_total - df.shape[0]
remaining = df.shape[0]

# Print out the results
print("Original total rows:", original_total)
print("Removed rows:", removed)
print("Remaining rows:", remaining)

# Write the filtered data to a new CSV file
df.to_csv(filename, index=False)