bhimrazy commited on
Commit
45928a2
1 Parent(s): cdb0658

Add prepare_data.py script to split data into train and validation sets

Browse files
Files changed (1) hide show
  1. prepare_data.py +21 -0
prepare_data.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from sklearn.model_selection import train_test_split
4
+
5
+ DATA_DIR = "data/train"
6
+ CSV_PATH = "data/trainLabels.csv"
7
+ TEST_SIZE = 0.2
8
+ RANDOM_STATE = 42
9
+
10
+ # Load the CSV file into a pandas DataFrame and add the image path
11
+ df = pd.read_csv(CSV_PATH, names=['image_path', 'label'], converters={'image_path': lambda x: f"{DATA_DIR}/{x}.jpeg"})
12
+
13
+ # drop row where image does not exist
14
+ df = df[df['image_path'].apply(lambda x: os.path.exists(x))]
15
+
16
+ # split the data into train and validation sets such that the class distribution is the same in both sets
17
+ df_train, df_val = train_test_split(df, test_size=TEST_SIZE, stratify=df['label'], random_state=RANDOM_STATE)
18
+
19
+ # Save the train and validation sets to CSV files
20
+ df_train.to_csv("data/train.csv", index=False)
21
+ df_val.to_csv("data/val.csv", index=False)