Updates scripts to prepare data
Browse files- prepare_data.py +0 -21
- scripts/{merge-zip-parts.sh → merge_and_extract.sh} +13 -8
- scripts/split_dataset.py +33 -0
prepare_data.py
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import pandas as pd
|
3 |
-
from sklearn.model_selection import train_test_split
|
4 |
-
|
5 |
-
DATA_DIR = "data/train"
|
6 |
-
CSV_PATH = "data/trainLabels.csv"
|
7 |
-
TEST_SIZE = 0.2
|
8 |
-
RANDOM_STATE = 42
|
9 |
-
|
10 |
-
# Load the CSV file into a pandas DataFrame and add the image path
|
11 |
-
df = pd.read_csv(CSV_PATH, names=['image_path', 'label'], converters={'image_path': lambda x: f"{DATA_DIR}/{x}.jpeg"})
|
12 |
-
|
13 |
-
# drop row where image does not exist
|
14 |
-
df = df[df['image_path'].apply(lambda x: os.path.exists(x))]
|
15 |
-
|
16 |
-
# split the data into train and validation sets such that the class distribution is the same in both sets
|
17 |
-
df_train, df_val = train_test_split(df, test_size=TEST_SIZE, stratify=df['label'], random_state=RANDOM_STATE)
|
18 |
-
|
19 |
-
# Save the train and validation sets to CSV files
|
20 |
-
df_train.to_csv("data/train.csv", index=False)
|
21 |
-
df_val.to_csv("data/val.csv", index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/{merge-zip-parts.sh → merge_and_extract.sh}
RENAMED
@@ -8,26 +8,31 @@ log() {
|
|
8 |
echo "$(date +"%Y-%m-%d %H:%M:%S") $1"
|
9 |
}
|
10 |
|
11 |
-
# Function to merge zip
|
12 |
-
|
13 |
local zip_name="$1"
|
14 |
log "Merging $zip_name parts into a single zip file..."
|
15 |
cat "$DATASET_DIR/$zip_name".zip.* > "$DATASET_DIR/$zip_name.zip"
|
16 |
log "Merged $zip_name.zip created at $DATASET_DIR"
|
17 |
|
18 |
-
#
|
19 |
rm "$DATASET_DIR/$zip_name".zip.*
|
20 |
log "Removing $zip_name parts"
|
|
|
|
|
|
|
|
|
|
|
21 |
}
|
22 |
|
23 |
-
# Merge train.zip parts;
|
24 |
-
|
25 |
|
26 |
-
# Merge test.zip parts
|
27 |
-
|
28 |
|
29 |
# End of script
|
30 |
log "Script execution completed."
|
31 |
|
32 |
# Wait for all background processes to finish
|
33 |
-
wait
|
|
|
8 |
echo "$(date +"%Y-%m-%d %H:%M:%S") $1"
|
9 |
}
|
10 |
|
11 |
+
# Function to merge and extract zip files
|
12 |
+
merge_and_extract_zip() {
|
13 |
local zip_name="$1"
|
14 |
log "Merging $zip_name parts into a single zip file..."
|
15 |
cat "$DATASET_DIR/$zip_name".zip.* > "$DATASET_DIR/$zip_name.zip"
|
16 |
log "Merged $zip_name.zip created at $DATASET_DIR"
|
17 |
|
18 |
+
# Remove partition files
|
19 |
rm "$DATASET_DIR/$zip_name".zip.*
|
20 |
log "Removing $zip_name parts"
|
21 |
+
|
22 |
+
# Extract the merged file
|
23 |
+
log "Extracting $zip_name.zip..."
|
24 |
+
unzip -o "$DATASET_DIR/$zip_name.zip" -d "$DATASET_DIR"
|
25 |
+
log "Extracted $zip_name.zip at $DATASET_DIR"
|
26 |
}
|
27 |
|
28 |
+
# Merge and extract train.zip parts;
|
29 |
+
merge_and_extract_zip "train" &
|
30 |
|
31 |
+
# Merge and extract test.zip parts
|
32 |
+
merge_and_extract_zip "test" &
|
33 |
|
34 |
# End of script
|
35 |
log "Script execution completed."
|
36 |
|
37 |
# Wait for all background processes to finish
|
38 |
+
wait
|
scripts/split_dataset.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import argparse
|
3 |
+
import pandas as pd
|
4 |
+
from sklearn.model_selection import train_test_split
|
5 |
+
|
6 |
+
def load_data(data_dir, csv_path):
|
7 |
+
df = pd.read_csv(csv_path, names=['image_path', 'label'], converters={'image_path': lambda x: f"{data_dir}/{x}.jpeg"})
|
8 |
+
df = df[df['image_path'].apply(lambda x: os.path.exists(x))]
|
9 |
+
return df
|
10 |
+
|
11 |
+
def main(data_dir, csv_path, train_csv_path, val_csv_path, test_size=0.2, random_state=42):
|
12 |
+
# Load data from CSV
|
13 |
+
df = load_data(data_dir, csv_path)
|
14 |
+
|
15 |
+
# Split the data into train and validation sets
|
16 |
+
df_train, df_val = train_test_split(df, test_size=test_size, stratify=df['label'], random_state=random_state)
|
17 |
+
|
18 |
+
# Save the train and validation sets to CSV files
|
19 |
+
df_train.to_csv(train_csv_path, index=False)
|
20 |
+
df_val.to_csv(val_csv_path, index=False)
|
21 |
+
|
22 |
+
if __name__ == "__main__":
|
23 |
+
parser = argparse.ArgumentParser(description="Split dataset into train and validation sets.")
|
24 |
+
parser.add_argument("--data_dir", type=str, default="data/diabetic-retinopathy-dataset/train", help="Directory containing images.")
|
25 |
+
parser.add_argument("--csv_path", type=str, default="data/diabetic-retinopathy-dataset/trainLabels.csv", help="Path to CSV file containing image labels.")
|
26 |
+
parser.add_argument("--train_csv_path", type=str, default="data/diabetic-retinopathy-dataset/train.csv", help="Path to save train CSV file.")
|
27 |
+
parser.add_argument("--val_csv_path", type=str, default="data/diabetic-retinopathy-dataset/val.csv", help="Path to save validation CSV file.")
|
28 |
+
parser.add_argument("--test_size", type=float, default=0.2, help="Proportion of the dataset to include in the validation split.")
|
29 |
+
parser.add_argument("--random_state", type=int, default=42, help="Seed for random number generator.")
|
30 |
+
|
31 |
+
args = parser.parse_args()
|
32 |
+
|
33 |
+
main(args.data_dir, args.csv_path, args.train_csv_path, args.val_csv_path, args.test_size, args.random_state)
|