bhimrazy commited on
Commit
94709b2
1 Parent(s): 7638f0a

Updates scripts to prepare data

Browse files
prepare_data.py DELETED
@@ -1,21 +0,0 @@
1
- import os
2
- import pandas as pd
3
- from sklearn.model_selection import train_test_split
4
-
5
- DATA_DIR = "data/train"
6
- CSV_PATH = "data/trainLabels.csv"
7
- TEST_SIZE = 0.2
8
- RANDOM_STATE = 42
9
-
10
- # Load the CSV file into a pandas DataFrame and add the image path
11
- df = pd.read_csv(CSV_PATH, names=['image_path', 'label'], converters={'image_path': lambda x: f"{DATA_DIR}/{x}.jpeg"})
12
-
13
- # drop row where image does not exist
14
- df = df[df['image_path'].apply(lambda x: os.path.exists(x))]
15
-
16
- # split the data into train and validation sets such that the class distribution is the same in both sets
17
- df_train, df_val = train_test_split(df, test_size=TEST_SIZE, stratify=df['label'], random_state=RANDOM_STATE)
18
-
19
- # Save the train and validation sets to CSV files
20
- df_train.to_csv("data/train.csv", index=False)
21
- df_val.to_csv("data/val.csv", index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/{merge-zip-parts.sh → merge_and_extract.sh} RENAMED
@@ -8,26 +8,31 @@ log() {
8
  echo "$(date +"%Y-%m-%d %H:%M:%S") $1"
9
  }
10
 
11
- # Function to merge zip parts
12
- merge_zip() {
13
  local zip_name="$1"
14
  log "Merging $zip_name parts into a single zip file..."
15
  cat "$DATASET_DIR/$zip_name".zip.* > "$DATASET_DIR/$zip_name.zip"
16
  log "Merged $zip_name.zip created at $DATASET_DIR"
17
 
18
- # remove partition files
19
  rm "$DATASET_DIR/$zip_name".zip.*
20
  log "Removing $zip_name parts"
 
 
 
 
 
21
  }
22
 
23
- # Merge train.zip parts;
24
- merge_zip "train" &
25
 
26
- # Merge test.zip parts
27
- merge_zip "test" &
28
 
29
  # End of script
30
  log "Script execution completed."
31
 
32
  # Wait for all background processes to finish
33
- wait
 
8
  echo "$(date +"%Y-%m-%d %H:%M:%S") $1"
9
  }
10
 
11
+ # Function to merge and extract zip files
12
+ merge_and_extract_zip() {
13
  local zip_name="$1"
14
  log "Merging $zip_name parts into a single zip file..."
15
  cat "$DATASET_DIR/$zip_name".zip.* > "$DATASET_DIR/$zip_name.zip"
16
  log "Merged $zip_name.zip created at $DATASET_DIR"
17
 
18
+ # Remove partition files
19
  rm "$DATASET_DIR/$zip_name".zip.*
20
  log "Removing $zip_name parts"
21
+
22
+ # Extract the merged file
23
+ log "Extracting $zip_name.zip..."
24
+ unzip -o "$DATASET_DIR/$zip_name.zip" -d "$DATASET_DIR"
25
+ log "Extracted $zip_name.zip at $DATASET_DIR"
26
  }
27
 
28
+ # Merge and extract train.zip parts;
29
+ merge_and_extract_zip "train" &
30
 
31
+ # Merge and extract test.zip parts
32
+ merge_and_extract_zip "test" &
33
 
34
  # End of script
35
  log "Script execution completed."
36
 
37
  # Wait for all background processes to finish
38
+ wait
scripts/split_dataset.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import pandas as pd
4
+ from sklearn.model_selection import train_test_split
5
+
6
+ def load_data(data_dir, csv_path):
7
+ df = pd.read_csv(csv_path, names=['image_path', 'label'], converters={'image_path': lambda x: f"{data_dir}/{x}.jpeg"})
8
+ df = df[df['image_path'].apply(lambda x: os.path.exists(x))]
9
+ return df
10
+
11
+ def main(data_dir, csv_path, train_csv_path, val_csv_path, test_size=0.2, random_state=42):
12
+ # Load data from CSV
13
+ df = load_data(data_dir, csv_path)
14
+
15
+ # Split the data into train and validation sets
16
+ df_train, df_val = train_test_split(df, test_size=test_size, stratify=df['label'], random_state=random_state)
17
+
18
+ # Save the train and validation sets to CSV files
19
+ df_train.to_csv(train_csv_path, index=False)
20
+ df_val.to_csv(val_csv_path, index=False)
21
+
22
+ if __name__ == "__main__":
23
+ parser = argparse.ArgumentParser(description="Split dataset into train and validation sets.")
24
+ parser.add_argument("--data_dir", type=str, default="data/diabetic-retinopathy-dataset/train", help="Directory containing images.")
25
+ parser.add_argument("--csv_path", type=str, default="data/diabetic-retinopathy-dataset/trainLabels.csv", help="Path to CSV file containing image labels.")
26
+ parser.add_argument("--train_csv_path", type=str, default="data/diabetic-retinopathy-dataset/train.csv", help="Path to save train CSV file.")
27
+ parser.add_argument("--val_csv_path", type=str, default="data/diabetic-retinopathy-dataset/val.csv", help="Path to save validation CSV file.")
28
+ parser.add_argument("--test_size", type=float, default=0.2, help="Proportion of the dataset to include in the validation split.")
29
+ parser.add_argument("--random_state", type=int, default=42, help="Seed for random number generator.")
30
+
31
+ args = parser.parse_args()
32
+
33
+ main(args.data_dir, args.csv_path, args.train_csv_path, args.val_csv_path, args.test_size, args.random_state)