|
|
"""
|
|
|
Script to split dataset into train, validation, and test sets
|
|
|
Proportions: 70% train, 15% validation, 15% test
|
|
|
"""
|
|
|
|
|
|
import os
|
|
|
import shutil
|
|
|
from pathlib import Path
|
|
|
import random
|
|
|
|
|
|
|
|
|
random.seed(42)
|
|
|
|
|
|
|
|
|
SOURCE_DIR = Path('dataset')
|
|
|
DEST_DIR = Path('data')
|
|
|
|
|
|
|
|
|
TRAIN_RATIO = 0.70
|
|
|
VAL_RATIO = 0.15
|
|
|
TEST_RATIO = 0.15
|
|
|
|
|
|
print("="*80)
|
|
|
print("DATASET SPLITTING TOOL")
|
|
|
print("="*80)
|
|
|
print(f"Source: {SOURCE_DIR}")
|
|
|
print(f"Destination: {DEST_DIR}")
|
|
|
print(f"Split ratio: Train={TRAIN_RATIO*100}%, Val={VAL_RATIO*100}%, Test={TEST_RATIO*100}%")
|
|
|
print("="*80)
|
|
|
|
|
|
|
|
|
for split in ['train', 'val', 'test']:
|
|
|
split_dir = DEST_DIR / split
|
|
|
if split_dir.exists():
|
|
|
print(f"\nWARNING: {split_dir} already exists!")
|
|
|
response = input(f"Delete and recreate? (yes/no): ")
|
|
|
if response.lower() == 'yes':
|
|
|
shutil.rmtree(split_dir)
|
|
|
print(f"Deleted {split_dir}")
|
|
|
else:
|
|
|
print("Aborting. Please backup or rename existing data directory.")
|
|
|
exit()
|
|
|
split_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
class_folders = [f for f in SOURCE_DIR.iterdir() if f.is_dir()]
|
|
|
class_folders = sorted(class_folders)
|
|
|
|
|
|
print(f"\nFound {len(class_folders)} classes")
|
|
|
print("="*80)
|
|
|
|
|
|
total_images = 0
|
|
|
total_train = 0
|
|
|
total_val = 0
|
|
|
total_test = 0
|
|
|
|
|
|
|
|
|
for class_folder in class_folders:
|
|
|
class_name = class_folder.name
|
|
|
|
|
|
|
|
|
image_files = []
|
|
|
for ext in ['*.jpg', '*.jpeg', '*.png', '*.JPG', '*.JPEG', '*.PNG']:
|
|
|
image_files.extend(list(class_folder.glob(ext)))
|
|
|
|
|
|
if len(image_files) == 0:
|
|
|
print(f"WARNING: No images found in {class_name}")
|
|
|
continue
|
|
|
|
|
|
|
|
|
random.shuffle(image_files)
|
|
|
|
|
|
|
|
|
n_images = len(image_files)
|
|
|
n_train = int(n_images * TRAIN_RATIO)
|
|
|
n_val = int(n_images * VAL_RATIO)
|
|
|
n_test = n_images - n_train - n_val
|
|
|
|
|
|
|
|
|
train_images = image_files[:n_train]
|
|
|
val_images = image_files[n_train:n_train + n_val]
|
|
|
test_images = image_files[n_train + n_val:]
|
|
|
|
|
|
|
|
|
for split in ['train', 'val', 'test']:
|
|
|
(DEST_DIR / split / class_name).mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
for img in train_images:
|
|
|
shutil.copy2(img, DEST_DIR / 'train' / class_name / img.name)
|
|
|
|
|
|
for img in val_images:
|
|
|
shutil.copy2(img, DEST_DIR / 'val' / class_name / img.name)
|
|
|
|
|
|
for img in test_images:
|
|
|
shutil.copy2(img, DEST_DIR / 'test' / class_name / img.name)
|
|
|
|
|
|
|
|
|
total_images += n_images
|
|
|
total_train += n_train
|
|
|
total_val += n_val
|
|
|
total_test += n_test
|
|
|
|
|
|
print(f"{class_name:40s}: {n_images:4d} total -> Train: {n_train:3d}, Val: {n_val:3d}, Test: {n_test:3d}")
|
|
|
|
|
|
print("="*80)
|
|
|
print("SUMMARY")
|
|
|
print("="*80)
|
|
|
print(f"Total images processed: {total_images}")
|
|
|
print(f"Train: {total_train} ({total_train/total_images*100:.1f}%)")
|
|
|
print(f"Val: {total_val} ({total_val/total_images*100:.1f}%)")
|
|
|
print(f"Test: {total_test} ({total_test/total_images*100:.1f}%)")
|
|
|
print("="*80)
|
|
|
print("\nDataset split completed successfully!")
|
|
|
print(f"\nDirectory structure:")
|
|
|
print(f" {DEST_DIR}/")
|
|
|
print(f" train/ ({total_train} images)")
|
|
|
print(f" val/ ({total_val} images)")
|
|
|
print(f" test/ ({total_test} images)")
|
|
|
print("="*80)
|
|
|
|