Spaces:
Sleeping
Sleeping
| import os | |
| import shutil | |
| import random | |
| from collections import defaultdict | |
| # --- CONFIGURATION --- | |
| SOURCE_IMG_DIR = "raw_data/img_align_celeba/img_align_celeba" # Path to unzipped images | |
| IDENTITY_FILE = "raw_data/identity_CelebA.txt" | |
| DEST_DIR = "stored_images" | |
| NUM_CELEBS = 50 # How many distinct people you want | |
| MIN_IMAGES_PER_CELEB = 20 # Minimum photos per person (to ensure good matching) | |
| def prepare_dataset(): | |
| # 1. Parse the Identity File | |
| print("Parsing identity file...") | |
| celeb_map = defaultdict(list) | |
| try: | |
| with open(IDENTITY_FILE, "r") as f: | |
| for line in f: | |
| # File format: "000001.jpg 2880" | |
| parts = line.split() | |
| if len(parts) >= 2: | |
| filename = parts[0] | |
| celeb_id = parts[1] | |
| celeb_map[celeb_id].append(filename) | |
| except FileNotFoundError: | |
| print(f"Error: Could not find {IDENTITY_FILE}. Did you download the dataset?") | |
| return | |
| print(f"Found {len(celeb_map)} total celebrities.") | |
| # 2. Filter: Keep only celebs with enough images | |
| valid_celebs = {cid: imgs for cid, imgs in celeb_map.items() if len(imgs) >= MIN_IMAGES_PER_CELEB} | |
| print(f"Found {len(valid_celebs)} celebrities with >= {MIN_IMAGES_PER_CELEB} images.") | |
| # 3. Sample: Pick random celebrities | |
| if len(valid_celebs) < NUM_CELEBS: | |
| print("Warning: Not enough celebs meet criteria. Using all available.") | |
| selected_ids = list(valid_celebs.keys()) | |
| else: | |
| selected_ids = random.sample(list(valid_celebs.keys()), NUM_CELEBS) | |
| # 4. Move and Rename Images | |
| if os.path.exists(DEST_DIR): | |
| shutil.rmtree(DEST_DIR) # Clean start | |
| os.makedirs(DEST_DIR) | |
| count = 0 | |
| print(f"Copying images for {len(selected_ids)} selected celebrities...") | |
| for cid in selected_ids: | |
| images = valid_celebs[cid] | |
| for img_name in images: | |
| src_path = os.path.join(SOURCE_IMG_DIR, img_name) | |
| # RENAME STRATEGY: CelebID_OriginalName.jpg | |
| # This ensures your Search Engine knows "Celeb 2880" is a unique person | |
| new_name = f"Celeb_{cid}_{img_name}" | |
| dest_path = os.path.join(DEST_DIR, new_name) | |
| try: | |
| shutil.copy2(src_path, dest_path) | |
| count += 1 | |
| except FileNotFoundError: | |
| print(f"Warning: Image {src_path} not found.") | |
| print(f"Success! Copied {count} images to '{DEST_DIR}/'.") | |
| print("You can now run 'python ingestion.py'") | |
| if __name__ == "__main__": | |
| prepare_dataset() |