Spaces:

Waseem-k
/

AI-Reverse_Face_Search

Sleeping

App Files Files Community

AI-Reverse_Face_Search / src /prepare_data.py

Waseem-k

Initial Commit

6b39516 verified 4 months ago

raw

history blame contribute delete

2.7 kB

	import os
	import shutil
	import random
	from collections import defaultdict

	# --- CONFIGURATION ---
	SOURCE_IMG_DIR = "raw_data/img_align_celeba/img_align_celeba" # Path to unzipped images
	IDENTITY_FILE = "raw_data/identity_CelebA.txt"
	DEST_DIR = "stored_images"
	NUM_CELEBS = 50 # How many distinct people you want
	MIN_IMAGES_PER_CELEB = 20 # Minimum photos per person (to ensure good matching)

	def prepare_dataset():
	# 1. Parse the Identity File
	print("Parsing identity file...")
	celeb_map = defaultdict(list)

	try:
	with open(IDENTITY_FILE, "r") as f:
	for line in f:
	# File format: "000001.jpg 2880"
	parts = line.split()
	if len(parts) >= 2:
	filename = parts[0]
	celeb_id = parts[1]
	celeb_map[celeb_id].append(filename)
	except FileNotFoundError:
	print(f"Error: Could not find {IDENTITY_FILE}. Did you download the dataset?")
	return

	print(f"Found {len(celeb_map)} total celebrities.")

	# 2. Filter: Keep only celebs with enough images
	valid_celebs = {cid: imgs for cid, imgs in celeb_map.items() if len(imgs) >= MIN_IMAGES_PER_CELEB}
	print(f"Found {len(valid_celebs)} celebrities with >= {MIN_IMAGES_PER_CELEB} images.")

	# 3. Sample: Pick random celebrities
	if len(valid_celebs) < NUM_CELEBS:
	print("Warning: Not enough celebs meet criteria. Using all available.")
	selected_ids = list(valid_celebs.keys())
	else:
	selected_ids = random.sample(list(valid_celebs.keys()), NUM_CELEBS)

	# 4. Move and Rename Images
	if os.path.exists(DEST_DIR):
	shutil.rmtree(DEST_DIR) # Clean start
	os.makedirs(DEST_DIR)

	count = 0
	print(f"Copying images for {len(selected_ids)} selected celebrities...")

	for cid in selected_ids:
	images = valid_celebs[cid]
	for img_name in images:
	src_path = os.path.join(SOURCE_IMG_DIR, img_name)

	# RENAME STRATEGY: CelebID_OriginalName.jpg
	# This ensures your Search Engine knows "Celeb 2880" is a unique person
	new_name = f"Celeb_{cid}_{img_name}"
	dest_path = os.path.join(DEST_DIR, new_name)

	try:
	shutil.copy2(src_path, dest_path)
	count += 1
	except FileNotFoundError:
	print(f"Warning: Image {src_path} not found.")

	print(f"Success! Copied {count} images to '{DEST_DIR}/'.")
	print("You can now run 'python ingestion.py'")

	if __name__ == "__main__":
	prepare_dataset()