| | |
| | """ |
| | Script to download all images from the dataset locally. |
| | This file downloads all images from URLs in the dataset CSV and saves them locally |
| | to speed up training by avoiding repeated downloads. It uses parallel processing |
| | to download multiple images simultaneously and updates the CSV with local paths |
| | of downloaded images. |
| | """ |
| |
|
| | import pandas as pd |
| | import requests |
| | from PIL import Image |
| | from io import BytesIO |
| | from tqdm import tqdm |
| | import hashlib |
| | from pathlib import Path |
| | import time |
| | import concurrent.futures |
| | from threading import Lock |
| | import config |
| |
|
| | class ImageDownloader: |
| | def __init__(self, df, images_dir=config.images_dir, max_workers=8, timeout=10): |
| | """ |
| | Initialize the image downloader. |
| | |
| | Args: |
| | csv_path: Path to the CSV file containing the URLs |
| | images_dir: Directory to save the images |
| | max_workers: Number of threads for parallel download |
| | timeout: Timeout for HTTP requests (seconds) |
| | """ |
| | self.df = df |
| | self.images_dir = Path(images_dir) |
| | self.max_workers = max_workers |
| | self.timeout = timeout |
| | |
| | |
| | self.images_dir.mkdir(parents=True, exist_ok=True) |
| | |
| | |
| | self.stats = { |
| | 'downloaded': 0, |
| | 'skipped': 0, |
| | 'failed': 0, |
| | 'total': 0 |
| | } |
| | self.stats_lock = Lock() |
| | |
| | def url_to_filename(self, url): |
| | """Convert a URL to a secure filename.""" |
| | |
| | url_hash = hashlib.md5(url.encode()).hexdigest() |
| | return f"{url_hash}.jpg" |
| | |
| | def download_single_image(self, row): |
| | """ |
| | Download a single image. |
| | |
| | Args: |
| | row: Tuple (index, pandas.Series) containing the row data |
| | |
| | Returns: |
| | tuple: (success, index, message) |
| | """ |
| | idx, data = row |
| | url = data[config.column_url_image] |
| | |
| | |
| | filename = self.url_to_filename(url) |
| | filepath = self.images_dir / filename |
| | |
| | |
| | if filepath.exists(): |
| | with self.stats_lock: |
| | self.stats['skipped'] += 1 |
| | return True, idx, f"Skipped (already exists): {filename}" |
| | |
| | try: |
| | |
| | response = requests.get(url, timeout=self.timeout, stream=True) |
| | response.raise_for_status() |
| | |
| | |
| | content_type = response.headers.get('content-type', '') |
| | if not content_type.startswith('image/'): |
| | with self.stats_lock: |
| | self.stats['failed'] += 1 |
| | return False, idx, f"Not an image: {content_type}" |
| | |
| | |
| | try: |
| | image = Image.open(BytesIO(response.content)).convert("RGB") |
| | image.save(filepath, "JPEG", quality=85, optimize=True) |
| | |
| | with self.stats_lock: |
| | self.stats['downloaded'] += 1 |
| | return True, idx, f"Downloaded: {filename}" |
| | |
| | except Exception as img_error: |
| | with self.stats_lock: |
| | self.stats['failed'] += 1 |
| | return False, idx, f"Image processing error: {str(img_error)}" |
| | |
| | except requests.exceptions.RequestException as e: |
| | with self.stats_lock: |
| | self.stats['failed'] += 1 |
| | return False, idx, f"Download error: {str(e)}" |
| | except Exception as e: |
| | with self.stats_lock: |
| | self.stats['failed'] += 1 |
| | return False, idx, f"Unexpected error: {str(e)}" |
| | |
| | def download_all_images(self): |
| | """Download all images from the dataset.""" |
| | print(f"π Loading dataset from {self.df}") |
| | self.stats['total'] = len(self.df) |
| | |
| | print(f"π Found {len(self.df)} images to download") |
| | print(f"π Saving in: {self.images_dir}") |
| | print(f"π§ Using {self.max_workers} threads") |
| | |
| | |
| | df_local = self.df.copy() |
| | df_local[config.column_local_image_path] = "" |
| | df_local['download_success'] = False |
| | |
| | start_time = time.time() |
| | |
| | |
| | with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor: |
| | |
| | future_to_row = { |
| | executor.submit(self.download_single_image, row): row |
| | for row in self.df.iterrows() |
| | } |
| | |
| | |
| | with tqdm(total=len(self.df), desc="π₯ Downloading", unit="img") as pbar: |
| | for future in concurrent.futures.as_completed(future_to_row): |
| | row = future_to_row[future] |
| | idx = row[0] |
| | |
| | try: |
| | success, _, message = future.result() |
| | |
| | if success: |
| | |
| | filename = self.url_to_filename(row[1][config.column_url_image]) |
| | df_local.loc[idx, config.column_local_image_path] = str(self.images_dir / filename) |
| | df_local.loc[idx, 'download_success'] = True |
| | |
| | |
| | pbar.set_postfix({ |
| | 'OK': self.stats['downloaded'], |
| | 'Skip': self.stats['skipped'], |
| | 'Fail': self.stats['failed'] |
| | }) |
| | pbar.update(1) |
| | |
| | except Exception as e: |
| | print(f"β Unexpected error for index {idx}: {e}") |
| | with self.stats_lock: |
| | self.stats['failed'] += 1 |
| | pbar.update(1) |
| | |
| | elapsed_time = time.time() - start_time |
| | |
| | |
| | print("\n" + "="*60) |
| | print("π DOWNLOAD STATISTICS") |
| | print("="*60) |
| | print(f"β
Downloaded: {self.stats['downloaded']}") |
| | print(f"βοΈ Skipped (already present): {self.stats['skipped']}") |
| | print(f"β Failed: {self.stats['failed']}") |
| | print(f"π Total: {self.stats['total']}") |
| | print(f"β±οΈ Time elapsed: {elapsed_time:.1f}s") |
| | |
| | success_rate = (self.stats['downloaded'] + self.stats['skipped']) / self.stats['total'] * 100 |
| | print(f"π― Success rate: {success_rate:.1f}%") |
| | |
| | if self.stats['downloaded'] > 0: |
| | avg_time = elapsed_time / self.stats['downloaded'] |
| | print(f"β‘ Average time per image: {avg_time:.2f}s") |
| | |
| | |
| | output_path = config.local_dataset_path |
| | df_local.to_csv(output_path, index=False) |
| | print(f"πΎ Updated dataset saved: {output_path}") |
| | |
| | return df_local |
| |
|
| | def main(): |
| | """Main function.""" |
| | print("π STARTING IMAGE DOWNLOADER") |
| | print("="*60) |
| | |
| | |
| | df = pd.read_csv(config.local_dataset_path) |
| | df = df[df['color'] != 'unknown'] |
| | |
| | |
| | downloader = ImageDownloader( |
| | df=df, |
| | images_dir=config.images_dir, |
| | max_workers=8, |
| | timeout=10 |
| | ) |
| | |
| | |
| | df_with_paths = downloader.download_all_images() |
| | |
| | print("\nπ DOWNLOAD COMPLETED!") |
| | print("π‘ You can now use the local images for training.") |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|