""" PDF Downloader Module Handles downloading PDFs from URLs with retry logic and progress tracking. """ import os import asyncio import tempfile import aiohttp from typing import Optional class PDFDownloader: """Handles PDF downloading with enhanced error handling and retry logic.""" def __init__(self): """Initialize the PDF downloader.""" pass async def download_pdf(self, url: str, timeout: int = 300, max_retries: int = 3) -> str: """ Download PDF from URL to a temporary file with enhanced error handling. Args: url: URL of the PDF to download timeout: Download timeout in seconds (default: 300s/5min) max_retries: Maximum number of retry attempts Returns: str: Path to the downloaded temporary file Raises: Exception: If download fails after all retries """ print(f"📥 Downloading PDF from: {url[:50]}...") for attempt in range(max_retries): try: # Enhanced timeout settings for large files timeout_config = aiohttp.ClientTimeout( total=timeout, # Total timeout connect=30, # Connection timeout sock_read=120 # Socket read timeout ) async with aiohttp.ClientSession(timeout=timeout_config) as session: print(f" Attempt {attempt + 1}/{max_retries} (timeout: {timeout}s)") async with session.get(url) as response: if response.status != 200: raise Exception(f"Failed to download PDF: HTTP {response.status}") # Get content length for progress tracking content_length = response.headers.get('content-length') if content_length: total_size = int(content_length) print(f" File size: {total_size / (1024*1024):.1f} MB") # Create temporary file temp_file = tempfile.NamedTemporaryFile( delete=False, suffix=".pdf", prefix="preprocess_" ) # Write content to temporary file with progress tracking downloaded = 0 async for chunk in response.content.iter_chunked(16384): # Larger chunks temp_file.write(chunk) downloaded += len(chunk) # Show progress for large files if content_length and downloaded % (1024*1024) == 0: # Every MB progress = (downloaded / total_size) * 100 print(f" Progress: {progress:.1f}% ({downloaded/(1024*1024):.1f} MB)") temp_file.close() print(f"✅ PDF downloaded successfully: {temp_file.name}") return temp_file.name except asyncio.TimeoutError: print(f" ⏰ Timeout on attempt {attempt + 1}") if attempt < max_retries - 1: wait_time = (attempt + 1) * 30 # Increasing wait time print(f" ⏳ Waiting {wait_time}s before retry...") await asyncio.sleep(wait_time) continue except Exception as e: print(f" ❌ Error on attempt {attempt + 1}: {str(e)}") if attempt < max_retries - 1: wait_time = (attempt + 1) * 15 print(f" ⏳ Waiting {wait_time}s before retry...") await asyncio.sleep(wait_time) continue raise Exception(f"Failed to download PDF after {max_retries} attempts") def cleanup_temp_file(self, temp_path: str) -> None: """ Clean up temporary file. Args: temp_path: Path to the temporary file to delete """ if temp_path and os.path.exists(temp_path): try: os.unlink(temp_path) print(f"🗑️ Cleaned up temporary file: {temp_path}") except Exception as e: print(f"⚠️ Warning: Could not delete temporary file {temp_path}: {e}")