Spaces:
Sleeping
Sleeping
""" | |
PDF Downloader Module | |
Handles downloading PDFs from URLs with retry logic and progress tracking. | |
""" | |
import os | |
import asyncio | |
import tempfile | |
import aiohttp | |
from typing import Optional | |
class PDFDownloader: | |
"""Handles PDF downloading with enhanced error handling and retry logic.""" | |
def __init__(self): | |
"""Initialize the PDF downloader.""" | |
pass | |
async def download_pdf(self, url: str, timeout: int = 300, max_retries: int = 3) -> str: | |
""" | |
Download PDF from URL to a temporary file with enhanced error handling. | |
Args: | |
url: URL of the PDF to download | |
timeout: Download timeout in seconds (default: 300s/5min) | |
max_retries: Maximum number of retry attempts | |
Returns: | |
str: Path to the downloaded temporary file | |
Raises: | |
Exception: If download fails after all retries | |
""" | |
print(f"π₯ Downloading PDF from: {url[:50]}...") | |
for attempt in range(max_retries): | |
try: | |
# Enhanced timeout settings for large files | |
timeout_config = aiohttp.ClientTimeout( | |
total=timeout, # Total timeout | |
connect=30, # Connection timeout | |
sock_read=120 # Socket read timeout | |
) | |
async with aiohttp.ClientSession(timeout=timeout_config) as session: | |
print(f" Attempt {attempt + 1}/{max_retries} (timeout: {timeout}s)") | |
async with session.get(url) as response: | |
if response.status != 200: | |
raise Exception(f"Failed to download PDF: HTTP {response.status}") | |
# Get content length for progress tracking | |
content_length = response.headers.get('content-length') | |
if content_length: | |
total_size = int(content_length) | |
print(f" File size: {total_size / (1024*1024):.1f} MB") | |
# Create temporary file | |
temp_file = tempfile.NamedTemporaryFile( | |
delete=False, | |
suffix=".pdf", | |
prefix="preprocess_" | |
) | |
# Write content to temporary file with progress tracking | |
downloaded = 0 | |
async for chunk in response.content.iter_chunked(16384): # Larger chunks | |
temp_file.write(chunk) | |
downloaded += len(chunk) | |
# Show progress for large files | |
if content_length and downloaded % (1024*1024) == 0: # Every MB | |
progress = (downloaded / total_size) * 100 | |
print(f" Progress: {progress:.1f}% ({downloaded/(1024*1024):.1f} MB)") | |
temp_file.close() | |
print(f"β PDF downloaded successfully: {temp_file.name}") | |
return temp_file.name | |
except asyncio.TimeoutError: | |
print(f" β° Timeout on attempt {attempt + 1}") | |
if attempt < max_retries - 1: | |
wait_time = (attempt + 1) * 30 # Increasing wait time | |
print(f" β³ Waiting {wait_time}s before retry...") | |
await asyncio.sleep(wait_time) | |
continue | |
except Exception as e: | |
print(f" β Error on attempt {attempt + 1}: {str(e)}") | |
if attempt < max_retries - 1: | |
wait_time = (attempt + 1) * 15 | |
print(f" β³ Waiting {wait_time}s before retry...") | |
await asyncio.sleep(wait_time) | |
continue | |
raise Exception(f"Failed to download PDF after {max_retries} attempts") | |
def cleanup_temp_file(self, temp_path: str) -> None: | |
""" | |
Clean up temporary file. | |
Args: | |
temp_path: Path to the temporary file to delete | |
""" | |
if temp_path and os.path.exists(temp_path): | |
try: | |
os.unlink(temp_path) | |
print(f"ποΈ Cleaned up temporary file: {temp_path}") | |
except Exception as e: | |
print(f"β οΈ Warning: Could not delete temporary file {temp_path}: {e}") | |