quantumbit's picture
Upload 39 files
e8051be verified
"""
PDF Downloader Module
Handles downloading PDFs from URLs with retry logic and progress tracking.
"""
import os
import asyncio
import tempfile
import aiohttp
from typing import Optional
class PDFDownloader:
"""Handles PDF downloading with enhanced error handling and retry logic."""
def __init__(self):
"""Initialize the PDF downloader."""
pass
async def download_pdf(self, url: str, timeout: int = 300, max_retries: int = 3) -> str:
"""
Download PDF from URL to a temporary file with enhanced error handling.
Args:
url: URL of the PDF to download
timeout: Download timeout in seconds (default: 300s/5min)
max_retries: Maximum number of retry attempts
Returns:
str: Path to the downloaded temporary file
Raises:
Exception: If download fails after all retries
"""
print(f"πŸ“₯ Downloading PDF from: {url[:50]}...")
for attempt in range(max_retries):
try:
# Enhanced timeout settings for large files
timeout_config = aiohttp.ClientTimeout(
total=timeout, # Total timeout
connect=30, # Connection timeout
sock_read=120 # Socket read timeout
)
async with aiohttp.ClientSession(timeout=timeout_config) as session:
print(f" Attempt {attempt + 1}/{max_retries} (timeout: {timeout}s)")
async with session.get(url) as response:
if response.status != 200:
raise Exception(f"Failed to download PDF: HTTP {response.status}")
# Get content length for progress tracking
content_length = response.headers.get('content-length')
if content_length:
total_size = int(content_length)
print(f" File size: {total_size / (1024*1024):.1f} MB")
# Create temporary file
temp_file = tempfile.NamedTemporaryFile(
delete=False,
suffix=".pdf",
prefix="preprocess_"
)
# Write content to temporary file with progress tracking
downloaded = 0
async for chunk in response.content.iter_chunked(16384): # Larger chunks
temp_file.write(chunk)
downloaded += len(chunk)
# Show progress for large files
if content_length and downloaded % (1024*1024) == 0: # Every MB
progress = (downloaded / total_size) * 100
print(f" Progress: {progress:.1f}% ({downloaded/(1024*1024):.1f} MB)")
temp_file.close()
print(f"βœ… PDF downloaded successfully: {temp_file.name}")
return temp_file.name
except asyncio.TimeoutError:
print(f" ⏰ Timeout on attempt {attempt + 1}")
if attempt < max_retries - 1:
wait_time = (attempt + 1) * 30 # Increasing wait time
print(f" ⏳ Waiting {wait_time}s before retry...")
await asyncio.sleep(wait_time)
continue
except Exception as e:
print(f" ❌ Error on attempt {attempt + 1}: {str(e)}")
if attempt < max_retries - 1:
wait_time = (attempt + 1) * 15
print(f" ⏳ Waiting {wait_time}s before retry...")
await asyncio.sleep(wait_time)
continue
raise Exception(f"Failed to download PDF after {max_retries} attempts")
def cleanup_temp_file(self, temp_path: str) -> None:
"""
Clean up temporary file.
Args:
temp_path: Path to the temporary file to delete
"""
if temp_path and os.path.exists(temp_path):
try:
os.unlink(temp_path)
print(f"πŸ—‘οΈ Cleaned up temporary file: {temp_path}")
except Exception as e:
print(f"⚠️ Warning: Could not delete temporary file {temp_path}: {e}")