Spaces:
Sleeping
Sleeping
File size: 4,876 Bytes
e8051be |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
"""
PDF Downloader Module
Handles downloading PDFs from URLs with retry logic and progress tracking.
"""
import os
import asyncio
import tempfile
import aiohttp
from typing import Optional
class PDFDownloader:
"""Handles PDF downloading with enhanced error handling and retry logic."""
def __init__(self):
"""Initialize the PDF downloader."""
pass
async def download_pdf(self, url: str, timeout: int = 300, max_retries: int = 3) -> str:
"""
Download PDF from URL to a temporary file with enhanced error handling.
Args:
url: URL of the PDF to download
timeout: Download timeout in seconds (default: 300s/5min)
max_retries: Maximum number of retry attempts
Returns:
str: Path to the downloaded temporary file
Raises:
Exception: If download fails after all retries
"""
print(f"📥 Downloading PDF from: {url[:50]}...")
for attempt in range(max_retries):
try:
# Enhanced timeout settings for large files
timeout_config = aiohttp.ClientTimeout(
total=timeout, # Total timeout
connect=30, # Connection timeout
sock_read=120 # Socket read timeout
)
async with aiohttp.ClientSession(timeout=timeout_config) as session:
print(f" Attempt {attempt + 1}/{max_retries} (timeout: {timeout}s)")
async with session.get(url) as response:
if response.status != 200:
raise Exception(f"Failed to download PDF: HTTP {response.status}")
# Get content length for progress tracking
content_length = response.headers.get('content-length')
if content_length:
total_size = int(content_length)
print(f" File size: {total_size / (1024*1024):.1f} MB")
# Create temporary file
temp_file = tempfile.NamedTemporaryFile(
delete=False,
suffix=".pdf",
prefix="preprocess_"
)
# Write content to temporary file with progress tracking
downloaded = 0
async for chunk in response.content.iter_chunked(16384): # Larger chunks
temp_file.write(chunk)
downloaded += len(chunk)
# Show progress for large files
if content_length and downloaded % (1024*1024) == 0: # Every MB
progress = (downloaded / total_size) * 100
print(f" Progress: {progress:.1f}% ({downloaded/(1024*1024):.1f} MB)")
temp_file.close()
print(f"✅ PDF downloaded successfully: {temp_file.name}")
return temp_file.name
except asyncio.TimeoutError:
print(f" ⏰ Timeout on attempt {attempt + 1}")
if attempt < max_retries - 1:
wait_time = (attempt + 1) * 30 # Increasing wait time
print(f" ⏳ Waiting {wait_time}s before retry...")
await asyncio.sleep(wait_time)
continue
except Exception as e:
print(f" ❌ Error on attempt {attempt + 1}: {str(e)}")
if attempt < max_retries - 1:
wait_time = (attempt + 1) * 15
print(f" ⏳ Waiting {wait_time}s before retry...")
await asyncio.sleep(wait_time)
continue
raise Exception(f"Failed to download PDF after {max_retries} attempts")
def cleanup_temp_file(self, temp_path: str) -> None:
"""
Clean up temporary file.
Args:
temp_path: Path to the temporary file to delete
"""
if temp_path and os.path.exists(temp_path):
try:
os.unlink(temp_path)
print(f"🗑️ Cleaned up temporary file: {temp_path}")
except Exception as e:
print(f"⚠️ Warning: Could not delete temporary file {temp_path}: {e}")
|