File size: 4,876 Bytes
e8051be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""

PDF Downloader Module



Handles downloading PDFs from URLs with retry logic and progress tracking.

"""

import os
import asyncio
import tempfile
import aiohttp
from typing import Optional


class PDFDownloader:
    """Handles PDF downloading with enhanced error handling and retry logic."""
    
    def __init__(self):
        """Initialize the PDF downloader."""
        pass
    
    async def download_pdf(self, url: str, timeout: int = 300, max_retries: int = 3) -> str:
        """

        Download PDF from URL to a temporary file with enhanced error handling.

        

        Args:

            url: URL of the PDF to download

            timeout: Download timeout in seconds (default: 300s/5min)

            max_retries: Maximum number of retry attempts

            

        Returns:

            str: Path to the downloaded temporary file

            

        Raises:

            Exception: If download fails after all retries

        """
        print(f"📥 Downloading PDF from: {url[:50]}...")
        
        for attempt in range(max_retries):
            try:
                # Enhanced timeout settings for large files
                timeout_config = aiohttp.ClientTimeout(
                    total=timeout,          # Total timeout
                    connect=30,             # Connection timeout
                    sock_read=120           # Socket read timeout
                )
                
                async with aiohttp.ClientSession(timeout=timeout_config) as session:
                    print(f"   Attempt {attempt + 1}/{max_retries} (timeout: {timeout}s)")
                    
                    async with session.get(url) as response:
                        if response.status != 200:
                            raise Exception(f"Failed to download PDF: HTTP {response.status}")
                        
                        # Get content length for progress tracking
                        content_length = response.headers.get('content-length')
                        if content_length:
                            total_size = int(content_length)
                            print(f"   File size: {total_size / (1024*1024):.1f} MB")
                        
                        # Create temporary file
                        temp_file = tempfile.NamedTemporaryFile(
                            delete=False, 
                            suffix=".pdf",
                            prefix="preprocess_"
                        )
                        
                        # Write content to temporary file with progress tracking
                        downloaded = 0
                        async for chunk in response.content.iter_chunked(16384):  # Larger chunks
                            temp_file.write(chunk)
                            downloaded += len(chunk)
                            
                            # Show progress for large files
                            if content_length and downloaded % (1024*1024) == 0:  # Every MB
                                progress = (downloaded / total_size) * 100
                                print(f"   Progress: {progress:.1f}% ({downloaded/(1024*1024):.1f} MB)")
                        
                        temp_file.close()
                        print(f"✅ PDF downloaded successfully: {temp_file.name}")
                        return temp_file.name
                        
            except asyncio.TimeoutError:
                print(f"   ⏰ Timeout on attempt {attempt + 1}")
                if attempt < max_retries - 1:
                    wait_time = (attempt + 1) * 30  # Increasing wait time
                    print(f"   ⏳ Waiting {wait_time}s before retry...")
                    await asyncio.sleep(wait_time)
                continue
                
            except Exception as e:
                print(f"   ❌ Error on attempt {attempt + 1}: {str(e)}")
                if attempt < max_retries - 1:
                    wait_time = (attempt + 1) * 15
                    print(f"   ⏳ Waiting {wait_time}s before retry...")
                    await asyncio.sleep(wait_time)
                continue
        
        raise Exception(f"Failed to download PDF after {max_retries} attempts")
    
    def cleanup_temp_file(self, temp_path: str) -> None:
        """

        Clean up temporary file.

        

        Args:

            temp_path: Path to the temporary file to delete

        """
        if temp_path and os.path.exists(temp_path):
            try:
                os.unlink(temp_path)
                print(f"🗑️ Cleaned up temporary file: {temp_path}")
            except Exception as e:
                print(f"⚠️ Warning: Could not delete temporary file {temp_path}: {e}")