File size: 2,259 Bytes
69a077e
 
 
 
3bb1fa1
69a077e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import asyncio
from playwright.async_api import async_playwright
from typing import Dict, Optional
import time
from settings import settings

class HTMLLoader:
    def __init__(self):
        self.browser = None
        self.context = None
        
    async def __aenter__(self):
        self.playwright = await async_playwright().start()
        self.browser = await self.playwright.chromium.launch(
            headless=settings.scraping.headless
        )
        self.context = await self.browser.new_context(
            user_agent=settings.scraping.user_agent
        )
        return self
    
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        if self.context:
            await self.context.close()
        if self.browser:
            await self.browser.close()
        if self.playwright:
            await self.playwright.stop()
    
    async def load_page(self, url: str) -> Dict[str, str]:
        """Load HTML content from URL handling both static and dynamic sites"""
        for attempt in range(settings.scraping.max_retries):
            try:
                page = await self.context.new_page()
                await page.goto(url, timeout=settings.scraping.timeout)
                
                # Wait for body to load
                await page.wait_for_selector(
                    settings.scraping.wait_for_selector, 
                    timeout=10000
                )
                
                # Additional wait for dynamic content
                await page.wait_for_timeout(2000)
                
                html_content = await page.content()
                title = await page.title()
                url_final = page.url
                
                await page.close()
                
                return {
                    "html": html_content,
                    "title": title,
                    "url": url_final,
                    "timestamp": int(time.time())
                }
                
            except Exception as e:
                if attempt == settings.scraping.max_retries - 1:
                    raise Exception(f"Failed to load {url}: {str(e)}")
                await asyncio.sleep(settings.scraping.delay_between_requests)
        
        return None