Spaces:
Sleeping
Sleeping
File size: 2,259 Bytes
69a077e 3bb1fa1 69a077e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import asyncio
from playwright.async_api import async_playwright
from typing import Dict, Optional
import time
from settings import settings
class HTMLLoader:
def __init__(self):
self.browser = None
self.context = None
async def __aenter__(self):
self.playwright = await async_playwright().start()
self.browser = await self.playwright.chromium.launch(
headless=settings.scraping.headless
)
self.context = await self.browser.new_context(
user_agent=settings.scraping.user_agent
)
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
if self.context:
await self.context.close()
if self.browser:
await self.browser.close()
if self.playwright:
await self.playwright.stop()
async def load_page(self, url: str) -> Dict[str, str]:
"""Load HTML content from URL handling both static and dynamic sites"""
for attempt in range(settings.scraping.max_retries):
try:
page = await self.context.new_page()
await page.goto(url, timeout=settings.scraping.timeout)
# Wait for body to load
await page.wait_for_selector(
settings.scraping.wait_for_selector,
timeout=10000
)
# Additional wait for dynamic content
await page.wait_for_timeout(2000)
html_content = await page.content()
title = await page.title()
url_final = page.url
await page.close()
return {
"html": html_content,
"title": title,
"url": url_final,
"timestamp": int(time.time())
}
except Exception as e:
if attempt == settings.scraping.max_retries - 1:
raise Exception(f"Failed to load {url}: {str(e)}")
await asyncio.sleep(settings.scraping.delay_between_requests)
return None |