import random import logging import time import json import asyncio from crawl4ai import AsyncWebCrawler class SecureScraper: def __init__(self, proxy_list=None): # Initialize with AsyncWebCrawler from crawl4ai self.use_proxies = bool(proxy_list) self.proxy_list = proxy_list # Initialize async crawler self.crawler = AsyncWebCrawler( max_connections=10, timeout=30, proxies=self.proxy_list if self.use_proxies and self.proxy_list else None, follow_redirects=True, random_user_agent=True # Enable random user agent rotation ) logging.basicConfig(level=logging.INFO) async def async_scrape_url(self, url, css_selectors=None): """ Asynchronously scrape a URL with privacy protection measures Args: url: URL to scrape css_selectors: Dict of elements to extract, e.g. {'title': 'h1', 'content': '.main-text'} """ max_retries = 5 current_retry = 0 while current_retry < max_retries: try: # Log attempt details proxy_status = "using proxy" if self.use_proxies else "without proxy" logging.info(f"Scraping {url} (Attempt {current_retry+1}/{max_retries}) {proxy_status}") # Use AsyncWebCrawler to fetch the page response = await self.crawler.arun(url) # Process the response based on content type if response.is_html: page_data = await response.parse_html() # Create a basic result structure if not css_selectors: # Default extraction if no selectors provided title = page_data.title or "Title extraction not supported" text = page_data.text[:10000] if hasattr(page_data, 'text') else "Text extraction not supported" links = page_data.links[:20] if hasattr(page_data, 'links') else [] result = { 'title': title, 'text': text, 'links': links } else: # Extract requested elements using CSS selectors result = {} for key, selector in css_selectors.items(): elements = page_data.select(selector) if elements: # If multiple elements match, create a list if len(elements) > 1: result[key] = [elem.text for elem in elements] else: result[key] = elements[0].text else: result[key] = f"No match for selector: {selector}" else: # Handle non-HTML responses result = { 'content_type': response.content_type, 'content_length': len(response.content), 'summary': 'Non-HTML content' } # Get user agent info user_agent = self.crawler.current_user_agent or "Unknown" # Truncate for privacy user_agent = user_agent[:30] + '...' if len(str(user_agent)) > 30 else user_agent return { 'status': 'success', 'data': result, 'privacy': { 'user_agent_type': user_agent, 'proxy_used': self.use_proxies } } except Exception as e: logging.error(f"Request failed: {str(e)}") current_retry += 1 await asyncio.sleep(random.uniform(2, 5)) # Async sleep for backoff # Try to rotate proxy if available if self.use_proxies and self.proxy_list and len(self.proxy_list) > 1: self.proxy_list = self.proxy_list[1:] + [self.proxy_list[0]] # Rotate proxies # Update crawler's proxies await self.crawler.update_proxies(self.proxy_list) # If we've exhausted retries return {'status': 'error', 'message': f"Failed after {max_retries} attempts"} def scrape_url(self, url, css_selectors=None): """ Synchronous wrapper for async_scrape_url Args: url: URL to scrape css_selectors: Dict of elements to extract """ return asyncio.run(self.async_scrape_url(url, css_selectors))