Spaces:
Running
Running
import random | |
import logging | |
import time | |
import json | |
import asyncio | |
from crawl4ai import AsyncWebCrawler | |
class SecureScraper: | |
def __init__(self, proxy_list=None): | |
# Initialize with AsyncWebCrawler from crawl4ai | |
self.use_proxies = bool(proxy_list) | |
self.proxy_list = proxy_list | |
# Initialize async crawler | |
self.crawler = AsyncWebCrawler( | |
max_connections=10, | |
timeout=30, | |
proxies=self.proxy_list if self.use_proxies and self.proxy_list else None, | |
follow_redirects=True, | |
random_user_agent=True # Enable random user agent rotation | |
) | |
logging.basicConfig(level=logging.INFO) | |
async def async_scrape_url(self, url, css_selectors=None): | |
""" | |
Asynchronously scrape a URL with privacy protection measures | |
Args: | |
url: URL to scrape | |
css_selectors: Dict of elements to extract, e.g. {'title': 'h1', 'content': '.main-text'} | |
""" | |
max_retries = 5 | |
current_retry = 0 | |
while current_retry < max_retries: | |
try: | |
# Log attempt details | |
proxy_status = "using proxy" if self.use_proxies else "without proxy" | |
logging.info(f"Scraping {url} (Attempt {current_retry+1}/{max_retries}) {proxy_status}") | |
# Use AsyncWebCrawler to fetch the page | |
response = await self.crawler.arun(url) | |
# Process the response based on content type | |
if response.is_html: | |
page_data = await response.parse_html() | |
# Create a basic result structure | |
if not css_selectors: | |
# Default extraction if no selectors provided | |
title = page_data.title or "Title extraction not supported" | |
text = page_data.text[:10000] if hasattr(page_data, 'text') else "Text extraction not supported" | |
links = page_data.links[:20] if hasattr(page_data, 'links') else [] | |
result = { | |
'title': title, | |
'text': text, | |
'links': links | |
} | |
else: | |
# Extract requested elements using CSS selectors | |
result = {} | |
for key, selector in css_selectors.items(): | |
elements = page_data.select(selector) | |
if elements: | |
# If multiple elements match, create a list | |
if len(elements) > 1: | |
result[key] = [elem.text for elem in elements] | |
else: | |
result[key] = elements[0].text | |
else: | |
result[key] = f"No match for selector: {selector}" | |
else: | |
# Handle non-HTML responses | |
result = { | |
'content_type': response.content_type, | |
'content_length': len(response.content), | |
'summary': 'Non-HTML content' | |
} | |
# Get user agent info | |
user_agent = self.crawler.current_user_agent or "Unknown" | |
# Truncate for privacy | |
user_agent = user_agent[:30] + '...' if len(str(user_agent)) > 30 else user_agent | |
return { | |
'status': 'success', | |
'data': result, | |
'privacy': { | |
'user_agent_type': user_agent, | |
'proxy_used': self.use_proxies | |
} | |
} | |
except Exception as e: | |
logging.error(f"Request failed: {str(e)}") | |
current_retry += 1 | |
await asyncio.sleep(random.uniform(2, 5)) # Async sleep for backoff | |
# Try to rotate proxy if available | |
if self.use_proxies and self.proxy_list and len(self.proxy_list) > 1: | |
self.proxy_list = self.proxy_list[1:] + [self.proxy_list[0]] # Rotate proxies | |
# Update crawler's proxies | |
await self.crawler.update_proxies(self.proxy_list) | |
# If we've exhausted retries | |
return {'status': 'error', 'message': f"Failed after {max_retries} attempts"} | |
def scrape_url(self, url, css_selectors=None): | |
""" | |
Synchronous wrapper for async_scrape_url | |
Args: | |
url: URL to scrape | |
css_selectors: Dict of elements to extract | |
""" | |
return asyncio.run(self.async_scrape_url(url, css_selectors)) |