Spaces:
Running
Running
File size: 5,087 Bytes
f937fdb 5ff752e f937fdb 5ff752e 4db655a 5a4503e 5ff752e 4db655a f937fdb 5ff752e f937fdb 5ff752e f937fdb be87b4b f937fdb 5a4503e 4db655a 5a4503e f937fdb 5ff752e f937fdb 5ff752e 4db655a 5ff752e 4db655a 5ff752e 4db655a 5ff752e f937fdb 5ff752e 4db655a f937fdb 4db655a f937fdb 5a4503e 4db655a f937fdb 5ff752e 4db655a 5ff752e f937fdb 5a4503e 5ff752e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import random
import logging
import time
import json
import asyncio
from crawl4ai import AsyncWebCrawler
class SecureScraper:
def __init__(self, proxy_list=None):
# Initialize with AsyncWebCrawler from crawl4ai
self.use_proxies = bool(proxy_list)
self.proxy_list = proxy_list
# Initialize async crawler
self.crawler = AsyncWebCrawler(
max_connections=10,
timeout=30,
proxies=self.proxy_list if self.use_proxies and self.proxy_list else None,
follow_redirects=True,
random_user_agent=True # Enable random user agent rotation
)
logging.basicConfig(level=logging.INFO)
async def async_scrape_url(self, url, css_selectors=None):
"""
Asynchronously scrape a URL with privacy protection measures
Args:
url: URL to scrape
css_selectors: Dict of elements to extract, e.g. {'title': 'h1', 'content': '.main-text'}
"""
max_retries = 5
current_retry = 0
while current_retry < max_retries:
try:
# Log attempt details
proxy_status = "using proxy" if self.use_proxies else "without proxy"
logging.info(f"Scraping {url} (Attempt {current_retry+1}/{max_retries}) {proxy_status}")
# Use AsyncWebCrawler to fetch the page
response = await self.crawler.arun(url)
# Process the response based on content type
if response.is_html:
page_data = await response.parse_html()
# Create a basic result structure
if not css_selectors:
# Default extraction if no selectors provided
title = page_data.title or "Title extraction not supported"
text = page_data.text[:10000] if hasattr(page_data, 'text') else "Text extraction not supported"
links = page_data.links[:20] if hasattr(page_data, 'links') else []
result = {
'title': title,
'text': text,
'links': links
}
else:
# Extract requested elements using CSS selectors
result = {}
for key, selector in css_selectors.items():
elements = page_data.select(selector)
if elements:
# If multiple elements match, create a list
if len(elements) > 1:
result[key] = [elem.text for elem in elements]
else:
result[key] = elements[0].text
else:
result[key] = f"No match for selector: {selector}"
else:
# Handle non-HTML responses
result = {
'content_type': response.content_type,
'content_length': len(response.content),
'summary': 'Non-HTML content'
}
# Get user agent info
user_agent = self.crawler.current_user_agent or "Unknown"
# Truncate for privacy
user_agent = user_agent[:30] + '...' if len(str(user_agent)) > 30 else user_agent
return {
'status': 'success',
'data': result,
'privacy': {
'user_agent_type': user_agent,
'proxy_used': self.use_proxies
}
}
except Exception as e:
logging.error(f"Request failed: {str(e)}")
current_retry += 1
await asyncio.sleep(random.uniform(2, 5)) # Async sleep for backoff
# Try to rotate proxy if available
if self.use_proxies and self.proxy_list and len(self.proxy_list) > 1:
self.proxy_list = self.proxy_list[1:] + [self.proxy_list[0]] # Rotate proxies
# Update crawler's proxies
await self.crawler.update_proxies(self.proxy_list)
# If we've exhausted retries
return {'status': 'error', 'message': f"Failed after {max_retries} attempts"}
def scrape_url(self, url, css_selectors=None):
"""
Synchronous wrapper for async_scrape_url
Args:
url: URL to scrape
css_selectors: Dict of elements to extract
"""
return asyncio.run(self.async_scrape_url(url, css_selectors)) |