LLm_Webscraper / secure_scraper.py
Mokshith Salian
modified app and scraper file
5ff752e
import random
import logging
import time
import json
import asyncio
from crawl4ai import AsyncWebCrawler
class SecureScraper:
def __init__(self, proxy_list=None):
# Initialize with AsyncWebCrawler from crawl4ai
self.use_proxies = bool(proxy_list)
self.proxy_list = proxy_list
# Initialize async crawler
self.crawler = AsyncWebCrawler(
max_connections=10,
timeout=30,
proxies=self.proxy_list if self.use_proxies and self.proxy_list else None,
follow_redirects=True,
random_user_agent=True # Enable random user agent rotation
)
logging.basicConfig(level=logging.INFO)
async def async_scrape_url(self, url, css_selectors=None):
"""
Asynchronously scrape a URL with privacy protection measures
Args:
url: URL to scrape
css_selectors: Dict of elements to extract, e.g. {'title': 'h1', 'content': '.main-text'}
"""
max_retries = 5
current_retry = 0
while current_retry < max_retries:
try:
# Log attempt details
proxy_status = "using proxy" if self.use_proxies else "without proxy"
logging.info(f"Scraping {url} (Attempt {current_retry+1}/{max_retries}) {proxy_status}")
# Use AsyncWebCrawler to fetch the page
response = await self.crawler.arun(url)
# Process the response based on content type
if response.is_html:
page_data = await response.parse_html()
# Create a basic result structure
if not css_selectors:
# Default extraction if no selectors provided
title = page_data.title or "Title extraction not supported"
text = page_data.text[:10000] if hasattr(page_data, 'text') else "Text extraction not supported"
links = page_data.links[:20] if hasattr(page_data, 'links') else []
result = {
'title': title,
'text': text,
'links': links
}
else:
# Extract requested elements using CSS selectors
result = {}
for key, selector in css_selectors.items():
elements = page_data.select(selector)
if elements:
# If multiple elements match, create a list
if len(elements) > 1:
result[key] = [elem.text for elem in elements]
else:
result[key] = elements[0].text
else:
result[key] = f"No match for selector: {selector}"
else:
# Handle non-HTML responses
result = {
'content_type': response.content_type,
'content_length': len(response.content),
'summary': 'Non-HTML content'
}
# Get user agent info
user_agent = self.crawler.current_user_agent or "Unknown"
# Truncate for privacy
user_agent = user_agent[:30] + '...' if len(str(user_agent)) > 30 else user_agent
return {
'status': 'success',
'data': result,
'privacy': {
'user_agent_type': user_agent,
'proxy_used': self.use_proxies
}
}
except Exception as e:
logging.error(f"Request failed: {str(e)}")
current_retry += 1
await asyncio.sleep(random.uniform(2, 5)) # Async sleep for backoff
# Try to rotate proxy if available
if self.use_proxies and self.proxy_list and len(self.proxy_list) > 1:
self.proxy_list = self.proxy_list[1:] + [self.proxy_list[0]] # Rotate proxies
# Update crawler's proxies
await self.crawler.update_proxies(self.proxy_list)
# If we've exhausted retries
return {'status': 'error', 'message': f"Failed after {max_retries} attempts"}
def scrape_url(self, url, css_selectors=None):
"""
Synchronous wrapper for async_scrape_url
Args:
url: URL to scrape
css_selectors: Dict of elements to extract
"""
return asyncio.run(self.async_scrape_url(url, css_selectors))