Spaces:
Running
Running
import random | |
import requests | |
from bs4 import BeautifulSoup | |
import logging | |
from privacy_manager import PrivacyManager | |
import time | |
import json | |
class SecureScraper: | |
def __init__(self, proxy_list=None): | |
self.privacy_manager = PrivacyManager(proxy_list) | |
self.session = requests.Session() | |
logging.basicConfig(level=logging.INFO) | |
def scrape_url(self, url, css_selectors=None): | |
""" | |
Scrape a URL with privacy protection measures | |
Args: | |
url: URL to scrape | |
css_selectors: Dict of elements to extract, e.g. {'title': 'h1', 'content': '.main-text'} | |
""" | |
# Get privacy parameters | |
params = self.privacy_manager.get_request_params() | |
max_retries = 3 | |
current_retry = 0 | |
while current_retry < max_retries: | |
try: | |
# Configure request with privacy measures | |
headers = params.get('headers', {}) | |
proxies = params.get('proxies', None) | |
# Log attempt details (but mask proxy details for security) | |
proxy_log = "using proxy" if proxies else "without proxy" | |
logging.info(f"Scraping {url} (Attempt {current_retry+1}/{max_retries}) {proxy_log}") | |
# Make the request | |
response = self.session.get(url, headers=headers, proxies=proxies, timeout=10) | |
response.raise_for_status() | |
# Check for CAPTCHA | |
if self.privacy_manager.handle_captcha(response): | |
logging.warning(f"CAPTCHA detected, retrying with new identity") | |
params = self.privacy_manager.get_request_params() # Get new privacy params | |
current_retry += 1 | |
time.sleep(random.uniform(3, 7)) # Longer delay after CAPTCHA | |
continue | |
# Extract content | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# If no selectors provided, return general page info | |
if not css_selectors: | |
result = { | |
'title': soup.title.string if soup.title else 'No title found', | |
'text': soup.get_text(strip=True)[:10000], # Limit text size | |
'links': [a.get('href') for a in soup.find_all('a', href=True)][:20] # Limit links | |
} | |
else: | |
# Extract requested elements | |
result = {} | |
for key, selector in css_selectors.items(): | |
elements = soup.select(selector) | |
if elements: | |
# If multiple elements match, create a list | |
if len(elements) > 1: | |
result[key] = [elem.get_text(strip=True) for elem in elements] | |
else: | |
result[key] = elements[0].get_text(strip=True) | |
else: | |
result[key] = f"No match for selector: {selector}" | |
return { | |
'status': 'success', | |
'data': result, | |
'privacy': { | |
'user_agent_type': headers.get('User-Agent', 'Unknown')[:30] + '...' # Truncate for privacy | |
} | |
} | |
except requests.exceptions.RequestException as e: | |
logging.error(f"Request failed: {str(e)}") | |
current_retry += 1 | |
time.sleep(random.uniform(2, 5)) # Incremental backoff | |
params = self.privacy_manager.get_request_params() # Get new privacy params | |
# If we've exhausted retries | |
return {'status': 'error', 'message': f"Failed after {max_retries} attempts"} | |