import random import requests from bs4 import BeautifulSoup import logging from privacy_manager import PrivacyManager import time import json class SecureScraper: def __init__(self, proxy_list=None): self.privacy_manager = PrivacyManager(proxy_list) self.session = requests.Session() logging.basicConfig(level=logging.INFO) def scrape_url(self, url, css_selectors=None): """ Scrape a URL with privacy protection measures Args: url: URL to scrape css_selectors: Dict of elements to extract, e.g. {'title': 'h1', 'content': '.main-text'} """ # Get privacy parameters params = self.privacy_manager.get_request_params() max_retries = 3 current_retry = 0 while current_retry < max_retries: try: # Configure request with privacy measures headers = params.get('headers', {}) proxies = params.get('proxies', None) # Log attempt details (but mask proxy details for security) proxy_log = "using proxy" if proxies else "without proxy" logging.info(f"Scraping {url} (Attempt {current_retry+1}/{max_retries}) {proxy_log}") # Make the request response = self.session.get(url, headers=headers, proxies=proxies, timeout=10) response.raise_for_status() # Check for CAPTCHA if self.privacy_manager.handle_captcha(response): logging.warning(f"CAPTCHA detected, retrying with new identity") params = self.privacy_manager.get_request_params() # Get new privacy params current_retry += 1 time.sleep(random.uniform(3, 7)) # Longer delay after CAPTCHA continue # Extract content soup = BeautifulSoup(response.content, 'html.parser') # If no selectors provided, return general page info if not css_selectors: result = { 'title': soup.title.string if soup.title else 'No title found', 'text': soup.get_text(strip=True)[:10000], # Limit text size 'links': [a.get('href') for a in soup.find_all('a', href=True)][:20] # Limit links } else: # Extract requested elements result = {} for key, selector in css_selectors.items(): elements = soup.select(selector) if elements: # If multiple elements match, create a list if len(elements) > 1: result[key] = [elem.get_text(strip=True) for elem in elements] else: result[key] = elements[0].get_text(strip=True) else: result[key] = f"No match for selector: {selector}" return { 'status': 'success', 'data': result, 'privacy': { 'user_agent_type': headers.get('User-Agent', 'Unknown')[:30] + '...' # Truncate for privacy } } except requests.exceptions.RequestException as e: logging.error(f"Request failed: {str(e)}") current_retry += 1 time.sleep(random.uniform(2, 5)) # Incremental backoff params = self.privacy_manager.get_request_params() # Get new privacy params # If we've exhausted retries return {'status': 'error', 'message': f"Failed after {max_retries} attempts"}