Spaces:

moksh9591
/

LLm_Webscraper

Running

LLm_Webscraper / secure_scraper.py

Mokshith Salian

initial commit

f937fdb 23 days ago

3.98 kB

	import random
	import requests
	from bs4 import BeautifulSoup
	import logging
	from privacy_manager import PrivacyManager
	import time
	import json

	class SecureScraper:
	def __init__(self, proxy_list=None):
	self.privacy_manager = PrivacyManager(proxy_list)
	self.session = requests.Session()
	logging.basicConfig(level=logging.INFO)

	def scrape_url(self, url, css_selectors=None):
	"""
	Scrape a URL with privacy protection measures

	Args:
	url: URL to scrape
	css_selectors: Dict of elements to extract, e.g. {'title': 'h1', 'content': '.main-text'}
	"""
	# Get privacy parameters
	params = self.privacy_manager.get_request_params()
	max_retries = 3
	current_retry = 0

	while current_retry < max_retries:
	try:
	# Configure request with privacy measures
	headers = params.get('headers', {})
	proxies = params.get('proxies', None)

	# Log attempt details (but mask proxy details for security)
	proxy_log = "using proxy" if proxies else "without proxy"
	logging.info(f"Scraping {url} (Attempt {current_retry+1}/{max_retries}) {proxy_log}")

	# Make the request
	response = self.session.get(url, headers=headers, proxies=proxies, timeout=10)
	response.raise_for_status()

	# Check for CAPTCHA
	if self.privacy_manager.handle_captcha(response):
	logging.warning(f"CAPTCHA detected, retrying with new identity")
	params = self.privacy_manager.get_request_params() # Get new privacy params
	current_retry += 1
	time.sleep(random.uniform(3, 7)) # Longer delay after CAPTCHA
	continue

	# Extract content
	soup = BeautifulSoup(response.content, 'html.parser')

	# If no selectors provided, return general page info
	if not css_selectors:
	result = {
	'title': soup.title.string if soup.title else 'No title found',
	'text': soup.get_text(strip=True)[:10000], # Limit text size
	'links': [a.get('href') for a in soup.find_all('a', href=True)][:20] # Limit links
	}
	else:
	# Extract requested elements
	result = {}
	for key, selector in css_selectors.items():
	elements = soup.select(selector)
	if elements:
	# If multiple elements match, create a list
	if len(elements) > 1:
	result[key] = [elem.get_text(strip=True) for elem in elements]
	else:
	result[key] = elements[0].get_text(strip=True)
	else:
	result[key] = f"No match for selector: {selector}"

	return {
	'status': 'success',
	'data': result,
	'privacy': {
	'user_agent_type': headers.get('User-Agent', 'Unknown')[:30] + '...' # Truncate for privacy
	}
	}
	except requests.exceptions.RequestException as e:
	logging.error(f"Request failed: {str(e)}")
	current_retry += 1
	time.sleep(random.uniform(2, 5)) # Incremental backoff
	params = self.privacy_manager.get_request_params() # Get new privacy params

	# If we've exhausted retries
	return {'status': 'error', 'message': f"Failed after {max_retries} attempts"}