LLm_Webscraper / privacy_manager.py
Mokshith Salian
initial commit
f937fdb
import requests
from fake_useragent import UserAgent
import random
import time
import os
from bs4 import BeautifulSoup
import logging
class PrivacyManager:
def __init__(self, proxy_list=None):
# Initialize User-Agent rotator
self.ua = UserAgent()
# Initialize proxies
self.proxies = []
if proxy_list:
self.proxies = proxy_list
else:
# Default to a few free proxy examples (you'd want to update these)
self.proxies = [
"http://public-proxy1.example.com:8080",
"http://public-proxy2.example.com:8080"
]
logging.info(f"Initialized PrivacyManager with {len(self.proxies)} proxies")
def get_random_proxy(self):
if not self.proxies:
return None
return random.choice(self.proxies)
def get_random_user_agent(self):
return self.ua.random
def handle_captcha(self, response):
"""
Basic CAPTCHA detection - in a real implementation, you'd need
more sophisticated handling or a dedicated service
"""
soup = BeautifulSoup(response.text, 'html.parser')
captcha_indicators = ['captcha', 'CAPTCHA', 'robot', 'verify']
for indicator in captcha_indicators:
if indicator in response.text:
logging.warning(f"CAPTCHA detected: {indicator} found on page")
return True
return False
def get_request_params(self):
# Random delay to avoid detection
time.sleep(random.uniform(1, 3))
params = {
'headers': {'User-Agent': self.get_random_user_agent()}
}
proxy = self.get_random_proxy()
if proxy:
params['proxies'] = {
'http': proxy,
'https': proxy
}
return params