Mokshith Salian commited on
Commit
f937fdb
·
1 Parent(s): 644f29b

initial commit

Browse files
Files changed (5) hide show
  1. app.py +128 -0
  2. llm_processor.py +34 -0
  3. privacy_manager.py +65 -0
  4. requirements.txt +7 -0
  5. secure_scraper.py +88 -0
app.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import json
3
+ import logging
4
+ import os
5
+ import time
6
+
7
+ # Import our custom classes
8
+ from secure_scraper import SecureScraper
9
+ from llm_processor import LLMProcessor
10
+
11
+ # Set up logging
12
+ logging.basicConfig(level=logging.INFO)
13
+
14
+ def main():
15
+ st.set_page_config(
16
+ page_title="LLM Web Scraper",
17
+ page_icon="🕸️",
18
+ layout="wide",
19
+ )
20
+
21
+ st.title("🕸️ Free LLM Web Scraper")
22
+ st.write("Scrape web content with privacy protection and open-source LLM processing")
23
+
24
+ # Configuration section
25
+ with st.sidebar:
26
+ st.header("Configuration")
27
+
28
+ st.subheader("LLM Model Selection")
29
+ model_option = st.selectbox(
30
+ "Choose LLM Model",
31
+ [
32
+ "microsoft/phi-2 (fastest, 2.7B)",
33
+ "google/gemma-2b (balanced)",
34
+ "mistralai/Mistral-7B-Instruct-v0.2 (best quality, slowest)"
35
+ ],
36
+ index=0
37
+ )
38
+
39
+ # Convert selection to model name
40
+ model_name = model_option.split(" ")[0]
41
+
42
+ st.subheader("Privacy Settings")
43
+ use_proxy = st.checkbox("Use Proxy Rotation", value=False)
44
+ use_user_agent = st.checkbox("Use User-Agent Rotation", value=True)
45
+
46
+ # Input section
47
+ st.header("Scraping Target")
48
+ url = st.text_input("Enter the URL to scrape", placeholder="https://oceanofgames.com/")
49
+
50
+ with st.expander("Advanced Scraping Options"):
51
+ css_selectors_text = st.text_area(
52
+ "CSS Selectors (JSON format)",
53
+ placeholder='{"title": "h1", "price": ".product-price", "description": ".product-description"}'
54
+ )
55
+
56
+ # Parse CSS selectors
57
+ css_selectors = None
58
+ if css_selectors_text:
59
+ try:
60
+ css_selectors = json.loads(css_selectors_text)
61
+ except json.JSONDecodeError:
62
+ st.error("Invalid JSON for CSS selectors")
63
+
64
+ st.header("LLM Processing")
65
+ llm_instruction = st.text_area(
66
+ "What do you want the LLM to do with the scraped data?",
67
+ placeholder="Extract the main product features and summarize them in bullet points"
68
+ )
69
+
70
+ # Initialize on button click
71
+ if st.button("Scrape and Process"):
72
+ if not url:
73
+ st.error("Please enter a URL to scrape")
74
+ return
75
+
76
+ # Show progress
77
+ with st.spinner("Initializing scraper..."):
78
+ proxies = None
79
+ if use_proxy:
80
+ st.warning("Using public proxies - in a production system, you'd want to use paid proxies")
81
+ # In a real app, we'd use better proxies or load from a file
82
+ proxies = [
83
+ "http://public-proxy1.example.com:8080",
84
+ "http://public-proxy2.example.com:8080"
85
+ ]
86
+
87
+ scraper = SecureScraper(proxy_list=proxies if use_proxy else None)
88
+
89
+ # Perform scraping
90
+ with st.spinner("Scraping website with privacy protection..."):
91
+ result = scraper.scrape_url(url, css_selectors)
92
+
93
+ if result['status'] == 'error':
94
+ st.error(f"Scraping failed: {result['message']}")
95
+ return
96
+
97
+ st.success("Scraping completed successfully!")
98
+
99
+ # Display privacy measures used
100
+ st.subheader("Privacy Measures Used")
101
+ st.json(result['privacy'])
102
+
103
+ # Display raw scraped data
104
+ with st.expander("Raw Scraped Data"):
105
+ st.json(result['data'])
106
+
107
+ # Process with LLM
108
+ with st.spinner(f"Processing with {model_name}..."):
109
+ try:
110
+ llm = LLMProcessor(model_name=model_name)
111
+
112
+ # Prepare data for LLM (convert to string if it's a dict)
113
+ scraped_data_str = json.dumps(result['data'], indent=2) if isinstance(result['data'], dict) else result['data']
114
+
115
+ processed_result = llm.process_data(
116
+ scraped_data_str,
117
+ llm_instruction if llm_instruction else "Summarize this information"
118
+ )
119
+
120
+ st.subheader("LLM Processing Result")
121
+ st.write(processed_result)
122
+
123
+ except Exception as e:
124
+ st.error(f"Error in LLM processing: {str(e)}")
125
+ st.info("Try using a smaller model like microsoft/phi-2 if you're facing memory issues")
126
+
127
+ if __name__ == "__main__":
128
+ main()
llm_processor.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
2
+ import torch
3
+
4
+ class LLMProcessor:
5
+ def __init__(self, model_name="TheBloke/Mistral-7B-Instruct-v0.2-GGUF"):
6
+ # Option 1: Use HuggingFace pipeline for simplicity
7
+ self.pipe = pipeline(
8
+ "text-generation",
9
+ model=model_name,
10
+ torch_dtype=torch.float16,
11
+ device_map="auto"
12
+ )
13
+
14
+ def process_data(self, scraped_data, task_instruction):
15
+ # Create prompt
16
+ prompt = f"""
17
+ Task: {task_instruction}
18
+
19
+ Data:
20
+ {scraped_data}
21
+
22
+ Please process the above data according to the task instruction.
23
+ """
24
+
25
+ # Generate response
26
+ response = self.pipe(
27
+ prompt,
28
+ max_length=2048,
29
+ temperature=0.7,
30
+ top_p=0.9,
31
+ do_sample=True
32
+ )
33
+
34
+ return response[0]['generated_text']
privacy_manager.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from fake_useragent import UserAgent
3
+ import random
4
+ import time
5
+ import os
6
+ from bs4 import BeautifulSoup
7
+ import logging
8
+
9
+ class PrivacyManager:
10
+ def __init__(self, proxy_list=None):
11
+ # Initialize User-Agent rotator
12
+ self.ua = UserAgent()
13
+
14
+ # Initialize proxies
15
+ self.proxies = []
16
+ if proxy_list:
17
+ self.proxies = proxy_list
18
+ else:
19
+ # Default to a few free proxy examples (you'd want to update these)
20
+ self.proxies = [
21
+ "http://public-proxy1.example.com:8080",
22
+ "http://public-proxy2.example.com:8080"
23
+ ]
24
+
25
+ logging.info(f"Initialized PrivacyManager with {len(self.proxies)} proxies")
26
+
27
+ def get_random_proxy(self):
28
+ if not self.proxies:
29
+ return None
30
+ return random.choice(self.proxies)
31
+
32
+ def get_random_user_agent(self):
33
+ return self.ua.random
34
+
35
+ def handle_captcha(self, response):
36
+ """
37
+ Basic CAPTCHA detection - in a real implementation, you'd need
38
+ more sophisticated handling or a dedicated service
39
+ """
40
+ soup = BeautifulSoup(response.text, 'html.parser')
41
+ captcha_indicators = ['captcha', 'CAPTCHA', 'robot', 'verify']
42
+
43
+ for indicator in captcha_indicators:
44
+ if indicator in response.text:
45
+ logging.warning(f"CAPTCHA detected: {indicator} found on page")
46
+ return True
47
+
48
+ return False
49
+
50
+ def get_request_params(self):
51
+ # Random delay to avoid detection
52
+ time.sleep(random.uniform(1, 3))
53
+
54
+ params = {
55
+ 'headers': {'User-Agent': self.get_random_user_agent()}
56
+ }
57
+
58
+ proxy = self.get_random_proxy()
59
+ if proxy:
60
+ params['proxies'] = {
61
+ 'http': proxy,
62
+ 'https': proxy
63
+ }
64
+
65
+ return params
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit==1.30.0
2
+ torch==2.1.1
3
+ transformers==4.36.2
4
+ requests==2.31.0
5
+ fake-useragent==1.4.0
6
+ beautifulsoup4==4.12.2
7
+ accelerate==0.25.0
secure_scraper.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import logging
5
+ from privacy_manager import PrivacyManager
6
+ import time
7
+ import json
8
+
9
+ class SecureScraper:
10
+ def __init__(self, proxy_list=None):
11
+ self.privacy_manager = PrivacyManager(proxy_list)
12
+ self.session = requests.Session()
13
+ logging.basicConfig(level=logging.INFO)
14
+
15
+ def scrape_url(self, url, css_selectors=None):
16
+ """
17
+ Scrape a URL with privacy protection measures
18
+
19
+ Args:
20
+ url: URL to scrape
21
+ css_selectors: Dict of elements to extract, e.g. {'title': 'h1', 'content': '.main-text'}
22
+ """
23
+ # Get privacy parameters
24
+ params = self.privacy_manager.get_request_params()
25
+ max_retries = 3
26
+ current_retry = 0
27
+
28
+ while current_retry < max_retries:
29
+ try:
30
+ # Configure request with privacy measures
31
+ headers = params.get('headers', {})
32
+ proxies = params.get('proxies', None)
33
+
34
+ # Log attempt details (but mask proxy details for security)
35
+ proxy_log = "using proxy" if proxies else "without proxy"
36
+ logging.info(f"Scraping {url} (Attempt {current_retry+1}/{max_retries}) {proxy_log}")
37
+
38
+ # Make the request
39
+ response = self.session.get(url, headers=headers, proxies=proxies, timeout=10)
40
+ response.raise_for_status()
41
+
42
+ # Check for CAPTCHA
43
+ if self.privacy_manager.handle_captcha(response):
44
+ logging.warning(f"CAPTCHA detected, retrying with new identity")
45
+ params = self.privacy_manager.get_request_params() # Get new privacy params
46
+ current_retry += 1
47
+ time.sleep(random.uniform(3, 7)) # Longer delay after CAPTCHA
48
+ continue
49
+
50
+ # Extract content
51
+ soup = BeautifulSoup(response.content, 'html.parser')
52
+
53
+ # If no selectors provided, return general page info
54
+ if not css_selectors:
55
+ result = {
56
+ 'title': soup.title.string if soup.title else 'No title found',
57
+ 'text': soup.get_text(strip=True)[:10000], # Limit text size
58
+ 'links': [a.get('href') for a in soup.find_all('a', href=True)][:20] # Limit links
59
+ }
60
+ else:
61
+ # Extract requested elements
62
+ result = {}
63
+ for key, selector in css_selectors.items():
64
+ elements = soup.select(selector)
65
+ if elements:
66
+ # If multiple elements match, create a list
67
+ if len(elements) > 1:
68
+ result[key] = [elem.get_text(strip=True) for elem in elements]
69
+ else:
70
+ result[key] = elements[0].get_text(strip=True)
71
+ else:
72
+ result[key] = f"No match for selector: {selector}"
73
+
74
+ return {
75
+ 'status': 'success',
76
+ 'data': result,
77
+ 'privacy': {
78
+ 'user_agent_type': headers.get('User-Agent', 'Unknown')[:30] + '...' # Truncate for privacy
79
+ }
80
+ }
81
+ except requests.exceptions.RequestException as e:
82
+ logging.error(f"Request failed: {str(e)}")
83
+ current_retry += 1
84
+ time.sleep(random.uniform(2, 5)) # Incremental backoff
85
+ params = self.privacy_manager.get_request_params() # Get new privacy params
86
+
87
+ # If we've exhausted retries
88
+ return {'status': 'error', 'message': f"Failed after {max_retries} attempts"}