Spaces:
Running
Running
Mokshith Salian
commited on
Commit
·
f937fdb
1
Parent(s):
644f29b
initial commit
Browse files- app.py +128 -0
- llm_processor.py +34 -0
- privacy_manager.py +65 -0
- requirements.txt +7 -0
- secure_scraper.py +88 -0
app.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import json
|
3 |
+
import logging
|
4 |
+
import os
|
5 |
+
import time
|
6 |
+
|
7 |
+
# Import our custom classes
|
8 |
+
from secure_scraper import SecureScraper
|
9 |
+
from llm_processor import LLMProcessor
|
10 |
+
|
11 |
+
# Set up logging
|
12 |
+
logging.basicConfig(level=logging.INFO)
|
13 |
+
|
14 |
+
def main():
|
15 |
+
st.set_page_config(
|
16 |
+
page_title="LLM Web Scraper",
|
17 |
+
page_icon="🕸️",
|
18 |
+
layout="wide",
|
19 |
+
)
|
20 |
+
|
21 |
+
st.title("🕸️ Free LLM Web Scraper")
|
22 |
+
st.write("Scrape web content with privacy protection and open-source LLM processing")
|
23 |
+
|
24 |
+
# Configuration section
|
25 |
+
with st.sidebar:
|
26 |
+
st.header("Configuration")
|
27 |
+
|
28 |
+
st.subheader("LLM Model Selection")
|
29 |
+
model_option = st.selectbox(
|
30 |
+
"Choose LLM Model",
|
31 |
+
[
|
32 |
+
"microsoft/phi-2 (fastest, 2.7B)",
|
33 |
+
"google/gemma-2b (balanced)",
|
34 |
+
"mistralai/Mistral-7B-Instruct-v0.2 (best quality, slowest)"
|
35 |
+
],
|
36 |
+
index=0
|
37 |
+
)
|
38 |
+
|
39 |
+
# Convert selection to model name
|
40 |
+
model_name = model_option.split(" ")[0]
|
41 |
+
|
42 |
+
st.subheader("Privacy Settings")
|
43 |
+
use_proxy = st.checkbox("Use Proxy Rotation", value=False)
|
44 |
+
use_user_agent = st.checkbox("Use User-Agent Rotation", value=True)
|
45 |
+
|
46 |
+
# Input section
|
47 |
+
st.header("Scraping Target")
|
48 |
+
url = st.text_input("Enter the URL to scrape", placeholder="https://oceanofgames.com/")
|
49 |
+
|
50 |
+
with st.expander("Advanced Scraping Options"):
|
51 |
+
css_selectors_text = st.text_area(
|
52 |
+
"CSS Selectors (JSON format)",
|
53 |
+
placeholder='{"title": "h1", "price": ".product-price", "description": ".product-description"}'
|
54 |
+
)
|
55 |
+
|
56 |
+
# Parse CSS selectors
|
57 |
+
css_selectors = None
|
58 |
+
if css_selectors_text:
|
59 |
+
try:
|
60 |
+
css_selectors = json.loads(css_selectors_text)
|
61 |
+
except json.JSONDecodeError:
|
62 |
+
st.error("Invalid JSON for CSS selectors")
|
63 |
+
|
64 |
+
st.header("LLM Processing")
|
65 |
+
llm_instruction = st.text_area(
|
66 |
+
"What do you want the LLM to do with the scraped data?",
|
67 |
+
placeholder="Extract the main product features and summarize them in bullet points"
|
68 |
+
)
|
69 |
+
|
70 |
+
# Initialize on button click
|
71 |
+
if st.button("Scrape and Process"):
|
72 |
+
if not url:
|
73 |
+
st.error("Please enter a URL to scrape")
|
74 |
+
return
|
75 |
+
|
76 |
+
# Show progress
|
77 |
+
with st.spinner("Initializing scraper..."):
|
78 |
+
proxies = None
|
79 |
+
if use_proxy:
|
80 |
+
st.warning("Using public proxies - in a production system, you'd want to use paid proxies")
|
81 |
+
# In a real app, we'd use better proxies or load from a file
|
82 |
+
proxies = [
|
83 |
+
"http://public-proxy1.example.com:8080",
|
84 |
+
"http://public-proxy2.example.com:8080"
|
85 |
+
]
|
86 |
+
|
87 |
+
scraper = SecureScraper(proxy_list=proxies if use_proxy else None)
|
88 |
+
|
89 |
+
# Perform scraping
|
90 |
+
with st.spinner("Scraping website with privacy protection..."):
|
91 |
+
result = scraper.scrape_url(url, css_selectors)
|
92 |
+
|
93 |
+
if result['status'] == 'error':
|
94 |
+
st.error(f"Scraping failed: {result['message']}")
|
95 |
+
return
|
96 |
+
|
97 |
+
st.success("Scraping completed successfully!")
|
98 |
+
|
99 |
+
# Display privacy measures used
|
100 |
+
st.subheader("Privacy Measures Used")
|
101 |
+
st.json(result['privacy'])
|
102 |
+
|
103 |
+
# Display raw scraped data
|
104 |
+
with st.expander("Raw Scraped Data"):
|
105 |
+
st.json(result['data'])
|
106 |
+
|
107 |
+
# Process with LLM
|
108 |
+
with st.spinner(f"Processing with {model_name}..."):
|
109 |
+
try:
|
110 |
+
llm = LLMProcessor(model_name=model_name)
|
111 |
+
|
112 |
+
# Prepare data for LLM (convert to string if it's a dict)
|
113 |
+
scraped_data_str = json.dumps(result['data'], indent=2) if isinstance(result['data'], dict) else result['data']
|
114 |
+
|
115 |
+
processed_result = llm.process_data(
|
116 |
+
scraped_data_str,
|
117 |
+
llm_instruction if llm_instruction else "Summarize this information"
|
118 |
+
)
|
119 |
+
|
120 |
+
st.subheader("LLM Processing Result")
|
121 |
+
st.write(processed_result)
|
122 |
+
|
123 |
+
except Exception as e:
|
124 |
+
st.error(f"Error in LLM processing: {str(e)}")
|
125 |
+
st.info("Try using a smaller model like microsoft/phi-2 if you're facing memory issues")
|
126 |
+
|
127 |
+
if __name__ == "__main__":
|
128 |
+
main()
|
llm_processor.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
2 |
+
import torch
|
3 |
+
|
4 |
+
class LLMProcessor:
|
5 |
+
def __init__(self, model_name="TheBloke/Mistral-7B-Instruct-v0.2-GGUF"):
|
6 |
+
# Option 1: Use HuggingFace pipeline for simplicity
|
7 |
+
self.pipe = pipeline(
|
8 |
+
"text-generation",
|
9 |
+
model=model_name,
|
10 |
+
torch_dtype=torch.float16,
|
11 |
+
device_map="auto"
|
12 |
+
)
|
13 |
+
|
14 |
+
def process_data(self, scraped_data, task_instruction):
|
15 |
+
# Create prompt
|
16 |
+
prompt = f"""
|
17 |
+
Task: {task_instruction}
|
18 |
+
|
19 |
+
Data:
|
20 |
+
{scraped_data}
|
21 |
+
|
22 |
+
Please process the above data according to the task instruction.
|
23 |
+
"""
|
24 |
+
|
25 |
+
# Generate response
|
26 |
+
response = self.pipe(
|
27 |
+
prompt,
|
28 |
+
max_length=2048,
|
29 |
+
temperature=0.7,
|
30 |
+
top_p=0.9,
|
31 |
+
do_sample=True
|
32 |
+
)
|
33 |
+
|
34 |
+
return response[0]['generated_text']
|
privacy_manager.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from fake_useragent import UserAgent
|
3 |
+
import random
|
4 |
+
import time
|
5 |
+
import os
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
+
import logging
|
8 |
+
|
9 |
+
class PrivacyManager:
|
10 |
+
def __init__(self, proxy_list=None):
|
11 |
+
# Initialize User-Agent rotator
|
12 |
+
self.ua = UserAgent()
|
13 |
+
|
14 |
+
# Initialize proxies
|
15 |
+
self.proxies = []
|
16 |
+
if proxy_list:
|
17 |
+
self.proxies = proxy_list
|
18 |
+
else:
|
19 |
+
# Default to a few free proxy examples (you'd want to update these)
|
20 |
+
self.proxies = [
|
21 |
+
"http://public-proxy1.example.com:8080",
|
22 |
+
"http://public-proxy2.example.com:8080"
|
23 |
+
]
|
24 |
+
|
25 |
+
logging.info(f"Initialized PrivacyManager with {len(self.proxies)} proxies")
|
26 |
+
|
27 |
+
def get_random_proxy(self):
|
28 |
+
if not self.proxies:
|
29 |
+
return None
|
30 |
+
return random.choice(self.proxies)
|
31 |
+
|
32 |
+
def get_random_user_agent(self):
|
33 |
+
return self.ua.random
|
34 |
+
|
35 |
+
def handle_captcha(self, response):
|
36 |
+
"""
|
37 |
+
Basic CAPTCHA detection - in a real implementation, you'd need
|
38 |
+
more sophisticated handling or a dedicated service
|
39 |
+
"""
|
40 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
41 |
+
captcha_indicators = ['captcha', 'CAPTCHA', 'robot', 'verify']
|
42 |
+
|
43 |
+
for indicator in captcha_indicators:
|
44 |
+
if indicator in response.text:
|
45 |
+
logging.warning(f"CAPTCHA detected: {indicator} found on page")
|
46 |
+
return True
|
47 |
+
|
48 |
+
return False
|
49 |
+
|
50 |
+
def get_request_params(self):
|
51 |
+
# Random delay to avoid detection
|
52 |
+
time.sleep(random.uniform(1, 3))
|
53 |
+
|
54 |
+
params = {
|
55 |
+
'headers': {'User-Agent': self.get_random_user_agent()}
|
56 |
+
}
|
57 |
+
|
58 |
+
proxy = self.get_random_proxy()
|
59 |
+
if proxy:
|
60 |
+
params['proxies'] = {
|
61 |
+
'http': proxy,
|
62 |
+
'https': proxy
|
63 |
+
}
|
64 |
+
|
65 |
+
return params
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==1.30.0
|
2 |
+
torch==2.1.1
|
3 |
+
transformers==4.36.2
|
4 |
+
requests==2.31.0
|
5 |
+
fake-useragent==1.4.0
|
6 |
+
beautifulsoup4==4.12.2
|
7 |
+
accelerate==0.25.0
|
secure_scraper.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
import requests
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import logging
|
5 |
+
from privacy_manager import PrivacyManager
|
6 |
+
import time
|
7 |
+
import json
|
8 |
+
|
9 |
+
class SecureScraper:
|
10 |
+
def __init__(self, proxy_list=None):
|
11 |
+
self.privacy_manager = PrivacyManager(proxy_list)
|
12 |
+
self.session = requests.Session()
|
13 |
+
logging.basicConfig(level=logging.INFO)
|
14 |
+
|
15 |
+
def scrape_url(self, url, css_selectors=None):
|
16 |
+
"""
|
17 |
+
Scrape a URL with privacy protection measures
|
18 |
+
|
19 |
+
Args:
|
20 |
+
url: URL to scrape
|
21 |
+
css_selectors: Dict of elements to extract, e.g. {'title': 'h1', 'content': '.main-text'}
|
22 |
+
"""
|
23 |
+
# Get privacy parameters
|
24 |
+
params = self.privacy_manager.get_request_params()
|
25 |
+
max_retries = 3
|
26 |
+
current_retry = 0
|
27 |
+
|
28 |
+
while current_retry < max_retries:
|
29 |
+
try:
|
30 |
+
# Configure request with privacy measures
|
31 |
+
headers = params.get('headers', {})
|
32 |
+
proxies = params.get('proxies', None)
|
33 |
+
|
34 |
+
# Log attempt details (but mask proxy details for security)
|
35 |
+
proxy_log = "using proxy" if proxies else "without proxy"
|
36 |
+
logging.info(f"Scraping {url} (Attempt {current_retry+1}/{max_retries}) {proxy_log}")
|
37 |
+
|
38 |
+
# Make the request
|
39 |
+
response = self.session.get(url, headers=headers, proxies=proxies, timeout=10)
|
40 |
+
response.raise_for_status()
|
41 |
+
|
42 |
+
# Check for CAPTCHA
|
43 |
+
if self.privacy_manager.handle_captcha(response):
|
44 |
+
logging.warning(f"CAPTCHA detected, retrying with new identity")
|
45 |
+
params = self.privacy_manager.get_request_params() # Get new privacy params
|
46 |
+
current_retry += 1
|
47 |
+
time.sleep(random.uniform(3, 7)) # Longer delay after CAPTCHA
|
48 |
+
continue
|
49 |
+
|
50 |
+
# Extract content
|
51 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
52 |
+
|
53 |
+
# If no selectors provided, return general page info
|
54 |
+
if not css_selectors:
|
55 |
+
result = {
|
56 |
+
'title': soup.title.string if soup.title else 'No title found',
|
57 |
+
'text': soup.get_text(strip=True)[:10000], # Limit text size
|
58 |
+
'links': [a.get('href') for a in soup.find_all('a', href=True)][:20] # Limit links
|
59 |
+
}
|
60 |
+
else:
|
61 |
+
# Extract requested elements
|
62 |
+
result = {}
|
63 |
+
for key, selector in css_selectors.items():
|
64 |
+
elements = soup.select(selector)
|
65 |
+
if elements:
|
66 |
+
# If multiple elements match, create a list
|
67 |
+
if len(elements) > 1:
|
68 |
+
result[key] = [elem.get_text(strip=True) for elem in elements]
|
69 |
+
else:
|
70 |
+
result[key] = elements[0].get_text(strip=True)
|
71 |
+
else:
|
72 |
+
result[key] = f"No match for selector: {selector}"
|
73 |
+
|
74 |
+
return {
|
75 |
+
'status': 'success',
|
76 |
+
'data': result,
|
77 |
+
'privacy': {
|
78 |
+
'user_agent_type': headers.get('User-Agent', 'Unknown')[:30] + '...' # Truncate for privacy
|
79 |
+
}
|
80 |
+
}
|
81 |
+
except requests.exceptions.RequestException as e:
|
82 |
+
logging.error(f"Request failed: {str(e)}")
|
83 |
+
current_retry += 1
|
84 |
+
time.sleep(random.uniform(2, 5)) # Incremental backoff
|
85 |
+
params = self.privacy_manager.get_request_params() # Get new privacy params
|
86 |
+
|
87 |
+
# If we've exhausted retries
|
88 |
+
return {'status': 'error', 'message': f"Failed after {max_retries} attempts"}
|