Spaces:
Running
Running
import streamlit as st | |
import json | |
import logging | |
import os | |
import time | |
# Import our custom classes | |
from secure_scraper import SecureScraper | |
from llm_processor import LLMProcessor | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
def main(): | |
st.set_page_config( | |
page_title="LLM Web Scraper", | |
page_icon="🕸️", | |
layout="wide", | |
) | |
st.title("🕸️ LLM Web Scraper") | |
st.write("Scrape web content with privacy protection and open-source LLM processing - by Mokshith salian") | |
# Configuration section | |
with st.sidebar: | |
st.header("Configuration") | |
st.subheader("LLM Model Selection") | |
model_option = st.selectbox( | |
"Choose LLM Model", | |
[ | |
"microsoft/phi-2 (fastest, 2.7B)", | |
"google/gemma-2b (balanced)", | |
"mistralai/Mistral-7B-Instruct-v0.2 (best quality, slowest)" | |
], | |
index=0 | |
) | |
# Convert selection to model name | |
model_name = model_option.split(" ")[0] | |
st.subheader("Privacy Settings") | |
use_proxy = st.checkbox("Use Proxy Rotation", value=False) | |
use_user_agent = st.checkbox("Use User-Agent Rotation", value=True) | |
# Input section | |
st.header("Scraping Target") | |
url = st.text_input("Enter the URL to scrape", placeholder="https://oceanofgames.com/") | |
with st.expander("Advanced Scraping Options"): | |
css_selectors_text = st.text_area( | |
"CSS Selectors (JSON format)", | |
placeholder='{"title": "h1", "price": ".product-price", "description": ".product-description"}' | |
) | |
# Parse CSS selectors | |
css_selectors = None | |
if css_selectors_text: | |
try: | |
css_selectors = json.loads(css_selectors_text) | |
except json.JSONDecodeError: | |
st.error("Invalid JSON for CSS selectors") | |
st.header("LLM Processing") | |
llm_instruction = st.text_area( | |
"What do you want the LLM to do with the scraped data?", | |
placeholder="Extract the main product features and summarize them in bullet points" | |
) | |
# Initialize on button click | |
if st.button("Scrape and Process"): | |
if not url: | |
st.error("Please enter a URL to scrape") | |
return | |
# Show progress | |
with st.spinner("Initializing scraper..."): | |
proxies = None | |
if use_proxy: | |
st.warning("Using public proxies - in a production system, you'd want to use paid proxies") | |
# In a real app, we'd use better proxies or load from a file | |
proxies = [ | |
"http://public-proxy1.example.com:8080", | |
"http://public-proxy2.example.com:8080" | |
] | |
scraper = SecureScraper(proxy_list=proxies if use_proxy else None) | |
# Perform scraping | |
with st.spinner("Scraping website with privacy protection..."): | |
result = scraper.scrape_url(url, css_selectors) | |
if result['status'] == 'error': | |
st.error(f"Scraping failed: {result['message']}") | |
return | |
st.success("Scraping completed successfully!") | |
# Display privacy measures used | |
st.subheader("Privacy Measures Used") | |
st.json(result['privacy']) | |
# Display raw scraped data | |
with st.expander("Raw Scraped Data"): | |
st.json(result['data']) | |
# Process with LLM | |
with st.spinner(f"Processing with {model_name}..."): | |
try: | |
llm = LLMProcessor(model_name=model_name) | |
# Prepare data for LLM (convert to string if it's a dict) | |
scraped_data_str = json.dumps(result['data'], indent=2) if isinstance(result['data'], dict) else result['data'] | |
processed_result = llm.process_data( | |
scraped_data_str, | |
llm_instruction if llm_instruction else "Summarize this information" | |
) | |
st.subheader("LLM Processing Result") | |
st.write(processed_result) | |
except Exception as e: | |
st.error(f"Error in LLM processing: {str(e)}") | |
st.info("Try using a smaller model like microsoft/phi-2 if you're facing memory issues") | |
if __name__ == "__main__": | |
main() |