LLm_Webscraper / app.py
Mokshith Salian
made small change
d5305d6
raw
history blame
4.6 kB
import streamlit as st
import json
import logging
import os
import time
# Import our custom classes
from secure_scraper import SecureScraper
from llm_processor import LLMProcessor
# Set up logging
logging.basicConfig(level=logging.INFO)
def main():
st.set_page_config(
page_title="LLM Web Scraper",
page_icon="🕸️",
layout="wide",
)
st.title("🕸️ LLM Web Scraper")
st.write("Scrape web content with privacy protection and open-source LLM processing - by Mokshith salian")
# Configuration section
with st.sidebar:
st.header("Configuration")
st.subheader("LLM Model Selection")
model_option = st.selectbox(
"Choose LLM Model",
[
"microsoft/phi-2 (fastest, 2.7B)",
"google/gemma-2b (balanced)",
"mistralai/Mistral-7B-Instruct-v0.2 (best quality, slowest)"
],
index=0
)
# Convert selection to model name
model_name = model_option.split(" ")[0]
st.subheader("Privacy Settings")
use_proxy = st.checkbox("Use Proxy Rotation", value=False)
use_user_agent = st.checkbox("Use User-Agent Rotation", value=True)
# Input section
st.header("Scraping Target")
url = st.text_input("Enter the URL to scrape", placeholder="https://oceanofgames.com/")
with st.expander("Advanced Scraping Options"):
css_selectors_text = st.text_area(
"CSS Selectors (JSON format)",
placeholder='{"title": "h1", "price": ".product-price", "description": ".product-description"}'
)
# Parse CSS selectors
css_selectors = None
if css_selectors_text:
try:
css_selectors = json.loads(css_selectors_text)
except json.JSONDecodeError:
st.error("Invalid JSON for CSS selectors")
st.header("LLM Processing")
llm_instruction = st.text_area(
"What do you want the LLM to do with the scraped data?",
placeholder="Extract the main product features and summarize them in bullet points"
)
# Initialize on button click
if st.button("Scrape and Process"):
if not url:
st.error("Please enter a URL to scrape")
return
# Show progress
with st.spinner("Initializing scraper..."):
proxies = None
if use_proxy:
st.warning("Using public proxies - in a production system, you'd want to use paid proxies")
# In a real app, we'd use better proxies or load from a file
proxies = [
"http://public-proxy1.example.com:8080",
"http://public-proxy2.example.com:8080"
]
scraper = SecureScraper(proxy_list=proxies if use_proxy else None)
# Perform scraping
with st.spinner("Scraping website with privacy protection..."):
result = scraper.scrape_url(url, css_selectors)
if result['status'] == 'error':
st.error(f"Scraping failed: {result['message']}")
return
st.success("Scraping completed successfully!")
# Display privacy measures used
st.subheader("Privacy Measures Used")
st.json(result['privacy'])
# Display raw scraped data
with st.expander("Raw Scraped Data"):
st.json(result['data'])
# Process with LLM
with st.spinner(f"Processing with {model_name}..."):
try:
llm = LLMProcessor(model_name=model_name)
# Prepare data for LLM (convert to string if it's a dict)
scraped_data_str = json.dumps(result['data'], indent=2) if isinstance(result['data'], dict) else result['data']
processed_result = llm.process_data(
scraped_data_str,
llm_instruction if llm_instruction else "Summarize this information"
)
st.subheader("LLM Processing Result")
st.write(processed_result)
except Exception as e:
st.error(f"Error in LLM processing: {str(e)}")
st.info("Try using a smaller model like microsoft/phi-2 if you're facing memory issues")
if __name__ == "__main__":
main()