Spaces:

moksh9591
/

LLm_Webscraper

Running

LLm_Webscraper / app.py

Mokshith Salian

made small change

d5305d6 23 days ago

4.6 kB

	import streamlit as st
	import json
	import logging
	import os
	import time

	# Import our custom classes
	from secure_scraper import SecureScraper
	from llm_processor import LLMProcessor

	# Set up logging
	logging.basicConfig(level=logging.INFO)

	def main():
	st.set_page_config(
	page_title="LLM Web Scraper",
	page_icon="🕸️",
	layout="wide",
	)

	st.title("🕸️ LLM Web Scraper")
	st.write("Scrape web content with privacy protection and open-source LLM processing - by Mokshith salian")

	# Configuration section
	with st.sidebar:
	st.header("Configuration")

	st.subheader("LLM Model Selection")
	model_option = st.selectbox(
	"Choose LLM Model",
	[
	"microsoft/phi-2 (fastest, 2.7B)",
	"google/gemma-2b (balanced)",
	"mistralai/Mistral-7B-Instruct-v0.2 (best quality, slowest)"
	],
	index=0
	)

	# Convert selection to model name
	model_name = model_option.split(" ")[0]

	st.subheader("Privacy Settings")
	use_proxy = st.checkbox("Use Proxy Rotation", value=False)
	use_user_agent = st.checkbox("Use User-Agent Rotation", value=True)

	# Input section
	st.header("Scraping Target")
	url = st.text_input("Enter the URL to scrape", placeholder="https://oceanofgames.com/")

	with st.expander("Advanced Scraping Options"):
	css_selectors_text = st.text_area(
	"CSS Selectors (JSON format)",
	placeholder='{"title": "h1", "price": ".product-price", "description": ".product-description"}'
	)

	# Parse CSS selectors
	css_selectors = None
	if css_selectors_text:
	try:
	css_selectors = json.loads(css_selectors_text)
	except json.JSONDecodeError:
	st.error("Invalid JSON for CSS selectors")

	st.header("LLM Processing")
	llm_instruction = st.text_area(
	"What do you want the LLM to do with the scraped data?",
	placeholder="Extract the main product features and summarize them in bullet points"
	)

	# Initialize on button click
	if st.button("Scrape and Process"):
	if not url:
	st.error("Please enter a URL to scrape")
	return

	# Show progress
	with st.spinner("Initializing scraper..."):
	proxies = None
	if use_proxy:
	st.warning("Using public proxies - in a production system, you'd want to use paid proxies")
	# In a real app, we'd use better proxies or load from a file
	proxies = [
	"http://public-proxy1.example.com:8080",
	"http://public-proxy2.example.com:8080"
	]

	scraper = SecureScraper(proxy_list=proxies if use_proxy else None)

	# Perform scraping
	with st.spinner("Scraping website with privacy protection..."):
	result = scraper.scrape_url(url, css_selectors)

	if result['status'] == 'error':
	st.error(f"Scraping failed: {result['message']}")
	return

	st.success("Scraping completed successfully!")

	# Display privacy measures used
	st.subheader("Privacy Measures Used")
	st.json(result['privacy'])

	# Display raw scraped data
	with st.expander("Raw Scraped Data"):
	st.json(result['data'])

	# Process with LLM
	with st.spinner(f"Processing with {model_name}..."):
	try:
	llm = LLMProcessor(model_name=model_name)

	# Prepare data for LLM (convert to string if it's a dict)
	scraped_data_str = json.dumps(result['data'], indent=2) if isinstance(result['data'], dict) else result['data']

	processed_result = llm.process_data(
	scraped_data_str,
	llm_instruction if llm_instruction else "Summarize this information"
	)

	st.subheader("LLM Processing Result")
	st.write(processed_result)

	except Exception as e:
	st.error(f"Error in LLM processing: {str(e)}")
	st.info("Try using a smaller model like microsoft/phi-2 if you're facing memory issues")

	if __name__ == "__main__":
	main()