Spaces:

moksh9591
/

LLm_Webscraper

Running

File size: 4,598 Bytes

import streamlit as st
import json
import logging
import os
import time

# Import our custom classes
from secure_scraper import SecureScraper
from llm_processor import LLMProcessor

# Set up logging
logging.basicConfig(level=logging.INFO)

def main():
    st.set_page_config(
        page_title="LLM Web Scraper",
        page_icon="🕸️",
        layout="wide",
    )
    
    st.title("🕸️ LLM Web Scraper")
    st.write("Scrape web content with privacy protection and open-source LLM processing - by Mokshith salian")
    
    # Configuration section
    with st.sidebar:
        st.header("Configuration")
        
        st.subheader("LLM Model Selection")
        model_option = st.selectbox(
            "Choose LLM Model",
            [
                "microsoft/phi-2 (fastest, 2.7B)",
                "google/gemma-2b (balanced)",
                "mistralai/Mistral-7B-Instruct-v0.2 (best quality, slowest)"
            ],
            index=0
        )
        
        # Convert selection to model name
        model_name = model_option.split(" ")[0]
        
        st.subheader("Privacy Settings")
        use_proxy = st.checkbox("Use Proxy Rotation", value=False)
        use_user_agent = st.checkbox("Use User-Agent Rotation", value=True)
    
    # Input section
    st.header("Scraping Target")
    url = st.text_input("Enter the URL to scrape", placeholder="https://oceanofgames.com/")
    
    with st.expander("Advanced Scraping Options"):
        css_selectors_text = st.text_area(
            "CSS Selectors (JSON format)",
            placeholder='{"title": "h1", "price": ".product-price", "description": ".product-description"}'
        )
        
        # Parse CSS selectors
        css_selectors = None
        if css_selectors_text:
            try:
                css_selectors = json.loads(css_selectors_text)
            except json.JSONDecodeError:
                st.error("Invalid JSON for CSS selectors")
    
    st.header("LLM Processing")
    llm_instruction = st.text_area(
        "What do you want the LLM to do with the scraped data?",
        placeholder="Extract the main product features and summarize them in bullet points"
    )
    
    # Initialize on button click
    if st.button("Scrape and Process"):
        if not url:
            st.error("Please enter a URL to scrape")
            return
        
        # Show progress
        with st.spinner("Initializing scraper..."):
            proxies = None
            if use_proxy:
                st.warning("Using public proxies - in a production system, you'd want to use paid proxies")
                # In a real app, we'd use better proxies or load from a file
                proxies = [
                    "http://public-proxy1.example.com:8080",
                    "http://public-proxy2.example.com:8080"
                ]
            
            scraper = SecureScraper(proxy_list=proxies if use_proxy else None)
        
        # Perform scraping
        with st.spinner("Scraping website with privacy protection..."):
            result = scraper.scrape_url(url, css_selectors)
            
            if result['status'] == 'error':
                st.error(f"Scraping failed: {result['message']}")
                return
                
            st.success("Scraping completed successfully!")
            
            # Display privacy measures used
            st.subheader("Privacy Measures Used")
            st.json(result['privacy'])
            
            # Display raw scraped data
            with st.expander("Raw Scraped Data"):
                st.json(result['data'])
        
        # Process with LLM
        with st.spinner(f"Processing with {model_name}..."):
            try:
                llm = LLMProcessor(model_name=model_name)
                
                # Prepare data for LLM (convert to string if it's a dict)
                scraped_data_str = json.dumps(result['data'], indent=2) if isinstance(result['data'], dict) else result['data']
                
                processed_result = llm.process_data(
                    scraped_data_str, 
                    llm_instruction if llm_instruction else "Summarize this information"
                )
                
                st.subheader("LLM Processing Result")
                st.write(processed_result)
                
            except Exception as e:
                st.error(f"Error in LLM processing: {str(e)}")
                st.info("Try using a smaller model like microsoft/phi-2 if you're facing memory issues")

if __name__ == "__main__":
    main()