File size: 4,598 Bytes
f937fdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5305d6
 
f937fdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import streamlit as st
import json
import logging
import os
import time

# Import our custom classes
from secure_scraper import SecureScraper
from llm_processor import LLMProcessor

# Set up logging
logging.basicConfig(level=logging.INFO)

def main():
    st.set_page_config(
        page_title="LLM Web Scraper",
        page_icon="🕸️",
        layout="wide",
    )
    
    st.title("🕸️ LLM Web Scraper")
    st.write("Scrape web content with privacy protection and open-source LLM processing - by Mokshith salian")
    
    # Configuration section
    with st.sidebar:
        st.header("Configuration")
        
        st.subheader("LLM Model Selection")
        model_option = st.selectbox(
            "Choose LLM Model",
            [
                "microsoft/phi-2 (fastest, 2.7B)",
                "google/gemma-2b (balanced)",
                "mistralai/Mistral-7B-Instruct-v0.2 (best quality, slowest)"
            ],
            index=0
        )
        
        # Convert selection to model name
        model_name = model_option.split(" ")[0]
        
        st.subheader("Privacy Settings")
        use_proxy = st.checkbox("Use Proxy Rotation", value=False)
        use_user_agent = st.checkbox("Use User-Agent Rotation", value=True)
    
    # Input section
    st.header("Scraping Target")
    url = st.text_input("Enter the URL to scrape", placeholder="https://oceanofgames.com/")
    
    with st.expander("Advanced Scraping Options"):
        css_selectors_text = st.text_area(
            "CSS Selectors (JSON format)",
            placeholder='{"title": "h1", "price": ".product-price", "description": ".product-description"}'
        )
        
        # Parse CSS selectors
        css_selectors = None
        if css_selectors_text:
            try:
                css_selectors = json.loads(css_selectors_text)
            except json.JSONDecodeError:
                st.error("Invalid JSON for CSS selectors")
    
    st.header("LLM Processing")
    llm_instruction = st.text_area(
        "What do you want the LLM to do with the scraped data?",
        placeholder="Extract the main product features and summarize them in bullet points"
    )
    
    # Initialize on button click
    if st.button("Scrape and Process"):
        if not url:
            st.error("Please enter a URL to scrape")
            return
        
        # Show progress
        with st.spinner("Initializing scraper..."):
            proxies = None
            if use_proxy:
                st.warning("Using public proxies - in a production system, you'd want to use paid proxies")
                # In a real app, we'd use better proxies or load from a file
                proxies = [
                    "http://public-proxy1.example.com:8080",
                    "http://public-proxy2.example.com:8080"
                ]
            
            scraper = SecureScraper(proxy_list=proxies if use_proxy else None)
        
        # Perform scraping
        with st.spinner("Scraping website with privacy protection..."):
            result = scraper.scrape_url(url, css_selectors)
            
            if result['status'] == 'error':
                st.error(f"Scraping failed: {result['message']}")
                return
                
            st.success("Scraping completed successfully!")
            
            # Display privacy measures used
            st.subheader("Privacy Measures Used")
            st.json(result['privacy'])
            
            # Display raw scraped data
            with st.expander("Raw Scraped Data"):
                st.json(result['data'])
        
        # Process with LLM
        with st.spinner(f"Processing with {model_name}..."):
            try:
                llm = LLMProcessor(model_name=model_name)
                
                # Prepare data for LLM (convert to string if it's a dict)
                scraped_data_str = json.dumps(result['data'], indent=2) if isinstance(result['data'], dict) else result['data']
                
                processed_result = llm.process_data(
                    scraped_data_str, 
                    llm_instruction if llm_instruction else "Summarize this information"
                )
                
                st.subheader("LLM Processing Result")
                st.write(processed_result)
                
            except Exception as e:
                st.error(f"Error in LLM processing: {str(e)}")
                st.info("Try using a smaller model like microsoft/phi-2 if you're facing memory issues")

if __name__ == "__main__":
    main()