Spaces:

moksh9591
/

LLm_Webscraper

Running

File size: 15,621 Bytes

import traceback
import streamlit as st
import json
import logging
import os
import time
import sys
import requests
import asyncio
import subprocess
import playwright.sync_api as sync_api

# Import our custom classes
from secure_scraper import SecureScraper
from llm_processor import LLMProcessor

# Try to import crawl4ai for debug information
try:
    import crawl4ai
    CRAWL4AI_IMPORTED = True
except ImportError:
    CRAWL4AI_IMPORTED = False

# Try to import playwright for browser check
try:
    import playwright
    PLAYWRIGHT_IMPORTED = True
except ImportError:
    PLAYWRIGHT_IMPORTED = False

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler("scraper_debug.log")
    ]
    )

def check_playwright_browsers():
    """Check if Playwright browsers are installed and provide instructions if not."""
    if not PLAYWRIGHT_IMPORTED:
        return False, "Playwright is not installed. Install with: pip install playwright"
    
    try:
        # Try to run playwright installation check command
        result = subprocess.run(
            ["python", "-m", "playwright", "install", "--help"], 
            stdout=subprocess.PIPE, 
            stderr=subprocess.PIPE,
            text=True,
            timeout=5
        )
        
        # Check if chromium browser exists at common locations
        chromium_path = sync_api.chromium.executable_path
        firefox_path = sync_api.firefox.executable_path
        webkit_path = sync_api.webkit.executable_path
        
        browser_exists = any(os.path.exists(path) for path in [chromium_path, firefox_path, webkit_path])
        
        if not browser_exists:
            return False, "Playwright browsers are not installed. Run: playwright install"
        
        return True, "Playwright browsers appear to be installed"
    except Exception as e:
        return False, f"Error checking Playwright: {str(e)}"

def main():
    st.set_page_config(
        page_title="LLM Web Scraper",
        page_icon="🕸️",
        layout="wide",
    )
    
    st.title("🕸️ LLM Web Scraper")
    st.write("Scrape web content with privacy protection and open-source LLM processing - by Mokshith salian")
    
    # Check for Playwright browsers
    browsers_ok, browsers_message = check_playwright_browsers()
    if not browsers_ok:
        st.warning(f"⚠️ {browsers_message}")
        st.info("To install the required browsers, run this command in your terminal:")
        st.code("playwright install")
        # Optional: add a button to try installing 
        if st.button("Try automatic installation"):
            try:
                with st.spinner("Installing Playwright browsers..."):
                    result = subprocess.run(
                        ["python", "-m", "playwright", "install"],
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE,
                        text=True,
                        timeout=120
                    )
                if result.returncode == 0:
                    st.success("Installation successful! Please refresh the page.")
                else:
                    st.error(f"Installation failed: {result.stderr}")
                    st.code(result.stdout)
            except Exception as e:
                st.error(f"Error during installation: {str(e)}")
                st.info("Please run the command manually in your terminal.")

    # Debug expander at the top
    with st.expander("Debug Information", expanded=False):
        st.write("Python version:", sys.version)
        
        try:
            import requests
            st.write("Requests version:", requests.__version__)
        except ImportError:
            st.error("Requests not installed!")
        
        # crawl4ai debug information
        if CRAWL4AI_IMPORTED:
            try:
                st.write("crawl4ai version:", getattr(crawl4ai, "__version__", "Unknown"))
                st.write("crawl4ai available methods:", [method for method in dir(crawl4ai) if not method.startswith("_")])
            except:
                st.write("crawl4ai is installed but version information is not available")
        else:
            st.error("crawl4ai not installed!")
        
        # Playwright debug information
        try:
            import playwright
            # Playwright package doesn't have __version__ directly accessible
            try:
                # Try to get version from package metadata if available
                from importlib.metadata import version
                playwright_version = version("playwright")
            except:
                # Fallback to getting version via pip subprocess
                try:
                    result = subprocess.run(
                        [sys.executable, "-m", "pip", "show", "playwright"],
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE,
                        text=True,
                        timeout=5
                    )
                    for line in result.stdout.split("\n"):
                        if line.startswith("Version:"):
                            playwright_version = line.split("Version:")[1].strip()
                            break
                    else:
                        playwright_version = "Unknown"
                except:
                    playwright_version = "Unknown"
            
            st.write("Playwright version:", playwright_version)
            # Check if browsers are installed
            browsers_ok, browsers_message = check_playwright_browsers()
            st.write(f"Playwright browsers: {browsers_message}")
        except ImportError:
            st.error("Playwright not installed!")
        
        try:
            import transformers
            st.write("Transformers version:", transformers.__version__)
        except ImportError:
            st.error("Transformers not installed!")
        
        try:
            import torch
            st.write("PyTorch version:", torch.__version__)
            st.write("CUDA available:", torch.cuda.is_available())
            if torch.cuda.is_available():
                st.write("CUDA device:", torch.cuda.get_device_name(0))
        except ImportError:
            st.error("PyTorch not installed!")

    # Configuration section
    with st.sidebar:
        st.header("Configuration")
        
        st.subheader("LLM Model Selection")
        model_option = st.selectbox(
            "Choose LLM Model",
            [
                "microsoft/phi-2 (fastest, 2.7B)",
                "google/gemma-2b (balanced)",
                "mistralai/Mistral-7B-Instruct-v0.2 (best quality, slowest)"
            ],
            index=0
        )
        
        # Convert selection to model name
        model_name = model_option.split(" ")[0]
        
        st.subheader("Privacy Settings")
        use_proxy = st.checkbox("Use Proxy Rotation", value=False)
        use_user_agent = st.checkbox("Use User-Agent Rotation", value=True)
        
        # Add AsyncWebCrawler specific settings
        st.subheader("Crawler Settings")
        max_connections = st.slider("Max Connections", min_value=1, max_value=20, value=10)
        timeout_seconds = st.slider("Request Timeout (seconds)", min_value=5, max_value=60, value=30)
        max_retries = st.slider("Max Retries", min_value=1, max_value=10, value=5)
    
    test_mode = st.sidebar.checkbox("Enable Test Mode", value=False)

    # If in test mode, show a simplified test interface
    if test_mode:
        st.header("🔍 Test Mode")
        st.info("This mode lets you test basic web connectivity without the full pipeline")
        
        test_url = st.text_input("Test URL", "https://www.example.com")
        
        if st.button("Test Connection"):
            try:
                with st.spinner("Testing connection..."):
                    # First try with requests for basic connectivity
                    basic_response = requests.get(test_url, timeout=10)
                    st.success(f"Basic HTTP connection successful: Status {basic_response.status_code}")
                    
                    # Then try with our crawler
                    st.info("Now testing with crawl4ai integration...")
                    
                    # Configure proxy list based on user selection
                    proxy_list = None
                    if use_proxy:
                        # Example proxy list - in production you'd load from a secured source
                        proxy_list = [
                            "http://example-proxy1.com:8080",
                            "http://example-proxy2.com:8080"
                        ]
                        
                    # Initialize the scraper with the configured settings
                    test_scraper = SecureScraper(proxy_list=proxy_list)
                    test_scraper.crawler.max_connections = max_connections
                    test_scraper.crawler.timeout = timeout_seconds
                    
                    result = test_scraper.scrape_url(test_url)
                    
                    if result['status'] == 'success':
                        st.success(f"crawl4ai connection successful")
                        st.write("Privacy settings used:")
                        st.json(result['privacy'])
                        
                        with st.expander("Response Preview"):
                            st.write(result['data']['title'])
                            st.write(result['data']['text'][:1000] + "..." if len(result['data']['text']) > 1000 else result['data']['text'])
                    else:
                        st.error(f"crawl4ai connection failed: {result['message']}")
                        
            except Exception as e:
                st.error(f"Connection failed: {str(e)}")
                st.code(traceback.format_exc())
    
    # Input section
    st.header("Scraping Target")
    url = st.text_input("Enter the URL to scrape", placeholder="https://example.com/")
    
    with st.expander("Advanced Scraping Options"):
        css_selectors_text = st.text_area(
            "CSS Selectors (JSON format)",
            placeholder='{"title": "h1", "price": ".product-price", "description": ".product-description"}'
        )
        
        # Parse CSS selectors
        css_selectors = None
        if css_selectors_text:
            try:
                css_selectors = json.loads(css_selectors_text)
            except json.JSONDecodeError:
                st.error("Invalid JSON for CSS selectors")
    
    st.header("LLM Processing")
    llm_instruction = st.text_area(
        "What do you want the LLM to do with the scraped data?",
        placeholder="Extract the main product features and summarize them in bullet points"
    )
    
    # Initialize on button click
    if st.button("Scrape and Process"):
        if not url:
            st.error("Please enter a URL to scrape")
            return
        
        # Show progress
        with st.spinner("Initializing scraper..."):
            # Configure proxy list based on user selection
            proxy_list = None
            if use_proxy:
                st.warning("Using proxy rotation - in a production system, you'd want to use paid proxies")
                # Example proxy list - in production you'd load from a secured source
                proxy_list = [
                    "http://example-proxy1.com:8080",
                    "http://example-proxy2.com:8080"
                ]
            
            # Initialize the scraper with updated parameters
            scraper = SecureScraper(proxy_list=proxy_list)
            
            # Update AsyncWebCrawler settings based on user input
            scraper.crawler.max_connections = max_connections
            scraper.crawler.timeout = timeout_seconds
            scraper.crawler.random_user_agent = use_user_agent
            
        error_placeholder = st.empty()
        
        # Perform scraping
        with st.spinner("Scraping website"):
            # First, test basic connectivity with a direct request
            st.info(f"Testing basic connectivity to {url}")
            try:
                test_response = requests.get(url, timeout=10)
                st.success(f"Basic connection successful: HTTP {test_response.status_code}")
            except Exception as e:
                st.warning(f"Basic connection test failed: {str(e)}. Trying with crawl4ai anyway...")
                
            # Check if Playwright browsers are installed before scraping
            browsers_ok, _ = check_playwright_browsers()
            if not browsers_ok:
                st.error("Cannot scrape: Playwright browsers are not installed. Please install them first.")
                return
                
            try:
                # Now perform the actual scraping with our scraper
                result = scraper.scrape_url(url, css_selectors)
                
                if result['status'] == 'error':
                    st.error(f"Scraping failed: {result['message']}")
                    return
            except Exception as e:
                if "Executable doesn't exist" in str(e):
                    st.error("Error: Playwright browser not found. Please install using the button at the top of the page.")
                    return
                else:
                    st.error(f"Scraping error: {str(e)}")
                    st.code(traceback.format_exc())
                    return
                
            st.success("Scraping completed successfully!")
            
            # Display privacy measures used
            st.subheader("Privacy Measures Used")
            st.json(result['privacy'])
            
            # Display raw scraped data
            with st.expander("Raw Scraped Data"):
                st.json(result['data'])
        
        # Don't attempt LLM processing if scraping failed
        if 'result' not in locals() or result['status'] == 'error':
            return
        
        # Process with LLM
        with st.spinner(f"Processing with {model_name}..."):
            try:
                llm = LLMProcessor(model_name=model_name)
                
                # Prepare data for LLM (convert to string if it's a dict)
                scraped_data_str = json.dumps(result['data'], indent=2) if isinstance(result['data'], dict) else result['data']
                
                processed_result = llm.process_data(
                    scraped_data_str, 
                    llm_instruction if llm_instruction else "Summarize this information"
                )
                
                st.subheader("LLM Processing Result")
                st.write(processed_result)
                
            except Exception as e:
                st.error(f"Error in LLM processing: {str(e)}")
                st.info("Try using a smaller model like microsoft/phi-2 if you're facing memory issues")
                logging.error(f"LLM processing error: {str(e)}")
                logging.error(traceback.format_exc())

# Create a utility for running async code in Streamlit
def run_async_code(coro):
    """Run an async coroutine in a Streamlit app."""
    try:
        loop = asyncio.new_event_loop()
        return loop.run_until_complete(coro)
    finally:
        loop.close()

if __name__ == "__main__":
    main()