import traceback import streamlit as st import json import logging import os import time import sys import requests import asyncio import subprocess import playwright.sync_api as sync_api # Import our custom classes from secure_scraper import SecureScraper from llm_processor import LLMProcessor # Try to import crawl4ai for debug information try: import crawl4ai CRAWL4AI_IMPORTED = True except ImportError: CRAWL4AI_IMPORTED = False # Try to import playwright for browser check try: import playwright PLAYWRIGHT_IMPORTED = True except ImportError: PLAYWRIGHT_IMPORTED = False # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler("scraper_debug.log") ] ) def check_playwright_browsers(): """Check if Playwright browsers are installed and provide instructions if not.""" if not PLAYWRIGHT_IMPORTED: return False, "Playwright is not installed. Install with: pip install playwright" try: # Try to run playwright installation check command result = subprocess.run( ["python", "-m", "playwright", "install", "--help"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=5 ) # Check if chromium browser exists at common locations chromium_path = sync_api.chromium.executable_path firefox_path = sync_api.firefox.executable_path webkit_path = sync_api.webkit.executable_path browser_exists = any(os.path.exists(path) for path in [chromium_path, firefox_path, webkit_path]) if not browser_exists: return False, "Playwright browsers are not installed. Run: playwright install" return True, "Playwright browsers appear to be installed" except Exception as e: return False, f"Error checking Playwright: {str(e)}" def main(): st.set_page_config( page_title="LLM Web Scraper", page_icon="🕸️", layout="wide", ) st.title("🕸️ LLM Web Scraper") st.write("Scrape web content with privacy protection and open-source LLM processing - by Mokshith salian") # Check for Playwright browsers browsers_ok, browsers_message = check_playwright_browsers() if not browsers_ok: st.warning(f"⚠️ {browsers_message}") st.info("To install the required browsers, run this command in your terminal:") st.code("playwright install") # Optional: add a button to try installing if st.button("Try automatic installation"): try: with st.spinner("Installing Playwright browsers..."): result = subprocess.run( ["python", "-m", "playwright", "install"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=120 ) if result.returncode == 0: st.success("Installation successful! Please refresh the page.") else: st.error(f"Installation failed: {result.stderr}") st.code(result.stdout) except Exception as e: st.error(f"Error during installation: {str(e)}") st.info("Please run the command manually in your terminal.") # Debug expander at the top with st.expander("Debug Information", expanded=False): st.write("Python version:", sys.version) try: import requests st.write("Requests version:", requests.__version__) except ImportError: st.error("Requests not installed!") # crawl4ai debug information if CRAWL4AI_IMPORTED: try: st.write("crawl4ai version:", getattr(crawl4ai, "__version__", "Unknown")) st.write("crawl4ai available methods:", [method for method in dir(crawl4ai) if not method.startswith("_")]) except: st.write("crawl4ai is installed but version information is not available") else: st.error("crawl4ai not installed!") # Playwright debug information try: import playwright # Playwright package doesn't have __version__ directly accessible try: # Try to get version from package metadata if available from importlib.metadata import version playwright_version = version("playwright") except: # Fallback to getting version via pip subprocess try: result = subprocess.run( [sys.executable, "-m", "pip", "show", "playwright"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=5 ) for line in result.stdout.split("\n"): if line.startswith("Version:"): playwright_version = line.split("Version:")[1].strip() break else: playwright_version = "Unknown" except: playwright_version = "Unknown" st.write("Playwright version:", playwright_version) # Check if browsers are installed browsers_ok, browsers_message = check_playwright_browsers() st.write(f"Playwright browsers: {browsers_message}") except ImportError: st.error("Playwright not installed!") try: import transformers st.write("Transformers version:", transformers.__version__) except ImportError: st.error("Transformers not installed!") try: import torch st.write("PyTorch version:", torch.__version__) st.write("CUDA available:", torch.cuda.is_available()) if torch.cuda.is_available(): st.write("CUDA device:", torch.cuda.get_device_name(0)) except ImportError: st.error("PyTorch not installed!") # Configuration section with st.sidebar: st.header("Configuration") st.subheader("LLM Model Selection") model_option = st.selectbox( "Choose LLM Model", [ "microsoft/phi-2 (fastest, 2.7B)", "google/gemma-2b (balanced)", "mistralai/Mistral-7B-Instruct-v0.2 (best quality, slowest)" ], index=0 ) # Convert selection to model name model_name = model_option.split(" ")[0] st.subheader("Privacy Settings") use_proxy = st.checkbox("Use Proxy Rotation", value=False) use_user_agent = st.checkbox("Use User-Agent Rotation", value=True) # Add AsyncWebCrawler specific settings st.subheader("Crawler Settings") max_connections = st.slider("Max Connections", min_value=1, max_value=20, value=10) timeout_seconds = st.slider("Request Timeout (seconds)", min_value=5, max_value=60, value=30) max_retries = st.slider("Max Retries", min_value=1, max_value=10, value=5) test_mode = st.sidebar.checkbox("Enable Test Mode", value=False) # If in test mode, show a simplified test interface if test_mode: st.header("🔍 Test Mode") st.info("This mode lets you test basic web connectivity without the full pipeline") test_url = st.text_input("Test URL", "https://www.example.com") if st.button("Test Connection"): try: with st.spinner("Testing connection..."): # First try with requests for basic connectivity basic_response = requests.get(test_url, timeout=10) st.success(f"Basic HTTP connection successful: Status {basic_response.status_code}") # Then try with our crawler st.info("Now testing with crawl4ai integration...") # Configure proxy list based on user selection proxy_list = None if use_proxy: # Example proxy list - in production you'd load from a secured source proxy_list = [ "http://example-proxy1.com:8080", "http://example-proxy2.com:8080" ] # Initialize the scraper with the configured settings test_scraper = SecureScraper(proxy_list=proxy_list) test_scraper.crawler.max_connections = max_connections test_scraper.crawler.timeout = timeout_seconds result = test_scraper.scrape_url(test_url) if result['status'] == 'success': st.success(f"crawl4ai connection successful") st.write("Privacy settings used:") st.json(result['privacy']) with st.expander("Response Preview"): st.write(result['data']['title']) st.write(result['data']['text'][:1000] + "..." if len(result['data']['text']) > 1000 else result['data']['text']) else: st.error(f"crawl4ai connection failed: {result['message']}") except Exception as e: st.error(f"Connection failed: {str(e)}") st.code(traceback.format_exc()) # Input section st.header("Scraping Target") url = st.text_input("Enter the URL to scrape", placeholder="https://example.com/") with st.expander("Advanced Scraping Options"): css_selectors_text = st.text_area( "CSS Selectors (JSON format)", placeholder='{"title": "h1", "price": ".product-price", "description": ".product-description"}' ) # Parse CSS selectors css_selectors = None if css_selectors_text: try: css_selectors = json.loads(css_selectors_text) except json.JSONDecodeError: st.error("Invalid JSON for CSS selectors") st.header("LLM Processing") llm_instruction = st.text_area( "What do you want the LLM to do with the scraped data?", placeholder="Extract the main product features and summarize them in bullet points" ) # Initialize on button click if st.button("Scrape and Process"): if not url: st.error("Please enter a URL to scrape") return # Show progress with st.spinner("Initializing scraper..."): # Configure proxy list based on user selection proxy_list = None if use_proxy: st.warning("Using proxy rotation - in a production system, you'd want to use paid proxies") # Example proxy list - in production you'd load from a secured source proxy_list = [ "http://example-proxy1.com:8080", "http://example-proxy2.com:8080" ] # Initialize the scraper with updated parameters scraper = SecureScraper(proxy_list=proxy_list) # Update AsyncWebCrawler settings based on user input scraper.crawler.max_connections = max_connections scraper.crawler.timeout = timeout_seconds scraper.crawler.random_user_agent = use_user_agent error_placeholder = st.empty() # Perform scraping with st.spinner("Scraping website"): # First, test basic connectivity with a direct request st.info(f"Testing basic connectivity to {url}") try: test_response = requests.get(url, timeout=10) st.success(f"Basic connection successful: HTTP {test_response.status_code}") except Exception as e: st.warning(f"Basic connection test failed: {str(e)}. Trying with crawl4ai anyway...") # Check if Playwright browsers are installed before scraping browsers_ok, _ = check_playwright_browsers() if not browsers_ok: st.error("Cannot scrape: Playwright browsers are not installed. Please install them first.") return try: # Now perform the actual scraping with our scraper result = scraper.scrape_url(url, css_selectors) if result['status'] == 'error': st.error(f"Scraping failed: {result['message']}") return except Exception as e: if "Executable doesn't exist" in str(e): st.error("Error: Playwright browser not found. Please install using the button at the top of the page.") return else: st.error(f"Scraping error: {str(e)}") st.code(traceback.format_exc()) return st.success("Scraping completed successfully!") # Display privacy measures used st.subheader("Privacy Measures Used") st.json(result['privacy']) # Display raw scraped data with st.expander("Raw Scraped Data"): st.json(result['data']) # Don't attempt LLM processing if scraping failed if 'result' not in locals() or result['status'] == 'error': return # Process with LLM with st.spinner(f"Processing with {model_name}..."): try: llm = LLMProcessor(model_name=model_name) # Prepare data for LLM (convert to string if it's a dict) scraped_data_str = json.dumps(result['data'], indent=2) if isinstance(result['data'], dict) else result['data'] processed_result = llm.process_data( scraped_data_str, llm_instruction if llm_instruction else "Summarize this information" ) st.subheader("LLM Processing Result") st.write(processed_result) except Exception as e: st.error(f"Error in LLM processing: {str(e)}") st.info("Try using a smaller model like microsoft/phi-2 if you're facing memory issues") logging.error(f"LLM processing error: {str(e)}") logging.error(traceback.format_exc()) # Create a utility for running async code in Streamlit def run_async_code(coro): """Run an async coroutine in a Streamlit app.""" try: loop = asyncio.new_event_loop() return loop.run_until_complete(coro) finally: loop.close() if __name__ == "__main__": main()