Spaces:

moksh9591
/

LLm_Webscraper

Running

LLm_Webscraper / app.py

Mokshith Salian

modified app.py

03585c3 13 days ago

15.6 kB

	import traceback
	import streamlit as st
	import json
	import logging
	import os
	import time
	import sys
	import requests
	import asyncio
	import subprocess
	import playwright.sync_api as sync_api

	# Import our custom classes
	from secure_scraper import SecureScraper
	from llm_processor import LLMProcessor

	# Try to import crawl4ai for debug information
	try:
	import crawl4ai
	CRAWL4AI_IMPORTED = True
	except ImportError:
	CRAWL4AI_IMPORTED = False

	# Try to import playwright for browser check
	try:
	import playwright
	PLAYWRIGHT_IMPORTED = True
	except ImportError:
	PLAYWRIGHT_IMPORTED = False

	# Set up logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
	handlers=[
	logging.StreamHandler(sys.stdout),
	logging.FileHandler("scraper_debug.log")
	]
	)

	def check_playwright_browsers():
	"""Check if Playwright browsers are installed and provide instructions if not."""
	if not PLAYWRIGHT_IMPORTED:
	return False, "Playwright is not installed. Install with: pip install playwright"

	try:
	# Try to run playwright installation check command
	result = subprocess.run(
	["python", "-m", "playwright", "install", "--help"],
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	text=True,
	timeout=5
	)

	# Check if chromium browser exists at common locations
	chromium_path = sync_api.chromium.executable_path
	firefox_path = sync_api.firefox.executable_path
	webkit_path = sync_api.webkit.executable_path

	browser_exists = any(os.path.exists(path) for path in [chromium_path, firefox_path, webkit_path])

	if not browser_exists:
	return False, "Playwright browsers are not installed. Run: playwright install"

	return True, "Playwright browsers appear to be installed"
	except Exception as e:
	return False, f"Error checking Playwright: {str(e)}"

	def main():
	st.set_page_config(
	page_title="LLM Web Scraper",
	page_icon="🕸️",
	layout="wide",
	)

	st.title("🕸️ LLM Web Scraper")
	st.write("Scrape web content with privacy protection and open-source LLM processing - by Mokshith salian")

	# Check for Playwright browsers
	browsers_ok, browsers_message = check_playwright_browsers()
	if not browsers_ok:
	st.warning(f"⚠️ {browsers_message}")
	st.info("To install the required browsers, run this command in your terminal:")
	st.code("playwright install")
	# Optional: add a button to try installing
	if st.button("Try automatic installation"):
	try:
	with st.spinner("Installing Playwright browsers..."):
	result = subprocess.run(
	["python", "-m", "playwright", "install"],
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	text=True,
	timeout=120
	)
	if result.returncode == 0:
	st.success("Installation successful! Please refresh the page.")
	else:
	st.error(f"Installation failed: {result.stderr}")
	st.code(result.stdout)
	except Exception as e:
	st.error(f"Error during installation: {str(e)}")
	st.info("Please run the command manually in your terminal.")

	# Debug expander at the top
	with st.expander("Debug Information", expanded=False):
	st.write("Python version:", sys.version)

	try:
	import requests
	st.write("Requests version:", requests.__version__)
	except ImportError:
	st.error("Requests not installed!")

	# crawl4ai debug information
	if CRAWL4AI_IMPORTED:
	try:
	st.write("crawl4ai version:", getattr(crawl4ai, "__version__", "Unknown"))
	st.write("crawl4ai available methods:", [method for method in dir(crawl4ai) if not method.startswith("_")])
	except:
	st.write("crawl4ai is installed but version information is not available")
	else:
	st.error("crawl4ai not installed!")

	# Playwright debug information
	try:
	import playwright
	# Playwright package doesn't have __version__ directly accessible
	try:
	# Try to get version from package metadata if available
	from importlib.metadata import version
	playwright_version = version("playwright")
	except:
	# Fallback to getting version via pip subprocess
	try:
	result = subprocess.run(
	[sys.executable, "-m", "pip", "show", "playwright"],
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	text=True,
	timeout=5
	)
	for line in result.stdout.split("\n"):
	if line.startswith("Version:"):
	playwright_version = line.split("Version:")[1].strip()
	break
	else:
	playwright_version = "Unknown"
	except:
	playwright_version = "Unknown"

	st.write("Playwright version:", playwright_version)
	# Check if browsers are installed
	browsers_ok, browsers_message = check_playwright_browsers()
	st.write(f"Playwright browsers: {browsers_message}")
	except ImportError:
	st.error("Playwright not installed!")

	try:
	import transformers
	st.write("Transformers version:", transformers.__version__)
	except ImportError:
	st.error("Transformers not installed!")

	try:
	import torch
	st.write("PyTorch version:", torch.__version__)
	st.write("CUDA available:", torch.cuda.is_available())
	if torch.cuda.is_available():
	st.write("CUDA device:", torch.cuda.get_device_name(0))
	except ImportError:
	st.error("PyTorch not installed!")

	# Configuration section
	with st.sidebar:
	st.header("Configuration")

	st.subheader("LLM Model Selection")
	model_option = st.selectbox(
	"Choose LLM Model",
	[
	"microsoft/phi-2 (fastest, 2.7B)",
	"google/gemma-2b (balanced)",
	"mistralai/Mistral-7B-Instruct-v0.2 (best quality, slowest)"
	],
	index=0
	)

	# Convert selection to model name
	model_name = model_option.split(" ")[0]

	st.subheader("Privacy Settings")
	use_proxy = st.checkbox("Use Proxy Rotation", value=False)
	use_user_agent = st.checkbox("Use User-Agent Rotation", value=True)

	# Add AsyncWebCrawler specific settings
	st.subheader("Crawler Settings")
	max_connections = st.slider("Max Connections", min_value=1, max_value=20, value=10)
	timeout_seconds = st.slider("Request Timeout (seconds)", min_value=5, max_value=60, value=30)
	max_retries = st.slider("Max Retries", min_value=1, max_value=10, value=5)

	test_mode = st.sidebar.checkbox("Enable Test Mode", value=False)

	# If in test mode, show a simplified test interface
	if test_mode:
	st.header("🔍 Test Mode")
	st.info("This mode lets you test basic web connectivity without the full pipeline")

	test_url = st.text_input("Test URL", "https://www.example.com")

	if st.button("Test Connection"):
	try:
	with st.spinner("Testing connection..."):
	# First try with requests for basic connectivity
	basic_response = requests.get(test_url, timeout=10)
	st.success(f"Basic HTTP connection successful: Status {basic_response.status_code}")

	# Then try with our crawler
	st.info("Now testing with crawl4ai integration...")

	# Configure proxy list based on user selection
	proxy_list = None
	if use_proxy:
	# Example proxy list - in production you'd load from a secured source
	proxy_list = [
	"http://example-proxy1.com:8080",
	"http://example-proxy2.com:8080"
	]

	# Initialize the scraper with the configured settings
	test_scraper = SecureScraper(proxy_list=proxy_list)
	test_scraper.crawler.max_connections = max_connections
	test_scraper.crawler.timeout = timeout_seconds

	result = test_scraper.scrape_url(test_url)

	if result['status'] == 'success':
	st.success(f"crawl4ai connection successful")
	st.write("Privacy settings used:")
	st.json(result['privacy'])

	with st.expander("Response Preview"):
	st.write(result['data']['title'])
	st.write(result['data']['text'][:1000] + "..." if len(result['data']['text']) > 1000 else result['data']['text'])
	else:
	st.error(f"crawl4ai connection failed: {result['message']}")

	except Exception as e:
	st.error(f"Connection failed: {str(e)}")
	st.code(traceback.format_exc())

	# Input section
	st.header("Scraping Target")
	url = st.text_input("Enter the URL to scrape", placeholder="https://example.com/")

	with st.expander("Advanced Scraping Options"):
	css_selectors_text = st.text_area(
	"CSS Selectors (JSON format)",
	placeholder='{"title": "h1", "price": ".product-price", "description": ".product-description"}'
	)

	# Parse CSS selectors
	css_selectors = None
	if css_selectors_text:
	try:
	css_selectors = json.loads(css_selectors_text)
	except json.JSONDecodeError:
	st.error("Invalid JSON for CSS selectors")

	st.header("LLM Processing")
	llm_instruction = st.text_area(
	"What do you want the LLM to do with the scraped data?",
	placeholder="Extract the main product features and summarize them in bullet points"
	)

	# Initialize on button click
	if st.button("Scrape and Process"):
	if not url:
	st.error("Please enter a URL to scrape")
	return

	# Show progress
	with st.spinner("Initializing scraper..."):
	# Configure proxy list based on user selection
	proxy_list = None
	if use_proxy:
	st.warning("Using proxy rotation - in a production system, you'd want to use paid proxies")
	# Example proxy list - in production you'd load from a secured source
	proxy_list = [
	"http://example-proxy1.com:8080",
	"http://example-proxy2.com:8080"
	]

	# Initialize the scraper with updated parameters
	scraper = SecureScraper(proxy_list=proxy_list)

	# Update AsyncWebCrawler settings based on user input
	scraper.crawler.max_connections = max_connections
	scraper.crawler.timeout = timeout_seconds
	scraper.crawler.random_user_agent = use_user_agent

	error_placeholder = st.empty()

	# Perform scraping
	with st.spinner("Scraping website"):
	# First, test basic connectivity with a direct request
	st.info(f"Testing basic connectivity to {url}")
	try:
	test_response = requests.get(url, timeout=10)
	st.success(f"Basic connection successful: HTTP {test_response.status_code}")
	except Exception as e:
	st.warning(f"Basic connection test failed: {str(e)}. Trying with crawl4ai anyway...")

	# Check if Playwright browsers are installed before scraping
	browsers_ok, _ = check_playwright_browsers()
	if not browsers_ok:
	st.error("Cannot scrape: Playwright browsers are not installed. Please install them first.")
	return

	try:
	# Now perform the actual scraping with our scraper
	result = scraper.scrape_url(url, css_selectors)

	if result['status'] == 'error':
	st.error(f"Scraping failed: {result['message']}")
	return
	except Exception as e:
	if "Executable doesn't exist" in str(e):
	st.error("Error: Playwright browser not found. Please install using the button at the top of the page.")
	return
	else:
	st.error(f"Scraping error: {str(e)}")
	st.code(traceback.format_exc())
	return

	st.success("Scraping completed successfully!")

	# Display privacy measures used
	st.subheader("Privacy Measures Used")
	st.json(result['privacy'])

	# Display raw scraped data
	with st.expander("Raw Scraped Data"):
	st.json(result['data'])

	# Don't attempt LLM processing if scraping failed
	if 'result' not in locals() or result['status'] == 'error':
	return

	# Process with LLM
	with st.spinner(f"Processing with {model_name}..."):
	try:
	llm = LLMProcessor(model_name=model_name)

	# Prepare data for LLM (convert to string if it's a dict)
	scraped_data_str = json.dumps(result['data'], indent=2) if isinstance(result['data'], dict) else result['data']

	processed_result = llm.process_data(
	scraped_data_str,
	llm_instruction if llm_instruction else "Summarize this information"
	)

	st.subheader("LLM Processing Result")
	st.write(processed_result)

	except Exception as e:
	st.error(f"Error in LLM processing: {str(e)}")
	st.info("Try using a smaller model like microsoft/phi-2 if you're facing memory issues")
	logging.error(f"LLM processing error: {str(e)}")
	logging.error(traceback.format_exc())

	# Create a utility for running async code in Streamlit
	def run_async_code(coro):
	"""Run an async coroutine in a Streamlit app."""
	try:
	loop = asyncio.new_event_loop()
	return loop.run_until_complete(coro)
	finally:
	loop.close()

	if __name__ == "__main__":
	main()