Spaces:

moksh9591
/

LLm_Webscraper

Running

App Files Files Community

Mokshith Salian commited on 14 days ago

Commit

6541332

1 Parent(s): 5ff752e

added browser config-playwright

Browse files

Files changed (1) hide show

app.py +144 -5

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import time
 import sys
 import requests
 import asyncio
 # Import our custom classes
 from secure_scraper import SecureScraper
@@ -19,6 +20,13 @@ try:
 except ImportError:
     CRAWL4AI_IMPORTED = False
 # Set up logging
 logging.basicConfig(
     level=logging.INFO,
@@ -29,6 +37,37 @@ logging.basicConfig(
     ]
     )
 def main():
     st.set_page_config(
         page_title="LLM Web Scraper",
@@ -38,6 +77,32 @@ def main():
     st.title("🕸️ LLM Web Scraper")
     st.write("Scrape web content with privacy protection and open-source LLM processing - by Mokshith salian")
     # Debug expander at the top
     with st.expander("Debug Information", expanded=False):
@@ -59,6 +124,16 @@ def main():
         else:
             st.error("crawl4ai not installed!")
         try:
             import transformers
             st.write("Transformers version:", transformers.__version__)
@@ -102,6 +177,55 @@ def main():
         timeout_seconds = st.slider("Request Timeout (seconds)", min_value=5, max_value=60, value=30)
         max_retries = st.slider("Max Retries", min_value=1, max_value=10, value=5)
     # Input section
     st.header("Scraping Target")
@@ -165,13 +289,28 @@ def main():
             except Exception as e:
                 st.warning(f"Basic connection test failed: {str(e)}. Trying with crawl4ai anyway...")
-            # Now perform the actual scraping with our scraper
-            result = scraper.scrape_url(url, css_selectors)
-            if result['status'] == 'error':
-                st.error(f"Scraping failed: {result['message']}")
                 return
             st.success("Scraping completed successfully!")
             # Display privacy measures used

 import sys
 import requests
 import asyncio
+import subprocess
 # Import our custom classes
 from secure_scraper import SecureScraper
 except ImportError:
     CRAWL4AI_IMPORTED = False
+# Try to import playwright for browser check
+try:
+    import playwright
+    PLAYWRIGHT_IMPORTED = True
+except ImportError:
+    PLAYWRIGHT_IMPORTED = False
 # Set up logging
 logging.basicConfig(
     level=logging.INFO,
     ]
     )
+def check_playwright_browsers():
+    """Check if Playwright browsers are installed and provide instructions if not."""
+    if not PLAYWRIGHT_IMPORTED:
+        return False, "Playwright is not installed. Install with: pip install playwright"
+    try:
+        # Try to run playwright installation check command
+        result = subprocess.run(
+            ["python", "-m", "playwright", "install", "--help"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            timeout=5
+        )
+        # Check if chromium browser exists at common locations
+        chrome_paths = [
+            os.path.expanduser("~/.cache/ms-playwright/chromium-*/chrome-linux/chrome"),
+            os.path.expanduser("~/.cache/ms-playwright/chromium-*/chrome.exe"),
+            os.path.expanduser("~/.cache/ms-playwright/chromium-*/chrome-mac/Chromium.app/Contents/MacOS/Chromium")
+        ]
+        browser_exists = any(os.path.exists(path.split("*")[0]) for path in chrome_paths)
+        if not browser_exists:
+            return False, "Playwright browsers are not installed. Run: playwright install"
+        return True, "Playwright browsers appear to be installed"
+    except Exception as e:
+        return False, f"Error checking Playwright: {str(e)}"
 def main():
     st.set_page_config(
         page_title="LLM Web Scraper",
     st.title("🕸️ LLM Web Scraper")
     st.write("Scrape web content with privacy protection and open-source LLM processing - by Mokshith salian")
+    # Check for Playwright browsers
+    browsers_ok, browsers_message = check_playwright_browsers()
+    if not browsers_ok:
+        st.warning(f"⚠️ {browsers_message}")
+        st.info("To install the required browsers, run this command in your terminal:")
+        st.code("playwright install")
+        # Optional: add a button to try installing
+        if st.button("Try automatic installation"):
+            try:
+                with st.spinner("Installing Playwright browsers..."):
+                    result = subprocess.run(
+                        ["python", "-m", "playwright", "install"],
+                        stdout=subprocess.PIPE,
+                        stderr=subprocess.PIPE,
+                        text=True,
+                        timeout=120
+                    )
+                if result.returncode == 0:
+                    st.success("Installation successful! Please refresh the page.")
+                else:
+                    st.error(f"Installation failed: {result.stderr}")
+                    st.code(result.stdout)
+            except Exception as e:
+                st.error(f"Error during installation: {str(e)}")
+                st.info("Please run the command manually in your terminal.")
     # Debug expander at the top
     with st.expander("Debug Information", expanded=False):
         else:
             st.error("crawl4ai not installed!")
+        # Playwright debug information
+        try:
+            import playwright
+            st.write("Playwright version:", playwright.__version__)
+            # Check if browsers are installed
+            browsers_ok, browsers_message = check_playwright_browsers()
+            st.write(f"Playwright browsers: {browsers_message}")
+        except ImportError:
+            st.error("Playwright not installed!")
         try:
             import transformers
             st.write("Transformers version:", transformers.__version__)
         timeout_seconds = st.slider("Request Timeout (seconds)", min_value=5, max_value=60, value=30)
         max_retries = st.slider("Max Retries", min_value=1, max_value=10, value=5)
+    test_mode = st.sidebar.checkbox("Enable Test Mode", value=False)
+    # If in test mode, show a simplified test interface
+    if test_mode:
+        st.header("🔍 Test Mode")
+        st.info("This mode lets you test basic web connectivity without the full pipeline")
+        test_url = st.text_input("Test URL", "https://www.example.com")
+        if st.button("Test Connection"):
+            try:
+                with st.spinner("Testing connection..."):
+                    # First try with requests for basic connectivity
+                    basic_response = requests.get(test_url, timeout=10)
+                    st.success(f"Basic HTTP connection successful: Status {basic_response.status_code}")
+                    # Then try with our crawler
+                    st.info("Now testing with crawl4ai integration...")
+                    # Configure proxy list based on user selection
+                    proxy_list = None
+                    if use_proxy:
+                        # Example proxy list - in production you'd load from a secured source
+                        proxy_list = [
+                            "http://example-proxy1.com:8080",
+                            "http://example-proxy2.com:8080"
+                        ]
+                    # Initialize the scraper with the configured settings
+                    test_scraper = SecureScraper(proxy_list=proxy_list)
+                    test_scraper.crawler.max_connections = max_connections
+                    test_scraper.crawler.timeout = timeout_seconds
+                    result = test_scraper.scrape_url(test_url)
+                    if result['status'] == 'success':
+                        st.success(f"crawl4ai connection successful")
+                        st.write("Privacy settings used:")
+                        st.json(result['privacy'])
+                        with st.expander("Response Preview"):
+                            st.write(result['data']['title'])
+                            st.write(result['data']['text'][:1000] + "..." if len(result['data']['text']) > 1000 else result['data']['text'])
+                    else:
+                        st.error(f"crawl4ai connection failed: {result['message']}")
+            except Exception as e:
+                st.error(f"Connection failed: {str(e)}")
+                st.code(traceback.format_exc())
     # Input section
     st.header("Scraping Target")
             except Exception as e:
                 st.warning(f"Basic connection test failed: {str(e)}. Trying with crawl4ai anyway...")
+            # Check if Playwright browsers are installed before scraping
+            browsers_ok, _ = check_playwright_browsers()
+            if not browsers_ok:
+                st.error("Cannot scrape: Playwright browsers are not installed. Please install them first.")
                 return
+            try:
+                # Now perform the actual scraping with our scraper
+                result = scraper.scrape_url(url, css_selectors)
+                if result['status'] == 'error':
+                    st.error(f"Scraping failed: {result['message']}")
+                    return
+            except Exception as e:
+                if "Executable doesn't exist" in str(e):
+                    st.error("Error: Playwright browser not found. Please install using the button at the top of the page.")
+                    return
+                else:
+                    st.error(f"Scraping error: {str(e)}")
+                    st.code(traceback.format_exc())
+                    return
             st.success("Scraping completed successfully!")
             # Display privacy measures used