Mokshith Salian commited on
Commit
6541332
·
1 Parent(s): 5ff752e

added browser config-playwright

Browse files
Files changed (1) hide show
  1. app.py +144 -5
app.py CHANGED
@@ -7,6 +7,7 @@ import time
7
  import sys
8
  import requests
9
  import asyncio
 
10
 
11
  # Import our custom classes
12
  from secure_scraper import SecureScraper
@@ -19,6 +20,13 @@ try:
19
  except ImportError:
20
  CRAWL4AI_IMPORTED = False
21
 
 
 
 
 
 
 
 
22
  # Set up logging
23
  logging.basicConfig(
24
  level=logging.INFO,
@@ -29,6 +37,37 @@ logging.basicConfig(
29
  ]
30
  )
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def main():
33
  st.set_page_config(
34
  page_title="LLM Web Scraper",
@@ -38,6 +77,32 @@ def main():
38
 
39
  st.title("🕸️ LLM Web Scraper")
40
  st.write("Scrape web content with privacy protection and open-source LLM processing - by Mokshith salian")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  # Debug expander at the top
43
  with st.expander("Debug Information", expanded=False):
@@ -59,6 +124,16 @@ def main():
59
  else:
60
  st.error("crawl4ai not installed!")
61
 
 
 
 
 
 
 
 
 
 
 
62
  try:
63
  import transformers
64
  st.write("Transformers version:", transformers.__version__)
@@ -102,6 +177,55 @@ def main():
102
  timeout_seconds = st.slider("Request Timeout (seconds)", min_value=5, max_value=60, value=30)
103
  max_retries = st.slider("Max Retries", min_value=1, max_value=10, value=5)
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  # Input section
107
  st.header("Scraping Target")
@@ -165,13 +289,28 @@ def main():
165
  except Exception as e:
166
  st.warning(f"Basic connection test failed: {str(e)}. Trying with crawl4ai anyway...")
167
 
168
- # Now perform the actual scraping with our scraper
169
- result = scraper.scrape_url(url, css_selectors)
170
-
171
- if result['status'] == 'error':
172
- st.error(f"Scraping failed: {result['message']}")
173
  return
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  st.success("Scraping completed successfully!")
176
 
177
  # Display privacy measures used
 
7
  import sys
8
  import requests
9
  import asyncio
10
+ import subprocess
11
 
12
  # Import our custom classes
13
  from secure_scraper import SecureScraper
 
20
  except ImportError:
21
  CRAWL4AI_IMPORTED = False
22
 
23
+ # Try to import playwright for browser check
24
+ try:
25
+ import playwright
26
+ PLAYWRIGHT_IMPORTED = True
27
+ except ImportError:
28
+ PLAYWRIGHT_IMPORTED = False
29
+
30
  # Set up logging
31
  logging.basicConfig(
32
  level=logging.INFO,
 
37
  ]
38
  )
39
 
40
+ def check_playwright_browsers():
41
+ """Check if Playwright browsers are installed and provide instructions if not."""
42
+ if not PLAYWRIGHT_IMPORTED:
43
+ return False, "Playwright is not installed. Install with: pip install playwright"
44
+
45
+ try:
46
+ # Try to run playwright installation check command
47
+ result = subprocess.run(
48
+ ["python", "-m", "playwright", "install", "--help"],
49
+ stdout=subprocess.PIPE,
50
+ stderr=subprocess.PIPE,
51
+ text=True,
52
+ timeout=5
53
+ )
54
+
55
+ # Check if chromium browser exists at common locations
56
+ chrome_paths = [
57
+ os.path.expanduser("~/.cache/ms-playwright/chromium-*/chrome-linux/chrome"),
58
+ os.path.expanduser("~/.cache/ms-playwright/chromium-*/chrome.exe"),
59
+ os.path.expanduser("~/.cache/ms-playwright/chromium-*/chrome-mac/Chromium.app/Contents/MacOS/Chromium")
60
+ ]
61
+
62
+ browser_exists = any(os.path.exists(path.split("*")[0]) for path in chrome_paths)
63
+
64
+ if not browser_exists:
65
+ return False, "Playwright browsers are not installed. Run: playwright install"
66
+
67
+ return True, "Playwright browsers appear to be installed"
68
+ except Exception as e:
69
+ return False, f"Error checking Playwright: {str(e)}"
70
+
71
  def main():
72
  st.set_page_config(
73
  page_title="LLM Web Scraper",
 
77
 
78
  st.title("🕸️ LLM Web Scraper")
79
  st.write("Scrape web content with privacy protection and open-source LLM processing - by Mokshith salian")
80
+
81
+ # Check for Playwright browsers
82
+ browsers_ok, browsers_message = check_playwright_browsers()
83
+ if not browsers_ok:
84
+ st.warning(f"⚠️ {browsers_message}")
85
+ st.info("To install the required browsers, run this command in your terminal:")
86
+ st.code("playwright install")
87
+ # Optional: add a button to try installing
88
+ if st.button("Try automatic installation"):
89
+ try:
90
+ with st.spinner("Installing Playwright browsers..."):
91
+ result = subprocess.run(
92
+ ["python", "-m", "playwright", "install"],
93
+ stdout=subprocess.PIPE,
94
+ stderr=subprocess.PIPE,
95
+ text=True,
96
+ timeout=120
97
+ )
98
+ if result.returncode == 0:
99
+ st.success("Installation successful! Please refresh the page.")
100
+ else:
101
+ st.error(f"Installation failed: {result.stderr}")
102
+ st.code(result.stdout)
103
+ except Exception as e:
104
+ st.error(f"Error during installation: {str(e)}")
105
+ st.info("Please run the command manually in your terminal.")
106
 
107
  # Debug expander at the top
108
  with st.expander("Debug Information", expanded=False):
 
124
  else:
125
  st.error("crawl4ai not installed!")
126
 
127
+ # Playwright debug information
128
+ try:
129
+ import playwright
130
+ st.write("Playwright version:", playwright.__version__)
131
+ # Check if browsers are installed
132
+ browsers_ok, browsers_message = check_playwright_browsers()
133
+ st.write(f"Playwright browsers: {browsers_message}")
134
+ except ImportError:
135
+ st.error("Playwright not installed!")
136
+
137
  try:
138
  import transformers
139
  st.write("Transformers version:", transformers.__version__)
 
177
  timeout_seconds = st.slider("Request Timeout (seconds)", min_value=5, max_value=60, value=30)
178
  max_retries = st.slider("Max Retries", min_value=1, max_value=10, value=5)
179
 
180
+ test_mode = st.sidebar.checkbox("Enable Test Mode", value=False)
181
+
182
+ # If in test mode, show a simplified test interface
183
+ if test_mode:
184
+ st.header("🔍 Test Mode")
185
+ st.info("This mode lets you test basic web connectivity without the full pipeline")
186
+
187
+ test_url = st.text_input("Test URL", "https://www.example.com")
188
+
189
+ if st.button("Test Connection"):
190
+ try:
191
+ with st.spinner("Testing connection..."):
192
+ # First try with requests for basic connectivity
193
+ basic_response = requests.get(test_url, timeout=10)
194
+ st.success(f"Basic HTTP connection successful: Status {basic_response.status_code}")
195
+
196
+ # Then try with our crawler
197
+ st.info("Now testing with crawl4ai integration...")
198
+
199
+ # Configure proxy list based on user selection
200
+ proxy_list = None
201
+ if use_proxy:
202
+ # Example proxy list - in production you'd load from a secured source
203
+ proxy_list = [
204
+ "http://example-proxy1.com:8080",
205
+ "http://example-proxy2.com:8080"
206
+ ]
207
+
208
+ # Initialize the scraper with the configured settings
209
+ test_scraper = SecureScraper(proxy_list=proxy_list)
210
+ test_scraper.crawler.max_connections = max_connections
211
+ test_scraper.crawler.timeout = timeout_seconds
212
+
213
+ result = test_scraper.scrape_url(test_url)
214
+
215
+ if result['status'] == 'success':
216
+ st.success(f"crawl4ai connection successful")
217
+ st.write("Privacy settings used:")
218
+ st.json(result['privacy'])
219
+
220
+ with st.expander("Response Preview"):
221
+ st.write(result['data']['title'])
222
+ st.write(result['data']['text'][:1000] + "..." if len(result['data']['text']) > 1000 else result['data']['text'])
223
+ else:
224
+ st.error(f"crawl4ai connection failed: {result['message']}")
225
+
226
+ except Exception as e:
227
+ st.error(f"Connection failed: {str(e)}")
228
+ st.code(traceback.format_exc())
229
 
230
  # Input section
231
  st.header("Scraping Target")
 
289
  except Exception as e:
290
  st.warning(f"Basic connection test failed: {str(e)}. Trying with crawl4ai anyway...")
291
 
292
+ # Check if Playwright browsers are installed before scraping
293
+ browsers_ok, _ = check_playwright_browsers()
294
+ if not browsers_ok:
295
+ st.error("Cannot scrape: Playwright browsers are not installed. Please install them first.")
 
296
  return
297
 
298
+ try:
299
+ # Now perform the actual scraping with our scraper
300
+ result = scraper.scrape_url(url, css_selectors)
301
+
302
+ if result['status'] == 'error':
303
+ st.error(f"Scraping failed: {result['message']}")
304
+ return
305
+ except Exception as e:
306
+ if "Executable doesn't exist" in str(e):
307
+ st.error("Error: Playwright browser not found. Please install using the button at the top of the page.")
308
+ return
309
+ else:
310
+ st.error(f"Scraping error: {str(e)}")
311
+ st.code(traceback.format_exc())
312
+ return
313
+
314
  st.success("Scraping completed successfully!")
315
 
316
  # Display privacy measures used