Spaces:
Running
Running
Mokshith Salian
commited on
Commit
·
6541332
1
Parent(s):
5ff752e
added browser config-playwright
Browse files
app.py
CHANGED
@@ -7,6 +7,7 @@ import time
|
|
7 |
import sys
|
8 |
import requests
|
9 |
import asyncio
|
|
|
10 |
|
11 |
# Import our custom classes
|
12 |
from secure_scraper import SecureScraper
|
@@ -19,6 +20,13 @@ try:
|
|
19 |
except ImportError:
|
20 |
CRAWL4AI_IMPORTED = False
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
# Set up logging
|
23 |
logging.basicConfig(
|
24 |
level=logging.INFO,
|
@@ -29,6 +37,37 @@ logging.basicConfig(
|
|
29 |
]
|
30 |
)
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
def main():
|
33 |
st.set_page_config(
|
34 |
page_title="LLM Web Scraper",
|
@@ -38,6 +77,32 @@ def main():
|
|
38 |
|
39 |
st.title("🕸️ LLM Web Scraper")
|
40 |
st.write("Scrape web content with privacy protection and open-source LLM processing - by Mokshith salian")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
# Debug expander at the top
|
43 |
with st.expander("Debug Information", expanded=False):
|
@@ -59,6 +124,16 @@ def main():
|
|
59 |
else:
|
60 |
st.error("crawl4ai not installed!")
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
try:
|
63 |
import transformers
|
64 |
st.write("Transformers version:", transformers.__version__)
|
@@ -102,6 +177,55 @@ def main():
|
|
102 |
timeout_seconds = st.slider("Request Timeout (seconds)", min_value=5, max_value=60, value=30)
|
103 |
max_retries = st.slider("Max Retries", min_value=1, max_value=10, value=5)
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
# Input section
|
107 |
st.header("Scraping Target")
|
@@ -165,13 +289,28 @@ def main():
|
|
165 |
except Exception as e:
|
166 |
st.warning(f"Basic connection test failed: {str(e)}. Trying with crawl4ai anyway...")
|
167 |
|
168 |
-
#
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
st.error(f"Scraping failed: {result['message']}")
|
173 |
return
|
174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
st.success("Scraping completed successfully!")
|
176 |
|
177 |
# Display privacy measures used
|
|
|
7 |
import sys
|
8 |
import requests
|
9 |
import asyncio
|
10 |
+
import subprocess
|
11 |
|
12 |
# Import our custom classes
|
13 |
from secure_scraper import SecureScraper
|
|
|
20 |
except ImportError:
|
21 |
CRAWL4AI_IMPORTED = False
|
22 |
|
23 |
+
# Try to import playwright for browser check
|
24 |
+
try:
|
25 |
+
import playwright
|
26 |
+
PLAYWRIGHT_IMPORTED = True
|
27 |
+
except ImportError:
|
28 |
+
PLAYWRIGHT_IMPORTED = False
|
29 |
+
|
30 |
# Set up logging
|
31 |
logging.basicConfig(
|
32 |
level=logging.INFO,
|
|
|
37 |
]
|
38 |
)
|
39 |
|
40 |
+
def check_playwright_browsers():
|
41 |
+
"""Check if Playwright browsers are installed and provide instructions if not."""
|
42 |
+
if not PLAYWRIGHT_IMPORTED:
|
43 |
+
return False, "Playwright is not installed. Install with: pip install playwright"
|
44 |
+
|
45 |
+
try:
|
46 |
+
# Try to run playwright installation check command
|
47 |
+
result = subprocess.run(
|
48 |
+
["python", "-m", "playwright", "install", "--help"],
|
49 |
+
stdout=subprocess.PIPE,
|
50 |
+
stderr=subprocess.PIPE,
|
51 |
+
text=True,
|
52 |
+
timeout=5
|
53 |
+
)
|
54 |
+
|
55 |
+
# Check if chromium browser exists at common locations
|
56 |
+
chrome_paths = [
|
57 |
+
os.path.expanduser("~/.cache/ms-playwright/chromium-*/chrome-linux/chrome"),
|
58 |
+
os.path.expanduser("~/.cache/ms-playwright/chromium-*/chrome.exe"),
|
59 |
+
os.path.expanduser("~/.cache/ms-playwright/chromium-*/chrome-mac/Chromium.app/Contents/MacOS/Chromium")
|
60 |
+
]
|
61 |
+
|
62 |
+
browser_exists = any(os.path.exists(path.split("*")[0]) for path in chrome_paths)
|
63 |
+
|
64 |
+
if not browser_exists:
|
65 |
+
return False, "Playwright browsers are not installed. Run: playwright install"
|
66 |
+
|
67 |
+
return True, "Playwright browsers appear to be installed"
|
68 |
+
except Exception as e:
|
69 |
+
return False, f"Error checking Playwright: {str(e)}"
|
70 |
+
|
71 |
def main():
|
72 |
st.set_page_config(
|
73 |
page_title="LLM Web Scraper",
|
|
|
77 |
|
78 |
st.title("🕸️ LLM Web Scraper")
|
79 |
st.write("Scrape web content with privacy protection and open-source LLM processing - by Mokshith salian")
|
80 |
+
|
81 |
+
# Check for Playwright browsers
|
82 |
+
browsers_ok, browsers_message = check_playwright_browsers()
|
83 |
+
if not browsers_ok:
|
84 |
+
st.warning(f"⚠️ {browsers_message}")
|
85 |
+
st.info("To install the required browsers, run this command in your terminal:")
|
86 |
+
st.code("playwright install")
|
87 |
+
# Optional: add a button to try installing
|
88 |
+
if st.button("Try automatic installation"):
|
89 |
+
try:
|
90 |
+
with st.spinner("Installing Playwright browsers..."):
|
91 |
+
result = subprocess.run(
|
92 |
+
["python", "-m", "playwright", "install"],
|
93 |
+
stdout=subprocess.PIPE,
|
94 |
+
stderr=subprocess.PIPE,
|
95 |
+
text=True,
|
96 |
+
timeout=120
|
97 |
+
)
|
98 |
+
if result.returncode == 0:
|
99 |
+
st.success("Installation successful! Please refresh the page.")
|
100 |
+
else:
|
101 |
+
st.error(f"Installation failed: {result.stderr}")
|
102 |
+
st.code(result.stdout)
|
103 |
+
except Exception as e:
|
104 |
+
st.error(f"Error during installation: {str(e)}")
|
105 |
+
st.info("Please run the command manually in your terminal.")
|
106 |
|
107 |
# Debug expander at the top
|
108 |
with st.expander("Debug Information", expanded=False):
|
|
|
124 |
else:
|
125 |
st.error("crawl4ai not installed!")
|
126 |
|
127 |
+
# Playwright debug information
|
128 |
+
try:
|
129 |
+
import playwright
|
130 |
+
st.write("Playwright version:", playwright.__version__)
|
131 |
+
# Check if browsers are installed
|
132 |
+
browsers_ok, browsers_message = check_playwright_browsers()
|
133 |
+
st.write(f"Playwright browsers: {browsers_message}")
|
134 |
+
except ImportError:
|
135 |
+
st.error("Playwright not installed!")
|
136 |
+
|
137 |
try:
|
138 |
import transformers
|
139 |
st.write("Transformers version:", transformers.__version__)
|
|
|
177 |
timeout_seconds = st.slider("Request Timeout (seconds)", min_value=5, max_value=60, value=30)
|
178 |
max_retries = st.slider("Max Retries", min_value=1, max_value=10, value=5)
|
179 |
|
180 |
+
test_mode = st.sidebar.checkbox("Enable Test Mode", value=False)
|
181 |
+
|
182 |
+
# If in test mode, show a simplified test interface
|
183 |
+
if test_mode:
|
184 |
+
st.header("🔍 Test Mode")
|
185 |
+
st.info("This mode lets you test basic web connectivity without the full pipeline")
|
186 |
+
|
187 |
+
test_url = st.text_input("Test URL", "https://www.example.com")
|
188 |
+
|
189 |
+
if st.button("Test Connection"):
|
190 |
+
try:
|
191 |
+
with st.spinner("Testing connection..."):
|
192 |
+
# First try with requests for basic connectivity
|
193 |
+
basic_response = requests.get(test_url, timeout=10)
|
194 |
+
st.success(f"Basic HTTP connection successful: Status {basic_response.status_code}")
|
195 |
+
|
196 |
+
# Then try with our crawler
|
197 |
+
st.info("Now testing with crawl4ai integration...")
|
198 |
+
|
199 |
+
# Configure proxy list based on user selection
|
200 |
+
proxy_list = None
|
201 |
+
if use_proxy:
|
202 |
+
# Example proxy list - in production you'd load from a secured source
|
203 |
+
proxy_list = [
|
204 |
+
"http://example-proxy1.com:8080",
|
205 |
+
"http://example-proxy2.com:8080"
|
206 |
+
]
|
207 |
+
|
208 |
+
# Initialize the scraper with the configured settings
|
209 |
+
test_scraper = SecureScraper(proxy_list=proxy_list)
|
210 |
+
test_scraper.crawler.max_connections = max_connections
|
211 |
+
test_scraper.crawler.timeout = timeout_seconds
|
212 |
+
|
213 |
+
result = test_scraper.scrape_url(test_url)
|
214 |
+
|
215 |
+
if result['status'] == 'success':
|
216 |
+
st.success(f"crawl4ai connection successful")
|
217 |
+
st.write("Privacy settings used:")
|
218 |
+
st.json(result['privacy'])
|
219 |
+
|
220 |
+
with st.expander("Response Preview"):
|
221 |
+
st.write(result['data']['title'])
|
222 |
+
st.write(result['data']['text'][:1000] + "..." if len(result['data']['text']) > 1000 else result['data']['text'])
|
223 |
+
else:
|
224 |
+
st.error(f"crawl4ai connection failed: {result['message']}")
|
225 |
+
|
226 |
+
except Exception as e:
|
227 |
+
st.error(f"Connection failed: {str(e)}")
|
228 |
+
st.code(traceback.format_exc())
|
229 |
|
230 |
# Input section
|
231 |
st.header("Scraping Target")
|
|
|
289 |
except Exception as e:
|
290 |
st.warning(f"Basic connection test failed: {str(e)}. Trying with crawl4ai anyway...")
|
291 |
|
292 |
+
# Check if Playwright browsers are installed before scraping
|
293 |
+
browsers_ok, _ = check_playwright_browsers()
|
294 |
+
if not browsers_ok:
|
295 |
+
st.error("Cannot scrape: Playwright browsers are not installed. Please install them first.")
|
|
|
296 |
return
|
297 |
|
298 |
+
try:
|
299 |
+
# Now perform the actual scraping with our scraper
|
300 |
+
result = scraper.scrape_url(url, css_selectors)
|
301 |
+
|
302 |
+
if result['status'] == 'error':
|
303 |
+
st.error(f"Scraping failed: {result['message']}")
|
304 |
+
return
|
305 |
+
except Exception as e:
|
306 |
+
if "Executable doesn't exist" in str(e):
|
307 |
+
st.error("Error: Playwright browser not found. Please install using the button at the top of the page.")
|
308 |
+
return
|
309 |
+
else:
|
310 |
+
st.error(f"Scraping error: {str(e)}")
|
311 |
+
st.code(traceback.format_exc())
|
312 |
+
return
|
313 |
+
|
314 |
st.success("Scraping completed successfully!")
|
315 |
|
316 |
# Display privacy measures used
|