Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -45,7 +45,7 @@ PLAYWRIGHT_STATE: Dict = {}
|
|
45 |
REVOLVER = CredentialRevolver(os.getenv("PROXY_LIST", ""))
|
46 |
|
47 |
SEARCH_ENGINES = {
|
48 |
-
"Google": "https://www.google.com/search?q={query}",
|
49 |
"DuckDuckGo": "https://duckduckgo.com/html/?q={query}",
|
50 |
"Bing": "https://www.bing.com/search?q={query}",
|
51 |
"Brave": "https://search.brave.com/search?q={query}",
|
@@ -151,9 +151,7 @@ async def perform_web_browse(query: str, browser_name: str, search_engine: str):
|
|
151 |
|
152 |
context_args = {
|
153 |
'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
|
154 |
-
'java_script_enabled': True,
|
155 |
-
'ignore_https_errors': True,
|
156 |
-
'bypass_csp': True
|
157 |
}
|
158 |
if proxy_config:
|
159 |
context_args['proxy'] = proxy_config
|
@@ -162,27 +160,31 @@ async def perform_web_browse(query: str, browser_name: str, search_engine: str):
|
|
162 |
page = await context.new_page()
|
163 |
|
164 |
try:
|
165 |
-
response = await page.goto(url, wait_until='
|
166 |
-
|
167 |
final_url = page.url
|
168 |
-
|
169 |
html_content = await page.content()
|
170 |
soup = BeautifulSoup(html_content, 'lxml')
|
171 |
-
|
172 |
converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
|
173 |
markdown_text = converter.convert()
|
174 |
-
status_code = response.status if response else 0
|
175 |
|
176 |
-
|
177 |
-
|
178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
|
180 |
return {
|
181 |
"status": "success", "query": query, "final_url": final_url, "page_title": title,
|
182 |
"http_status": status_code, "proxy_used": proxy_server_used, "markdown_content": markdown_text,
|
183 |
}
|
184 |
except PlaywrightTimeoutError:
|
185 |
-
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": f"Navigation timed out after
|
186 |
except Exception as e:
|
187 |
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": str(e).splitlines()[0]}
|
188 |
finally:
|
|
|
45 |
REVOLVER = CredentialRevolver(os.getenv("PROXY_LIST", ""))
|
46 |
|
47 |
SEARCH_ENGINES = {
|
48 |
+
"Google": "https://www.google.com/search?q={query}&hl=en",
|
49 |
"DuckDuckGo": "https://duckduckgo.com/html/?q={query}",
|
50 |
"Bing": "https://www.bing.com/search?q={query}",
|
51 |
"Brave": "https://search.brave.com/search?q={query}",
|
|
|
151 |
|
152 |
context_args = {
|
153 |
'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
|
154 |
+
'java_script_enabled': True, 'ignore_https_errors': True, 'bypass_csp': True
|
|
|
|
|
155 |
}
|
156 |
if proxy_config:
|
157 |
context_args['proxy'] = proxy_config
|
|
|
160 |
page = await context.new_page()
|
161 |
|
162 |
try:
|
163 |
+
response = await page.goto(url, wait_until='domcontentloaded', timeout=25000)
|
|
|
164 |
final_url = page.url
|
165 |
+
|
166 |
html_content = await page.content()
|
167 |
soup = BeautifulSoup(html_content, 'lxml')
|
|
|
168 |
converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
|
169 |
markdown_text = converter.convert()
|
|
|
170 |
|
171 |
+
# HYBRID STRATEGY: If content is empty/trivial, wait briefly for JS to render.
|
172 |
+
if len(markdown_text.split()) < 20:
|
173 |
+
await page.wait_for_timeout(3000)
|
174 |
+
html_content = await page.content()
|
175 |
+
soup = BeautifulSoup(html_content, 'lxml')
|
176 |
+
converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
|
177 |
+
markdown_text = converter.convert()
|
178 |
+
|
179 |
+
title = await page.title() or "No Title"
|
180 |
+
status_code = response.status if response else 0
|
181 |
|
182 |
return {
|
183 |
"status": "success", "query": query, "final_url": final_url, "page_title": title,
|
184 |
"http_status": status_code, "proxy_used": proxy_server_used, "markdown_content": markdown_text,
|
185 |
}
|
186 |
except PlaywrightTimeoutError:
|
187 |
+
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": f"Navigation timed out after 25s. Site is likely too slow or blocking requests."}
|
188 |
except Exception as e:
|
189 |
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": str(e).splitlines()[0]}
|
190 |
finally:
|