Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -8,7 +8,7 @@ from itertools import cycle
|
|
8 |
|
9 |
import gradio as gr
|
10 |
from bs4 import BeautifulSoup, NavigableString
|
11 |
-
from playwright.async_api import async_playwright
|
12 |
|
13 |
class CredentialRevolver:
|
14 |
def __init__(self, proxy_string: str):
|
@@ -27,9 +27,9 @@ class CredentialRevolver:
|
|
27 |
server = f"http://{parsed.hostname}:{parsed.port}"
|
28 |
proxy_dict = {"server": server}
|
29 |
if parsed.username:
|
30 |
-
proxy_dict["username"] = parsed.username
|
31 |
if parsed.password:
|
32 |
-
proxy_dict["password"] = parsed.password
|
33 |
proxies.append(proxy_dict)
|
34 |
except Exception:
|
35 |
pass
|
@@ -150,9 +150,10 @@ async def perform_web_browse(query: str, browser_name: str, search_engine: str):
|
|
150 |
proxy_server_used = proxy_config["server"] if proxy_config else "Direct Connection"
|
151 |
|
152 |
context_args = {
|
153 |
-
'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/
|
154 |
'java_script_enabled': True,
|
155 |
-
'ignore_https_errors': True
|
|
|
156 |
}
|
157 |
if proxy_config:
|
158 |
context_args['proxy'] = proxy_config
|
@@ -161,27 +162,27 @@ async def perform_web_browse(query: str, browser_name: str, search_engine: str):
|
|
161 |
page = await context.new_page()
|
162 |
|
163 |
try:
|
164 |
-
response = await page.goto(url, wait_until='
|
165 |
-
|
166 |
-
current_url = page.url
|
167 |
-
if "google.com" in current_url:
|
168 |
-
await page.wait_for_selector('div#rso, div#search, body[jsmodel]', timeout=15000)
|
169 |
-
elif "perplexity.ai" in current_url or "you.com" in current_url:
|
170 |
-
await page.wait_for_timeout(4000)
|
171 |
-
|
172 |
-
final_url, title = page.url, await page.title() or "No Title"
|
173 |
|
|
|
|
|
174 |
html_content = await page.content()
|
175 |
soup = BeautifulSoup(html_content, 'lxml')
|
176 |
|
177 |
converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
|
178 |
markdown_text = converter.convert()
|
179 |
-
status_code = response.status if response else
|
180 |
-
|
|
|
|
|
|
|
|
|
181 |
return {
|
182 |
"status": "success", "query": query, "final_url": final_url, "page_title": title,
|
183 |
"http_status": status_code, "proxy_used": proxy_server_used, "markdown_content": markdown_text,
|
184 |
}
|
|
|
|
|
185 |
except Exception as e:
|
186 |
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": str(e).splitlines()[0]}
|
187 |
finally:
|
|
|
8 |
|
9 |
import gradio as gr
|
10 |
from bs4 import BeautifulSoup, NavigableString
|
11 |
+
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
12 |
|
13 |
class CredentialRevolver:
|
14 |
def __init__(self, proxy_string: str):
|
|
|
27 |
server = f"http://{parsed.hostname}:{parsed.port}"
|
28 |
proxy_dict = {"server": server}
|
29 |
if parsed.username:
|
30 |
+
proxy_dict["username"] = urllib.parse.unquote(parsed.username)
|
31 |
if parsed.password:
|
32 |
+
proxy_dict["password"] = urllib.parse.unquote(parsed.password)
|
33 |
proxies.append(proxy_dict)
|
34 |
except Exception:
|
35 |
pass
|
|
|
150 |
proxy_server_used = proxy_config["server"] if proxy_config else "Direct Connection"
|
151 |
|
152 |
context_args = {
|
153 |
+
'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
|
154 |
'java_script_enabled': True,
|
155 |
+
'ignore_https_errors': True,
|
156 |
+
'bypass_csp': True
|
157 |
}
|
158 |
if proxy_config:
|
159 |
context_args['proxy'] = proxy_config
|
|
|
162 |
page = await context.new_page()
|
163 |
|
164 |
try:
|
165 |
+
response = await page.goto(url, wait_until='networkidle', timeout=35000)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
+
final_url = page.url
|
168 |
+
title = await page.title() or "No Title"
|
169 |
html_content = await page.content()
|
170 |
soup = BeautifulSoup(html_content, 'lxml')
|
171 |
|
172 |
converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
|
173 |
markdown_text = converter.convert()
|
174 |
+
status_code = response.status if response else 0
|
175 |
+
|
176 |
+
if status_code not in {200, 204} and not markdown_text:
|
177 |
+
error_info = f"Page loaded with non-2xx status code: {status_code}. Content may be empty or an error page."
|
178 |
+
return {"status": "error", "query": query, "final_url": final_url, "http_status": status_code, "error_message": error_info}
|
179 |
+
|
180 |
return {
|
181 |
"status": "success", "query": query, "final_url": final_url, "page_title": title,
|
182 |
"http_status": status_code, "proxy_used": proxy_server_used, "markdown_content": markdown_text,
|
183 |
}
|
184 |
+
except PlaywrightTimeoutError:
|
185 |
+
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": f"Navigation timed out after 35s. The site may be slow, blocking automation, or the proxy ({proxy_server_used}) may have failed."}
|
186 |
except Exception as e:
|
187 |
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": str(e).splitlines()[0]}
|
188 |
finally:
|