broadfield-dev commited on
Commit
5e1f980
·
verified ·
1 Parent(s): 224e219

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -16
app.py CHANGED
@@ -8,7 +8,7 @@ from itertools import cycle
8
 
9
  import gradio as gr
10
  from bs4 import BeautifulSoup, NavigableString
11
- from playwright.async_api import async_playwright
12
 
13
  class CredentialRevolver:
14
  def __init__(self, proxy_string: str):
@@ -27,9 +27,9 @@ class CredentialRevolver:
27
  server = f"http://{parsed.hostname}:{parsed.port}"
28
  proxy_dict = {"server": server}
29
  if parsed.username:
30
- proxy_dict["username"] = parsed.username
31
  if parsed.password:
32
- proxy_dict["password"] = parsed.password
33
  proxies.append(proxy_dict)
34
  except Exception:
35
  pass
@@ -150,9 +150,10 @@ async def perform_web_browse(query: str, browser_name: str, search_engine: str):
150
  proxy_server_used = proxy_config["server"] if proxy_config else "Direct Connection"
151
 
152
  context_args = {
153
- 'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
154
  'java_script_enabled': True,
155
- 'ignore_https_errors': True
 
156
  }
157
  if proxy_config:
158
  context_args['proxy'] = proxy_config
@@ -161,27 +162,27 @@ async def perform_web_browse(query: str, browser_name: str, search_engine: str):
161
  page = await context.new_page()
162
 
163
  try:
164
- response = await page.goto(url, wait_until='domcontentloaded', timeout=45000)
165
-
166
- current_url = page.url
167
- if "google.com" in current_url:
168
- await page.wait_for_selector('div#rso, div#search, body[jsmodel]', timeout=15000)
169
- elif "perplexity.ai" in current_url or "you.com" in current_url:
170
- await page.wait_for_timeout(4000)
171
-
172
- final_url, title = page.url, await page.title() or "No Title"
173
 
 
 
174
  html_content = await page.content()
175
  soup = BeautifulSoup(html_content, 'lxml')
176
 
177
  converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
178
  markdown_text = converter.convert()
179
- status_code = response.status if response else "N/A"
180
-
 
 
 
 
181
  return {
182
  "status": "success", "query": query, "final_url": final_url, "page_title": title,
183
  "http_status": status_code, "proxy_used": proxy_server_used, "markdown_content": markdown_text,
184
  }
 
 
185
  except Exception as e:
186
  return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": str(e).splitlines()[0]}
187
  finally:
 
8
 
9
  import gradio as gr
10
  from bs4 import BeautifulSoup, NavigableString
11
+ from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
12
 
13
  class CredentialRevolver:
14
  def __init__(self, proxy_string: str):
 
27
  server = f"http://{parsed.hostname}:{parsed.port}"
28
  proxy_dict = {"server": server}
29
  if parsed.username:
30
+ proxy_dict["username"] = urllib.parse.unquote(parsed.username)
31
  if parsed.password:
32
+ proxy_dict["password"] = urllib.parse.unquote(parsed.password)
33
  proxies.append(proxy_dict)
34
  except Exception:
35
  pass
 
150
  proxy_server_used = proxy_config["server"] if proxy_config else "Direct Connection"
151
 
152
  context_args = {
153
+ 'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
154
  'java_script_enabled': True,
155
+ 'ignore_https_errors': True,
156
+ 'bypass_csp': True
157
  }
158
  if proxy_config:
159
  context_args['proxy'] = proxy_config
 
162
  page = await context.new_page()
163
 
164
  try:
165
+ response = await page.goto(url, wait_until='networkidle', timeout=35000)
 
 
 
 
 
 
 
 
166
 
167
+ final_url = page.url
168
+ title = await page.title() or "No Title"
169
  html_content = await page.content()
170
  soup = BeautifulSoup(html_content, 'lxml')
171
 
172
  converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
173
  markdown_text = converter.convert()
174
+ status_code = response.status if response else 0
175
+
176
+ if status_code not in {200, 204} and not markdown_text:
177
+ error_info = f"Page loaded with non-2xx status code: {status_code}. Content may be empty or an error page."
178
+ return {"status": "error", "query": query, "final_url": final_url, "http_status": status_code, "error_message": error_info}
179
+
180
  return {
181
  "status": "success", "query": query, "final_url": final_url, "page_title": title,
182
  "http_status": status_code, "proxy_used": proxy_server_used, "markdown_content": markdown_text,
183
  }
184
+ except PlaywrightTimeoutError:
185
+ return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": f"Navigation timed out after 35s. The site may be slow, blocking automation, or the proxy ({proxy_server_used}) may have failed."}
186
  except Exception as e:
187
  return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": str(e).splitlines()[0]}
188
  finally: