broadfield-dev commited on
Commit
3887cc3
·
verified ·
1 Parent(s): 5e1f980

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -13
app.py CHANGED
@@ -45,7 +45,7 @@ PLAYWRIGHT_STATE: Dict = {}
45
  REVOLVER = CredentialRevolver(os.getenv("PROXY_LIST", ""))
46
 
47
  SEARCH_ENGINES = {
48
- "Google": "https://www.google.com/search?q={query}",
49
  "DuckDuckGo": "https://duckduckgo.com/html/?q={query}",
50
  "Bing": "https://www.bing.com/search?q={query}",
51
  "Brave": "https://search.brave.com/search?q={query}",
@@ -151,9 +151,7 @@ async def perform_web_browse(query: str, browser_name: str, search_engine: str):
151
 
152
  context_args = {
153
  'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
154
- 'java_script_enabled': True,
155
- 'ignore_https_errors': True,
156
- 'bypass_csp': True
157
  }
158
  if proxy_config:
159
  context_args['proxy'] = proxy_config
@@ -162,27 +160,31 @@ async def perform_web_browse(query: str, browser_name: str, search_engine: str):
162
  page = await context.new_page()
163
 
164
  try:
165
- response = await page.goto(url, wait_until='networkidle', timeout=35000)
166
-
167
  final_url = page.url
168
- title = await page.title() or "No Title"
169
  html_content = await page.content()
170
  soup = BeautifulSoup(html_content, 'lxml')
171
-
172
  converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
173
  markdown_text = converter.convert()
174
- status_code = response.status if response else 0
175
 
176
- if status_code not in {200, 204} and not markdown_text:
177
- error_info = f"Page loaded with non-2xx status code: {status_code}. Content may be empty or an error page."
178
- return {"status": "error", "query": query, "final_url": final_url, "http_status": status_code, "error_message": error_info}
 
 
 
 
 
 
 
179
 
180
  return {
181
  "status": "success", "query": query, "final_url": final_url, "page_title": title,
182
  "http_status": status_code, "proxy_used": proxy_server_used, "markdown_content": markdown_text,
183
  }
184
  except PlaywrightTimeoutError:
185
- return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": f"Navigation timed out after 35s. The site may be slow, blocking automation, or the proxy ({proxy_server_used}) may have failed."}
186
  except Exception as e:
187
  return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": str(e).splitlines()[0]}
188
  finally:
 
45
  REVOLVER = CredentialRevolver(os.getenv("PROXY_LIST", ""))
46
 
47
  SEARCH_ENGINES = {
48
+ "Google": "https://www.google.com/search?q={query}&hl=en",
49
  "DuckDuckGo": "https://duckduckgo.com/html/?q={query}",
50
  "Bing": "https://www.bing.com/search?q={query}",
51
  "Brave": "https://search.brave.com/search?q={query}",
 
151
 
152
  context_args = {
153
  'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
154
+ 'java_script_enabled': True, 'ignore_https_errors': True, 'bypass_csp': True
 
 
155
  }
156
  if proxy_config:
157
  context_args['proxy'] = proxy_config
 
160
  page = await context.new_page()
161
 
162
  try:
163
+ response = await page.goto(url, wait_until='domcontentloaded', timeout=25000)
 
164
  final_url = page.url
165
+
166
  html_content = await page.content()
167
  soup = BeautifulSoup(html_content, 'lxml')
 
168
  converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
169
  markdown_text = converter.convert()
 
170
 
171
+ # HYBRID STRATEGY: If content is empty/trivial, wait briefly for JS to render.
172
+ if len(markdown_text.split()) < 20:
173
+ await page.wait_for_timeout(3000)
174
+ html_content = await page.content()
175
+ soup = BeautifulSoup(html_content, 'lxml')
176
+ converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
177
+ markdown_text = converter.convert()
178
+
179
+ title = await page.title() or "No Title"
180
+ status_code = response.status if response else 0
181
 
182
  return {
183
  "status": "success", "query": query, "final_url": final_url, "page_title": title,
184
  "http_status": status_code, "proxy_used": proxy_server_used, "markdown_content": markdown_text,
185
  }
186
  except PlaywrightTimeoutError:
187
+ return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": f"Navigation timed out after 25s. Site is likely too slow or blocking requests."}
188
  except Exception as e:
189
  return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": str(e).splitlines()[0]}
190
  finally: