broadfield-dev commited on
Commit
3880e24
·
verified ·
1 Parent(s): 3887cc3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -65
app.py CHANGED
@@ -17,22 +17,17 @@ class CredentialRevolver:
17
 
18
  def _parse_proxies(self, proxy_string: str):
19
  proxies = []
20
- if not proxy_string:
21
- return proxies
22
  for line in proxy_string.strip().splitlines():
23
  try:
24
  parsed = urllib.parse.urlparse(f"//{line.strip()}")
25
- if not parsed.hostname or not parsed.port:
26
- continue
27
  server = f"http://{parsed.hostname}:{parsed.port}"
28
  proxy_dict = {"server": server}
29
- if parsed.username:
30
- proxy_dict["username"] = urllib.parse.unquote(parsed.username)
31
- if parsed.password:
32
- proxy_dict["password"] = urllib.parse.unquote(parsed.password)
33
  proxies.append(proxy_dict)
34
- except Exception:
35
- pass
36
  return proxies
37
 
38
  def get_next(self) -> Optional[Dict]:
@@ -45,21 +40,21 @@ PLAYWRIGHT_STATE: Dict = {}
45
  REVOLVER = CredentialRevolver(os.getenv("PROXY_LIST", ""))
46
 
47
  SEARCH_ENGINES = {
48
- "Google": "https://www.google.com/search?q={query}&hl=en",
49
- "DuckDuckGo": "https://duckduckgo.com/html/?q={query}",
50
- "Bing": "https://www.bing.com/search?q={query}",
51
- "Brave": "https://search.brave.com/search?q={query}",
52
- "Ecosia": "https://www.ecosia.org/search?q={query}",
53
- "Yahoo": "https://search.yahoo.com/search?p={query}",
54
- "Startpage": "https://www.startpage.com/sp/search?q={query}",
55
- "Qwant": "https://www.qwant.com/?q={query}",
56
- "Swisscows": "https://swisscows.com/web?query={query}",
57
- "You.com": "https://you.com/search?q={query}",
58
- "SearXNG": "https://searx.be/search?q={query}",
59
- "MetaGer": "https://metager.org/meta/meta.ger-en?eingabe={query}",
60
- "Yandex": "https://yandex.com/search/?text={query}",
61
- "Baidu": "https://www.baidu.com/s?wd={query}",
62
- "Perplexity": "https://www.perplexity.ai/search?q={query}"
63
  }
64
 
65
  class HTML_TO_MARKDOWN_CONVERTER:
@@ -68,13 +63,7 @@ class HTML_TO_MARKDOWN_CONVERTER:
68
  self.base_url = base_url
69
 
70
  def _cleanup_html(self):
71
- selectors_to_remove = [
72
- 'nav', 'footer', 'header', 'aside', 'form', 'script', 'style', 'svg', 'button', 'input', 'textarea',
73
- '[role="navigation"]', '[role="search"]', '[id*="comment"]', '[class*="comment-"]',
74
- '[id*="sidebar"]', '[class*="sidebar"]', '[id*="related"]', '[class*="related"]',
75
- '[id*="share"]', '[class*="share"]', '[id*="social"]', '[class*="social"]',
76
- '[id*="cookie"]', '[class*="cookie"]'
77
- ]
78
  for selector in selectors_to_remove:
79
  for element in self.soup.select(selector):
80
  element.decompose()
@@ -82,8 +71,7 @@ class HTML_TO_MARKDOWN_CONVERTER:
82
  def convert(self):
83
  self._cleanup_html()
84
  content_node = self.soup.find('main') or self.soup.find('article') or self.soup.find('body')
85
- if not content_node:
86
- return ""
87
  md = self._process_node(content_node)
88
  return re.sub(r'\n{3,}', '\n\n', md).strip()
89
 
@@ -120,7 +108,7 @@ class HTML_TO_MARKDOWN_CONVERTER:
120
  return f"\n\n![{alt}]({full_src})\n\n"
121
  return inner_md
122
 
123
- async def perform_web_browse(query: str, browser_name: str, search_engine: str):
124
  browser_key = browser_name.lower()
125
  if "playwright" not in PLAYWRIGHT_STATE:
126
  PLAYWRIGHT_STATE["playwright"] = await async_playwright().start()
@@ -138,55 +126,52 @@ async def perform_web_browse(query: str, browser_name: str, search_engine: str):
138
 
139
  browser_instance = PLAYWRIGHT_STATE[browser_key]
140
 
141
- if urllib.parse.urlparse(query).scheme in ['http', 'https'] and '.' in urllib.parse.urlparse(query).netloc:
 
 
142
  url = query
 
143
  else:
144
- search_url_template = SEARCH_ENGINES.get(search_engine)
145
- if not search_url_template:
146
- return {"status": "error", "query": query, "error_message": f"Invalid search engine: '{search_engine}'."}
147
- url = search_url_template.format(query=urllib.parse.quote_plus(query))
 
148
 
149
  proxy_config = REVOLVER.get_next()
150
  proxy_server_used = proxy_config["server"] if proxy_config else "Direct Connection"
151
 
152
- context_args = {
153
- 'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
154
- 'java_script_enabled': True, 'ignore_https_errors': True, 'bypass_csp': True
155
- }
156
- if proxy_config:
157
- context_args['proxy'] = proxy_config
158
 
159
  context = await browser_instance.new_context(**context_args)
160
  page = await context.new_page()
161
 
162
  try:
163
- response = await page.goto(url, wait_until='domcontentloaded', timeout=25000)
164
- final_url = page.url
 
 
 
 
165
 
166
  html_content = await page.content()
 
 
 
 
 
167
  soup = BeautifulSoup(html_content, 'lxml')
168
  converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
169
  markdown_text = converter.convert()
170
-
171
- # HYBRID STRATEGY: If content is empty/trivial, wait briefly for JS to render.
172
- if len(markdown_text.split()) < 20:
173
- await page.wait_for_timeout(3000)
174
- html_content = await page.content()
175
- soup = BeautifulSoup(html_content, 'lxml')
176
- converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
177
- markdown_text = converter.convert()
178
-
179
- title = await page.title() or "No Title"
180
  status_code = response.status if response else 0
181
 
182
- return {
183
- "status": "success", "query": query, "final_url": final_url, "page_title": title,
184
- "http_status": status_code, "proxy_used": proxy_server_used, "markdown_content": markdown_text,
185
- }
186
- except PlaywrightTimeoutError:
187
- return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": f"Navigation timed out after 25s. Site is likely too slow or blocking requests."}
188
  except Exception as e:
189
- return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": str(e).splitlines()[0]}
 
 
 
190
  finally:
191
  if 'page' in locals() and not page.is_closed(): await page.close()
192
  if 'context' in locals(): await context.close()
 
17
 
18
  def _parse_proxies(self, proxy_string: str):
19
  proxies = []
20
+ if not proxy_string: return proxies
 
21
  for line in proxy_string.strip().splitlines():
22
  try:
23
  parsed = urllib.parse.urlparse(f"//{line.strip()}")
24
+ if not parsed.hostname or not parsed.port: continue
 
25
  server = f"http://{parsed.hostname}:{parsed.port}"
26
  proxy_dict = {"server": server}
27
+ if parsed.username: proxy_dict["username"] = urllib.parse.unquote(parsed.username)
28
+ if parsed.password: proxy_dict["password"] = urllib.parse.unquote(parsed.password)
 
 
29
  proxies.append(proxy_dict)
30
+ except Exception: pass
 
31
  return proxies
32
 
33
  def get_next(self) -> Optional[Dict]:
 
40
  REVOLVER = CredentialRevolver(os.getenv("PROXY_LIST", ""))
41
 
42
  SEARCH_ENGINES = {
43
+ "Google": ("https://www.google.com/search?q={query}&hl=en", '#rso, #search, #botstuff'),
44
+ "DuckDuckGo": ("https://duckduckgo.com/html/?q={query}", '#links'),
45
+ "Bing": ("https://www.bing.com/search?q={query}", '#b_results'),
46
+ "Brave": ("https://search.brave.com/search?q={query}", '#results'),
47
+ "Ecosia": ("https://www.ecosia.org/search?q={query}", 'main[role="main"]'),
48
+ "Yahoo": ("https://search.yahoo.com/search?p={query}", '#web'),
49
+ "Startpage": ("https://www.startpage.com/sp/search?q={query}", '#main'),
50
+ "Qwant": ("https://www.qwant.com/?q={query}", '[data-testid="web-results"]'),
51
+ "Swisscows": ("https://swisscows.com/web?query={query}", '.web-results'),
52
+ "You.com": ("https://you.com/search?q={query}", '#search-results'),
53
+ "SearXNG": ("https://searx.be/search?q={query}", '#results'),
54
+ "MetaGer": ("https://metager.org/meta/meta.ger-en?eingabe={query}", '#results'),
55
+ "Yandex": ("https://yandex.com/search/?text={query}", '#search-result'),
56
+ "Baidu": ("https://www.baidu.com/s?wd={query}", '#content_left'),
57
+ "Perplexity": ("https://www.perplexity.ai/search?q={query}", 'div[class*="prose"]'),
58
  }
59
 
60
  class HTML_TO_MARKDOWN_CONVERTER:
 
63
  self.base_url = base_url
64
 
65
  def _cleanup_html(self):
66
+ selectors_to_remove = ['nav', 'footer', 'header', 'aside', 'form', 'script', 'style', 'svg', 'button', 'input', 'textarea', '[role="navigation"]', '[role="search"]', '[id*="comment"]', '[class*="comment-"]', '[id*="sidebar"]', '[class*="sidebar"]', '[id*="related"]', '[class*="related"]', '[id*="share"]', '[class*="share"]', '[id*="social"]', '[class*="social"]', '[id*="cookie"]', '[class*="cookie"]', '[aria-hidden="true"]']
 
 
 
 
 
 
67
  for selector in selectors_to_remove:
68
  for element in self.soup.select(selector):
69
  element.decompose()
 
71
  def convert(self):
72
  self._cleanup_html()
73
  content_node = self.soup.find('main') or self.soup.find('article') or self.soup.find('body')
74
+ if not content_node: return ""
 
75
  md = self._process_node(content_node)
76
  return re.sub(r'\n{3,}', '\n\n', md).strip()
77
 
 
108
  return f"\n\n![{alt}]({full_src})\n\n"
109
  return inner_md
110
 
111
+ async def perform_web_browse(query: str, browser_name: str, search_engine_name: str):
112
  browser_key = browser_name.lower()
113
  if "playwright" not in PLAYWRIGHT_STATE:
114
  PLAYWRIGHT_STATE["playwright"] = await async_playwright().start()
 
126
 
127
  browser_instance = PLAYWRIGHT_STATE[browser_key]
128
 
129
+ is_direct_url = urllib.parse.urlparse(query).scheme in ['http', 'https'] and '.' in urllib.parse.urlparse(query).netloc
130
+
131
+ if is_direct_url:
132
  url = query
133
+ content_selector = 'body'
134
  else:
135
+ engine_data = SEARCH_ENGINES.get(search_engine_name)
136
+ if not engine_data:
137
+ return {"status": "error", "query": query, "error_message": f"Invalid search engine: '{search_engine_name}'."}
138
+ url, content_selector = engine_data
139
+ url = url.format(query=urllib.parse.quote_plus(query))
140
 
141
  proxy_config = REVOLVER.get_next()
142
  proxy_server_used = proxy_config["server"] if proxy_config else "Direct Connection"
143
 
144
+ context_args = {'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', 'java_script_enabled': True, 'ignore_https_errors': True, 'bypass_csp': True, 'accept_downloads': False}
145
+ if proxy_config: context_args['proxy'] = proxy_config
 
 
 
 
146
 
147
  context = await browser_instance.new_context(**context_args)
148
  page = await context.new_page()
149
 
150
  try:
151
+ response = await page.goto(url, wait_until='commit', timeout=20000)
152
+
153
+ await asyncio.wait([
154
+ page.wait_for_load_state('domcontentloaded', timeout=15000),
155
+ page.wait_for_selector(content_selector, timeout=15000),
156
+ ], return_when=asyncio.FIRST_COMPLETED)
157
 
158
  html_content = await page.content()
159
+
160
+ if any(phrase in html_content for phrase in ["unusual traffic", "CAPTCHA", "prove you are human", "before you continue"]):
161
+ raise Exception("Anti-bot measure detected. Try another search engine or proxy.")
162
+
163
+ final_url, title = page.url, await page.title() or "No Title"
164
  soup = BeautifulSoup(html_content, 'lxml')
165
  converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
166
  markdown_text = converter.convert()
 
 
 
 
 
 
 
 
 
 
167
  status_code = response.status if response else 0
168
 
169
+ return {"status": "success", "query": query, "final_url": final_url, "page_title": title, "http_status": status_code, "proxy_used": proxy_server_used, "markdown_content": markdown_text}
 
 
 
 
 
170
  except Exception as e:
171
+ error_message = str(e).splitlines()[0]
172
+ if "Timeout" in error_message:
173
+ return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": f"Page failed to load or find content for '{query}'. The site may be slow, blocking automation, or the content selector '{content_selector}' was not found."}
174
+ return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": error_message}
175
  finally:
176
  if 'page' in locals() and not page.is_closed(): await page.close()
177
  if 'context' in locals(): await context.close()