broadfield-dev commited on
Commit
0cde5ea
·
verified ·
1 Parent(s): 2f01972

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -34
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import os
2
  import re
3
  import urllib.parse
@@ -6,12 +8,10 @@ from typing import Dict, Optional
6
  from itertools import cycle
7
 
8
  # Install playwright if not present
9
- # This is often needed in environments like HF Spaces
10
  if os.getenv("PLAYWRIGHT_INSTALL_RUN", "false").lower() != "true":
11
  os.system("playwright install --with-deps")
12
  os.environ["PLAYWRIGHT_INSTALL_RUN"] = "true"
13
 
14
-
15
  from flask import Flask, request, jsonify
16
  from bs4 import BeautifulSoup, NavigableString
17
  from playwright.async_api import async_playwright
@@ -47,7 +47,6 @@ class CredentialRevolver:
47
  return len(self.proxies)
48
 
49
  PLAYWRIGHT_STATE: Dict = {}
50
- # For Hugging Face Spaces, it's better to use Space secrets for PROXY_LIST
51
  REVOLVER = CredentialRevolver(os.getenv("PROXY_LIST", ""))
52
 
53
  SEARCH_ENGINES = {
@@ -120,40 +119,39 @@ class HTML_TO_MARKDOWN_CONVERTER:
120
  return f"\n\n![{alt}]({full_src})\n\n"
121
  return inner_md
122
 
123
- # --- Core Web Browsing Logic (unchanged) ---
124
- async def perform_web_browse(action: str, query: str, browser_name: str, search_engine_name: str):
125
- browser_key = browser_name.lower()
 
126
  if "playwright" not in PLAYWRIGHT_STATE:
127
  PLAYWRIGHT_STATE["playwright"] = await async_playwright().start()
128
 
129
- # Use a persistent browser instance to avoid re-launching on every call
130
  if browser_key not in PLAYWRIGHT_STATE:
131
  try:
132
  p = PLAYWRIGHT_STATE["playwright"]
133
  browser_map = {'firefox': p.firefox, 'chromium': p.chromium, 'webkit': p.webkit}
134
  browser_launcher = browser_map.get(browser_key)
135
  if not browser_launcher:
136
- raise ValueError(f"Invalid browser name: {browser_name}")
137
- # In some containerized environments, --no-sandbox is needed for chromium
138
  launch_args = ['--no-sandbox'] if browser_key == 'chromium' else []
139
  browser_instance = await browser_launcher.launch(headless=True, args=launch_args)
140
  PLAYWRIGHT_STATE[browser_key] = browser_instance
141
  except Exception as e:
142
  return {"status": "error", "query": query, "error_message": f"Failed to launch '{browser_key}'. Error: {str(e).splitlines()[0]}"}
143
-
144
  browser_instance = PLAYWRIGHT_STATE[browser_key]
145
 
146
  if action == "Scrape URL":
147
  url = query if query.startswith(('http://', 'https://')) else f"http://{query}"
148
  else: # action == "Search"
149
- url_template = SEARCH_ENGINES.get(search_engine_name)
150
  if not url_template:
151
- return {"status": "error", "query": query, "error_message": f"Invalid search engine: '{search_engine_name}'."}
152
  url = url_template.format(query=urllib.parse.quote_plus(query))
153
 
154
  proxy_config = REVOLVER.get_next()
155
  proxy_server_used = proxy_config["server"] if proxy_config else "Direct Connection"
156
-
157
  context_args = {'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', 'java_script_enabled': True, 'ignore_https_errors': True, 'bypass_csp': True, 'accept_downloads': False}
158
  if proxy_config: context_args['proxy'] = proxy_config
159
 
@@ -163,16 +161,13 @@ async def perform_web_browse(action: str, query: str, browser_name: str, search_
163
  try:
164
  response = await page.goto(url, wait_until='domcontentloaded', timeout=25000)
165
  html_content = await page.content()
166
-
167
  if any(phrase in html_content for phrase in ["unusual traffic", "CAPTCHA", "are you human", "not a robot"]):
168
  raise Exception(f"Anti-bot measure detected on {page.url}. Try another search engine or proxy.")
169
-
170
  final_url, title = page.url, await page.title() or "No Title"
171
  soup = BeautifulSoup(html_content, 'lxml')
172
  converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
173
  markdown_text = converter.convert()
174
  status_code = response.status if response else 0
175
-
176
  return {"status": "success", "query": query, "action": action, "final_url": final_url, "page_title": title, "http_status": status_code, "proxy_used": proxy_server_used, "markdown_content": markdown_text}
177
  except Exception as e:
178
  error_message = str(e).splitlines()[0]
@@ -187,9 +182,7 @@ async def perform_web_browse(action: str, query: str, browser_name: str, search_
187
 
188
  @app.route('/', methods=['GET'])
189
  def index():
190
- """
191
- Root endpoint to provide API status and usage instructions.
192
- """
193
  return jsonify({
194
  "status": "online",
195
  "message": "Welcome to the Web Browse API!",
@@ -198,46 +191,41 @@ def index():
198
  "payload_format": {
199
  "action": "string (required: 'Search' or 'Scrape URL')",
200
  "query": "string (required: a search term or a full URL)",
201
- "browser_name": "string (optional, default: 'firefox'; options: 'firefox', 'chromium', 'webkit')",
202
- "search_engine_name": "string (optional, default: 'DuckDuckGo'; see code for all options)"
203
  },
204
- "example_curl": """curl -X POST YOUR_SPACE_URL/web_browse -H "Content-Type: application/json" -d '{"action": "Search", "query": "latest news on AI"}'"""
205
  })
206
 
207
  @app.route('/web_browse', methods=['POST'])
208
  def web_browse():
209
- """
210
- API endpoint to perform a web search or scrape a URL.
211
- """
212
  if not request.is_json:
213
  return jsonify({"status": "error", "error_message": "Invalid input: payload must be JSON"}), 400
214
 
215
  data = request.get_json()
216
  action = data.get('action')
217
  query = data.get('query')
218
- browser_name = data.get('browser_name', 'firefox')
219
- search_engine_name = data.get('search_engine_name', 'DuckDuckGo')
 
220
 
221
  if not action or not query:
222
  return jsonify({"status": "error", "error_message": "Missing required parameters: 'action' and 'query' are mandatory."}), 400
223
-
224
  if action not in ["Search", "Scrape URL"]:
225
  return jsonify({"status": "error", "error_message": "Invalid 'action'. Must be 'Search' or 'Scrape URL'."}), 400
226
 
227
  try:
228
- # Use asyncio.run() to execute the async function within the sync Flask route
229
- result = asyncio.run(perform_web_browse(action, query, browser_name, search_engine_name))
230
  response_status_code = 200 if result.get("status") == "success" else 500
231
  return jsonify(result), response_status_code
232
  except Exception as e:
233
- app.logger.error(f"An unexpected server error occurred: {str(e)}")
234
  return jsonify({"status": "error", "query": query, "error_message": f"An unexpected server error occurred: {str(e)}"}), 500
235
 
236
  # --- Main Application Runner ---
237
  if __name__ == "__main__":
238
  port = int(os.environ.get("PORT", 7860))
239
  print(f"Flask server starting on port {port}... {REVOLVER.count()} proxies loaded.")
240
- print(f"API instructions available at GET http://0.0.0.0:{port}/")
241
- print(f"API endpoint available at POST http://0.0.0.0:{port}/web_browse")
242
- # For production, consider using a more robust WSGI server like Gunicorn or Waitress
243
  app.run(host='0.0.0.0', port=port)
 
1
+ # app.py - Your Flask API Server
2
+
3
  import os
4
  import re
5
  import urllib.parse
 
8
  from itertools import cycle
9
 
10
  # Install playwright if not present
 
11
  if os.getenv("PLAYWRIGHT_INSTALL_RUN", "false").lower() != "true":
12
  os.system("playwright install --with-deps")
13
  os.environ["PLAYWRIGHT_INSTALL_RUN"] = "true"
14
 
 
15
  from flask import Flask, request, jsonify
16
  from bs4 import BeautifulSoup, NavigableString
17
  from playwright.async_api import async_playwright
 
47
  return len(self.proxies)
48
 
49
  PLAYWRIGHT_STATE: Dict = {}
 
50
  REVOLVER = CredentialRevolver(os.getenv("PROXY_LIST", ""))
51
 
52
  SEARCH_ENGINES = {
 
119
  return f"\n\n![{alt}]({full_src})\n\n"
120
  return inner_md
121
 
122
+ # --- Core Web Browsing Logic ---
123
+ # UPDATED: The function signature now uses `browser` and `search_engine` for consistency.
124
+ async def perform_web_browse(action: str, query: str, browser: str, search_engine: str):
125
+ browser_key = browser.lower()
126
  if "playwright" not in PLAYWRIGHT_STATE:
127
  PLAYWRIGHT_STATE["playwright"] = await async_playwright().start()
128
 
 
129
  if browser_key not in PLAYWRIGHT_STATE:
130
  try:
131
  p = PLAYWRIGHT_STATE["playwright"]
132
  browser_map = {'firefox': p.firefox, 'chromium': p.chromium, 'webkit': p.webkit}
133
  browser_launcher = browser_map.get(browser_key)
134
  if not browser_launcher:
135
+ raise ValueError(f"Invalid browser name: {browser}")
 
136
  launch_args = ['--no-sandbox'] if browser_key == 'chromium' else []
137
  browser_instance = await browser_launcher.launch(headless=True, args=launch_args)
138
  PLAYWRIGHT_STATE[browser_key] = browser_instance
139
  except Exception as e:
140
  return {"status": "error", "query": query, "error_message": f"Failed to launch '{browser_key}'. Error: {str(e).splitlines()[0]}"}
141
+
142
  browser_instance = PLAYWRIGHT_STATE[browser_key]
143
 
144
  if action == "Scrape URL":
145
  url = query if query.startswith(('http://', 'https://')) else f"http://{query}"
146
  else: # action == "Search"
147
+ url_template = SEARCH_ENGINES.get(search_engine)
148
  if not url_template:
149
+ return {"status": "error", "query": query, "error_message": f"Invalid search engine: '{search_engine}'."}
150
  url = url_template.format(query=urllib.parse.quote_plus(query))
151
 
152
  proxy_config = REVOLVER.get_next()
153
  proxy_server_used = proxy_config["server"] if proxy_config else "Direct Connection"
154
+
155
  context_args = {'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', 'java_script_enabled': True, 'ignore_https_errors': True, 'bypass_csp': True, 'accept_downloads': False}
156
  if proxy_config: context_args['proxy'] = proxy_config
157
 
 
161
  try:
162
  response = await page.goto(url, wait_until='domcontentloaded', timeout=25000)
163
  html_content = await page.content()
 
164
  if any(phrase in html_content for phrase in ["unusual traffic", "CAPTCHA", "are you human", "not a robot"]):
165
  raise Exception(f"Anti-bot measure detected on {page.url}. Try another search engine or proxy.")
 
166
  final_url, title = page.url, await page.title() or "No Title"
167
  soup = BeautifulSoup(html_content, 'lxml')
168
  converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
169
  markdown_text = converter.convert()
170
  status_code = response.status if response else 0
 
171
  return {"status": "success", "query": query, "action": action, "final_url": final_url, "page_title": title, "http_status": status_code, "proxy_used": proxy_server_used, "markdown_content": markdown_text}
172
  except Exception as e:
173
  error_message = str(e).splitlines()[0]
 
182
 
183
  @app.route('/', methods=['GET'])
184
  def index():
185
+ """Root endpoint to provide API status and usage instructions."""
 
 
186
  return jsonify({
187
  "status": "online",
188
  "message": "Welcome to the Web Browse API!",
 
191
  "payload_format": {
192
  "action": "string (required: 'Search' or 'Scrape URL')",
193
  "query": "string (required: a search term or a full URL)",
194
+ "browser": "string (optional, default: 'firefox'; options: 'firefox', 'chromium', 'webkit')",
195
+ "search_engine": "string (optional, default: 'DuckDuckGo'; see code for all options)"
196
  },
197
+ "example_curl": """curl -X POST YOUR_SPACE_URL/web_browse -H "Content-Type: application/json" -d '{"action": "Search", "query": "latest news on AI", "browser": "webkit"}'"""
198
  })
199
 
200
  @app.route('/web_browse', methods=['POST'])
201
  def web_browse():
202
+ """API endpoint to perform a web search or scrape a URL."""
 
 
203
  if not request.is_json:
204
  return jsonify({"status": "error", "error_message": "Invalid input: payload must be JSON"}), 400
205
 
206
  data = request.get_json()
207
  action = data.get('action')
208
  query = data.get('query')
209
+ # UPDATED: Reading `browser` and `search_engine` from the payload.
210
+ browser = data.get('browser', 'firefox')
211
+ search_engine = data.get('search_engine', 'DuckDuckGo')
212
 
213
  if not action or not query:
214
  return jsonify({"status": "error", "error_message": "Missing required parameters: 'action' and 'query' are mandatory."}), 400
 
215
  if action not in ["Search", "Scrape URL"]:
216
  return jsonify({"status": "error", "error_message": "Invalid 'action'. Must be 'Search' or 'Scrape URL'."}), 400
217
 
218
  try:
219
+ # UPDATED: Passing the new variable names to the function.
220
+ result = asyncio.run(perform_web_browse(action, query, browser, search_engine))
221
  response_status_code = 200 if result.get("status") == "success" else 500
222
  return jsonify(result), response_status_code
223
  except Exception as e:
224
+ app.logger.error(f"An unexpected server error occurred: {e}", exc_info=True)
225
  return jsonify({"status": "error", "query": query, "error_message": f"An unexpected server error occurred: {str(e)}"}), 500
226
 
227
  # --- Main Application Runner ---
228
  if __name__ == "__main__":
229
  port = int(os.environ.get("PORT", 7860))
230
  print(f"Flask server starting on port {port}... {REVOLVER.count()} proxies loaded.")
 
 
 
231
  app.run(host='0.0.0.0', port=port)