KingNish commited on
Commit
c5ecf2c
1 Parent(s): b6dd7da

Update chatbot.py

Browse files
Files changed (1) hide show
  1. chatbot.py +16 -37
chatbot.py CHANGED
@@ -226,11 +226,10 @@ def extract_images_from_msg_list(msg_list):
226
 
227
  from duckduckgo_search import DDGS
228
  from threading import Thread
229
- from queue import Queue
230
  import random
231
-
232
- def get_useragent():
233
- return random.choice(_useragent_list)
234
 
235
  _useragent_list = [
236
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
@@ -239,65 +238,45 @@ _useragent_list = [
239
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
240
  'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
241
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
242
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0',
243
- 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Mobile/15E148 Safari/605.1.15',
244
- 'Mozilla/5.0 (iPad; CPU OS 16_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Mobile/15E148 Safari/605.1.15',
245
- 'Mozilla/5.0 (Android 13; Mobile; rv:109.0) Gecko/109.0 Firefox/109.0',
246
- 'Mozilla/5.0 (Linux; Android 13; SM-G991B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36',
247
- 'Mozilla/5.0 (Linux; U; Android 11; en-us; SM-G991U) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/89.0.4387.119 Mobile Safari/537.36',
248
- 'Mozilla/5.0 (Linux; Android 12; SM-G998U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36',
249
- 'Mozilla/5.0 (Linux; Android 13; Pixel 7 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36',
250
- 'Mozilla/5.0 (Linux; Android 12; LM-G900V) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36',
251
- 'Mozilla/5.0 (Linux; Android 11; SM-G975U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36',
252
- 'Mozilla/5.0 (Linux; Android 11; SM-N975U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36',
253
- 'Mozilla/5.0 (Linux; Android 13; SM-S918U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36',
254
- 'Mozilla/5.0 (Linux; Android 13; SM-F936U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36'
255
  ]
256
 
257
- @lru_cache(maxsize=512)
 
 
 
258
  def extract_text_from_webpage(html_content):
259
- """Extracts visible text from HTML content using BeautifulSoup."""
260
  soup = BeautifulSoup(html_content, "html.parser")
261
- for tag in soup(["script", "style", "header", "footer", "nav", "aside",
262
- "figure", "figcaption", "template", "form", "input",
263
- "svg", "canvas", "video", "audio", "head", "meta",
264
- "link", "img", "iframe", "noscript"]):
265
  tag.extract()
266
  return soup.get_text(strip=True)
267
 
268
- def fetch_and_extract(link, max_chars_per_page, queue):
269
- """Fetches webpage content and extracts text in a separate thread."""
270
  try:
271
  webpage = requests.get(link, headers={"User-Agent": get_useragent()})
272
  webpage.raise_for_status()
273
  visible_text = extract_text_from_webpage(webpage.text)
274
  if len(visible_text) > max_chars_per_page:
275
  visible_text = visible_text[:max_chars_per_page] + "..."
276
- queue.put({"link": link, "text": visible_text})
277
  except requests.exceptions.RequestException as e:
278
- queue.put({"link": link, "text": None})
279
  print(f"Error fetching or processing {link}: {e}")
 
280
 
281
- def search(term, max_results=2, max_chars_per_page=8000, max_threads=5):
282
- """Performs a DuckDuckGo search and extracts text from webpages using threads."""
283
  all_results = []
284
  result_block = DDGS().text(term, max_results=max_results)
285
- # Use a queue to store results from threads
286
- queue = Queue()
287
- # Create and start threads for each link
288
  threads = []
289
  for result in result_block:
290
  if 'href' in result:
291
  link = result["href"]
292
- thread = Thread(target=fetch_and_extract, args=(link, max_chars_per_page, queue))
293
  threads.append(thread)
294
  thread.start()
295
- # Wait for all threads to finish
296
  for thread in threads:
297
  thread.join()
298
- # Retrieve results from the queue
299
- while not queue.empty():
300
- all_results.append(queue.get())
301
  return all_results
302
 
303
  # Format the prompt for the language model
 
226
 
227
  from duckduckgo_search import DDGS
228
  from threading import Thread
 
229
  import random
230
+ from bs4 import BeautifulSoup
231
+ from functools import lru_cache
232
+ import requests
233
 
234
  _useragent_list = [
235
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
 
238
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
239
  'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
240
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
241
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
 
 
 
 
 
 
 
 
 
 
 
 
242
  ]
243
 
244
+ def get_useragent():
245
+ return random.choice(_useragent_list)
246
+
247
+ @lru_cache(maxsize=128)
248
  def extract_text_from_webpage(html_content):
 
249
  soup = BeautifulSoup(html_content, "html.parser")
250
+ for tag in soup(["script", "style", "header", "footer", "nav"]):
 
 
 
251
  tag.extract()
252
  return soup.get_text(strip=True)
253
 
254
+ def fetch_and_extract(link, max_chars_per_page):
255
+ """Fetches webpage content and extracts text."""
256
  try:
257
  webpage = requests.get(link, headers={"User-Agent": get_useragent()})
258
  webpage.raise_for_status()
259
  visible_text = extract_text_from_webpage(webpage.text)
260
  if len(visible_text) > max_chars_per_page:
261
  visible_text = visible_text[:max_chars_per_page] + "..."
262
+ return {"link": link, "text": visible_text}
263
  except requests.exceptions.RequestException as e:
 
264
  print(f"Error fetching or processing {link}: {e}")
265
+ return {"link": link, "text": None}
266
 
267
+ def search(term, max_results=2, max_chars_per_page=8000, max_threads=10):
268
+ """Performs a DuckDuckGo search and extracts text from webpages."""
269
  all_results = []
270
  result_block = DDGS().text(term, max_results=max_results)
 
 
 
271
  threads = []
272
  for result in result_block:
273
  if 'href' in result:
274
  link = result["href"]
275
+ thread = Thread(target=lambda: all_results.append(fetch_and_extract(link, max_chars_per_page)))
276
  threads.append(thread)
277
  thread.start()
 
278
  for thread in threads:
279
  thread.join()
 
 
 
280
  return all_results
281
 
282
  # Format the prompt for the language model