Spaces:
Running
Running
Update chatbot.py
Browse files- chatbot.py +25 -42
chatbot.py
CHANGED
@@ -224,47 +224,20 @@ def extract_images_from_msg_list(msg_list):
|
|
224 |
all_images.append(c_)
|
225 |
return all_images
|
226 |
|
227 |
-
|
228 |
-
from threading import Thread
|
229 |
-
import random
|
230 |
-
from bs4 import BeautifulSoup
|
231 |
-
from functools import lru_cache
|
232 |
-
import requests
|
233 |
-
|
234 |
-
_useragent_list = [
|
235 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
|
236 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
|
237 |
-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
|
238 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
|
239 |
-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
|
240 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
|
241 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
|
242 |
-
]
|
243 |
-
|
244 |
-
def get_useragent():
|
245 |
-
return random.choice(_useragent_list)
|
246 |
-
|
247 |
@lru_cache(maxsize=128)
|
248 |
def extract_text_from_webpage(html_content):
|
|
|
249 |
soup = BeautifulSoup(html_content, "html.parser")
|
|
|
250 |
for tag in soup(["script", "style", "header", "footer", "nav"]):
|
251 |
tag.extract()
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
"""Fetches webpage content and extracts text."""
|
256 |
-
try:
|
257 |
-
webpage = requests.get(link, headers={"User-Agent": get_useragent()})
|
258 |
-
webpage.raise_for_status()
|
259 |
-
visible_text = extract_text_from_webpage(webpage.text)
|
260 |
-
if len(visible_text) > max_chars_per_page:
|
261 |
-
visible_text = visible_text[:max_chars_per_page] + "..."
|
262 |
-
return {"link": link, "text": visible_text}
|
263 |
-
except requests.exceptions.RequestException as e:
|
264 |
-
return {"link": link, "text": None}
|
265 |
|
266 |
# Perform a Google search and return the results
|
267 |
-
def search(term, num_results=
|
268 |
"""Performs a Google search and returns the results."""
|
269 |
escaped_term = urllib.parse.quote_plus(term)
|
270 |
start = 0
|
@@ -295,13 +268,22 @@ def search(term, num_results=3, lang="en", timeout=5, safe="active", ssl_verify=
|
|
295 |
continue
|
296 |
for result in result_block:
|
297 |
link = result.find("a", href=True)
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
return all_results
|
306 |
|
307 |
# Format the prompt for the language model
|
@@ -330,7 +312,7 @@ def update_history(answer="", question=""):
|
|
330 |
return history
|
331 |
|
332 |
# Define a function for model inference
|
333 |
-
@spaces.GPU(duration=
|
334 |
def model_inference(
|
335 |
user_prompt,
|
336 |
chat_history,
|
@@ -390,6 +372,7 @@ def model_inference(
|
|
390 |
output += response.token.text
|
391 |
yield output
|
392 |
update_history(output, user_prompt)
|
|
|
393 |
return
|
394 |
else:
|
395 |
if user_prompt["text"].strip() == "" and not user_prompt["files"]:
|
|
|
224 |
all_images.append(c_)
|
225 |
return all_images
|
226 |
|
227 |
+
# Perform a Google search and return the results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
@lru_cache(maxsize=128)
|
229 |
def extract_text_from_webpage(html_content):
|
230 |
+
"""Extracts visible text from HTML content using BeautifulSoup."""
|
231 |
soup = BeautifulSoup(html_content, "html.parser")
|
232 |
+
# Remove unwanted tags
|
233 |
for tag in soup(["script", "style", "header", "footer", "nav"]):
|
234 |
tag.extract()
|
235 |
+
# Get the remaining visible text
|
236 |
+
visible_text = soup.get_text(strip=True)
|
237 |
+
return visible_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
|
239 |
# Perform a Google search and return the results
|
240 |
+
def search(term, num_results=2, lang="en", advanced=True, timeout=5, safe="active", ssl_verify=None):
|
241 |
"""Performs a Google search and returns the results."""
|
242 |
escaped_term = urllib.parse.quote_plus(term)
|
243 |
start = 0
|
|
|
268 |
continue
|
269 |
for result in result_block:
|
270 |
link = result.find("a", href=True)
|
271 |
+
if link:
|
272 |
+
link = link["href"]
|
273 |
+
try:
|
274 |
+
webpage = session.get(link, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
|
275 |
+
webpage.raise_for_status()
|
276 |
+
visible_text = extract_text_from_webpage(webpage.text)
|
277 |
+
# Truncate text if it's too long
|
278 |
+
if len(visible_text) > max_chars_per_page:
|
279 |
+
visible_text = visible_text[:max_chars_per_page] + "..."
|
280 |
+
all_results.append({"link": link, "text": visible_text})
|
281 |
+
except requests.exceptions.RequestException as e:
|
282 |
+
print(f"Error fetching or processing {link}: {e}")
|
283 |
+
all_results.append({"link": link, "text": None})
|
284 |
+
else:
|
285 |
+
all_results.append({"link": None, "text": None})
|
286 |
+
start += len(result_block)
|
287 |
return all_results
|
288 |
|
289 |
# Format the prompt for the language model
|
|
|
312 |
return history
|
313 |
|
314 |
# Define a function for model inference
|
315 |
+
@spaces.GPU(duration=30, queue=False)
|
316 |
def model_inference(
|
317 |
user_prompt,
|
318 |
chat_history,
|
|
|
372 |
output += response.token.text
|
373 |
yield output
|
374 |
update_history(output, user_prompt)
|
375 |
+
print(history)
|
376 |
return
|
377 |
else:
|
378 |
if user_prompt["text"].strip() == "" and not user_prompt["files"]:
|