Shreyas094
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -20,6 +20,9 @@ from datetime import datetime
|
|
20 |
import os
|
21 |
from dotenv import load_dotenv
|
22 |
import certifi
|
|
|
|
|
|
|
23 |
|
24 |
# Load environment variables from a .env file
|
25 |
load_dotenv()
|
@@ -88,12 +91,24 @@ def scrape_with_bs4(url, session, max_chars=None):
|
|
88 |
logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
|
89 |
return ""
|
90 |
|
91 |
-
def scrape_with_trafilatura(url, max_chars=None, timeout=5):
|
92 |
try:
|
93 |
response = requests.get(url, timeout=timeout)
|
94 |
response.raise_for_status()
|
95 |
downloaded = response.text
|
96 |
-
content =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
return (content or "")[:max_chars] if max_chars else (content or "")
|
98 |
except Timeout:
|
99 |
logger.error(f"Timeout error while scraping {url} with Trafilatura")
|
@@ -252,7 +267,7 @@ def scrape_full_content(url, scraper="trafilatura", max_chars=3000, timeout=5):
|
|
252 |
else:
|
253 |
content = soup.get_text(strip=True, separator='\n')
|
254 |
else: # trafilatura
|
255 |
-
content = scrape_with_trafilatura(url, max_chars, timeout)
|
256 |
|
257 |
# Limit the content to max_chars
|
258 |
return content[:max_chars] if content else ""
|
@@ -378,7 +393,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
|
|
378 |
logger.warning(f"No more results returned from SearXNG on page {page}.")
|
379 |
break
|
380 |
|
381 |
-
|
382 |
if len(scraped_content) >= num_results:
|
383 |
break
|
384 |
|
@@ -415,7 +430,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
|
|
415 |
config = use_config()
|
416 |
config.set("DEFAULT", "USER_AGENT", ua)
|
417 |
|
418 |
-
content = scrape_with_trafilatura(url, max_chars, timeout=timeout)
|
419 |
|
420 |
if content:
|
421 |
break
|
|
|
20 |
import os
|
21 |
from dotenv import load_dotenv
|
22 |
import certifi
|
23 |
+
from bs4 import BeautifulSoup
|
24 |
+
from trafilatura import extract
|
25 |
+
from trafilatura.htmlprocessing import convert_tree
|
26 |
|
27 |
# Load environment variables from a .env file
|
28 |
load_dotenv()
|
|
|
91 |
logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
|
92 |
return ""
|
93 |
|
94 |
+
def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=False):
|
95 |
try:
|
96 |
response = requests.get(url, timeout=timeout)
|
97 |
response.raise_for_status()
|
98 |
downloaded = response.text
|
99 |
+
content = ""
|
100 |
+
if use_beautifulsoup:
|
101 |
+
soup = BeautifulSoup(downloaded, "lxml")
|
102 |
+
lxml_tree = convert_tree(soup)[0]
|
103 |
+
content = extract(lxml_tree, include_comments=False, include_tables=True, no_fallback=False)
|
104 |
+
|
105 |
+
# Fallback mechanism: if BeautifulSoup didn't yield results, try without it
|
106 |
+
if not content and use_beautifulsoup:
|
107 |
+
logger.info("BeautifulSoup method failed to extract content. Trying without BeautifulSoup.")
|
108 |
+
content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
|
109 |
+
# If still no content, use the direct method
|
110 |
+
if not content:
|
111 |
+
content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
|
112 |
return (content or "")[:max_chars] if max_chars else (content or "")
|
113 |
except Timeout:
|
114 |
logger.error(f"Timeout error while scraping {url} with Trafilatura")
|
|
|
267 |
else:
|
268 |
content = soup.get_text(strip=True, separator='\n')
|
269 |
else: # trafilatura
|
270 |
+
content = scrape_with_trafilatura(url, max_chars, timeout, use_beautifulsoup=True)
|
271 |
|
272 |
# Limit the content to max_chars
|
273 |
return content[:max_chars] if content else ""
|
|
|
393 |
logger.warning(f"No more results returned from SearXNG on page {page}.")
|
394 |
break
|
395 |
|
396 |
+
for result in results:
|
397 |
if len(scraped_content) >= num_results:
|
398 |
break
|
399 |
|
|
|
430 |
config = use_config()
|
431 |
config.set("DEFAULT", "USER_AGENT", ua)
|
432 |
|
433 |
+
content = scrape_with_trafilatura(url, max_chars, timeout=timeout, use_beautifulsoup=True)
|
434 |
|
435 |
if content:
|
436 |
break
|