Shreyas094 commited on
Commit
fb5d60f
·
verified ·
1 Parent(s): eb6b9ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -5
app.py CHANGED
@@ -20,6 +20,9 @@ from datetime import datetime
20
  import os
21
  from dotenv import load_dotenv
22
  import certifi
 
 
 
23
 
24
  # Load environment variables from a .env file
25
  load_dotenv()
@@ -88,12 +91,24 @@ def scrape_with_bs4(url, session, max_chars=None):
88
  logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
89
  return ""
90
 
91
- def scrape_with_trafilatura(url, max_chars=None, timeout=5):
92
  try:
93
  response = requests.get(url, timeout=timeout)
94
  response.raise_for_status()
95
  downloaded = response.text
96
- content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
 
 
 
 
 
 
 
 
 
 
 
 
97
  return (content or "")[:max_chars] if max_chars else (content or "")
98
  except Timeout:
99
  logger.error(f"Timeout error while scraping {url} with Trafilatura")
@@ -252,7 +267,7 @@ def scrape_full_content(url, scraper="trafilatura", max_chars=3000, timeout=5):
252
  else:
253
  content = soup.get_text(strip=True, separator='\n')
254
  else: # trafilatura
255
- content = scrape_with_trafilatura(url, max_chars, timeout)
256
 
257
  # Limit the content to max_chars
258
  return content[:max_chars] if content else ""
@@ -378,7 +393,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
378
  logger.warning(f"No more results returned from SearXNG on page {page}.")
379
  break
380
 
381
- for result in results:
382
  if len(scraped_content) >= num_results:
383
  break
384
 
@@ -415,7 +430,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
415
  config = use_config()
416
  config.set("DEFAULT", "USER_AGENT", ua)
417
 
418
- content = scrape_with_trafilatura(url, max_chars, timeout=timeout)
419
 
420
  if content:
421
  break
 
20
  import os
21
  from dotenv import load_dotenv
22
  import certifi
23
+ from bs4 import BeautifulSoup
24
+ from trafilatura import extract
25
+ from trafilatura.htmlprocessing import convert_tree
26
 
27
  # Load environment variables from a .env file
28
  load_dotenv()
 
91
  logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
92
  return ""
93
 
94
+ def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=False):
95
  try:
96
  response = requests.get(url, timeout=timeout)
97
  response.raise_for_status()
98
  downloaded = response.text
99
+ content = ""
100
+ if use_beautifulsoup:
101
+ soup = BeautifulSoup(downloaded, "lxml")
102
+ lxml_tree = convert_tree(soup)[0]
103
+ content = extract(lxml_tree, include_comments=False, include_tables=True, no_fallback=False)
104
+
105
+ # Fallback mechanism: if BeautifulSoup didn't yield results, try without it
106
+ if not content and use_beautifulsoup:
107
+ logger.info("BeautifulSoup method failed to extract content. Trying without BeautifulSoup.")
108
+ content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
109
+ # If still no content, use the direct method
110
+ if not content:
111
+ content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
112
  return (content or "")[:max_chars] if max_chars else (content or "")
113
  except Timeout:
114
  logger.error(f"Timeout error while scraping {url} with Trafilatura")
 
267
  else:
268
  content = soup.get_text(strip=True, separator='\n')
269
  else: # trafilatura
270
+ content = scrape_with_trafilatura(url, max_chars, timeout, use_beautifulsoup=True)
271
 
272
  # Limit the content to max_chars
273
  return content[:max_chars] if content else ""
 
393
  logger.warning(f"No more results returned from SearXNG on page {page}.")
394
  break
395
 
396
+ for result in results:
397
  if len(scraped_content) >= num_results:
398
  break
399
 
 
430
  config = use_config()
431
  config.set("DEFAULT", "USER_AGENT", ua)
432
 
433
+ content = scrape_with_trafilatura(url, max_chars, timeout=timeout, use_beautifulsoup=True)
434
 
435
  if content:
436
  break