Shreyas094 commited on
Commit
6773bde
·
verified ·
1 Parent(s): 69ddd17

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -46
app.py CHANGED
@@ -20,10 +20,15 @@ from datetime import datetime
20
  import os
21
  from dotenv import load_dotenv
22
  import certifi
23
- from bs4 import BeautifulSoup
24
  import requests
25
- from trafilatura.settings import use_config
26
- from urllib.request import urlopen, Request
 
 
 
 
 
 
27
 
28
  # Load environment variables from a .env file
29
  load_dotenv()
@@ -37,7 +42,7 @@ SEARXNG_URL = 'https://shreyas094-searxng-local.hf.space/search'
37
  SEARXNG_KEY = 'f9f07f93b37b8483aadb5ba717f556f3a4ac507b281b4ca01e6c6288aa3e3ae5'
38
 
39
  # Use the environment variable
40
- HF_TOKEN = os.getenv('HF_TOKEN')
41
  client = InferenceClient(
42
  "mistralai/Mistral-Nemo-Instruct-2407",
43
  token=HF_TOKEN,
@@ -74,6 +79,51 @@ def is_valid_url(url):
74
  except ValueError:
75
  return False
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  def scrape_with_bs4(url, session, max_chars=None):
78
  try:
79
  response = session.get(url, timeout=5)
@@ -248,7 +298,8 @@ Remember to focus on financial aspects and implications in your assessment and s
248
  response = llm_client.chat_completion(
249
  messages=messages,
250
  max_tokens=150,
251
- temperature=temperature
 
252
  )
253
  return response.choices[0].message.content.strip()
254
  except Exception as e:
@@ -272,8 +323,15 @@ def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
272
  content = main_content.get_text(strip=True, separator='\n')
273
  else:
274
  content = soup.get_text(strip=True, separator='\n')
275
- else: # trafilatura
276
  content = scrape_with_trafilatura(url, max_chars, timeout, use_beautifulsoup=True)
 
 
 
 
 
 
 
277
 
278
  # Limit the content to max_chars
279
  return content[:max_chars] if content else ""
@@ -314,7 +372,10 @@ Your response should be detailed, informative, accurate, and directly relevant t
314
  response = llm_client.chat_completion(
315
  messages=messages,
316
  max_tokens=10000,
317
- temperature=temperature
 
 
 
318
  )
319
  return response.choices[0].message.content.strip()
320
  except Exception as e:
@@ -408,51 +469,17 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
408
  try:
409
  logger.info(f"Scraping content from: {url}")
410
 
411
- # Implement a retry mechanism with different user agents
412
- user_agents = [
413
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
414
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
415
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
416
- ]
417
-
418
- content = ""
419
- for ua in user_agents:
420
- try:
421
- if scraper == "bs4":
422
- session.headers.update({'User-Agent': ua})
423
- content = scrape_with_bs4(url, session, max_chars)
424
- else: # trafilatura
425
- # Use urllib to handle custom headers for trafilatura
426
- req = Request(url, headers={'User-Agent': ua})
427
- with urlopen(req) as response:
428
- downloaded = response.read()
429
-
430
- # Configure trafilatura to use a specific user agent
431
- config = use_config()
432
- config.set("DEFAULT", "USER_AGENT", ua)
433
-
434
- content = scrape_with_trafilatura(url, max_chars, timeout=timeout, use_beautifulsoup=True)
435
-
436
- if content:
437
- break
438
- except requests.exceptions.HTTPError as e:
439
- if e.response.status_code == 403:
440
- logger.warning(f"403 Forbidden error with User-Agent: {ua}. Trying next...")
441
- continue
442
- else:
443
- raise
444
- except Exception as e:
445
- logger.error(f"Error scraping {url} with User-Agent {ua}: {str(e)}")
446
- continue
447
 
448
  if not content:
449
- logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")
450
  continue
451
 
452
  scraped_content.append({
453
  "title": title,
454
  "url": url,
455
- "content": content, # No need to slice here as it's already limited
456
  "scraper": scraper
457
  })
458
  logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
@@ -558,7 +585,7 @@ iface = gr.ChatInterface(
558
  description="Enter your query, and I'll search the web for the most recent and relevant financial news, scrape content, and provide summarized results.",
559
  additional_inputs=[
560
  gr.Slider(5, 20, value=10, step=1, label="Number of initial results"),
561
- gr.Dropdown(["bs4", "trafilatura"], value="bs4", label="Scraping Method"),
562
  gr.Slider(500, 10000, value=1500, step=100, label="Max characters to retrieve"),
563
  gr.Dropdown(["", "day", "week", "month", "year"], value="year", label="Time Range"),
564
  gr.Dropdown(["all", "en", "fr", "de", "es", "it", "nl", "pt", "pl", "ru", "zh"], value="en", label="Language"),
 
20
  import os
21
  from dotenv import load_dotenv
22
  import certifi
 
23
  import requests
24
+ import scrapy
25
+ from scrapy.crawler import CrawlerProcess
26
+ from scrapy import signals
27
+ from scrapy.signalmanager import dispatcher
28
+ from scrapy.utils.log import configure_logging
29
+ from newspaper import Article
30
+
31
+
32
 
33
  # Load environment variables from a .env file
34
  load_dotenv()
 
42
  SEARXNG_KEY = 'f9f07f93b37b8483aadb5ba717f556f3a4ac507b281b4ca01e6c6288aa3e3ae5'
43
 
44
  # Use the environment variable
45
+ HF_TOKEN = os.getenv("HF_TOKEN")
46
  client = InferenceClient(
47
  "mistralai/Mistral-Nemo-Instruct-2407",
48
  token=HF_TOKEN,
 
79
  except ValueError:
80
  return False
81
 
82
+ class NewsSpider(scrapy.Spider):
83
+ name = 'news_spider'
84
+
85
+ def __init__(self, url=None, *args, **kwargs):
86
+ super(NewsSpider, self).__init__(*args, **kwargs)
87
+ self.start_urls = [url] if url else []
88
+
89
+ def parse(self, response):
90
+ content = ' '.join(response.css('p::text').getall())
91
+ self.logger.info(f"Scraped content length: {len(content)}")
92
+ return {'content': content}
93
+
94
+ def scrape_with_scrapy(url, timeout=30):
95
+ logger.info(f"Starting to scrape with Scrapy: {url}")
96
+ configure_logging(install_root_handler=False)
97
+ logging.getLogger('scrapy').setLevel(logging.WARNING)
98
+
99
+ results = []
100
+
101
+ def spider_results(signal, sender, item, response, spider):
102
+ results.append(item)
103
+
104
+ process = CrawlerProcess(settings={
105
+ 'LOG_ENABLED': True,
106
+ 'LOG_LEVEL': 'WARNING',
107
+ 'DOWNLOAD_TIMEOUT': timeout
108
+ })
109
+
110
+ dispatcher.connect(spider_results, signal=signals.item_scraped)
111
+
112
+ process.crawl(NewsSpider, url=url)
113
+ process.start()
114
+
115
+ # Get the content from results
116
+ if results:
117
+ return results[0]['content']
118
+ return ''
119
+
120
+ def scrape_with_newspaper(url):
121
+ logger.info(f"Starting to scrape with Newspaper3k: {url}")
122
+ article = Article(url)
123
+ article.download()
124
+ article.parse()
125
+ return article.text
126
+
127
  def scrape_with_bs4(url, session, max_chars=None):
128
  try:
129
  response = session.get(url, timeout=5)
 
298
  response = llm_client.chat_completion(
299
  messages=messages,
300
  max_tokens=150,
301
+ temperature=temperature,
302
+ top_p=0.9
303
  )
304
  return response.choices[0].message.content.strip()
305
  except Exception as e:
 
323
  content = main_content.get_text(strip=True, separator='\n')
324
  else:
325
  content = soup.get_text(strip=True, separator='\n')
326
+ elif scraper == "trafilatura":
327
  content = scrape_with_trafilatura(url, max_chars, timeout, use_beautifulsoup=True)
328
+ elif scraper == "scrapy":
329
+ content = scrape_with_scrapy(url, timeout)
330
+ elif scraper == "newspaper":
331
+ content = scrape_with_newspaper(url)
332
+ else:
333
+ logger.error(f"Unknown scraper: {scraper}")
334
+ return ""
335
 
336
  # Limit the content to max_chars
337
  return content[:max_chars] if content else ""
 
372
  response = llm_client.chat_completion(
373
  messages=messages,
374
  max_tokens=10000,
375
+ temperature=temperature,
376
+ frequency_penalty=1.1,
377
+ top_p=0.9,
378
+ stream=True
379
  )
380
  return response.choices[0].message.content.strip()
381
  except Exception as e:
 
469
  try:
470
  logger.info(f"Scraping content from: {url}")
471
 
472
+ # MODIFY: Remove the user agent loop and use a single scraping method
473
+ content = scrape_full_content(url, scraper, max_chars, timeout)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
 
475
  if not content:
476
+ logger.warning(f"Failed to scrape content from {url}")
477
  continue
478
 
479
  scraped_content.append({
480
  "title": title,
481
  "url": url,
482
+ "content": content,
483
  "scraper": scraper
484
  })
485
  logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
 
585
  description="Enter your query, and I'll search the web for the most recent and relevant financial news, scrape content, and provide summarized results.",
586
  additional_inputs=[
587
  gr.Slider(5, 20, value=10, step=1, label="Number of initial results"),
588
+ gr.Dropdown(["bs4", "trafilatura", "scrapy", "newspaper"], value="bs4", label="Scraping Method"),
589
  gr.Slider(500, 10000, value=1500, step=100, label="Max characters to retrieve"),
590
  gr.Dropdown(["", "day", "week", "month", "year"], value="year", label="Time Range"),
591
  gr.Dropdown(["all", "en", "fr", "de", "es", "it", "nl", "pt", "pl", "ru", "zh"], value="en", label="Language"),