Shreyas094
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -20,10 +20,15 @@ from datetime import datetime
|
|
20 |
import os
|
21 |
from dotenv import load_dotenv
|
22 |
import certifi
|
23 |
-
from bs4 import BeautifulSoup
|
24 |
import requests
|
25 |
-
|
26 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
# Load environment variables from a .env file
|
29 |
load_dotenv()
|
@@ -37,7 +42,7 @@ SEARXNG_URL = 'https://shreyas094-searxng-local.hf.space/search'
|
|
37 |
SEARXNG_KEY = 'f9f07f93b37b8483aadb5ba717f556f3a4ac507b281b4ca01e6c6288aa3e3ae5'
|
38 |
|
39 |
# Use the environment variable
|
40 |
-
HF_TOKEN = os.getenv(
|
41 |
client = InferenceClient(
|
42 |
"mistralai/Mistral-Nemo-Instruct-2407",
|
43 |
token=HF_TOKEN,
|
@@ -74,6 +79,51 @@ def is_valid_url(url):
|
|
74 |
except ValueError:
|
75 |
return False
|
76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
def scrape_with_bs4(url, session, max_chars=None):
|
78 |
try:
|
79 |
response = session.get(url, timeout=5)
|
@@ -248,7 +298,8 @@ Remember to focus on financial aspects and implications in your assessment and s
|
|
248 |
response = llm_client.chat_completion(
|
249 |
messages=messages,
|
250 |
max_tokens=150,
|
251 |
-
temperature=temperature
|
|
|
252 |
)
|
253 |
return response.choices[0].message.content.strip()
|
254 |
except Exception as e:
|
@@ -272,8 +323,15 @@ def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
|
|
272 |
content = main_content.get_text(strip=True, separator='\n')
|
273 |
else:
|
274 |
content = soup.get_text(strip=True, separator='\n')
|
275 |
-
|
276 |
content = scrape_with_trafilatura(url, max_chars, timeout, use_beautifulsoup=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
|
278 |
# Limit the content to max_chars
|
279 |
return content[:max_chars] if content else ""
|
@@ -314,7 +372,10 @@ Your response should be detailed, informative, accurate, and directly relevant t
|
|
314 |
response = llm_client.chat_completion(
|
315 |
messages=messages,
|
316 |
max_tokens=10000,
|
317 |
-
temperature=temperature
|
|
|
|
|
|
|
318 |
)
|
319 |
return response.choices[0].message.content.strip()
|
320 |
except Exception as e:
|
@@ -408,51 +469,17 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
|
|
408 |
try:
|
409 |
logger.info(f"Scraping content from: {url}")
|
410 |
|
411 |
-
#
|
412 |
-
|
413 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
414 |
-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
|
415 |
-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
416 |
-
]
|
417 |
-
|
418 |
-
content = ""
|
419 |
-
for ua in user_agents:
|
420 |
-
try:
|
421 |
-
if scraper == "bs4":
|
422 |
-
session.headers.update({'User-Agent': ua})
|
423 |
-
content = scrape_with_bs4(url, session, max_chars)
|
424 |
-
else: # trafilatura
|
425 |
-
# Use urllib to handle custom headers for trafilatura
|
426 |
-
req = Request(url, headers={'User-Agent': ua})
|
427 |
-
with urlopen(req) as response:
|
428 |
-
downloaded = response.read()
|
429 |
-
|
430 |
-
# Configure trafilatura to use a specific user agent
|
431 |
-
config = use_config()
|
432 |
-
config.set("DEFAULT", "USER_AGENT", ua)
|
433 |
-
|
434 |
-
content = scrape_with_trafilatura(url, max_chars, timeout=timeout, use_beautifulsoup=True)
|
435 |
-
|
436 |
-
if content:
|
437 |
-
break
|
438 |
-
except requests.exceptions.HTTPError as e:
|
439 |
-
if e.response.status_code == 403:
|
440 |
-
logger.warning(f"403 Forbidden error with User-Agent: {ua}. Trying next...")
|
441 |
-
continue
|
442 |
-
else:
|
443 |
-
raise
|
444 |
-
except Exception as e:
|
445 |
-
logger.error(f"Error scraping {url} with User-Agent {ua}: {str(e)}")
|
446 |
-
continue
|
447 |
|
448 |
if not content:
|
449 |
-
logger.warning(f"Failed to scrape content from {url}
|
450 |
continue
|
451 |
|
452 |
scraped_content.append({
|
453 |
"title": title,
|
454 |
"url": url,
|
455 |
-
"content": content,
|
456 |
"scraper": scraper
|
457 |
})
|
458 |
logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
|
@@ -558,7 +585,7 @@ iface = gr.ChatInterface(
|
|
558 |
description="Enter your query, and I'll search the web for the most recent and relevant financial news, scrape content, and provide summarized results.",
|
559 |
additional_inputs=[
|
560 |
gr.Slider(5, 20, value=10, step=1, label="Number of initial results"),
|
561 |
-
gr.Dropdown(["bs4", "trafilatura"], value="bs4", label="Scraping Method"),
|
562 |
gr.Slider(500, 10000, value=1500, step=100, label="Max characters to retrieve"),
|
563 |
gr.Dropdown(["", "day", "week", "month", "year"], value="year", label="Time Range"),
|
564 |
gr.Dropdown(["all", "en", "fr", "de", "es", "it", "nl", "pt", "pl", "ru", "zh"], value="en", label="Language"),
|
|
|
20 |
import os
|
21 |
from dotenv import load_dotenv
|
22 |
import certifi
|
|
|
23 |
import requests
|
24 |
+
import scrapy
|
25 |
+
from scrapy.crawler import CrawlerProcess
|
26 |
+
from scrapy import signals
|
27 |
+
from scrapy.signalmanager import dispatcher
|
28 |
+
from scrapy.utils.log import configure_logging
|
29 |
+
from newspaper import Article
|
30 |
+
|
31 |
+
|
32 |
|
33 |
# Load environment variables from a .env file
|
34 |
load_dotenv()
|
|
|
42 |
SEARXNG_KEY = 'f9f07f93b37b8483aadb5ba717f556f3a4ac507b281b4ca01e6c6288aa3e3ae5'
|
43 |
|
44 |
# Use the environment variable
|
45 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
46 |
client = InferenceClient(
|
47 |
"mistralai/Mistral-Nemo-Instruct-2407",
|
48 |
token=HF_TOKEN,
|
|
|
79 |
except ValueError:
|
80 |
return False
|
81 |
|
82 |
+
class NewsSpider(scrapy.Spider):
|
83 |
+
name = 'news_spider'
|
84 |
+
|
85 |
+
def __init__(self, url=None, *args, **kwargs):
|
86 |
+
super(NewsSpider, self).__init__(*args, **kwargs)
|
87 |
+
self.start_urls = [url] if url else []
|
88 |
+
|
89 |
+
def parse(self, response):
|
90 |
+
content = ' '.join(response.css('p::text').getall())
|
91 |
+
self.logger.info(f"Scraped content length: {len(content)}")
|
92 |
+
return {'content': content}
|
93 |
+
|
94 |
+
def scrape_with_scrapy(url, timeout=30):
|
95 |
+
logger.info(f"Starting to scrape with Scrapy: {url}")
|
96 |
+
configure_logging(install_root_handler=False)
|
97 |
+
logging.getLogger('scrapy').setLevel(logging.WARNING)
|
98 |
+
|
99 |
+
results = []
|
100 |
+
|
101 |
+
def spider_results(signal, sender, item, response, spider):
|
102 |
+
results.append(item)
|
103 |
+
|
104 |
+
process = CrawlerProcess(settings={
|
105 |
+
'LOG_ENABLED': True,
|
106 |
+
'LOG_LEVEL': 'WARNING',
|
107 |
+
'DOWNLOAD_TIMEOUT': timeout
|
108 |
+
})
|
109 |
+
|
110 |
+
dispatcher.connect(spider_results, signal=signals.item_scraped)
|
111 |
+
|
112 |
+
process.crawl(NewsSpider, url=url)
|
113 |
+
process.start()
|
114 |
+
|
115 |
+
# Get the content from results
|
116 |
+
if results:
|
117 |
+
return results[0]['content']
|
118 |
+
return ''
|
119 |
+
|
120 |
+
def scrape_with_newspaper(url):
|
121 |
+
logger.info(f"Starting to scrape with Newspaper3k: {url}")
|
122 |
+
article = Article(url)
|
123 |
+
article.download()
|
124 |
+
article.parse()
|
125 |
+
return article.text
|
126 |
+
|
127 |
def scrape_with_bs4(url, session, max_chars=None):
|
128 |
try:
|
129 |
response = session.get(url, timeout=5)
|
|
|
298 |
response = llm_client.chat_completion(
|
299 |
messages=messages,
|
300 |
max_tokens=150,
|
301 |
+
temperature=temperature,
|
302 |
+
top_p=0.9
|
303 |
)
|
304 |
return response.choices[0].message.content.strip()
|
305 |
except Exception as e:
|
|
|
323 |
content = main_content.get_text(strip=True, separator='\n')
|
324 |
else:
|
325 |
content = soup.get_text(strip=True, separator='\n')
|
326 |
+
elif scraper == "trafilatura":
|
327 |
content = scrape_with_trafilatura(url, max_chars, timeout, use_beautifulsoup=True)
|
328 |
+
elif scraper == "scrapy":
|
329 |
+
content = scrape_with_scrapy(url, timeout)
|
330 |
+
elif scraper == "newspaper":
|
331 |
+
content = scrape_with_newspaper(url)
|
332 |
+
else:
|
333 |
+
logger.error(f"Unknown scraper: {scraper}")
|
334 |
+
return ""
|
335 |
|
336 |
# Limit the content to max_chars
|
337 |
return content[:max_chars] if content else ""
|
|
|
372 |
response = llm_client.chat_completion(
|
373 |
messages=messages,
|
374 |
max_tokens=10000,
|
375 |
+
temperature=temperature,
|
376 |
+
frequency_penalty=1.1,
|
377 |
+
top_p=0.9,
|
378 |
+
stream=True
|
379 |
)
|
380 |
return response.choices[0].message.content.strip()
|
381 |
except Exception as e:
|
|
|
469 |
try:
|
470 |
logger.info(f"Scraping content from: {url}")
|
471 |
|
472 |
+
# MODIFY: Remove the user agent loop and use a single scraping method
|
473 |
+
content = scrape_full_content(url, scraper, max_chars, timeout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
474 |
|
475 |
if not content:
|
476 |
+
logger.warning(f"Failed to scrape content from {url}")
|
477 |
continue
|
478 |
|
479 |
scraped_content.append({
|
480 |
"title": title,
|
481 |
"url": url,
|
482 |
+
"content": content,
|
483 |
"scraper": scraper
|
484 |
})
|
485 |
logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
|
|
|
585 |
description="Enter your query, and I'll search the web for the most recent and relevant financial news, scrape content, and provide summarized results.",
|
586 |
additional_inputs=[
|
587 |
gr.Slider(5, 20, value=10, step=1, label="Number of initial results"),
|
588 |
+
gr.Dropdown(["bs4", "trafilatura", "scrapy", "newspaper"], value="bs4", label="Scraping Method"),
|
589 |
gr.Slider(500, 10000, value=1500, step=100, label="Max characters to retrieve"),
|
590 |
gr.Dropdown(["", "day", "week", "month", "year"], value="year", label="Time Range"),
|
591 |
gr.Dropdown(["all", "en", "fr", "de", "es", "it", "nl", "pt", "pl", "ru", "zh"], value="en", label="Language"),
|