Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import feedparser
|
2 |
+
import urllib.parse
|
3 |
+
import newspaper
|
4 |
+
import functools
|
5 |
+
from transformers import pipeline, BartForConditionalGeneration, BartTokenizer
|
6 |
+
from sentence_transformers import SentenceTransformer, util
|
7 |
+
from datetime import datetime
|
8 |
+
import pandas as pd
|
9 |
+
import time
|
10 |
+
import gradio as gr
|
11 |
+
|
12 |
+
# Define sentiment analysis pipeline
|
13 |
+
sentiment_analysis = pipeline("sentiment-analysis", model="ProsusAI/finbert")
|
14 |
+
|
15 |
+
# Load Sentence Transformer model
|
16 |
+
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
|
17 |
+
|
18 |
+
# Load BART model and tokenizer for detailed news summary
|
19 |
+
bart_model_name = "facebook/bart-large-cnn"
|
20 |
+
bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name)
|
21 |
+
bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name)
|
22 |
+
|
23 |
+
# Cache for storing fetched articles
|
24 |
+
article_cache = {}
|
25 |
+
|
26 |
+
def fetch_article(url):
|
27 |
+
"""Fetch article text from URL."""
|
28 |
+
if url not in article_cache:
|
29 |
+
article = newspaper.Article(url)
|
30 |
+
article.download()
|
31 |
+
article.parse()
|
32 |
+
article_cache[url] = article.text
|
33 |
+
return article_cache[url]
|
34 |
+
|
35 |
+
def fetch_and_analyze_news_entry(entry, company_name, company_ticker, location):
|
36 |
+
"""Fetch and analyze sentiment for a single news entry."""
|
37 |
+
title = entry.title
|
38 |
+
url = entry.link
|
39 |
+
domain = urllib.parse.urlparse(url).netloc # Extract domain from URL
|
40 |
+
publishing_date = entry.published_parsed # Extract publishing date
|
41 |
+
|
42 |
+
# Analyze sentiment regardless of article text availability
|
43 |
+
try:
|
44 |
+
label, score = analyze_sentiment(title)
|
45 |
+
sentiment_label = "Positive" if label == "positive" else "Negative" if label == "negative" else "Neutral"
|
46 |
+
except Exception as e:
|
47 |
+
print(f"Error analyzing sentiment for title: {title}. Error: {e}")
|
48 |
+
sentiment_label = "Unknown"
|
49 |
+
|
50 |
+
try:
|
51 |
+
# Fetch article text using caching
|
52 |
+
article_text = fetch_article(url)
|
53 |
+
except Exception as e:
|
54 |
+
print(f"Error fetching article at URL: {url}. Skipping article.")
|
55 |
+
return {
|
56 |
+
"title": title,
|
57 |
+
"url": url,
|
58 |
+
"domain": domain, # Include domain in the result
|
59 |
+
"location": location, # Include location in the result
|
60 |
+
"publishing_date": datetime.fromtimestamp(time.mktime(publishing_date)).strftime("%Y-%m-%d %H:%M:%S"), # Convert to normal date format
|
61 |
+
"sentiment": sentiment_label,
|
62 |
+
"detailed_summary": "Paywall Detected",
|
63 |
+
"similarity_score": calculate_similarity(company_name, company_ticker, title) # Calculate similarity based on title
|
64 |
+
}
|
65 |
+
|
66 |
+
# Generate detailed news summary using BART model
|
67 |
+
detailed_summary = news_detailed(article_text)
|
68 |
+
|
69 |
+
# Calculate sentence similarity
|
70 |
+
similarity_score = calculate_similarity(company_name, company_ticker, title)
|
71 |
+
|
72 |
+
return {
|
73 |
+
"title": title,
|
74 |
+
"url": url,
|
75 |
+
"domain": domain, # Include domain in the result
|
76 |
+
"location": location, # Include location in the result
|
77 |
+
"publishing_date": datetime.fromtimestamp(time.mktime(publishing_date)).strftime("%Y-%m-%d %H:%M:%S"), # Convert to normal date format
|
78 |
+
"sentiment": sentiment_label,
|
79 |
+
"detailed_summary": detailed_summary,
|
80 |
+
"similarity_score": similarity_score
|
81 |
+
}
|
82 |
+
|
83 |
+
def fetch_and_analyze_news(company_name, company_ticker, event_name, start_date=None, end_date=None, location=None, num_news=5, include_domains=None, exclude_domains=None):
|
84 |
+
"""Fetch and analyze news entries."""
|
85 |
+
# Constructing the Google News RSS feed URL
|
86 |
+
query_name = f"{company_name} {event_name} {location}"
|
87 |
+
|
88 |
+
# Add date range to the query if start_date and end_date are provided
|
89 |
+
if start_date and end_date:
|
90 |
+
query_name += f" after:{start_date} before:{end_date}"
|
91 |
+
|
92 |
+
# Add domain suggestions and exclusions to the query
|
93 |
+
if include_domains:
|
94 |
+
include_domains_query = " OR ".join(f"site:{domain.strip()}" for domain in include_domains)
|
95 |
+
query_name += f" {include_domains_query}"
|
96 |
+
|
97 |
+
if exclude_domains:
|
98 |
+
exclude_domains_query = " ".join(f"-site:{domain.strip()}" for domain in exclude_domains)
|
99 |
+
query_name += f" {exclude_domains_query}"
|
100 |
+
|
101 |
+
encoded_query_name = urllib.parse.quote(query_name)
|
102 |
+
rss_url_name = f"https://news.google.com/rss/search?q={encoded_query_name}"
|
103 |
+
|
104 |
+
# Parsing the RSS feed for company name
|
105 |
+
feed_name = feedparser.parse(rss_url_name)
|
106 |
+
news_entries_name = feed_name.entries[:num_news]
|
107 |
+
|
108 |
+
analyzed_news_name = []
|
109 |
+
|
110 |
+
# Fetch and analyze news entries for company name
|
111 |
+
analyze_news_entry_func = functools.partial(fetch_and_analyze_news_entry, company_name=company_name, company_ticker=company_ticker, location=location)
|
112 |
+
for entry in news_entries_name:
|
113 |
+
analyzed_news_name.append(analyze_news_entry_func(entry))
|
114 |
+
|
115 |
+
return analyzed_news_name
|
116 |
+
|
117 |
+
def news_detailed(article_text, max_length=250):
|
118 |
+
"""Generate detailed news summary using BART model."""
|
119 |
+
inputs = bart_tokenizer([article_text], max_length=max_length, truncation=True, return_tensors="pt")
|
120 |
+
summary_ids = bart_model.generate(inputs["input_ids"], num_beams=4, max_length=max_length, length_penalty=2.0, early_stopping=True)
|
121 |
+
detailed_summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
122 |
+
return detailed_summary
|
123 |
+
|
124 |
+
def calculate_similarity(company_name, company_ticker, title, threshold=0.4):
|
125 |
+
"""Calculate sentence similarity."""
|
126 |
+
company_name_prefix = f"News Regarding {company_name}"
|
127 |
+
|
128 |
+
embeddings_company_name = sentence_model.encode([company_name_prefix], convert_to_tensor=True)
|
129 |
+
embeddings_title = sentence_model.encode([title], convert_to_tensor=True)
|
130 |
+
|
131 |
+
similarity_score_company_name = util.pytorch_cos_sim(embeddings_company_name, embeddings_title).item()
|
132 |
+
|
133 |
+
weighted_similarity_score = similarity_score_company_name
|
134 |
+
|
135 |
+
return weighted_similarity_score
|
136 |
+
|
137 |
+
def analyze_sentiment(title):
|
138 |
+
# Perform sentiment analysis on the input title
|
139 |
+
result = sentiment_analysis(title)
|
140 |
+
# Extract sentiment label and score from the result
|
141 |
+
labels = result[0]['label']
|
142 |
+
scores = result[0]['score']
|
143 |
+
return labels, scores
|
144 |
+
|
145 |
+
def fetch_news(company_name, company_ticker, event_name, start_date, end_date, location, num_news, include_domains, exclude_domains):
|
146 |
+
start_time = time.time() # Record the start time
|
147 |
+
|
148 |
+
include_domains = [domain.strip() for domain in include_domains.split(',')] if include_domains else None
|
149 |
+
exclude_domains = [domain.strip() for domain in exclude_domains.split(',')] if exclude_domains else None
|
150 |
+
|
151 |
+
analyzed_news_name = fetch_and_analyze_news(company_name, company_ticker, event_name, start_date, end_date, location, int(num_news), include_domains=include_domains, exclude_domains=exclude_domains)
|
152 |
+
|
153 |
+
above_threshold_news = [news for news in analyzed_news_name if news is not None and news['similarity_score'] >= 0.3]
|
154 |
+
below_threshold_news = [news for news in analyzed_news_name if news is not None and news['similarity_score'] < 0.3]
|
155 |
+
|
156 |
+
above_threshold_df = pd.DataFrame(above_threshold_news)
|
157 |
+
below_threshold_df = pd.DataFrame(below_threshold_news)
|
158 |
+
|
159 |
+
file_name = f"{company_name}_News_Data_10002.xlsx"
|
160 |
+
|
161 |
+
with pd.ExcelWriter(file_name) as writer:
|
162 |
+
above_threshold_df.to_excel(writer, sheet_name='Above_Threshold', index=False)
|
163 |
+
below_threshold_df.to_excel(writer, sheet_name='Below_Threshold', index=False)
|
164 |
+
|
165 |
+
end_time = time.time() # Record the end time
|
166 |
+
elapsed_time = end_time - start_time # Calculate the elapsed time
|
167 |
+
|
168 |
+
return f"News data saved to {file_name} with separate sheets for above and below threshold news. Computation Time: {elapsed_time:.2f} seconds"
|
169 |
+
|
170 |
+
# Gradio interface
|
171 |
+
iface = gr.Interface(
|
172 |
+
fn=fetch_news,
|
173 |
+
inputs=[
|
174 |
+
gr.inputs.Textbox(label="Company Name"),
|
175 |
+
gr.inputs.Textbox(label="Company Ticker"),
|
176 |
+
gr.inputs.Textbox(label="Event Name"),
|
177 |
+
gr.inputs.Textbox(label="Start Date (optional)"),
|
178 |
+
gr.inputs.Textbox(label="End Date (optional)"),
|
179 |
+
gr.inputs.Textbox(label="Location (optional)"),
|
180 |
+
gr.inputs.Textbox(label="Number of News to Fetch"),
|
181 |
+
gr.inputs.Textbox(label="Include Domains (comma-separated)"),
|
182 |
+
gr.inputs.Textbox(label="Exclude Domains (comma-separated)")
|
183 |
+
],
|
184 |
+
outputs="text",
|
185 |
+
title="News Fetcher",
|
186 |
+
description="Fetch and analyze news articles for a specific company and event."
|
187 |
+
)
|
188 |
+
|
189 |
+
if __name__ == "__main__":
|
190 |
+
iface.launch()
|