Shreyas94 commited on
Commit
c0518a9
1 Parent(s): ea5c525

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +190 -0
app.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import feedparser
2
+ import urllib.parse
3
+ import newspaper
4
+ import functools
5
+ from transformers import pipeline, BartForConditionalGeneration, BartTokenizer
6
+ from sentence_transformers import SentenceTransformer, util
7
+ from datetime import datetime
8
+ import pandas as pd
9
+ import time
10
+ import gradio as gr
11
+
12
+ # Define sentiment analysis pipeline
13
+ sentiment_analysis = pipeline("sentiment-analysis", model="ProsusAI/finbert")
14
+
15
+ # Load Sentence Transformer model
16
+ sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
17
+
18
+ # Load BART model and tokenizer for detailed news summary
19
+ bart_model_name = "facebook/bart-large-cnn"
20
+ bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name)
21
+ bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name)
22
+
23
+ # Cache for storing fetched articles
24
+ article_cache = {}
25
+
26
+ def fetch_article(url):
27
+ """Fetch article text from URL."""
28
+ if url not in article_cache:
29
+ article = newspaper.Article(url)
30
+ article.download()
31
+ article.parse()
32
+ article_cache[url] = article.text
33
+ return article_cache[url]
34
+
35
+ def fetch_and_analyze_news_entry(entry, company_name, company_ticker, location):
36
+ """Fetch and analyze sentiment for a single news entry."""
37
+ title = entry.title
38
+ url = entry.link
39
+ domain = urllib.parse.urlparse(url).netloc # Extract domain from URL
40
+ publishing_date = entry.published_parsed # Extract publishing date
41
+
42
+ # Analyze sentiment regardless of article text availability
43
+ try:
44
+ label, score = analyze_sentiment(title)
45
+ sentiment_label = "Positive" if label == "positive" else "Negative" if label == "negative" else "Neutral"
46
+ except Exception as e:
47
+ print(f"Error analyzing sentiment for title: {title}. Error: {e}")
48
+ sentiment_label = "Unknown"
49
+
50
+ try:
51
+ # Fetch article text using caching
52
+ article_text = fetch_article(url)
53
+ except Exception as e:
54
+ print(f"Error fetching article at URL: {url}. Skipping article.")
55
+ return {
56
+ "title": title,
57
+ "url": url,
58
+ "domain": domain, # Include domain in the result
59
+ "location": location, # Include location in the result
60
+ "publishing_date": datetime.fromtimestamp(time.mktime(publishing_date)).strftime("%Y-%m-%d %H:%M:%S"), # Convert to normal date format
61
+ "sentiment": sentiment_label,
62
+ "detailed_summary": "Paywall Detected",
63
+ "similarity_score": calculate_similarity(company_name, company_ticker, title) # Calculate similarity based on title
64
+ }
65
+
66
+ # Generate detailed news summary using BART model
67
+ detailed_summary = news_detailed(article_text)
68
+
69
+ # Calculate sentence similarity
70
+ similarity_score = calculate_similarity(company_name, company_ticker, title)
71
+
72
+ return {
73
+ "title": title,
74
+ "url": url,
75
+ "domain": domain, # Include domain in the result
76
+ "location": location, # Include location in the result
77
+ "publishing_date": datetime.fromtimestamp(time.mktime(publishing_date)).strftime("%Y-%m-%d %H:%M:%S"), # Convert to normal date format
78
+ "sentiment": sentiment_label,
79
+ "detailed_summary": detailed_summary,
80
+ "similarity_score": similarity_score
81
+ }
82
+
83
+ def fetch_and_analyze_news(company_name, company_ticker, event_name, start_date=None, end_date=None, location=None, num_news=5, include_domains=None, exclude_domains=None):
84
+ """Fetch and analyze news entries."""
85
+ # Constructing the Google News RSS feed URL
86
+ query_name = f"{company_name} {event_name} {location}"
87
+
88
+ # Add date range to the query if start_date and end_date are provided
89
+ if start_date and end_date:
90
+ query_name += f" after:{start_date} before:{end_date}"
91
+
92
+ # Add domain suggestions and exclusions to the query
93
+ if include_domains:
94
+ include_domains_query = " OR ".join(f"site:{domain.strip()}" for domain in include_domains)
95
+ query_name += f" {include_domains_query}"
96
+
97
+ if exclude_domains:
98
+ exclude_domains_query = " ".join(f"-site:{domain.strip()}" for domain in exclude_domains)
99
+ query_name += f" {exclude_domains_query}"
100
+
101
+ encoded_query_name = urllib.parse.quote(query_name)
102
+ rss_url_name = f"https://news.google.com/rss/search?q={encoded_query_name}"
103
+
104
+ # Parsing the RSS feed for company name
105
+ feed_name = feedparser.parse(rss_url_name)
106
+ news_entries_name = feed_name.entries[:num_news]
107
+
108
+ analyzed_news_name = []
109
+
110
+ # Fetch and analyze news entries for company name
111
+ analyze_news_entry_func = functools.partial(fetch_and_analyze_news_entry, company_name=company_name, company_ticker=company_ticker, location=location)
112
+ for entry in news_entries_name:
113
+ analyzed_news_name.append(analyze_news_entry_func(entry))
114
+
115
+ return analyzed_news_name
116
+
117
+ def news_detailed(article_text, max_length=250):
118
+ """Generate detailed news summary using BART model."""
119
+ inputs = bart_tokenizer([article_text], max_length=max_length, truncation=True, return_tensors="pt")
120
+ summary_ids = bart_model.generate(inputs["input_ids"], num_beams=4, max_length=max_length, length_penalty=2.0, early_stopping=True)
121
+ detailed_summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
122
+ return detailed_summary
123
+
124
+ def calculate_similarity(company_name, company_ticker, title, threshold=0.4):
125
+ """Calculate sentence similarity."""
126
+ company_name_prefix = f"News Regarding {company_name}"
127
+
128
+ embeddings_company_name = sentence_model.encode([company_name_prefix], convert_to_tensor=True)
129
+ embeddings_title = sentence_model.encode([title], convert_to_tensor=True)
130
+
131
+ similarity_score_company_name = util.pytorch_cos_sim(embeddings_company_name, embeddings_title).item()
132
+
133
+ weighted_similarity_score = similarity_score_company_name
134
+
135
+ return weighted_similarity_score
136
+
137
+ def analyze_sentiment(title):
138
+ # Perform sentiment analysis on the input title
139
+ result = sentiment_analysis(title)
140
+ # Extract sentiment label and score from the result
141
+ labels = result[0]['label']
142
+ scores = result[0]['score']
143
+ return labels, scores
144
+
145
+ def fetch_news(company_name, company_ticker, event_name, start_date, end_date, location, num_news, include_domains, exclude_domains):
146
+ start_time = time.time() # Record the start time
147
+
148
+ include_domains = [domain.strip() for domain in include_domains.split(',')] if include_domains else None
149
+ exclude_domains = [domain.strip() for domain in exclude_domains.split(',')] if exclude_domains else None
150
+
151
+ analyzed_news_name = fetch_and_analyze_news(company_name, company_ticker, event_name, start_date, end_date, location, int(num_news), include_domains=include_domains, exclude_domains=exclude_domains)
152
+
153
+ above_threshold_news = [news for news in analyzed_news_name if news is not None and news['similarity_score'] >= 0.3]
154
+ below_threshold_news = [news for news in analyzed_news_name if news is not None and news['similarity_score'] < 0.3]
155
+
156
+ above_threshold_df = pd.DataFrame(above_threshold_news)
157
+ below_threshold_df = pd.DataFrame(below_threshold_news)
158
+
159
+ file_name = f"{company_name}_News_Data_10002.xlsx"
160
+
161
+ with pd.ExcelWriter(file_name) as writer:
162
+ above_threshold_df.to_excel(writer, sheet_name='Above_Threshold', index=False)
163
+ below_threshold_df.to_excel(writer, sheet_name='Below_Threshold', index=False)
164
+
165
+ end_time = time.time() # Record the end time
166
+ elapsed_time = end_time - start_time # Calculate the elapsed time
167
+
168
+ return f"News data saved to {file_name} with separate sheets for above and below threshold news. Computation Time: {elapsed_time:.2f} seconds"
169
+
170
+ # Gradio interface
171
+ iface = gr.Interface(
172
+ fn=fetch_news,
173
+ inputs=[
174
+ gr.inputs.Textbox(label="Company Name"),
175
+ gr.inputs.Textbox(label="Company Ticker"),
176
+ gr.inputs.Textbox(label="Event Name"),
177
+ gr.inputs.Textbox(label="Start Date (optional)"),
178
+ gr.inputs.Textbox(label="End Date (optional)"),
179
+ gr.inputs.Textbox(label="Location (optional)"),
180
+ gr.inputs.Textbox(label="Number of News to Fetch"),
181
+ gr.inputs.Textbox(label="Include Domains (comma-separated)"),
182
+ gr.inputs.Textbox(label="Exclude Domains (comma-separated)")
183
+ ],
184
+ outputs="text",
185
+ title="News Fetcher",
186
+ description="Fetch and analyze news articles for a specific company and event."
187
+ )
188
+
189
+ if __name__ == "__main__":
190
+ iface.launch()