Spaces:

Shreyas94
/

News_Sentiment_Analysis_And_Summarizer

Sleeping

App Files Files Community

Shreyas94 commited on Jun 17

Commit

c0518a9

•

1 Parent(s): ea5c525

Create app.py

Browse files

Files changed (1) hide show

app.py +190 -0

app.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import feedparser
+import urllib.parse
+import newspaper
+import functools
+from transformers import pipeline, BartForConditionalGeneration, BartTokenizer
+from sentence_transformers import SentenceTransformer, util
+from datetime import datetime
+import pandas as pd
+import time
+import gradio as gr
+# Define sentiment analysis pipeline
+sentiment_analysis = pipeline("sentiment-analysis", model="ProsusAI/finbert")
+# Load Sentence Transformer model
+sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
+# Load BART model and tokenizer for detailed news summary
+bart_model_name = "facebook/bart-large-cnn"
+bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name)
+bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name)
+# Cache for storing fetched articles
+article_cache = {}
+def fetch_article(url):
+    """Fetch article text from URL."""
+    if url not in article_cache:
+        article = newspaper.Article(url)
+        article.download()
+        article.parse()
+        article_cache[url] = article.text
+    return article_cache[url]
+def fetch_and_analyze_news_entry(entry, company_name, company_ticker, location):
+    """Fetch and analyze sentiment for a single news entry."""
+    title = entry.title
+    url = entry.link
+    domain = urllib.parse.urlparse(url).netloc  # Extract domain from URL
+    publishing_date = entry.published_parsed  # Extract publishing date
+    # Analyze sentiment regardless of article text availability
+    try:
+        label, score = analyze_sentiment(title)
+        sentiment_label = "Positive" if label == "positive" else "Negative" if label == "negative" else "Neutral"
+    except Exception as e:
+        print(f"Error analyzing sentiment for title: {title}. Error: {e}")
+        sentiment_label = "Unknown"
+    try:
+        # Fetch article text using caching
+        article_text = fetch_article(url)
+    except Exception as e:
+        print(f"Error fetching article at URL: {url}. Skipping article.")
+        return {
+            "title": title,
+            "url": url,
+            "domain": domain,  # Include domain in the result
+            "location": location,  # Include location in the result
+            "publishing_date": datetime.fromtimestamp(time.mktime(publishing_date)).strftime("%Y-%m-%d %H:%M:%S"),  # Convert to normal date format
+            "sentiment": sentiment_label,
+            "detailed_summary": "Paywall Detected",
+            "similarity_score": calculate_similarity(company_name, company_ticker, title)  # Calculate similarity based on title
+        }
+    # Generate detailed news summary using BART model
+    detailed_summary = news_detailed(article_text)
+    # Calculate sentence similarity
+    similarity_score = calculate_similarity(company_name, company_ticker, title)
+    return {
+        "title": title,
+        "url": url,
+        "domain": domain,  # Include domain in the result
+        "location": location,  # Include location in the result
+        "publishing_date": datetime.fromtimestamp(time.mktime(publishing_date)).strftime("%Y-%m-%d %H:%M:%S"),  # Convert to normal date format
+        "sentiment": sentiment_label,
+        "detailed_summary": detailed_summary,
+        "similarity_score": similarity_score
+    }
+def fetch_and_analyze_news(company_name, company_ticker, event_name, start_date=None, end_date=None, location=None, num_news=5, include_domains=None, exclude_domains=None):
+    """Fetch and analyze news entries."""
+    # Constructing the Google News RSS feed URL
+    query_name = f"{company_name} {event_name} {location}"
+    # Add date range to the query if start_date and end_date are provided
+    if start_date and end_date:
+        query_name += f" after:{start_date} before:{end_date}"
+    # Add domain suggestions and exclusions to the query
+    if include_domains:
+        include_domains_query = " OR ".join(f"site:{domain.strip()}" for domain in include_domains)
+        query_name += f" {include_domains_query}"
+    if exclude_domains:
+        exclude_domains_query = " ".join(f"-site:{domain.strip()}" for domain in exclude_domains)
+        query_name += f" {exclude_domains_query}"
+    encoded_query_name = urllib.parse.quote(query_name)
+    rss_url_name = f"https://news.google.com/rss/search?q={encoded_query_name}"
+    # Parsing the RSS feed for company name
+    feed_name = feedparser.parse(rss_url_name)
+    news_entries_name = feed_name.entries[:num_news]
+    analyzed_news_name = []
+    # Fetch and analyze news entries for company name
+    analyze_news_entry_func = functools.partial(fetch_and_analyze_news_entry, company_name=company_name, company_ticker=company_ticker, location=location)
+    for entry in news_entries_name:
+        analyzed_news_name.append(analyze_news_entry_func(entry))
+    return analyzed_news_name
+def news_detailed(article_text, max_length=250):
+    """Generate detailed news summary using BART model."""
+    inputs = bart_tokenizer([article_text], max_length=max_length, truncation=True, return_tensors="pt")
+    summary_ids = bart_model.generate(inputs["input_ids"], num_beams=4, max_length=max_length, length_penalty=2.0, early_stopping=True)
+    detailed_summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    return detailed_summary
+def calculate_similarity(company_name, company_ticker, title, threshold=0.4):
+    """Calculate sentence similarity."""
+    company_name_prefix = f"News Regarding {company_name}"
+    embeddings_company_name = sentence_model.encode([company_name_prefix], convert_to_tensor=True)
+    embeddings_title = sentence_model.encode([title], convert_to_tensor=True)
+    similarity_score_company_name = util.pytorch_cos_sim(embeddings_company_name, embeddings_title).item()
+    weighted_similarity_score = similarity_score_company_name
+    return weighted_similarity_score
+def analyze_sentiment(title):
+    # Perform sentiment analysis on the input title
+    result = sentiment_analysis(title)
+    # Extract sentiment label and score from the result
+    labels = result[0]['label']
+    scores = result[0]['score']
+    return labels, scores
+def fetch_news(company_name, company_ticker, event_name, start_date, end_date, location, num_news, include_domains, exclude_domains):
+    start_time = time.time()  # Record the start time
+    include_domains = [domain.strip() for domain in include_domains.split(',')] if include_domains else None
+    exclude_domains = [domain.strip() for domain in exclude_domains.split(',')] if exclude_domains else None
+    analyzed_news_name = fetch_and_analyze_news(company_name, company_ticker, event_name, start_date, end_date, location, int(num_news), include_domains=include_domains, exclude_domains=exclude_domains)
+    above_threshold_news = [news for news in analyzed_news_name if news is not None and news['similarity_score'] >= 0.3]
+    below_threshold_news = [news for news in analyzed_news_name if news is not None and news['similarity_score'] < 0.3]
+    above_threshold_df = pd.DataFrame(above_threshold_news)
+    below_threshold_df = pd.DataFrame(below_threshold_news)
+    file_name = f"{company_name}_News_Data_10002.xlsx"
+    with pd.ExcelWriter(file_name) as writer:
+        above_threshold_df.to_excel(writer, sheet_name='Above_Threshold', index=False)
+        below_threshold_df.to_excel(writer, sheet_name='Below_Threshold', index=False)
+    end_time = time.time()  # Record the end time
+    elapsed_time = end_time - start_time  # Calculate the elapsed time
+    return f"News data saved to {file_name} with separate sheets for above and below threshold news. Computation Time: {elapsed_time:.2f} seconds"
+# Gradio interface
+iface = gr.Interface(
+    fn=fetch_news,
+    inputs=[
+        gr.inputs.Textbox(label="Company Name"),
+        gr.inputs.Textbox(label="Company Ticker"),
+        gr.inputs.Textbox(label="Event Name"),
+        gr.inputs.Textbox(label="Start Date (optional)"),
+        gr.inputs.Textbox(label="End Date (optional)"),
+        gr.inputs.Textbox(label="Location (optional)"),
+        gr.inputs.Textbox(label="Number of News to Fetch"),
+        gr.inputs.Textbox(label="Include Domains (comma-separated)"),
+        gr.inputs.Textbox(label="Exclude Domains (comma-separated)")
+    ],
+    outputs="text",
+    title="News Fetcher",
+    description="Fetch and analyze news articles for a specific company and event."
+)
+if __name__ == "__main__":
+    iface.launch()