Spaces:

dpaul8195
/

news-sentiment-analyzer

Sleeping

File size: 10,049 Bytes

import requests
import random
from io import BytesIO
from bs4 import BeautifulSoup
from gtts import gTTS
from rake_nltk import Rake
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# from googletrans import Translator
import re
import nltk
from collections import Counter


nltk.download('punkt')
nltk.download('stopwords')  # Needed for filtering keywords
nltk.download('punkt_tab')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


# Initialize Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

# Initialize RAKE for keyword extraction
rake = Rake()


def get_news_articles(topic, max_articles_per_source=5):
    all_articles = []

    # --- Source 1: Times of India ---
    toi_url = f"https://timesofindia.indiatimes.com/topic/{topic}"
    try:
        response = requests.get(toi_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        results = soup.find_all("div", class_="uwU81")[:max_articles_per_source]

        for result in results:
            title_tag = result.find("div", class_="fHv_i o58kM")
            summary_tag = result.find("p", class_="oxXSK o58kM")
            link_tag = result.find("a")
            date_tag = result.find("div", class_="ZxBIG")

            title = title_tag.text.strip() if title_tag else "No title"
            summary = summary_tag.text.strip() if summary_tag else "No summary"
            link = f"https://timesofindia.indiatimes.com{link_tag['href']}" if link_tag else "#"

            formatted_date = "Date not found"
            if date_tag:
                match = re.search(r"/\s+(.*?\(\w+\))", date_tag.get_text())
                if match:
                    date_str = match.group(1).replace("(IST)", "").strip()
                    try:
                        dt = datetime.strptime(date_str, "%b %d, %Y, %H:%M")
                        formatted_date = dt.strftime("%b %d, %Y")
                    except Exception:
                        formatted_date = date_str

            sentiment_score = sia.polarity_scores(f"{title}. {summary}")["compound"]
            sentiment = "Positive" if sentiment_score >= 0.05 else "Negative" if sentiment_score <= -0.05 else "Neutral"
            topics = extract_topics(title + " " + summary)

            all_articles.append({
                "Source": "Times of India",
                "Title": title,
                "Summary": summary,
                "Link": link,
                "Date": formatted_date,
                "Sentiment": sentiment,
                "Topics": topics
            })

    except Exception as e:
        print(f"Error scraping TOI: {e}")

    # --- Source 2: Economic Times ---
    et_url = f"https://economictimes.indiatimes.com/topic/{topic}"
    try:
        response = requests.get(et_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        results = soup.find_all("div", class_="contentD")[:max_articles_per_source]

        for result in results:
            a_tag = result.find("a", class_="wrapLines l2")
            summary_tag = result.find("p", class_="wrapLines l3")
            time_tag = result.find("time")

            title = a_tag.text.strip() if a_tag else "No title"
            link = f"https://economictimes.indiatimes.com{a_tag['href']}" if a_tag and "href" in a_tag.attrs else "#"
            summary = summary_tag.text.strip() if summary_tag else "No summary"
            date_str = time_tag.text.strip() if time_tag else "Date not found"

            try:
                dt = datetime.strptime(date_str.replace(" IST", ""), "%d %b, %Y, %I:%M %p")
                formatted_date = dt.strftime("%b %d, %Y")
            except Exception:
                formatted_date = date_str

            sentiment_score = sia.polarity_scores(f"{title}. {summary}")["compound"]
            sentiment = "Positive" if sentiment_score >= 0.05 else "Negative" if sentiment_score <= -0.05 else "Neutral"
            topics = extract_topics(title + " " + summary)

            all_articles.append({
                "Source": "Economic Times",
                "Title": title,
                "Summary": summary,
                "Link": link,
                "Date": formatted_date,
                "Sentiment": sentiment,
                "Topics": topics
            })

    except Exception as e:
        print(f"Error scraping Economic Times: {e}")

    
    # Sentiment Distribution
    sentiment_counts = Counter(article["Sentiment"] for article in all_articles)

    # Topic Overlap
    topic_overlap = analyze_topic_overlap(all_articles)

    # Coverage Differences
    coverage_differences = generate_coverage_differences(all_articles)

    # Final Sentiment Summary
    final_sentiment_summary_english = generate_final_sentiment_analysis(sentiment_counts, topic)

    # Translation & TTS
    final_sentiment_summary_hindi = translate_to_hindi(final_sentiment_summary_english)
    audio_bytes = text_to_speech_hindi(final_sentiment_summary_hindi)


    return {
        "Company": topic,
        "Articles": all_articles,
        "Comparative Sentiment Score": {
            "Sentiment Distribution": dict(sentiment_counts),
            "Topic Overlap": topic_overlap,
            "Coverage Differences": coverage_differences  # Can be implemented later
        },
        "Final Sentiment Analysis": final_sentiment_summary_english,
        "Audio Bytes": audio_bytes
    }


def extract_topics(text, max_keywords=3):
    """Extracts key topics using RAKE, filtering out irrelevant keywords."""
    rake.extract_keywords_from_text(text)
    
    keywords = []
    for kw in rake.get_ranked_phrases():
        cleaned_kw = kw.title().strip()
        if (
            len(kw.split()) > 1 and
            "summary available" not in kw.lower() and
            not re.search(r"\b\d+\b", kw) and
            not re.search(r"[^\w\s-]", kw) and
            len(re.sub(r"[^a-zA-Z\s]", "", kw).strip()) > 1 and
            not any(word in stop_words for word in kw.lower().split())
        ):
            keywords.append(cleaned_kw)

    return keywords[:max_keywords] if keywords else ["General News"]



def generate_coverage_differences(articles):
    """Compares three random pairs of articles and generates coverage differences."""
    if len(articles) < 6:
        return [{"Comparison": "Not enough articles to compare 3 pairs.", "Impact": "At least 6 articles required."}]
    
    sampled_indices = random.sample(range(len(articles)), 6)
    pairs = [(sampled_indices[i], sampled_indices[i+1]) for i in range(0, 6, 2)]

    comparisons = []
    for idx1, idx2 in pairs:
        article1 = articles[idx1]
        article2 = articles[idx2]

        title1 = article1['Title'].replace('\n', ' ').strip()
        title2 = article2['Title'].replace('\n', ' ').strip()
        sentiment1 = article1['Sentiment'].strip().lower()
        sentiment2 = article2['Sentiment'].strip().lower()

        comparisons.append({
            "Comparison": f"Article {idx1+1}: '{title1}' vs Article {idx2+1}: '{title2}'.",
            "Impact": f"Article {idx1+1} is {sentiment1}, while Article {idx2+1} is {sentiment2}."
        })

    return comparisons

def analyze_topic_overlap(articles):
    """Finds common and unique topics among articles."""
    if len(articles) < 2:
        return {"Common Topics": [], "Unique Topics": {}}
    
    all_topics = [set(article["Topics"]) for article in articles if article["Topics"]]
    common_topics = set.intersection(*all_topics) if len(all_topics) > 1 else set()
    unique_topics = {f"Article {idx+1}": list(set(article["Topics"]) - common_topics) for idx, article in enumerate(articles)}

    return {"Common Topics": list(common_topics), "Unique Topics": unique_topics}


def generate_final_sentiment_analysis(sentiment_counts, company_name):
    """Generates a final summary based on sentiment distribution."""
    if sentiment_counts["Positive"] > sentiment_counts["Negative"]:
        return f"{company_name}’s latest news coverage is mostly positive. Potential stock growth expected."
    elif sentiment_counts["Negative"] > sentiment_counts["Positive"]:
        return f"{company_name} is facing challenges, with a high number of negative reports. Investors may remain cautious."
    else:
        return f"{company_name}'s news sentiment is neutral or mixed. Market response could go either way."


def translate_to_hindi(text):
    """Fallback translation using pre-defined mappings."""
    translations = {
        "’s latest news coverage is mostly positive. Potential stock growth expected.":
            "की ताज़ा ख़बरों की कवरेज ज्यादातर सकारात्मक है। स्टॉक में वृद्धि की संभावना है।",
        " is facing challenges, with a high number of negative reports. Investors may remain cautious.":
            " चुनौतियों का सामना कर रहा है, कई नकारात्मक रिपोर्टों के साथ। निवेशक सतर्क रह सकते हैं।",
        "'s news sentiment is neutral or mixed. Market response could go either way.":
            "की खबरों की भावना तटस्थ या मिली-जुली है। बाज़ार की प्रतिक्रिया किसी भी दिशा में जा सकती है।"
    }
    for key, val in translations.items():
        if key in text:
            return text.split(key)[0] + val
    return "अनुवाद करने में त्रुटि हुई।"


def text_to_speech_hindi(text):
    """Converts text to Hindi speech using gTTS and returns audio bytes."""
    tts = gTTS(text=text, lang="hi")
    audio_buffer = BytesIO()
    tts.write_to_fp(audio_buffer)
    audio_buffer.seek(0)
    return audio_buffer