File size: 10,049 Bytes
98a6e13
 
 
 
 
 
 
bae5920
98a6e13
e1659c4
 
 
 
 
 
58e5eec
e1659c4
 
 
98a6e13
 
 
 
 
 
 
 
e1659c4
 
98a6e13
e1659c4
 
98a6e13
e1659c4
 
 
 
 
98a6e13
 
e1659c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98a6e13
e1659c4
 
 
 
 
 
 
 
 
98a6e13
e1659c4
 
 
 
 
 
98a6e13
e1659c4
 
98a6e13
e1659c4
 
 
 
98a6e13
e1659c4
 
98a6e13
 
 
e1659c4
98a6e13
 
e1659c4
98a6e13
e1659c4
 
 
98a6e13
 
 
 
 
 
e1659c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98a6e13
e1659c4
 
 
98a6e13
e1659c4
 
 
 
 
 
 
98a6e13
e1659c4
 
 
 
98a6e13
e1659c4
 
 
 
 
 
98a6e13
 
 
 
 
 
 
e1659c4
98a6e13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1659c4
 
 
 
 
 
 
 
 
 
 
 
 
98a6e13
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
import requests
import random
from io import BytesIO
from bs4 import BeautifulSoup
from gtts import gTTS
from rake_nltk import Rake
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# from googletrans import Translator
import re
import nltk
from collections import Counter


nltk.download('punkt')
nltk.download('stopwords')  # Needed for filtering keywords
nltk.download('punkt_tab')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


# Initialize Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

# Initialize RAKE for keyword extraction
rake = Rake()


def get_news_articles(topic, max_articles_per_source=5):
    all_articles = []

    # --- Source 1: Times of India ---
    toi_url = f"https://timesofindia.indiatimes.com/topic/{topic}"
    try:
        response = requests.get(toi_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        results = soup.find_all("div", class_="uwU81")[:max_articles_per_source]

        for result in results:
            title_tag = result.find("div", class_="fHv_i o58kM")
            summary_tag = result.find("p", class_="oxXSK o58kM")
            link_tag = result.find("a")
            date_tag = result.find("div", class_="ZxBIG")

            title = title_tag.text.strip() if title_tag else "No title"
            summary = summary_tag.text.strip() if summary_tag else "No summary"
            link = f"https://timesofindia.indiatimes.com{link_tag['href']}" if link_tag else "#"

            formatted_date = "Date not found"
            if date_tag:
                match = re.search(r"/\s+(.*?\(\w+\))", date_tag.get_text())
                if match:
                    date_str = match.group(1).replace("(IST)", "").strip()
                    try:
                        dt = datetime.strptime(date_str, "%b %d, %Y, %H:%M")
                        formatted_date = dt.strftime("%b %d, %Y")
                    except Exception:
                        formatted_date = date_str

            sentiment_score = sia.polarity_scores(f"{title}. {summary}")["compound"]
            sentiment = "Positive" if sentiment_score >= 0.05 else "Negative" if sentiment_score <= -0.05 else "Neutral"
            topics = extract_topics(title + " " + summary)

            all_articles.append({
                "Source": "Times of India",
                "Title": title,
                "Summary": summary,
                "Link": link,
                "Date": formatted_date,
                "Sentiment": sentiment,
                "Topics": topics
            })

    except Exception as e:
        print(f"Error scraping TOI: {e}")

    # --- Source 2: Economic Times ---
    et_url = f"https://economictimes.indiatimes.com/topic/{topic}"
    try:
        response = requests.get(et_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        results = soup.find_all("div", class_="contentD")[:max_articles_per_source]

        for result in results:
            a_tag = result.find("a", class_="wrapLines l2")
            summary_tag = result.find("p", class_="wrapLines l3")
            time_tag = result.find("time")

            title = a_tag.text.strip() if a_tag else "No title"
            link = f"https://economictimes.indiatimes.com{a_tag['href']}" if a_tag and "href" in a_tag.attrs else "#"
            summary = summary_tag.text.strip() if summary_tag else "No summary"
            date_str = time_tag.text.strip() if time_tag else "Date not found"

            try:
                dt = datetime.strptime(date_str.replace(" IST", ""), "%d %b, %Y, %I:%M %p")
                formatted_date = dt.strftime("%b %d, %Y")
            except Exception:
                formatted_date = date_str

            sentiment_score = sia.polarity_scores(f"{title}. {summary}")["compound"]
            sentiment = "Positive" if sentiment_score >= 0.05 else "Negative" if sentiment_score <= -0.05 else "Neutral"
            topics = extract_topics(title + " " + summary)

            all_articles.append({
                "Source": "Economic Times",
                "Title": title,
                "Summary": summary,
                "Link": link,
                "Date": formatted_date,
                "Sentiment": sentiment,
                "Topics": topics
            })

    except Exception as e:
        print(f"Error scraping Economic Times: {e}")

    
    # Sentiment Distribution
    sentiment_counts = Counter(article["Sentiment"] for article in all_articles)

    # Topic Overlap
    topic_overlap = analyze_topic_overlap(all_articles)

    # Coverage Differences
    coverage_differences = generate_coverage_differences(all_articles)

    # Final Sentiment Summary
    final_sentiment_summary_english = generate_final_sentiment_analysis(sentiment_counts, topic)

    # Translation & TTS
    final_sentiment_summary_hindi = translate_to_hindi(final_sentiment_summary_english)
    audio_bytes = text_to_speech_hindi(final_sentiment_summary_hindi)


    return {
        "Company": topic,
        "Articles": all_articles,
        "Comparative Sentiment Score": {
            "Sentiment Distribution": dict(sentiment_counts),
            "Topic Overlap": topic_overlap,
            "Coverage Differences": coverage_differences  # Can be implemented later
        },
        "Final Sentiment Analysis": final_sentiment_summary_english,
        "Audio Bytes": audio_bytes
    }


def extract_topics(text, max_keywords=3):
    """Extracts key topics using RAKE, filtering out irrelevant keywords."""
    rake.extract_keywords_from_text(text)
    
    keywords = []
    for kw in rake.get_ranked_phrases():
        cleaned_kw = kw.title().strip()
        if (
            len(kw.split()) > 1 and
            "summary available" not in kw.lower() and
            not re.search(r"\b\d+\b", kw) and
            not re.search(r"[^\w\s-]", kw) and
            len(re.sub(r"[^a-zA-Z\s]", "", kw).strip()) > 1 and
            not any(word in stop_words for word in kw.lower().split())
        ):
            keywords.append(cleaned_kw)

    return keywords[:max_keywords] if keywords else ["General News"]



def generate_coverage_differences(articles):
    """Compares three random pairs of articles and generates coverage differences."""
    if len(articles) < 6:
        return [{"Comparison": "Not enough articles to compare 3 pairs.", "Impact": "At least 6 articles required."}]
    
    sampled_indices = random.sample(range(len(articles)), 6)
    pairs = [(sampled_indices[i], sampled_indices[i+1]) for i in range(0, 6, 2)]

    comparisons = []
    for idx1, idx2 in pairs:
        article1 = articles[idx1]
        article2 = articles[idx2]

        title1 = article1['Title'].replace('\n', ' ').strip()
        title2 = article2['Title'].replace('\n', ' ').strip()
        sentiment1 = article1['Sentiment'].strip().lower()
        sentiment2 = article2['Sentiment'].strip().lower()

        comparisons.append({
            "Comparison": f"Article {idx1+1}: '{title1}' vs Article {idx2+1}: '{title2}'.",
            "Impact": f"Article {idx1+1} is {sentiment1}, while Article {idx2+1} is {sentiment2}."
        })

    return comparisons

def analyze_topic_overlap(articles):
    """Finds common and unique topics among articles."""
    if len(articles) < 2:
        return {"Common Topics": [], "Unique Topics": {}}
    
    all_topics = [set(article["Topics"]) for article in articles if article["Topics"]]
    common_topics = set.intersection(*all_topics) if len(all_topics) > 1 else set()
    unique_topics = {f"Article {idx+1}": list(set(article["Topics"]) - common_topics) for idx, article in enumerate(articles)}

    return {"Common Topics": list(common_topics), "Unique Topics": unique_topics}


def generate_final_sentiment_analysis(sentiment_counts, company_name):
    """Generates a final summary based on sentiment distribution."""
    if sentiment_counts["Positive"] > sentiment_counts["Negative"]:
        return f"{company_name}’s latest news coverage is mostly positive. Potential stock growth expected."
    elif sentiment_counts["Negative"] > sentiment_counts["Positive"]:
        return f"{company_name} is facing challenges, with a high number of negative reports. Investors may remain cautious."
    else:
        return f"{company_name}'s news sentiment is neutral or mixed. Market response could go either way."


def translate_to_hindi(text):
    """Fallback translation using pre-defined mappings."""
    translations = {
        "’s latest news coverage is mostly positive. Potential stock growth expected.":
            "की ताज़ा ख़बरों की कवरेज ज्यादातर सकारात्मक है। स्टॉक में वृद्धि की संभावना है।",
        " is facing challenges, with a high number of negative reports. Investors may remain cautious.":
            " चुनौतियों का सामना कर रहा है, कई नकारात्मक रिपोर्टों के साथ। निवेशक सतर्क रह सकते हैं।",
        "'s news sentiment is neutral or mixed. Market response could go either way.":
            "की खबरों की भावना तटस्थ या मिली-जुली है। बाज़ार की प्रतिक्रिया किसी भी दिशा में जा सकती है।"
    }
    for key, val in translations.items():
        if key in text:
            return text.split(key)[0] + val
    return "अनुवाद करने में त्रुटि हुई।"


def text_to_speech_hindi(text):
    """Converts text to Hindi speech using gTTS and returns audio bytes."""
    tts = gTTS(text=text, lang="hi")
    audio_buffer = BytesIO()
    tts.write_to_fp(audio_buffer)
    audio_buffer.seek(0)
    return audio_buffer