Spaces:
Sleeping
Sleeping
import requests | |
import random | |
from io import BytesIO | |
from bs4 import BeautifulSoup | |
from gtts import gTTS | |
from rake_nltk import Rake | |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer | |
# from googletrans import Translator | |
import re | |
import nltk | |
from collections import Counter | |
nltk.download('punkt') | |
nltk.download('stopwords') # Needed for filtering keywords | |
nltk.download('punkt_tab') | |
from nltk.corpus import stopwords | |
stop_words = set(stopwords.words('english')) | |
# Initialize Sentiment Analyzer | |
sia = SentimentIntensityAnalyzer() | |
# Initialize RAKE for keyword extraction | |
rake = Rake() | |
def get_news_articles(topic, max_articles_per_source=5): | |
all_articles = [] | |
# --- Source 1: Times of India --- | |
toi_url = f"https://timesofindia.indiatimes.com/topic/{topic}" | |
try: | |
response = requests.get(toi_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10) | |
soup = BeautifulSoup(response.text, "html.parser") | |
results = soup.find_all("div", class_="uwU81")[:max_articles_per_source] | |
for result in results: | |
title_tag = result.find("div", class_="fHv_i o58kM") | |
summary_tag = result.find("p", class_="oxXSK o58kM") | |
link_tag = result.find("a") | |
date_tag = result.find("div", class_="ZxBIG") | |
title = title_tag.text.strip() if title_tag else "No title" | |
summary = summary_tag.text.strip() if summary_tag else "No summary" | |
link = f"https://timesofindia.indiatimes.com{link_tag['href']}" if link_tag else "#" | |
formatted_date = "Date not found" | |
if date_tag: | |
match = re.search(r"/\s+(.*?\(\w+\))", date_tag.get_text()) | |
if match: | |
date_str = match.group(1).replace("(IST)", "").strip() | |
try: | |
dt = datetime.strptime(date_str, "%b %d, %Y, %H:%M") | |
formatted_date = dt.strftime("%b %d, %Y") | |
except Exception: | |
formatted_date = date_str | |
sentiment_score = sia.polarity_scores(f"{title}. {summary}")["compound"] | |
sentiment = "Positive" if sentiment_score >= 0.05 else "Negative" if sentiment_score <= -0.05 else "Neutral" | |
topics = extract_topics(title + " " + summary) | |
all_articles.append({ | |
"Source": "Times of India", | |
"Title": title, | |
"Summary": summary, | |
"Link": link, | |
"Date": formatted_date, | |
"Sentiment": sentiment, | |
"Topics": topics | |
}) | |
except Exception as e: | |
print(f"Error scraping TOI: {e}") | |
# --- Source 2: Economic Times --- | |
et_url = f"https://economictimes.indiatimes.com/topic/{topic}" | |
try: | |
response = requests.get(et_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10) | |
soup = BeautifulSoup(response.text, "html.parser") | |
results = soup.find_all("div", class_="contentD")[:max_articles_per_source] | |
for result in results: | |
a_tag = result.find("a", class_="wrapLines l2") | |
summary_tag = result.find("p", class_="wrapLines l3") | |
time_tag = result.find("time") | |
title = a_tag.text.strip() if a_tag else "No title" | |
link = f"https://economictimes.indiatimes.com{a_tag['href']}" if a_tag and "href" in a_tag.attrs else "#" | |
summary = summary_tag.text.strip() if summary_tag else "No summary" | |
date_str = time_tag.text.strip() if time_tag else "Date not found" | |
try: | |
dt = datetime.strptime(date_str.replace(" IST", ""), "%d %b, %Y, %I:%M %p") | |
formatted_date = dt.strftime("%b %d, %Y") | |
except Exception: | |
formatted_date = date_str | |
sentiment_score = sia.polarity_scores(f"{title}. {summary}")["compound"] | |
sentiment = "Positive" if sentiment_score >= 0.05 else "Negative" if sentiment_score <= -0.05 else "Neutral" | |
topics = extract_topics(title + " " + summary) | |
all_articles.append({ | |
"Source": "Economic Times", | |
"Title": title, | |
"Summary": summary, | |
"Link": link, | |
"Date": formatted_date, | |
"Sentiment": sentiment, | |
"Topics": topics | |
}) | |
except Exception as e: | |
print(f"Error scraping Economic Times: {e}") | |
# Sentiment Distribution | |
sentiment_counts = Counter(article["Sentiment"] for article in all_articles) | |
# Topic Overlap | |
topic_overlap = analyze_topic_overlap(all_articles) | |
# Coverage Differences | |
coverage_differences = generate_coverage_differences(all_articles) | |
# Final Sentiment Summary | |
final_sentiment_summary_english = generate_final_sentiment_analysis(sentiment_counts, topic) | |
# Translation & TTS | |
final_sentiment_summary_hindi = translate_to_hindi(final_sentiment_summary_english) | |
audio_bytes = text_to_speech_hindi(final_sentiment_summary_hindi) | |
return { | |
"Company": topic, | |
"Articles": all_articles, | |
"Comparative Sentiment Score": { | |
"Sentiment Distribution": dict(sentiment_counts), | |
"Topic Overlap": topic_overlap, | |
"Coverage Differences": coverage_differences # Can be implemented later | |
}, | |
"Final Sentiment Analysis": final_sentiment_summary_english, | |
"Audio Bytes": audio_bytes | |
} | |
def extract_topics(text, max_keywords=3): | |
"""Extracts key topics using RAKE, filtering out irrelevant keywords.""" | |
rake.extract_keywords_from_text(text) | |
keywords = [] | |
for kw in rake.get_ranked_phrases(): | |
cleaned_kw = kw.title().strip() | |
if ( | |
len(kw.split()) > 1 and | |
"summary available" not in kw.lower() and | |
not re.search(r"\b\d+\b", kw) and | |
not re.search(r"[^\w\s-]", kw) and | |
len(re.sub(r"[^a-zA-Z\s]", "", kw).strip()) > 1 and | |
not any(word in stop_words for word in kw.lower().split()) | |
): | |
keywords.append(cleaned_kw) | |
return keywords[:max_keywords] if keywords else ["General News"] | |
def generate_coverage_differences(articles): | |
"""Compares three random pairs of articles and generates coverage differences.""" | |
if len(articles) < 6: | |
return [{"Comparison": "Not enough articles to compare 3 pairs.", "Impact": "At least 6 articles required."}] | |
sampled_indices = random.sample(range(len(articles)), 6) | |
pairs = [(sampled_indices[i], sampled_indices[i+1]) for i in range(0, 6, 2)] | |
comparisons = [] | |
for idx1, idx2 in pairs: | |
article1 = articles[idx1] | |
article2 = articles[idx2] | |
title1 = article1['Title'].replace('\n', ' ').strip() | |
title2 = article2['Title'].replace('\n', ' ').strip() | |
sentiment1 = article1['Sentiment'].strip().lower() | |
sentiment2 = article2['Sentiment'].strip().lower() | |
comparisons.append({ | |
"Comparison": f"Article {idx1+1}: '{title1}' vs Article {idx2+1}: '{title2}'.", | |
"Impact": f"Article {idx1+1} is {sentiment1}, while Article {idx2+1} is {sentiment2}." | |
}) | |
return comparisons | |
def analyze_topic_overlap(articles): | |
"""Finds common and unique topics among articles.""" | |
if len(articles) < 2: | |
return {"Common Topics": [], "Unique Topics": {}} | |
all_topics = [set(article["Topics"]) for article in articles if article["Topics"]] | |
common_topics = set.intersection(*all_topics) if len(all_topics) > 1 else set() | |
unique_topics = {f"Article {idx+1}": list(set(article["Topics"]) - common_topics) for idx, article in enumerate(articles)} | |
return {"Common Topics": list(common_topics), "Unique Topics": unique_topics} | |
def generate_final_sentiment_analysis(sentiment_counts, company_name): | |
"""Generates a final summary based on sentiment distribution.""" | |
if sentiment_counts["Positive"] > sentiment_counts["Negative"]: | |
return f"{company_name}’s latest news coverage is mostly positive. Potential stock growth expected." | |
elif sentiment_counts["Negative"] > sentiment_counts["Positive"]: | |
return f"{company_name} is facing challenges, with a high number of negative reports. Investors may remain cautious." | |
else: | |
return f"{company_name}'s news sentiment is neutral or mixed. Market response could go either way." | |
def translate_to_hindi(text): | |
"""Fallback translation using pre-defined mappings.""" | |
translations = { | |
"’s latest news coverage is mostly positive. Potential stock growth expected.": | |
"की ताज़ा ख़बरों की कवरेज ज्यादातर सकारात्मक है। स्टॉक में वृद्धि की संभावना है।", | |
" is facing challenges, with a high number of negative reports. Investors may remain cautious.": | |
" चुनौतियों का सामना कर रहा है, कई नकारात्मक रिपोर्टों के साथ। निवेशक सतर्क रह सकते हैं।", | |
"'s news sentiment is neutral or mixed. Market response could go either way.": | |
"की खबरों की भावना तटस्थ या मिली-जुली है। बाज़ार की प्रतिक्रिया किसी भी दिशा में जा सकती है।" | |
} | |
for key, val in translations.items(): | |
if key in text: | |
return text.split(key)[0] + val | |
return "अनुवाद करने में त्रुटि हुई।" | |
def text_to_speech_hindi(text): | |
"""Converts text to Hindi speech using gTTS and returns audio bytes.""" | |
tts = gTTS(text=text, lang="hi") | |
audio_buffer = BytesIO() | |
tts.write_to_fp(audio_buffer) | |
audio_buffer.seek(0) | |
return audio_buffer | |