Spaces:

dpaul8195
/

news-sentiment-analyzer

Sleeping

App Files Files Community

news-sentiment-analyzer / utils.py

dpaul8195

Update utils.py

58e5eec unverified 3 months ago

raw

history blame contribute delete

10 kB

	import requests
	import random
	from io import BytesIO
	from bs4 import BeautifulSoup
	from gtts import gTTS
	from rake_nltk import Rake
	from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
	# from googletrans import Translator
	import re
	import nltk
	from collections import Counter


	nltk.download('punkt')
	nltk.download('stopwords') # Needed for filtering keywords
	nltk.download('punkt_tab')
	from nltk.corpus import stopwords
	stop_words = set(stopwords.words('english'))


	# Initialize Sentiment Analyzer
	sia = SentimentIntensityAnalyzer()

	# Initialize RAKE for keyword extraction
	rake = Rake()


	def get_news_articles(topic, max_articles_per_source=5):
	all_articles = []

	# --- Source 1: Times of India ---
	toi_url = f"https://timesofindia.indiatimes.com/topic/{topic}"
	try:
	response = requests.get(toi_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
	soup = BeautifulSoup(response.text, "html.parser")
	results = soup.find_all("div", class_="uwU81")[:max_articles_per_source]

	for result in results:
	title_tag = result.find("div", class_="fHv_i o58kM")
	summary_tag = result.find("p", class_="oxXSK o58kM")
	link_tag = result.find("a")
	date_tag = result.find("div", class_="ZxBIG")

	title = title_tag.text.strip() if title_tag else "No title"
	summary = summary_tag.text.strip() if summary_tag else "No summary"
	link = f"https://timesofindia.indiatimes.com{link_tag['href']}" if link_tag else "#"

	formatted_date = "Date not found"
	if date_tag:
	match = re.search(r"/\s+(.*?\(\w+\))", date_tag.get_text())
	if match:
	date_str = match.group(1).replace("(IST)", "").strip()
	try:
	dt = datetime.strptime(date_str, "%b %d, %Y, %H:%M")
	formatted_date = dt.strftime("%b %d, %Y")
	except Exception:
	formatted_date = date_str

	sentiment_score = sia.polarity_scores(f"{title}. {summary}")["compound"]
	sentiment = "Positive" if sentiment_score >= 0.05 else "Negative" if sentiment_score <= -0.05 else "Neutral"
	topics = extract_topics(title + " " + summary)

	all_articles.append({
	"Source": "Times of India",
	"Title": title,
	"Summary": summary,
	"Link": link,
	"Date": formatted_date,
	"Sentiment": sentiment,
	"Topics": topics
	})

	except Exception as e:
	print(f"Error scraping TOI: {e}")

	# --- Source 2: Economic Times ---
	et_url = f"https://economictimes.indiatimes.com/topic/{topic}"
	try:
	response = requests.get(et_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
	soup = BeautifulSoup(response.text, "html.parser")
	results = soup.find_all("div", class_="contentD")[:max_articles_per_source]

	for result in results:
	a_tag = result.find("a", class_="wrapLines l2")
	summary_tag = result.find("p", class_="wrapLines l3")
	time_tag = result.find("time")

	title = a_tag.text.strip() if a_tag else "No title"
	link = f"https://economictimes.indiatimes.com{a_tag['href']}" if a_tag and "href" in a_tag.attrs else "#"
	summary = summary_tag.text.strip() if summary_tag else "No summary"
	date_str = time_tag.text.strip() if time_tag else "Date not found"

	try:
	dt = datetime.strptime(date_str.replace(" IST", ""), "%d %b, %Y, %I:%M %p")
	formatted_date = dt.strftime("%b %d, %Y")
	except Exception:
	formatted_date = date_str

	sentiment_score = sia.polarity_scores(f"{title}. {summary}")["compound"]
	sentiment = "Positive" if sentiment_score >= 0.05 else "Negative" if sentiment_score <= -0.05 else "Neutral"
	topics = extract_topics(title + " " + summary)

	all_articles.append({
	"Source": "Economic Times",
	"Title": title,
	"Summary": summary,
	"Link": link,
	"Date": formatted_date,
	"Sentiment": sentiment,
	"Topics": topics
	})

	except Exception as e:
	print(f"Error scraping Economic Times: {e}")


	# Sentiment Distribution
	sentiment_counts = Counter(article["Sentiment"] for article in all_articles)

	# Topic Overlap
	topic_overlap = analyze_topic_overlap(all_articles)

	# Coverage Differences
	coverage_differences = generate_coverage_differences(all_articles)

	# Final Sentiment Summary
	final_sentiment_summary_english = generate_final_sentiment_analysis(sentiment_counts, topic)

	# Translation & TTS
	final_sentiment_summary_hindi = translate_to_hindi(final_sentiment_summary_english)
	audio_bytes = text_to_speech_hindi(final_sentiment_summary_hindi)


	return {
	"Company": topic,
	"Articles": all_articles,
	"Comparative Sentiment Score": {
	"Sentiment Distribution": dict(sentiment_counts),
	"Topic Overlap": topic_overlap,
	"Coverage Differences": coverage_differences # Can be implemented later
	},
	"Final Sentiment Analysis": final_sentiment_summary_english,
	"Audio Bytes": audio_bytes
	}


	def extract_topics(text, max_keywords=3):
	"""Extracts key topics using RAKE, filtering out irrelevant keywords."""
	rake.extract_keywords_from_text(text)

	keywords = []
	for kw in rake.get_ranked_phrases():
	cleaned_kw = kw.title().strip()
	if (
	len(kw.split()) > 1 and
	"summary available" not in kw.lower() and
	not re.search(r"\b\d+\b", kw) and
	not re.search(r"[^\w\s-]", kw) and
	len(re.sub(r"[^a-zA-Z\s]", "", kw).strip()) > 1 and
	not any(word in stop_words for word in kw.lower().split())
	):
	keywords.append(cleaned_kw)

	return keywords[:max_keywords] if keywords else ["General News"]



	def generate_coverage_differences(articles):
	"""Compares three random pairs of articles and generates coverage differences."""
	if len(articles) < 6:
	return [{"Comparison": "Not enough articles to compare 3 pairs.", "Impact": "At least 6 articles required."}]

	sampled_indices = random.sample(range(len(articles)), 6)
	pairs = [(sampled_indices[i], sampled_indices[i+1]) for i in range(0, 6, 2)]

	comparisons = []
	for idx1, idx2 in pairs:
	article1 = articles[idx1]
	article2 = articles[idx2]

	title1 = article1['Title'].replace('\n', ' ').strip()
	title2 = article2['Title'].replace('\n', ' ').strip()
	sentiment1 = article1['Sentiment'].strip().lower()
	sentiment2 = article2['Sentiment'].strip().lower()

	comparisons.append({
	"Comparison": f"Article {idx1+1}: '{title1}' vs Article {idx2+1}: '{title2}'.",
	"Impact": f"Article {idx1+1} is {sentiment1}, while Article {idx2+1} is {sentiment2}."
	})

	return comparisons

	def analyze_topic_overlap(articles):
	"""Finds common and unique topics among articles."""
	if len(articles) < 2:
	return {"Common Topics": [], "Unique Topics": {}}

	all_topics = [set(article["Topics"]) for article in articles if article["Topics"]]
	common_topics = set.intersection(*all_topics) if len(all_topics) > 1 else set()
	unique_topics = {f"Article {idx+1}": list(set(article["Topics"]) - common_topics) for idx, article in enumerate(articles)}

	return {"Common Topics": list(common_topics), "Unique Topics": unique_topics}


	def generate_final_sentiment_analysis(sentiment_counts, company_name):
	"""Generates a final summary based on sentiment distribution."""
	if sentiment_counts["Positive"] > sentiment_counts["Negative"]:
	return f"{company_name}’s latest news coverage is mostly positive. Potential stock growth expected."
	elif sentiment_counts["Negative"] > sentiment_counts["Positive"]:
	return f"{company_name} is facing challenges, with a high number of negative reports. Investors may remain cautious."
	else:
	return f"{company_name}'s news sentiment is neutral or mixed. Market response could go either way."


	def translate_to_hindi(text):
	"""Fallback translation using pre-defined mappings."""
	translations = {
	"’s latest news coverage is mostly positive. Potential stock growth expected.":
	"की ताज़ा ख़बरों की कवरेज ज्यादातर सकारात्मक है। स्टॉक में वृद्धि की संभावना है।",
	" is facing challenges, with a high number of negative reports. Investors may remain cautious.":
	" चुनौतियों का सामना कर रहा है, कई नकारात्मक रिपोर्टों के साथ। निवेशक सतर्क रह सकते हैं।",
	"'s news sentiment is neutral or mixed. Market response could go either way.":
	"की खबरों की भावना तटस्थ या मिली-जुली है। बाज़ार की प्रतिक्रिया किसी भी दिशा में जा सकती है।"
	}
	for key, val in translations.items():
	if key in text:
	return text.split(key)[0] + val
	return "अनुवाद करने में त्रुटि हुई।"


	def text_to_speech_hindi(text):
	"""Converts text to Hindi speech using gTTS and returns audio bytes."""
	tts = gTTS(text=text, lang="hi")
	audio_buffer = BytesIO()
	tts.write_to_fp(audio_buffer)
	audio_buffer.seek(0)
	return audio_buffer