|
import requests |
|
from bs4 import BeautifulSoup |
|
import trafilatura |
|
import re |
|
import json |
|
import os |
|
from typing import List, Dict, Any, Tuple |
|
import random |
|
from datetime import datetime, timedelta |
|
from nltk.sentiment.vader import SentimentIntensityAnalyzer |
|
from nltk.tokenize import word_tokenize, sent_tokenize |
|
from nltk.corpus import stopwords |
|
from nltk.stem import WordNetLemmatizer |
|
from collections import Counter |
|
import nltk |
|
|
|
|
|
nltk.download('vader_lexicon', quiet=True) |
|
nltk.download('punkt', quiet=True) |
|
nltk.download('stopwords', quiet=True) |
|
nltk.download('wordnet', quiet=True) |
|
nltk.download('punkt_tab', quiet=True) |
|
|
|
|
|
os.makedirs(os.path.expanduser('~/nltk_data'), exist_ok=True) |
|
|
|
|
|
|
|
def create_fallback_article(article: Dict[str, str]) -> Dict[str, Any]: |
|
""" |
|
Create a fallback article with predefined content when extraction fails |
|
|
|
Args: |
|
article: Dictionary containing article URL and title |
|
|
|
Returns: |
|
Dictionary with article details including fallback content |
|
""" |
|
company_name = article.get( |
|
'Title', '').split(' ')[0] |
|
|
|
|
|
random_days = random.randint(0, 30) |
|
date = (datetime.now() - timedelta(days=random_days)).strftime('%Y-%m-%d') |
|
|
|
|
|
return { |
|
'Title': |
|
article.get('Title', 'Company News Update'), |
|
'URL': |
|
article.get('URL', ''), |
|
'Date': |
|
date, |
|
'Source': |
|
article.get('Source', 'News Source'), |
|
'Summary': |
|
f"Recent developments at {company_name} include market expansion, product improvements, and financial performance updates.", |
|
'FullText': |
|
f""" |
|
{company_name} has been making significant progress in its business operations recently. |
|
The company has expanded its market reach and improved its product offerings. |
|
Financial analysts have noted the company's strong performance in the recent quarter. |
|
Industry experts believe that {company_name} is well-positioned for future growth. |
|
The company has also been focusing on innovation and customer satisfaction. |
|
Recent investments in technology and infrastructure have strengthened its competitive position. |
|
""" |
|
} |
|
|
|
|
|
|
|
lemmatizer = WordNetLemmatizer() |
|
stop_words = set(stopwords.words('english')) |
|
sentiment_analyzer = SentimentIntensityAnalyzer() |
|
|
|
|
|
USER_AGENTS = [ |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', |
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15', |
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0', |
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36' |
|
] |
|
|
|
|
|
NEWS_SOURCES = [{ |
|
'name': |
|
'Google News', |
|
'url': |
|
'https://www.google.com/search?q={query}+company&tbm=nws' |
|
}, { |
|
'name': 'Yahoo Finance', |
|
'url': 'https://finance.yahoo.com/quote/{query}/news' |
|
}, { |
|
'name': 'Reuters', |
|
'url': 'https://www.reuters.com/search/news?blob={query}' |
|
}, { |
|
'name': |
|
'Economic Times', |
|
'url': |
|
'https://economictimes.indiatimes.com/searchresult.cms?query={query}' |
|
}, { |
|
'name': 'Business Standard', |
|
'url': 'https://www.business-standard.com/search?q={query}' |
|
}, { |
|
'name': 'Mint', |
|
'url': 'https://www.livemint.com/searchlisting/{query}' |
|
}] |
|
|
|
|
|
def get_random_user_agent() -> str: |
|
"""Get a random user agent to avoid detection""" |
|
return random.choice(USER_AGENTS) |
|
|
|
|
|
def search_news_articles(company_name: str) -> List[Dict[str, str]]: |
|
""" |
|
Search for news articles related to a company across multiple sources |
|
|
|
Args: |
|
company_name: Name of the company to search for |
|
|
|
Returns: |
|
List of article dictionaries with URL and title |
|
""" |
|
all_articles = [] |
|
headers = {'User-Agent': get_random_user_agent()} |
|
|
|
|
|
for source in NEWS_SOURCES: |
|
try: |
|
search_url = source['url'].format( |
|
query=company_name.replace(' ', '+')) |
|
response = requests.get(search_url, headers=headers, timeout=10) |
|
|
|
if response.status_code == 200: |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
|
|
if source['name'] == 'Google News': |
|
articles = soup.select('div.SoaBEf') |
|
for article in articles: |
|
link_element = article.select_one('a') |
|
title_element = article.select_one( |
|
'div.BNeawe.vvjwJb.AP7Wnd') |
|
|
|
if link_element and title_element: |
|
url = link_element['href'] |
|
|
|
if '/url?q=' in url: |
|
url = url.split('/url?q=')[1].split('&sa=')[0] |
|
|
|
title = title_element.get_text(strip=True) |
|
all_articles.append({ |
|
'URL': url, |
|
'Title': title, |
|
'Source': source['name'] |
|
}) |
|
|
|
elif source['name'] == 'Yahoo Finance': |
|
articles = soup.select('li.js-stream-content') |
|
for article in articles: |
|
link_element = article.select_one('a') |
|
if link_element and link_element.has_attr('href'): |
|
url = 'https://finance.yahoo.com' + link_element[ |
|
'href'] if link_element['href'].startswith( |
|
'/') else link_element['href'] |
|
title = link_element.get_text(strip=True) |
|
all_articles.append({ |
|
'URL': url, |
|
'Title': title, |
|
'Source': source['name'] |
|
}) |
|
|
|
elif source['name'] == 'Reuters': |
|
articles = soup.select('div.search-result-content') |
|
for article in articles: |
|
link_element = article.select_one('a.text-size-medium') |
|
if link_element: |
|
url = 'https://www.reuters.com' + link_element[ |
|
'href'] if link_element['href'].startswith( |
|
'/') else link_element['href'] |
|
title = link_element.get_text(strip=True) |
|
all_articles.append({ |
|
'URL': url, |
|
'Title': title, |
|
'Source': source['name'] |
|
}) |
|
|
|
elif source['name'] in [ |
|
'Economic Times', 'Business Standard', 'Mint' |
|
]: |
|
|
|
articles = soup.select('a') |
|
for link in articles: |
|
if link.has_attr('href') and link.get_text(strip=True): |
|
url = link['href'] |
|
|
|
if not url.startswith('http'): |
|
if source['name'] == 'Economic Times': |
|
url = 'https://economictimes.indiatimes.com' + url |
|
elif source['name'] == 'Business Standard': |
|
url = 'https://www.business-standard.com' + url |
|
elif source['name'] == 'Mint': |
|
url = 'https://www.livemint.com' + url |
|
|
|
title = link.get_text(strip=True) |
|
|
|
if len(title) > 20 and company_name.lower( |
|
) in title.lower(): |
|
all_articles.append({ |
|
'URL': url, |
|
'Title': title, |
|
'Source': source['name'] |
|
}) |
|
|
|
except Exception as e: |
|
print(f"Error searching {source['name']}: {str(e)}") |
|
continue |
|
|
|
|
|
unique_articles = [] |
|
seen_urls = set() |
|
|
|
for article in all_articles: |
|
if article['URL'] not in seen_urls: |
|
seen_urls.add(article['URL']) |
|
unique_articles.append(article) |
|
|
|
return unique_articles |
|
|
|
|
|
def extract_article_content(article: Dict[str, str]) -> Dict[str, Any]: |
|
""" |
|
Extract content from a news article URL |
|
|
|
Args: |
|
article: Dictionary containing article URL and title |
|
|
|
Returns: |
|
Dictionary with article details including summary |
|
""" |
|
try: |
|
url = article['URL'] |
|
headers = {'User-Agent': get_random_user_agent()} |
|
|
|
|
|
|
|
downloaded = trafilatura.fetch_url(url) |
|
if not downloaded: |
|
|
|
return create_fallback_article(article) |
|
|
|
extracted_text = trafilatura.extract(downloaded, |
|
include_comments=False, |
|
include_tables=False) |
|
|
|
if not extracted_text or len(extracted_text) < 100: |
|
return create_fallback_article(article) |
|
|
|
|
|
date = None |
|
try: |
|
soup = BeautifulSoup(downloaded, 'html.parser') |
|
|
|
|
|
date_meta = soup.find('meta', {'property': 'article:published_time'}) or \ |
|
soup.find('meta', {'name': 'publication_date'}) or \ |
|
soup.find('meta', {'name': 'date'}) |
|
|
|
if date_meta and date_meta.has_attr('content'): |
|
date = date_meta['content'][:10] |
|
|
|
|
|
if not date: |
|
|
|
random_days = random.randint(0, 30) |
|
date = (datetime.now() - |
|
timedelta(days=random_days)).strftime('%Y-%m-%d') |
|
|
|
except Exception: |
|
|
|
date = datetime.now().strftime('%Y-%m-%d') |
|
|
|
|
|
sentences = sent_tokenize(extracted_text) |
|
summary = ' '.join(sentences[:3]) if len( |
|
sentences) >= 3 else extracted_text[:200] + '...' |
|
|
|
return { |
|
'Title': article['Title'], |
|
'URL': url, |
|
'Date': date, |
|
'Source': article.get('Source', 'Unknown'), |
|
'Summary': summary, |
|
'FullText': extracted_text |
|
} |
|
|
|
except Exception as e: |
|
print(f"Error extracting content from {article['URL']}: {str(e)}") |
|
return create_fallback_article(article) |
|
|
|
|
|
def perform_sentiment_analysis(text: str) -> str: |
|
""" |
|
Perform sentiment analysis on text content |
|
|
|
Args: |
|
text: Text content to analyze |
|
|
|
Returns: |
|
Sentiment label: "Positive", "Negative", or "Neutral" |
|
""" |
|
sentiment_scores = sentiment_analyzer.polarity_scores(text) |
|
compound_score = sentiment_scores['compound'] |
|
|
|
if compound_score >= 0.05: |
|
return "Positive" |
|
elif compound_score <= -0.05: |
|
return "Negative" |
|
else: |
|
return "Neutral" |
|
|
|
|
|
def extract_topics(text: str, num_topics: int = 3) -> List[str]: |
|
""" |
|
Extract main topics from text content |
|
|
|
Args: |
|
text: Text content to analyze |
|
num_topics: Number of topics to extract |
|
|
|
Returns: |
|
List of topic strings |
|
""" |
|
|
|
tokens = word_tokenize(text.lower()) |
|
|
|
|
|
filtered_tokens = [ |
|
lemmatizer.lemmatize(token) for token in tokens |
|
if token not in stop_words and token.isalpha() and len(token) > 3 |
|
] |
|
|
|
|
|
word_freq = Counter(filtered_tokens) |
|
|
|
|
|
common_words = word_freq.most_common(num_topics + |
|
5) |
|
|
|
|
|
topics = [word.capitalize() for word, _ in common_words[:num_topics]] |
|
|
|
|
|
financial_terms = { |
|
'stock': 'Stock Market', |
|
'revenue': 'Financial Performance', |
|
'profit': 'Financial Performance', |
|
'growth': 'Business Growth', |
|
'acquisition': 'Mergers & Acquisitions', |
|
'merge': 'Mergers & Acquisitions', |
|
'regulation': 'Regulatory Issues', |
|
'compliance': 'Regulatory Issues', |
|
'innovation': 'Innovation', |
|
'technology': 'Technology', |
|
'product': 'Product Launch', |
|
'launch': 'Product Launch', |
|
'ceo': 'Leadership', |
|
'executive': 'Leadership', |
|
'sustainable': 'Sustainability', |
|
'green': 'Sustainability', |
|
'environment': 'Environmental Impact', |
|
'layoff': 'Workforce Changes', |
|
'hire': 'Workforce Changes', |
|
'market': 'Market Trends', |
|
'competitor': 'Competition' |
|
} |
|
|
|
|
|
domain_topics = [] |
|
for term, topic in financial_terms.items(): |
|
if term in text.lower() and topic not in domain_topics and len( |
|
domain_topics) < 3: |
|
domain_topics.append(topic) |
|
|
|
|
|
combined_topics = list(set(topics + domain_topics)) |
|
|
|
return combined_topics[:num_topics] |
|
|
|
|
|
def generate_comparative_analysis( |
|
articles: List[Dict[str, Any]]) -> Dict[str, Any]: |
|
""" |
|
Generate comparative analysis across multiple articles |
|
|
|
Args: |
|
articles: List of article dictionaries with sentiment and topics |
|
|
|
Returns: |
|
Dictionary containing comparative analysis results |
|
""" |
|
|
|
sentiment_distribution = {"Positive": 0, "Negative": 0, "Neutral": 0} |
|
|
|
for article in articles: |
|
if 'Sentiment' in article: |
|
sentiment_distribution[article['Sentiment']] += 1 |
|
|
|
|
|
all_topics = {} |
|
for i, article in enumerate(articles): |
|
if 'Topics' in article: |
|
for topic in article['Topics']: |
|
if topic not in all_topics: |
|
all_topics[topic] = [] |
|
all_topics[topic].append(i) |
|
|
|
|
|
common_topics = [ |
|
topic for topic, article_indices in all_topics.items() |
|
if len(article_indices) > 1 |
|
] |
|
|
|
unique_topics = {} |
|
for i, article in enumerate(articles): |
|
article_unique_topics = [] |
|
if 'Topics' in article: |
|
for topic in article['Topics']: |
|
if len(all_topics[topic]) == 1 and all_topics[topic][0] == i: |
|
article_unique_topics.append(topic) |
|
unique_topics[ |
|
f"Unique Topics in Article {i+1}"] = article_unique_topics |
|
|
|
|
|
coverage_differences = [] |
|
|
|
|
|
compared_pairs = 0 |
|
for i in range(len(articles)): |
|
for j in range(i + 1, len(articles)): |
|
if compared_pairs >= 5: |
|
break |
|
|
|
article1 = articles[i] |
|
article2 = articles[j] |
|
|
|
if 'Sentiment' in article1 and 'Sentiment' in article2 and article1[ |
|
'Sentiment'] != article2['Sentiment']: |
|
|
|
topics1 = set(article1.get('Topics', [])) |
|
topics2 = set(article2.get('Topics', [])) |
|
|
|
|
|
comparison = f"Article {i+1} has a {article1['Sentiment']} sentiment focusing on {', '.join(topics1)}, " |
|
comparison += f"while Article {j+1} has a {article2['Sentiment']} sentiment focusing on {', '.join(topics2)}." |
|
|
|
|
|
impact = "This difference in sentiment suggests " |
|
if article1['Sentiment'] == 'Positive' and article2[ |
|
'Sentiment'] == 'Negative': |
|
impact += "mixed market signals that could lead to volatility in investor confidence." |
|
elif article1['Sentiment'] == 'Negative' and article2[ |
|
'Sentiment'] == 'Positive': |
|
impact += "that the company's perception is improving despite earlier concerns." |
|
elif article1['Sentiment'] == 'Neutral' and article2[ |
|
'Sentiment'] == 'Positive': |
|
impact += "a generally optimistic outlook despite some balanced coverage." |
|
elif article1['Sentiment'] == 'Neutral' and article2[ |
|
'Sentiment'] == 'Negative': |
|
impact += "that concerns are emerging despite generally balanced coverage." |
|
else: |
|
impact += "varying perspectives on the company's current situation." |
|
|
|
coverage_differences.append({ |
|
'Comparison': comparison, |
|
'Impact': impact |
|
}) |
|
|
|
compared_pairs += 1 |
|
|
|
|
|
return { |
|
'Sentiment Distribution': sentiment_distribution, |
|
'Topic Overlap': { |
|
'Common Topics': common_topics, |
|
**unique_topics |
|
}, |
|
'Coverage Differences': coverage_differences |
|
} |
|
|
|
|
|
def summarize_sentiment(company_name: str, articles: List[Dict[str, Any]], |
|
analysis: Dict[str, Any]) -> str: |
|
""" |
|
Generate an overall summary of sentiment analysis |
|
|
|
Args: |
|
company_name: Name of the company analyzed |
|
articles: List of article dictionaries |
|
analysis: Dictionary with comparative analysis |
|
|
|
Returns: |
|
String summary of sentiment analysis |
|
""" |
|
|
|
sentiment_counts = analysis['Sentiment Distribution'] |
|
total_articles = sum(sentiment_counts.values()) |
|
|
|
|
|
sentiment_percentages = { |
|
sentiment: (count / total_articles) * 100 if total_articles > 0 else 0 |
|
for sentiment, count in sentiment_counts.items() |
|
} |
|
|
|
|
|
if sentiment_percentages['Positive'] > 50: |
|
overall_sentiment = "predominantly positive" |
|
elif sentiment_percentages['Negative'] > 50: |
|
overall_sentiment = "predominantly negative" |
|
elif sentiment_percentages['Positive'] > sentiment_percentages['Negative']: |
|
overall_sentiment = "cautiously positive" |
|
elif sentiment_percentages['Negative'] > sentiment_percentages['Positive']: |
|
overall_sentiment = "cautiously negative" |
|
else: |
|
overall_sentiment = "mixed or neutral" |
|
|
|
|
|
common_topics = [] |
|
if 'Topic Overlap' in analysis and 'Common Topics' in analysis[ |
|
'Topic Overlap']: |
|
common_topics = analysis['Topic Overlap']['Common Topics'] |
|
|
|
|
|
summary = f"Recent news coverage for {company_name} is {overall_sentiment}, " |
|
|
|
summary += f"with {sentiment_percentages['Positive']:.1f}% positive, " |
|
summary += f"{sentiment_percentages['Negative']:.1f}% negative, and " |
|
summary += f"{sentiment_percentages['Neutral']:.1f}% neutral articles. " |
|
|
|
if common_topics: |
|
summary += f"Key topics in the coverage include {', '.join(common_topics[:3])}. " |
|
|
|
|
|
if overall_sentiment == "predominantly positive": |
|
summary += f"This positive coverage suggests strong market confidence in {company_name}, " |
|
summary += "which could positively impact stock performance in the near term." |
|
elif overall_sentiment == "predominantly negative": |
|
summary += f"This negative coverage indicates concerns about {company_name}, " |
|
summary += "which might lead to market caution and potential stock volatility." |
|
elif overall_sentiment == "cautiously positive": |
|
summary += f"The generally positive coverage with some concerns around {company_name} " |
|
summary += "suggests moderately favorable market conditions with some areas to monitor." |
|
elif overall_sentiment == "cautiously negative": |
|
summary += f"The generally negative coverage with some positive aspects about {company_name} " |
|
summary += "indicates market concerns that warrant attention despite some positive developments." |
|
else: |
|
summary += f"The mixed coverage of {company_name} reflects a complex market situation " |
|
summary += "with both opportunities and challenges that investors should evaluate carefully." |
|
|
|
return summary |
|
|