Spaces:

loayshabet
/

news-sumarry

Running

App Files Files Community

news-sumarry / app.py

loayshabet

Update app.py

e437fbf verified about 2 months ago

raw

history blame

11.8 kB

	import gradio as gr
	from transformers import pipeline
	import feedparser
	from datetime import datetime, timedelta
	import pytz
	from bs4 import BeautifulSoup
	import hashlib
	import threading
	import logging
	import requests

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Global settings
	OPENROUTER_API_KEY = "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"
	OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"

	SUMMARIZER_MODELS = {
	"Default (facebook/bart-large-cnn)": "local_bart",
	"Free Model (distilbart-cnn-6-6)": "local_distilbart",
	"OpenRouter (Claude-3)": "anthropic/claude-3-haiku",
	"OpenRouter (GPT-4)": "openai/gpt-4"
	}

	CACHE_SIZE = 500
	RSS_FETCH_INTERVAL = timedelta(hours=8)
	ARTICLE_LIMIT = 5

	# Updated categories and news sources
	CATEGORIES = ["Technology", "Business", "Science", "World News", "Sports", "Health"]
	NEWS_SOURCES = {
	"Technology": {
	"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
	"reutersagency": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best",
	"alarabiya arabic": "https://www.alarabiya.net/feed/rss2/ar/technology.xml",
	},
	"Business": {
	"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
	"reutersagency": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best",
	"alwatanvoice arabic": "https://feeds.alwatanvoice.com/ar/business.xml",
	},
	"Science": {
	"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml"
	},
	"World News": {
	"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
	"BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
	"CNN": "http://rss.cnn.com/rss/edition_world.rss",
	"reutersagency": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best",
	"france24 arabic": "https://www.france24.com/ar/rss",
	"aljazera arabic": "https://www.aljazeera.net/aljazeerarss/a7c186be-1baa-4bd4-9d80-a84db769f779/73d0e1b4-532f-45ef-b135-bfdff8b8cab9",
	},
	"Sports": {
	"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml",
	"reutersagency": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best",
	"france24 arabic": "https://www.france24.com/ar/%D8%B1%D9%8A%D8%A7%D8%B6%D8%A9/rss",
	},
	"Health": {
	"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml",
	"politico": "http://rss.politico.com/healthcare.xml",
	"reutersagency": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best"
	},
	}

	class NewsCache:
	def __init__(self, size):
	self.cache = {}
	self.size = size
	self.lock = threading.Lock()

	def get(self, key):
	with self.lock:
	return self.cache.get(key)

	def set(self, key, value):
	with self.lock:
	if len(self.cache) >= self.size:
	oldest_key = next(iter(self.cache))
	del self.cache[oldest_key]
	self.cache[key] = value

	cache = NewsCache(CACHE_SIZE)

	def detect_language(text):
	"""Detect if the text is primarily Arabic"""
	if not text:
	return False
	arabic_chars = len([c for c in text if '\u0600' <= c <= '\u06FF'])
	return (arabic_chars / len(text)) > 0.5

	def summarize_text(text, model_name):
	try:
	content_hash = hashlib.md5(text.encode()).hexdigest()
	cached_summary = cache.get(content_hash)

	if cached_summary:
	logger.info("Using cached summary")
	return cached_summary

	is_arabic = detect_language(text)

	if is_arabic or model_name in ["OpenRouter (Claude-3)", "OpenRouter (GPT-4)"]:
	logger.info(f"Using OpenRouter with model {model_name} for summarization")
	headers = {
	"Authorization": f"Bearer {OPENROUTER_API_KEY}",
	"HTTP-Referer": "https://localhost:7860",
	"X-Title": "News Summarizer App",
	"Content-Type": "application/json"
	}

	prompt = f"Please provide a concise summary of the following news article in the same language as the original text. Keep the summary brief and focused on key points:\n\n{text}"

	model_id = SUMMARIZER_MODELS[model_name]

	data = {
	"model": model_id,
	"messages": [{"role": "user", "content": prompt}],
	"max_tokens": 150
	}

	response = requests.post(OPENROUTER_API_URL, headers=headers, json=data)
	response.raise_for_status()

	summary = response.json()["choices"][0]["message"]["content"]
	else:
	logger.info("Using local model for summarization")
	model_path = "facebook/bart-large-cnn" if model_name == "Default (facebook/bart-large-cnn)" else "sshleifer/distilbart-cnn-6-6"
	summarizer = pipeline("summarization", model=model_path, device=-1)
	result = summarizer(text, max_length=120, min_length=40, truncation=True)
	summary = result[0]['summary_text']

	cache.set(content_hash, summary)
	return summary
	except Exception as e:
	logger.error(f"Error in summarization: {str(e)}")
	return f"Summary unavailable. Error: {str(e)}"

	def fetch_rss_news(tech_sources, business_sources, science_sources, world_sources, sports_sources, health_sources):
	articles = []
	cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL

	category_sources = {
	"Technology": tech_sources if tech_sources else [],
	"Business": business_sources if business_sources else [],
	"Science": science_sources if science_sources else [],
	"World News": world_sources if world_sources else [],
	"Sports": sports_sources if sports_sources else [],
	"Health": health_sources if health_sources else []
	}

	logger.info(f"Selected sources: {category_sources}")

	for category, sources in category_sources.items():
	if not sources:
	continue

	logger.info(f"Processing category: {category} with sources: {sources}")

	for source in sources:
	if source in NEWS_SOURCES[category]:
	url = NEWS_SOURCES[category][source]
	try:
	logger.info(f"Fetching from URL: {url}")
	feed = feedparser.parse(url)

	if hasattr(feed, 'status') and feed.status != 200:
	logger.warning(f"Failed to fetch feed from {url}. Status: {feed.status}")
	continue

	for entry in feed.entries:
	try:
	published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
	if published > cutoff_time:
	articles.append({
	"title": entry.title,
	"description": BeautifulSoup(entry.description, "html.parser").get_text(),
	"link": entry.link,
	"category": category,
	"source": source,
	"published": published
	})
	except (AttributeError, TypeError) as e:
	logger.error(f"Error processing entry: {str(e)}")
	continue

	except Exception as e:
	logger.error(f"Error fetching feed from {url}: {str(e)}")
	continue

	logger.info(f"Total articles fetched: {len(articles)}")
	articles = sorted(articles, key=lambda x: x["published"], reverse=True)[:ARTICLE_LIMIT]
	return articles

	def summarize_articles(articles, model_name):
	summaries = []
	for article in articles:
	content = article["description"]
	summary = summarize_text(content, model_name)
	summaries.append(f"""
	<div style='margin-bottom: 20px; white-space: pre-wrap;'>
	📰 {article['title']}
	📃 Summary: {summary}
	- 📁 Category: {article['category']}
	- 💡 Source: {article['source']}
	- 🔗 Read More: <a href="{article['link']}" target="_blank" style="text-decoration: none;">click here</a>
	</div>
	""")
	return "\n".join(summaries)

	def get_summary(tech_sources, business_sources, science_sources, world_sources,
	sports_sources, health_sources, selected_model):
	try:
	if not any([tech_sources, business_sources, science_sources,
	world_sources, sports_sources, health_sources]):
	return "Please select at least one news source."

	articles = fetch_rss_news(tech_sources, business_sources, science_sources,
	world_sources, sports_sources, health_sources)

	if not articles:
	return "No recent news found from the selected sources."

	return summarize_articles(articles, selected_model)
	except Exception as e:
	logger.error(f"Error in get_summary: {str(e)}")
	return f"An error occurred while processing your request: {str(e)}"

	# Gradio Interface
	demo = gr.Blocks()

	with demo:
	gr.Markdown("# 📰 AI News Summarizer")

	with gr.Row():
	with gr.Column():
	tech_sources = gr.CheckboxGroup(
	choices=list(NEWS_SOURCES["Technology"].keys()),
	label="Technology Sources",
	value=[]
	)

	business_sources = gr.CheckboxGroup(
	choices=list(NEWS_SOURCES["Business"].keys()),
	label="Business Sources",
	value=[]
	)

	science_sources = gr.CheckboxGroup(
	choices=list(NEWS_SOURCES["Science"].keys()),
	label="Science Sources",
	value=[]
	)

	with gr.Column():
	world_sources = gr.CheckboxGroup(
	choices=list(NEWS_SOURCES["World News"].keys()),
	label="World News Sources",
	value=[]
	)

	sports_sources = gr.CheckboxGroup(
	choices=list(NEWS_SOURCES["Sports"].keys()),
	label="Sports Sources",
	value=[]
	)

	health_sources = gr.CheckboxGroup(
	choices=list(NEWS_SOURCES["Health"].keys()),
	label="Health Sources",
	value=[]
	)

	with gr.Column():
	model_selector = gr.Radio(
	choices=list(SUMMARIZER_MODELS.keys()),
	label="Choose Summarization Model",
	value="OpenRouter (Claude-3)"
	)

	summarize_button = gr.Button("Get News Summary")
	summary_output = gr.HTML(label="News Summary")

	summarize_button.click(
	get_summary,
	inputs=[
	tech_sources,
	business_sources,
	science_sources,
	world_sources,
	sports_sources,
	health_sources,
	model_selector
	],
	outputs=summary_output
	)

	if __name__ == "__main__":
	demo.launch()