news-sumarry / app.py
loayshabet's picture
Update app.py
2267b2b verified
raw
history blame
7.83 kB
import gradio as gr
from transformers import pipeline
import feedparser
from datetime import datetime, timedelta
import json
import os
import logging
import pytz
from bs4 import BeautifulSoup
import hashlib
import threading
# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Global settings
SUMMARIZER_MODEL = "facebook/bart-large-cnn" # You can replace this with other summarization models
CACHE_SIZE = 500 # Maximum number of cached summaries
RSS_FETCH_INTERVAL = timedelta(hours=8) # Fetch recent news within the last 8 hours
TIMEOUT_LIMIT = 30 # Maximum time in seconds to process summaries
# News sources
NEWS_SOURCES = {
"Technology": {
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best"
},
"Business": {
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best"
},
"Science": {
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml"
},
"World News": {
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
"BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
"CNN": "http://rss.cnn.com/rss/edition_world.rss",
"reutersagency": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best"
},
"Sports": {
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml",
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best"
},
"Health": {
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml",
"politico": "http://rss.politico.com/healthcare.xml",
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best"
},
}
# Initialize cache
class NewsCache:
def __init__(self, size):
self.cache = {}
self.size = size
self.lock = threading.Lock()
def get(self, key):
with self.lock:
return self.cache.get(key)
def set(self, key, value):
with self.lock:
if len(self.cache) >= self.size:
# Remove oldest cached item
oldest_key = next(iter(self.cache))
del self.cache[oldest_key]
self.cache[key] = value
cache = NewsCache(CACHE_SIZE)
# Initialize summarizer
summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=-1)
# Utility functions
def fetch_rss_news(categories):
"""Fetch news articles from RSS feeds based on selected categories."""
articles = []
cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
for category in categories:
for source, url in NEWS_SOURCES.get(category, {}).items():
try:
feed = feedparser.parse(url)
for entry in feed.entries:
# Parse publication time
published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
if published > cutoff_time:
articles.append({
"title": entry.title,
"description": BeautifulSoup(entry.description, "html.parser").get_text(),
"link": entry.link,
"category": category,
"source": source,
"published": published
})
except Exception as e:
logging.error(f"Failed to fetch from {url}: {e}")
return articles
def summarize_text(text):
"""Summarize the text using the AI model."""
content_hash = hashlib.md5(text.encode()).hexdigest()
cached_summary = cache.get(content_hash)
if cached_summary:
return cached_summary
try:
result = summarizer(text, max_length=120, min_length=40, truncation=True)
summary = result[0]['summary_text']
cache.set(content_hash, summary)
return summary
except Exception as e:
logging.error(f"Summarization failed: {e}")
return "Summary unavailable."
def summarize_articles(articles):
"""Summarize all fetched articles."""
summaries = []
for article in articles:
try:
content = article["description"]
title = article["title"]
category = article["category"]
source = article["source"]
link = article["link"]
published = article["published"].strftime('%Y-%m-%d %H:%M')
# Summarize article content
summary = summarize_text(content)
if summary:
summaries.append(f"""
**{title}**
**Category:** {category} | **Source:** {source} | **Published:** {published}
{summary}
[Read more]({link})
---""")
except Exception as e:
logging.error(f"Error summarizing article: {e}")
continue
return summaries
def generate_user_summary(name):
"""Generate a personalized news summary based on user preferences."""
# Load preferences
try:
with open(f"user_preferences/preferences_{name}.json") as f:
preferences = json.load(f)
except FileNotFoundError:
return "Preferences not found. Please set your preferences first."
except Exception as e:
logging.error(f"Error loading preferences: {e}")
return "Failed to load preferences."
categories = preferences.get("interests", [])
if not categories:
return "No categories selected. Please update your preferences."
# Fetch news
articles = fetch_rss_news(categories)
if not articles:
return "No recent news found in your selected categories."
# Summarize all articles
summaries = summarize_articles(articles)
# Combine and return summaries
return "\n\n".join(summaries) if summaries else "No summaries available."
# Gradio interface
demo = gr.Blocks()
with demo:
gr.Markdown("# 📰 Personalized AI News Summarizer")
with gr.Tab("Set Preferences"):
name_input = gr.Textbox(label="Your Name")
interests = gr.CheckboxGroup(
choices=list(NEWS_SOURCES.keys()),
label="Select Your Interests"
)
save_button = gr.Button("Save Preferences")
save_status = gr.Textbox(label="Status")
def save_preferences(name, selected_interests):
if not name or not selected_interests:
return "Name and interests are required!"
preferences = {"name": name, "interests": selected_interests}
try:
os.makedirs("user_preferences", exist_ok=True)
with open(f"user_preferences/preferences_{name}.json", "w") as f:
json.dump(preferences, f)
return "Preferences saved successfully!"
except Exception as e:
logging.error(f"Failed to save preferences: {e}")
return "Failed to save preferences."
save_button.click(save_preferences, inputs=[name_input, interests], outputs=save_status)
with gr.Tab("Get News Summary"):
name_input_summary = gr.Textbox(label="Your Name")
fetch_button = gr.Button("Get Summary")
summary_output = gr.Textbox(label="News Summary", lines=20)
fetch_button.click(generate_user_summary, inputs=[name_input_summary], outputs=summary_output)
if __name__ == "__main__":
demo.launch()