Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import pipeline | |
import feedparser | |
from datetime import datetime, timedelta | |
import json | |
import os | |
import logging | |
import pytz | |
from bs4 import BeautifulSoup | |
import hashlib | |
import threading | |
# Logging setup | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
# Global settings | |
SUMMARIZER_MODEL = "facebook/bart-large-cnn" # You can replace this with other summarization models | |
CACHE_SIZE = 500 # Maximum number of cached summaries | |
RSS_FETCH_INTERVAL = timedelta(hours=8) # Fetch recent news within the last 8 hours | |
TIMEOUT_LIMIT = 30 # Maximum time in seconds to process summaries | |
# News sources | |
NEWS_SOURCES = { | |
"Technology": { | |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml", | |
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best" | |
}, | |
"Business": { | |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml", | |
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best" | |
}, | |
"Science": { | |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml" | |
}, | |
"World News": { | |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml", | |
"BBC": "http://feeds.bbci.co.uk/news/world/rss.xml", | |
"CNN": "http://rss.cnn.com/rss/edition_world.rss", | |
"reutersagency": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best" | |
}, | |
"Sports": { | |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml", | |
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best" | |
}, | |
"Health": { | |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml", | |
"politico": "http://rss.politico.com/healthcare.xml", | |
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best" | |
}, | |
} | |
# Initialize cache | |
class NewsCache: | |
def __init__(self, size): | |
self.cache = {} | |
self.size = size | |
self.lock = threading.Lock() | |
def get(self, key): | |
with self.lock: | |
return self.cache.get(key) | |
def set(self, key, value): | |
with self.lock: | |
if len(self.cache) >= self.size: | |
# Remove oldest cached item | |
oldest_key = next(iter(self.cache)) | |
del self.cache[oldest_key] | |
self.cache[key] = value | |
cache = NewsCache(CACHE_SIZE) | |
# Initialize summarizer | |
summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=-1) | |
# Utility functions | |
def fetch_rss_news(categories): | |
"""Fetch news articles from RSS feeds based on selected categories.""" | |
articles = [] | |
cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL | |
for category in categories: | |
for source, url in NEWS_SOURCES.get(category, {}).items(): | |
try: | |
feed = feedparser.parse(url) | |
for entry in feed.entries: | |
# Parse publication time | |
published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC) | |
if published > cutoff_time: | |
articles.append({ | |
"title": entry.title, | |
"description": BeautifulSoup(entry.description, "html.parser").get_text(), | |
"link": entry.link, | |
"category": category, | |
"source": source, | |
"published": published | |
}) | |
except Exception as e: | |
logging.error(f"Failed to fetch from {url}: {e}") | |
return articles | |
def summarize_text(text): | |
"""Summarize the text using the AI model.""" | |
content_hash = hashlib.md5(text.encode()).hexdigest() | |
cached_summary = cache.get(content_hash) | |
if cached_summary: | |
return cached_summary | |
try: | |
result = summarizer(text, max_length=120, min_length=40, truncation=True) | |
summary = result[0]['summary_text'] | |
cache.set(content_hash, summary) | |
return summary | |
except Exception as e: | |
logging.error(f"Summarization failed: {e}") | |
return "Summary unavailable." | |
def summarize_articles(articles): | |
"""Summarize all fetched articles.""" | |
summaries = [] | |
for article in articles: | |
try: | |
content = article["description"] | |
title = article["title"] | |
category = article["category"] | |
source = article["source"] | |
link = article["link"] | |
published = article["published"].strftime('%Y-%m-%d %H:%M') | |
# Summarize article content | |
summary = summarize_text(content) | |
if summary: | |
summaries.append(f""" | |
**{title}** | |
**Category:** {category} | **Source:** {source} | **Published:** {published} | |
{summary} | |
[Read more]({link}) | |
---""") | |
except Exception as e: | |
logging.error(f"Error summarizing article: {e}") | |
continue | |
return summaries | |
def generate_user_summary(name): | |
"""Generate a personalized news summary based on user preferences.""" | |
# Load preferences | |
try: | |
with open(f"user_preferences/preferences_{name}.json") as f: | |
preferences = json.load(f) | |
except FileNotFoundError: | |
return "Preferences not found. Please set your preferences first." | |
except Exception as e: | |
logging.error(f"Error loading preferences: {e}") | |
return "Failed to load preferences." | |
categories = preferences.get("interests", []) | |
if not categories: | |
return "No categories selected. Please update your preferences." | |
# Fetch news | |
articles = fetch_rss_news(categories) | |
if not articles: | |
return "No recent news found in your selected categories." | |
# Summarize all articles | |
summaries = summarize_articles(articles) | |
# Combine and return summaries | |
return "\n\n".join(summaries) if summaries else "No summaries available." | |
# Gradio interface | |
demo = gr.Blocks() | |
with demo: | |
gr.Markdown("# 📰 Personalized AI News Summarizer") | |
with gr.Tab("Set Preferences"): | |
name_input = gr.Textbox(label="Your Name") | |
interests = gr.CheckboxGroup( | |
choices=list(NEWS_SOURCES.keys()), | |
label="Select Your Interests" | |
) | |
save_button = gr.Button("Save Preferences") | |
save_status = gr.Textbox(label="Status") | |
def save_preferences(name, selected_interests): | |
if not name or not selected_interests: | |
return "Name and interests are required!" | |
preferences = {"name": name, "interests": selected_interests} | |
try: | |
os.makedirs("user_preferences", exist_ok=True) | |
with open(f"user_preferences/preferences_{name}.json", "w") as f: | |
json.dump(preferences, f) | |
return "Preferences saved successfully!" | |
except Exception as e: | |
logging.error(f"Failed to save preferences: {e}") | |
return "Failed to save preferences." | |
save_button.click(save_preferences, inputs=[name_input, interests], outputs=save_status) | |
with gr.Tab("Get News Summary"): | |
name_input_summary = gr.Textbox(label="Your Name") | |
fetch_button = gr.Button("Get Summary") | |
summary_output = gr.Textbox(label="News Summary", lines=20) | |
fetch_button.click(generate_user_summary, inputs=[name_input_summary], outputs=summary_output) | |
if __name__ == "__main__": | |
demo.launch() | |