Spaces:
Running
Running
import gradio as gr | |
from transformers import pipeline | |
import feedparser | |
from datetime import datetime, timedelta | |
import pytz | |
from bs4 import BeautifulSoup | |
import hashlib | |
import threading | |
import logging | |
import requests | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# Global settings | |
OPENROUTER_API_KEY = "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" | |
OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions" | |
SUMMARIZER_MODELS = { | |
"Default (facebook/bart-large-cnn)": "local_bart", | |
"Free Model (distilbart-cnn-6-6)": "local_distilbart", | |
"OpenRouter (Claude-3)": "anthropic/claude-3-haiku", | |
"OpenRouter (GPT-4)": "openai/gpt-4" | |
} | |
CACHE_SIZE = 500 | |
RSS_FETCH_INTERVAL = timedelta(hours=8) | |
ARTICLE_LIMIT = 5 | |
# Updated categories and news sources | |
CATEGORIES = ["Technology", "Business", "Science", "World News", "Sports", "Health"] | |
NEWS_SOURCES = { | |
"Technology": { | |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml", | |
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best", | |
"alarabiya arabic": "https://www.alarabiya.net/feed/rss2/ar/technology.xml", | |
}, | |
"Business": { | |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml", | |
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best", | |
"alwatanvoice arabic": "https://feeds.alwatanvoice.com/ar/business.xml", | |
}, | |
"Science": { | |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml" | |
}, | |
"World News": { | |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml", | |
"BBC": "http://feeds.bbci.co.uk/news/world/rss.xml", | |
"CNN": "http://rss.cnn.com/rss/edition_world.rss", | |
"reutersagency": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best", | |
"france24 arabic": "https://www.france24.com/ar/rss", | |
"aljazera arabic": "https://www.aljazeera.net/aljazeerarss/a7c186be-1baa-4bd4-9d80-a84db769f779/73d0e1b4-532f-45ef-b135-bfdff8b8cab9", | |
}, | |
"Sports": { | |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml", | |
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best", | |
"france24 arabic": "https://www.france24.com/ar/%D8%B1%D9%8A%D8%A7%D8%B6%D8%A9/rss", | |
}, | |
"Health": { | |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml", | |
"politico": "http://rss.politico.com/healthcare.xml", | |
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best" | |
}, | |
} | |
class NewsCache: | |
def __init__(self, size): | |
self.cache = {} | |
self.size = size | |
self.lock = threading.Lock() | |
def get(self, key): | |
with self.lock: | |
return self.cache.get(key) | |
def set(self, key, value): | |
with self.lock: | |
if len(self.cache) >= self.size: | |
oldest_key = next(iter(self.cache)) | |
del self.cache[oldest_key] | |
self.cache[key] = value | |
cache = NewsCache(CACHE_SIZE) | |
def detect_language(text): | |
"""Detect if the text is primarily Arabic""" | |
if not text: | |
return False | |
arabic_chars = len([c for c in text if '\u0600' <= c <= '\u06FF']) | |
return (arabic_chars / len(text)) > 0.5 | |
def summarize_text(text, model_name): | |
try: | |
content_hash = hashlib.md5(text.encode()).hexdigest() | |
cached_summary = cache.get(content_hash) | |
if cached_summary: | |
logger.info("Using cached summary") | |
return cached_summary | |
is_arabic = detect_language(text) | |
if is_arabic or model_name in ["OpenRouter (Claude-3)", "OpenRouter (GPT-4)"]: | |
logger.info(f"Using OpenRouter with model {model_name} for summarization") | |
headers = { | |
"Authorization": f"Bearer {OPENROUTER_API_KEY}", | |
"HTTP-Referer": "https://localhost:7860", | |
"X-Title": "News Summarizer App", | |
"Content-Type": "application/json" | |
} | |
prompt = f"Please provide a concise summary of the following news article in the same language as the original text. Keep the summary brief and focused on key points:\n\n{text}" | |
model_id = SUMMARIZER_MODELS[model_name] | |
data = { | |
"model": model_id, | |
"messages": [{"role": "user", "content": prompt}], | |
"max_tokens": 150 | |
} | |
response = requests.post(OPENROUTER_API_URL, headers=headers, json=data) | |
response.raise_for_status() | |
summary = response.json()["choices"][0]["message"]["content"] | |
else: | |
logger.info("Using local model for summarization") | |
model_path = "facebook/bart-large-cnn" if model_name == "Default (facebook/bart-large-cnn)" else "sshleifer/distilbart-cnn-6-6" | |
summarizer = pipeline("summarization", model=model_path, device=-1) | |
result = summarizer(text, max_length=120, min_length=40, truncation=True) | |
summary = result[0]['summary_text'] | |
cache.set(content_hash, summary) | |
return summary | |
except Exception as e: | |
logger.error(f"Error in summarization: {str(e)}") | |
return f"Summary unavailable. Error: {str(e)}" | |
def fetch_rss_news(tech_sources, business_sources, science_sources, world_sources, sports_sources, health_sources): | |
articles = [] | |
cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL | |
category_sources = { | |
"Technology": tech_sources if tech_sources else [], | |
"Business": business_sources if business_sources else [], | |
"Science": science_sources if science_sources else [], | |
"World News": world_sources if world_sources else [], | |
"Sports": sports_sources if sports_sources else [], | |
"Health": health_sources if health_sources else [] | |
} | |
logger.info(f"Selected sources: {category_sources}") | |
for category, sources in category_sources.items(): | |
if not sources: | |
continue | |
logger.info(f"Processing category: {category} with sources: {sources}") | |
for source in sources: | |
if source in NEWS_SOURCES[category]: | |
url = NEWS_SOURCES[category][source] | |
try: | |
logger.info(f"Fetching from URL: {url}") | |
feed = feedparser.parse(url) | |
if hasattr(feed, 'status') and feed.status != 200: | |
logger.warning(f"Failed to fetch feed from {url}. Status: {feed.status}") | |
continue | |
for entry in feed.entries: | |
try: | |
published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC) | |
if published > cutoff_time: | |
articles.append({ | |
"title": entry.title, | |
"description": BeautifulSoup(entry.description, "html.parser").get_text(), | |
"link": entry.link, | |
"category": category, | |
"source": source, | |
"published": published | |
}) | |
except (AttributeError, TypeError) as e: | |
logger.error(f"Error processing entry: {str(e)}") | |
continue | |
except Exception as e: | |
logger.error(f"Error fetching feed from {url}: {str(e)}") | |
continue | |
logger.info(f"Total articles fetched: {len(articles)}") | |
articles = sorted(articles, key=lambda x: x["published"], reverse=True)[:ARTICLE_LIMIT] | |
return articles | |
def summarize_articles(articles, model_name): | |
summaries = [] | |
for article in articles: | |
content = article["description"] | |
summary = summarize_text(content, model_name) | |
summaries.append(f""" | |
<div style='margin-bottom: 20px; white-space: pre-wrap;'> | |
π° {article['title']} | |
π Summary: {summary} | |
- π Category: {article['category']} | |
- π‘ Source: {article['source']} | |
- π Read More: <a href="{article['link']}" target="_blank" style="text-decoration: none;">click here</a> | |
</div> | |
""") | |
return "\n".join(summaries) | |
def get_summary(tech_sources, business_sources, science_sources, world_sources, | |
sports_sources, health_sources, selected_model): | |
try: | |
if not any([tech_sources, business_sources, science_sources, | |
world_sources, sports_sources, health_sources]): | |
return "Please select at least one news source." | |
articles = fetch_rss_news(tech_sources, business_sources, science_sources, | |
world_sources, sports_sources, health_sources) | |
if not articles: | |
return "No recent news found from the selected sources." | |
return summarize_articles(articles, selected_model) | |
except Exception as e: | |
logger.error(f"Error in get_summary: {str(e)}") | |
return f"An error occurred while processing your request: {str(e)}" | |
# Gradio Interface | |
demo = gr.Blocks() | |
with demo: | |
gr.Markdown("# π° AI News Summarizer") | |
with gr.Row(): | |
with gr.Column(): | |
tech_sources = gr.CheckboxGroup( | |
choices=list(NEWS_SOURCES["Technology"].keys()), | |
label="Technology Sources", | |
value=[] | |
) | |
business_sources = gr.CheckboxGroup( | |
choices=list(NEWS_SOURCES["Business"].keys()), | |
label="Business Sources", | |
value=[] | |
) | |
science_sources = gr.CheckboxGroup( | |
choices=list(NEWS_SOURCES["Science"].keys()), | |
label="Science Sources", | |
value=[] | |
) | |
with gr.Column(): | |
world_sources = gr.CheckboxGroup( | |
choices=list(NEWS_SOURCES["World News"].keys()), | |
label="World News Sources", | |
value=[] | |
) | |
sports_sources = gr.CheckboxGroup( | |
choices=list(NEWS_SOURCES["Sports"].keys()), | |
label="Sports Sources", | |
value=[] | |
) | |
health_sources = gr.CheckboxGroup( | |
choices=list(NEWS_SOURCES["Health"].keys()), | |
label="Health Sources", | |
value=[] | |
) | |
with gr.Column(): | |
model_selector = gr.Radio( | |
choices=list(SUMMARIZER_MODELS.keys()), | |
label="Choose Summarization Model", | |
value="OpenRouter (Claude-3)" | |
) | |
summarize_button = gr.Button("Get News Summary") | |
summary_output = gr.HTML(label="News Summary") | |
summarize_button.click( | |
get_summary, | |
inputs=[ | |
tech_sources, | |
business_sources, | |
science_sources, | |
world_sources, | |
sports_sources, | |
health_sources, | |
model_selector | |
], | |
outputs=summary_output | |
) | |
if __name__ == "__main__": | |
demo.launch() | |