news-sumarry / app.py
loayshabet's picture
Update app.py
e437fbf verified
raw
history blame
11.8 kB
import gradio as gr
from transformers import pipeline
import feedparser
from datetime import datetime, timedelta
import pytz
from bs4 import BeautifulSoup
import hashlib
import threading
import logging
import requests
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Global settings
OPENROUTER_API_KEY = "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"
OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"
SUMMARIZER_MODELS = {
"Default (facebook/bart-large-cnn)": "local_bart",
"Free Model (distilbart-cnn-6-6)": "local_distilbart",
"OpenRouter (Claude-3)": "anthropic/claude-3-haiku",
"OpenRouter (GPT-4)": "openai/gpt-4"
}
CACHE_SIZE = 500
RSS_FETCH_INTERVAL = timedelta(hours=8)
ARTICLE_LIMIT = 5
# Updated categories and news sources
CATEGORIES = ["Technology", "Business", "Science", "World News", "Sports", "Health"]
NEWS_SOURCES = {
"Technology": {
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best",
"alarabiya arabic": "https://www.alarabiya.net/feed/rss2/ar/technology.xml",
},
"Business": {
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best",
"alwatanvoice arabic": "https://feeds.alwatanvoice.com/ar/business.xml",
},
"Science": {
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml"
},
"World News": {
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
"BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
"CNN": "http://rss.cnn.com/rss/edition_world.rss",
"reutersagency": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best",
"france24 arabic": "https://www.france24.com/ar/rss",
"aljazera arabic": "https://www.aljazeera.net/aljazeerarss/a7c186be-1baa-4bd4-9d80-a84db769f779/73d0e1b4-532f-45ef-b135-bfdff8b8cab9",
},
"Sports": {
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml",
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best",
"france24 arabic": "https://www.france24.com/ar/%D8%B1%D9%8A%D8%A7%D8%B6%D8%A9/rss",
},
"Health": {
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml",
"politico": "http://rss.politico.com/healthcare.xml",
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best"
},
}
class NewsCache:
def __init__(self, size):
self.cache = {}
self.size = size
self.lock = threading.Lock()
def get(self, key):
with self.lock:
return self.cache.get(key)
def set(self, key, value):
with self.lock:
if len(self.cache) >= self.size:
oldest_key = next(iter(self.cache))
del self.cache[oldest_key]
self.cache[key] = value
cache = NewsCache(CACHE_SIZE)
def detect_language(text):
"""Detect if the text is primarily Arabic"""
if not text:
return False
arabic_chars = len([c for c in text if '\u0600' <= c <= '\u06FF'])
return (arabic_chars / len(text)) > 0.5
def summarize_text(text, model_name):
try:
content_hash = hashlib.md5(text.encode()).hexdigest()
cached_summary = cache.get(content_hash)
if cached_summary:
logger.info("Using cached summary")
return cached_summary
is_arabic = detect_language(text)
if is_arabic or model_name in ["OpenRouter (Claude-3)", "OpenRouter (GPT-4)"]:
logger.info(f"Using OpenRouter with model {model_name} for summarization")
headers = {
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"HTTP-Referer": "https://localhost:7860",
"X-Title": "News Summarizer App",
"Content-Type": "application/json"
}
prompt = f"Please provide a concise summary of the following news article in the same language as the original text. Keep the summary brief and focused on key points:\n\n{text}"
model_id = SUMMARIZER_MODELS[model_name]
data = {
"model": model_id,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 150
}
response = requests.post(OPENROUTER_API_URL, headers=headers, json=data)
response.raise_for_status()
summary = response.json()["choices"][0]["message"]["content"]
else:
logger.info("Using local model for summarization")
model_path = "facebook/bart-large-cnn" if model_name == "Default (facebook/bart-large-cnn)" else "sshleifer/distilbart-cnn-6-6"
summarizer = pipeline("summarization", model=model_path, device=-1)
result = summarizer(text, max_length=120, min_length=40, truncation=True)
summary = result[0]['summary_text']
cache.set(content_hash, summary)
return summary
except Exception as e:
logger.error(f"Error in summarization: {str(e)}")
return f"Summary unavailable. Error: {str(e)}"
def fetch_rss_news(tech_sources, business_sources, science_sources, world_sources, sports_sources, health_sources):
articles = []
cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
category_sources = {
"Technology": tech_sources if tech_sources else [],
"Business": business_sources if business_sources else [],
"Science": science_sources if science_sources else [],
"World News": world_sources if world_sources else [],
"Sports": sports_sources if sports_sources else [],
"Health": health_sources if health_sources else []
}
logger.info(f"Selected sources: {category_sources}")
for category, sources in category_sources.items():
if not sources:
continue
logger.info(f"Processing category: {category} with sources: {sources}")
for source in sources:
if source in NEWS_SOURCES[category]:
url = NEWS_SOURCES[category][source]
try:
logger.info(f"Fetching from URL: {url}")
feed = feedparser.parse(url)
if hasattr(feed, 'status') and feed.status != 200:
logger.warning(f"Failed to fetch feed from {url}. Status: {feed.status}")
continue
for entry in feed.entries:
try:
published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
if published > cutoff_time:
articles.append({
"title": entry.title,
"description": BeautifulSoup(entry.description, "html.parser").get_text(),
"link": entry.link,
"category": category,
"source": source,
"published": published
})
except (AttributeError, TypeError) as e:
logger.error(f"Error processing entry: {str(e)}")
continue
except Exception as e:
logger.error(f"Error fetching feed from {url}: {str(e)}")
continue
logger.info(f"Total articles fetched: {len(articles)}")
articles = sorted(articles, key=lambda x: x["published"], reverse=True)[:ARTICLE_LIMIT]
return articles
def summarize_articles(articles, model_name):
summaries = []
for article in articles:
content = article["description"]
summary = summarize_text(content, model_name)
summaries.append(f"""
<div style='margin-bottom: 20px; white-space: pre-wrap;'>
πŸ“° {article['title']}
πŸ“ƒ Summary: {summary}
- πŸ“ Category: {article['category']}
- πŸ’‘ Source: {article['source']}
- πŸ”— Read More: <a href="{article['link']}" target="_blank" style="text-decoration: none;">click here</a>
</div>
""")
return "\n".join(summaries)
def get_summary(tech_sources, business_sources, science_sources, world_sources,
sports_sources, health_sources, selected_model):
try:
if not any([tech_sources, business_sources, science_sources,
world_sources, sports_sources, health_sources]):
return "Please select at least one news source."
articles = fetch_rss_news(tech_sources, business_sources, science_sources,
world_sources, sports_sources, health_sources)
if not articles:
return "No recent news found from the selected sources."
return summarize_articles(articles, selected_model)
except Exception as e:
logger.error(f"Error in get_summary: {str(e)}")
return f"An error occurred while processing your request: {str(e)}"
# Gradio Interface
demo = gr.Blocks()
with demo:
gr.Markdown("# πŸ“° AI News Summarizer")
with gr.Row():
with gr.Column():
tech_sources = gr.CheckboxGroup(
choices=list(NEWS_SOURCES["Technology"].keys()),
label="Technology Sources",
value=[]
)
business_sources = gr.CheckboxGroup(
choices=list(NEWS_SOURCES["Business"].keys()),
label="Business Sources",
value=[]
)
science_sources = gr.CheckboxGroup(
choices=list(NEWS_SOURCES["Science"].keys()),
label="Science Sources",
value=[]
)
with gr.Column():
world_sources = gr.CheckboxGroup(
choices=list(NEWS_SOURCES["World News"].keys()),
label="World News Sources",
value=[]
)
sports_sources = gr.CheckboxGroup(
choices=list(NEWS_SOURCES["Sports"].keys()),
label="Sports Sources",
value=[]
)
health_sources = gr.CheckboxGroup(
choices=list(NEWS_SOURCES["Health"].keys()),
label="Health Sources",
value=[]
)
with gr.Column():
model_selector = gr.Radio(
choices=list(SUMMARIZER_MODELS.keys()),
label="Choose Summarization Model",
value="OpenRouter (Claude-3)"
)
summarize_button = gr.Button("Get News Summary")
summary_output = gr.HTML(label="News Summary")
summarize_button.click(
get_summary,
inputs=[
tech_sources,
business_sources,
science_sources,
world_sources,
sports_sources,
health_sources,
model_selector
],
outputs=summary_output
)
if __name__ == "__main__":
demo.launch()