import gradio as gr from transformers import pipeline import feedparser from datetime import datetime, timedelta import pytz from bs4 import BeautifulSoup import hashlib import threading import logging import requests # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Global settings OPENROUTER_API_KEY = "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" OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions" SUMMARIZER_MODELS = { "Default (facebook/bart-large-cnn)": "local_bart", "Free Model (distilbart-cnn-6-6)": "local_distilbart", "OpenRouter (Claude-3)": "anthropic/claude-3-haiku", "OpenRouter (GPT-4)": "openai/gpt-4" } CACHE_SIZE = 500 RSS_FETCH_INTERVAL = timedelta(hours=8) ARTICLE_LIMIT = 5 # Updated categories and news sources CATEGORIES = ["Technology", "Business", "Science", "World News", "Sports", "Health"] NEWS_SOURCES = { "Technology": { "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml", "reutersagency": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best", "alarabiya arabic": "https://www.alarabiya.net/feed/rss2/ar/technology.xml", }, "Business": { "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml", "reutersagency": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best", "alwatanvoice arabic": "https://feeds.alwatanvoice.com/ar/business.xml", }, "Science": { "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml" }, "World News": { "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml", "BBC": "http://feeds.bbci.co.uk/news/world/rss.xml", "CNN": "http://rss.cnn.com/rss/edition_world.rss", "reutersagency": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best", "france24 arabic": "https://www.france24.com/ar/rss", "aljazera arabic": "https://www.aljazeera.net/aljazeerarss/a7c186be-1baa-4bd4-9d80-a84db769f779/73d0e1b4-532f-45ef-b135-bfdff8b8cab9", }, "Sports": { "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml", "reutersagency": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best", "france24 arabic": "https://www.france24.com/ar/%D8%B1%D9%8A%D8%A7%D8%B6%D8%A9/rss", }, "Health": { "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml", "politico": "http://rss.politico.com/healthcare.xml", "reutersagency": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best" }, } class NewsCache: def __init__(self, size): self.cache = {} self.size = size self.lock = threading.Lock() def get(self, key): with self.lock: return self.cache.get(key) def set(self, key, value): with self.lock: if len(self.cache) >= self.size: oldest_key = next(iter(self.cache)) del self.cache[oldest_key] self.cache[key] = value cache = NewsCache(CACHE_SIZE) def detect_language(text): """Detect if the text is primarily Arabic""" if not text: return False arabic_chars = len([c for c in text if '\u0600' <= c <= '\u06FF']) return (arabic_chars / len(text)) > 0.5 def summarize_text(text, model_name): try: content_hash = hashlib.md5(text.encode()).hexdigest() cached_summary = cache.get(content_hash) if cached_summary: logger.info("Using cached summary") return cached_summary is_arabic = detect_language(text) if is_arabic or model_name in ["OpenRouter (Claude-3)", "OpenRouter (GPT-4)"]: logger.info(f"Using OpenRouter with model {model_name} for summarization") headers = { "Authorization": f"Bearer {OPENROUTER_API_KEY}", "HTTP-Referer": "https://localhost:7860", "X-Title": "News Summarizer App", "Content-Type": "application/json" } prompt = f"Please provide a concise summary of the following news article in the same language as the original text. Keep the summary brief and focused on key points:\n\n{text}" model_id = SUMMARIZER_MODELS[model_name] data = { "model": model_id, "messages": [{"role": "user", "content": prompt}], "max_tokens": 150 } response = requests.post(OPENROUTER_API_URL, headers=headers, json=data) response.raise_for_status() summary = response.json()["choices"][0]["message"]["content"] else: logger.info("Using local model for summarization") model_path = "facebook/bart-large-cnn" if model_name == "Default (facebook/bart-large-cnn)" else "sshleifer/distilbart-cnn-6-6" summarizer = pipeline("summarization", model=model_path, device=-1) result = summarizer(text, max_length=120, min_length=40, truncation=True) summary = result[0]['summary_text'] cache.set(content_hash, summary) return summary except Exception as e: logger.error(f"Error in summarization: {str(e)}") return f"Summary unavailable. Error: {str(e)}" def fetch_rss_news(tech_sources, business_sources, science_sources, world_sources, sports_sources, health_sources): articles = [] cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL category_sources = { "Technology": tech_sources if tech_sources else [], "Business": business_sources if business_sources else [], "Science": science_sources if science_sources else [], "World News": world_sources if world_sources else [], "Sports": sports_sources if sports_sources else [], "Health": health_sources if health_sources else [] } logger.info(f"Selected sources: {category_sources}") for category, sources in category_sources.items(): if not sources: continue logger.info(f"Processing category: {category} with sources: {sources}") for source in sources: if source in NEWS_SOURCES[category]: url = NEWS_SOURCES[category][source] try: logger.info(f"Fetching from URL: {url}") feed = feedparser.parse(url) if hasattr(feed, 'status') and feed.status != 200: logger.warning(f"Failed to fetch feed from {url}. Status: {feed.status}") continue for entry in feed.entries: try: published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC) if published > cutoff_time: articles.append({ "title": entry.title, "description": BeautifulSoup(entry.description, "html.parser").get_text(), "link": entry.link, "category": category, "source": source, "published": published }) except (AttributeError, TypeError) as e: logger.error(f"Error processing entry: {str(e)}") continue except Exception as e: logger.error(f"Error fetching feed from {url}: {str(e)}") continue logger.info(f"Total articles fetched: {len(articles)}") articles = sorted(articles, key=lambda x: x["published"], reverse=True)[:ARTICLE_LIMIT] return articles def summarize_articles(articles, model_name): summaries = [] for article in articles: content = article["description"] summary = summarize_text(content, model_name) summaries.append(f"""
📰 {article['title']} 📃 Summary: {summary} - 📁 Category: {article['category']} - 💡 Source: {article['source']} - 🔗 Read More: click here
""") return "\n".join(summaries) def get_summary(tech_sources, business_sources, science_sources, world_sources, sports_sources, health_sources, selected_model): try: if not any([tech_sources, business_sources, science_sources, world_sources, sports_sources, health_sources]): return "Please select at least one news source." articles = fetch_rss_news(tech_sources, business_sources, science_sources, world_sources, sports_sources, health_sources) if not articles: return "No recent news found from the selected sources." return summarize_articles(articles, selected_model) except Exception as e: logger.error(f"Error in get_summary: {str(e)}") return f"An error occurred while processing your request: {str(e)}" # Gradio Interface demo = gr.Blocks() with demo: gr.Markdown("# 📰 AI News Summarizer") with gr.Row(): with gr.Column(): tech_sources = gr.CheckboxGroup( choices=list(NEWS_SOURCES["Technology"].keys()), label="Technology Sources", value=[] ) business_sources = gr.CheckboxGroup( choices=list(NEWS_SOURCES["Business"].keys()), label="Business Sources", value=[] ) science_sources = gr.CheckboxGroup( choices=list(NEWS_SOURCES["Science"].keys()), label="Science Sources", value=[] ) with gr.Column(): world_sources = gr.CheckboxGroup( choices=list(NEWS_SOURCES["World News"].keys()), label="World News Sources", value=[] ) sports_sources = gr.CheckboxGroup( choices=list(NEWS_SOURCES["Sports"].keys()), label="Sports Sources", value=[] ) health_sources = gr.CheckboxGroup( choices=list(NEWS_SOURCES["Health"].keys()), label="Health Sources", value=[] ) with gr.Column(): model_selector = gr.Radio( choices=list(SUMMARIZER_MODELS.keys()), label="Choose Summarization Model", value="OpenRouter (Claude-3)" ) summarize_button = gr.Button("Get News Summary") summary_output = gr.HTML(label="News Summary") summarize_button.click( get_summary, inputs=[ tech_sources, business_sources, science_sources, world_sources, sports_sources, health_sources, model_selector ], outputs=summary_output ) if __name__ == "__main__": demo.launch()