import gradio as gr
from transformers import pipeline
import feedparser
from datetime import datetime, timedelta
import pytz
from bs4 import BeautifulSoup
import hashlib
import threading
import logging
import requests

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Global settings
OPENROUTER_API_KEY = "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"
OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"

SUMMARIZER_MODELS = {
    "Default (facebook/bart-large-cnn)": "local_bart",
    "Free Model (distilbart-cnn-6-6)": "local_distilbart",
    "OpenRouter (Claude-3)": "anthropic/claude-3-haiku",
    "OpenRouter (GPT-4)": "openai/gpt-4"
}

CACHE_SIZE = 500
RSS_FETCH_INTERVAL = timedelta(hours=8)
ARTICLE_LIMIT = 5

# Updated categories and news sources
CATEGORIES = ["Technology", "Business", "Science", "World News", "Sports", "Health"]
NEWS_SOURCES = {
    "Technology": {
        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best",
        "alarabiya arabic": "https://www.alarabiya.net/feed/rss2/ar/technology.xml",
    },
    "Business": {
        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best",
        "alwatanvoice arabic": "https://feeds.alwatanvoice.com/ar/business.xml",
    },
    "Science": {
        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml"
    },
    "World News": {
        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
        "BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
        "CNN": "http://rss.cnn.com/rss/edition_world.rss",
        "reutersagency": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best",
        "france24 arabic": "https://www.france24.com/ar/rss",
        "aljazera arabic": "https://www.aljazeera.net/aljazeerarss/a7c186be-1baa-4bd4-9d80-a84db769f779/73d0e1b4-532f-45ef-b135-bfdff8b8cab9",
    },
    "Sports": {
        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml",
        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best",
        "france24 arabic": "https://www.france24.com/ar/%D8%B1%D9%8A%D8%A7%D8%B6%D8%A9/rss",
    },
    "Health": {
        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml",
        "politico": "http://rss.politico.com/healthcare.xml",
        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best"
    },
}

class NewsCache:
    def __init__(self, size):
        self.cache = {}
        self.size = size
        self.lock = threading.Lock()

    def get(self, key):
        with self.lock:
            return self.cache.get(key)

    def set(self, key, value):
        with self.lock:
            if len(self.cache) >= self.size:
                oldest_key = next(iter(self.cache))
                del self.cache[oldest_key]
            self.cache[key] = value

cache = NewsCache(CACHE_SIZE)

def detect_language(text):
    """Detect if the text is primarily Arabic"""
    if not text:
        return False
    arabic_chars = len([c for c in text if '\u0600' <= c <= '\u06FF'])
    return (arabic_chars / len(text)) > 0.5

def summarize_text(text, model_name):
    try:
        content_hash = hashlib.md5(text.encode()).hexdigest()
        cached_summary = cache.get(content_hash)
        
        if cached_summary:
            logger.info("Using cached summary")
            return cached_summary

        is_arabic = detect_language(text)
        
        if is_arabic or model_name in ["OpenRouter (Claude-3)", "OpenRouter (GPT-4)"]:
            logger.info(f"Using OpenRouter with model {model_name} for summarization")
            headers = {
                "Authorization": f"Bearer {OPENROUTER_API_KEY}",
                "HTTP-Referer": "https://localhost:7860",
                "X-Title": "News Summarizer App",
                "Content-Type": "application/json"
            }
            
            prompt = f"Please provide a concise summary of the following news article in the same language as the original text. Keep the summary brief and focused on key points:\n\n{text}"
            
            model_id = SUMMARIZER_MODELS[model_name]
            
            data = {
                "model": model_id,
                "messages": [{"role": "user", "content": prompt}],
                "max_tokens": 150
            }
            
            response = requests.post(OPENROUTER_API_URL, headers=headers, json=data)
            response.raise_for_status()
            
            summary = response.json()["choices"][0]["message"]["content"]
        else:
            logger.info("Using local model for summarization")
            model_path = "facebook/bart-large-cnn" if model_name == "Default (facebook/bart-large-cnn)" else "sshleifer/distilbart-cnn-6-6"
            summarizer = pipeline("summarization", model=model_path, device=-1)
            result = summarizer(text, max_length=120, min_length=40, truncation=True)
            summary = result[0]['summary_text']
        
        cache.set(content_hash, summary)
        return summary
    except Exception as e:
        logger.error(f"Error in summarization: {str(e)}")
        return f"Summary unavailable. Error: {str(e)}"

def fetch_rss_news(tech_sources, business_sources, science_sources, world_sources, sports_sources, health_sources):
    articles = []
    cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
    
    category_sources = {
        "Technology": tech_sources if tech_sources else [],
        "Business": business_sources if business_sources else [],
        "Science": science_sources if science_sources else [],
        "World News": world_sources if world_sources else [],
        "Sports": sports_sources if sports_sources else [],
        "Health": health_sources if health_sources else []
    }
    
    logger.info(f"Selected sources: {category_sources}")
    
    for category, sources in category_sources.items():
        if not sources:
            continue
            
        logger.info(f"Processing category: {category} with sources: {sources}")
        
        for source in sources:
            if source in NEWS_SOURCES[category]:
                url = NEWS_SOURCES[category][source]
                try:
                    logger.info(f"Fetching from URL: {url}")
                    feed = feedparser.parse(url)
                    
                    if hasattr(feed, 'status') and feed.status != 200:
                        logger.warning(f"Failed to fetch feed from {url}. Status: {feed.status}")
                        continue
                        
                    for entry in feed.entries:
                        try:
                            published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
                            if published > cutoff_time:
                                articles.append({
                                    "title": entry.title,
                                    "description": BeautifulSoup(entry.description, "html.parser").get_text(),
                                    "link": entry.link,
                                    "category": category,
                                    "source": source,
                                    "published": published
                                })
                        except (AttributeError, TypeError) as e:
                            logger.error(f"Error processing entry: {str(e)}")
                            continue
                            
                except Exception as e:
                    logger.error(f"Error fetching feed from {url}: {str(e)}")
                    continue
    
    logger.info(f"Total articles fetched: {len(articles)}")
    articles = sorted(articles, key=lambda x: x["published"], reverse=True)[:ARTICLE_LIMIT]
    return articles

def summarize_articles(articles, model_name):
    summaries = []
    for article in articles:
        content = article["description"]
        summary = summarize_text(content, model_name)
        summaries.append(f"""
        <div style='margin-bottom: 20px; white-space: pre-wrap;'>
        📰 {article['title']}
        📃 Summary: {summary}
        - 📁 Category: {article['category']}
        - 💡 Source: {article['source']}
        - 🔗 Read More: <a href="{article['link']}" target="_blank" style="text-decoration: none;">click here</a>
        </div>
        """)
    return "\n".join(summaries)

def get_summary(tech_sources, business_sources, science_sources, world_sources, 
                sports_sources, health_sources, selected_model):
    try:
        if not any([tech_sources, business_sources, science_sources, 
                   world_sources, sports_sources, health_sources]):
            return "Please select at least one news source."
        
        articles = fetch_rss_news(tech_sources, business_sources, science_sources,
                                world_sources, sports_sources, health_sources)
        
        if not articles:
            return "No recent news found from the selected sources."
            
        return summarize_articles(articles, selected_model)
    except Exception as e:
        logger.error(f"Error in get_summary: {str(e)}")
        return f"An error occurred while processing your request: {str(e)}"

# Gradio Interface
demo = gr.Blocks()

with demo:
    gr.Markdown("# 📰 AI News Summarizer")
    
    with gr.Row():
        with gr.Column():
            tech_sources = gr.CheckboxGroup(
                choices=list(NEWS_SOURCES["Technology"].keys()),
                label="Technology Sources",
                value=[]
            )
            
            business_sources = gr.CheckboxGroup(
                choices=list(NEWS_SOURCES["Business"].keys()),
                label="Business Sources",
                value=[]
            )
            
            science_sources = gr.CheckboxGroup(
                choices=list(NEWS_SOURCES["Science"].keys()),
                label="Science Sources",
                value=[]
            )
            
        with gr.Column():
            world_sources = gr.CheckboxGroup(
                choices=list(NEWS_SOURCES["World News"].keys()),
                label="World News Sources",
                value=[]
            )
            
            sports_sources = gr.CheckboxGroup(
                choices=list(NEWS_SOURCES["Sports"].keys()),
                label="Sports Sources",
                value=[]
            )
            
            health_sources = gr.CheckboxGroup(
                choices=list(NEWS_SOURCES["Health"].keys()),
                label="Health Sources",
                value=[]
            )
            
        with gr.Column():
            model_selector = gr.Radio(
                choices=list(SUMMARIZER_MODELS.keys()),
                label="Choose Summarization Model",
                value="OpenRouter (Claude-3)"
            )
    
    summarize_button = gr.Button("Get News Summary")
    summary_output = gr.HTML(label="News Summary")

    summarize_button.click(
        get_summary,
        inputs=[
            tech_sources,
            business_sources,
            science_sources,
            world_sources,
            sports_sources,
            health_sources,
            model_selector
        ],
        outputs=summary_output
    )

if __name__ == "__main__":
    demo.launch()