import gradio as gr
from gradio_client import Client
import json
from cachetools import cached, TTLCache
from typing import Optional, Any, List, Union, Dict
import httpx
import requests
from typing import Dict, Any
from toolz import groupby

CACHE_TIME = 60 * 60 * 1  # 1 hour

client = Client("https://librarian-bots-collection-papers-extractor.hf.space/")


@cached(cache=TTLCache(maxsize=500, ttl=10))
def get_arxiv_ids_from_slug(
    slug: str,
) -> Dict[str, Union[None, Dict[str, Dict[str, Union[List[str], List[str]]]]]]:
    result = client.predict(slug, api_name="/predict")
    with open(result) as f:
        data = json.load(f)
    return data


def format_arxiv_id_for_semantic_scholar(arxiv_id: str) -> str:
    return f"ArXiv:{arxiv_id}"


def format_ids(data, exclude_keys: Optional[list[str]] = None) -> list[str]:
    arxiv_ids = []
    if exclude_keys is not None:
        data = {k: v for k, v in data.items() if k not in exclude_keys}
        # check if dict now empty
        if not data:
            return []
    for repo in data.values():
        if repo is None:
            continue
        for item in repo.values():
            arxiv_ids.extend(item["arxiv_ids"])
    # format for semantic scholar
    return [format_arxiv_id_for_semantic_scholar(id) for id in arxiv_ids]


@cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME))
def get_recommendations_from_semantic_scholar(paper_ids: tuple[str]):
    paper_ids = list(paper_ids)
    print(paper_ids)
    r = httpx.post(
        "https://api.semanticscholar.org/recommendations/v1/papers/",
        json={
            "positivePaperIds": paper_ids,
        },
        params={"fields": "externalIds,title,year", "limit": 10},
        timeout=30,
    )
    print(r.text)
    return r.json()


def is_arxiv_paper(recommendation: Dict[str, Any]) -> bool:
    return recommendation["externalIds"].get("ArXiv", None) is not None


def group_by_is_arxiv_paper(
    recommendations: List[Dict[str, Any]]
) -> Dict[bool, List[Dict[str, Any]]]:
    return groupby(is_arxiv_paper, recommendations)


def format_recommendation_into_markdown(
    grouped_recommendations: Dict[bool, List[Dict[str, Any]]]
):
    comment = "The following papers were recommended by the Semantic Scholar API \n\n"
    arxiv_papers = grouped_recommendations.get(True)
    if arxiv_papers:
        comment += "## Papers available on Hugging Face Papers:\n\n"
        for r in arxiv_papers:
            hub_paper_url = f"https://huggingface.co/papers/{r['externalIds']['ArXiv']}"
            comment += f"* [{r['title']}]({hub_paper_url}) ({r['year']})\n"
    other_papers = grouped_recommendations.get(False)
    if other_papers:
        comment += "\n\n## Other papers:\n\n"
        for r in other_papers:
            comment += f"* {r['title']} ({r['year']})\n"
    return comment


def map_repo_name_to_api_key(repo_name: str) -> str:
    return {
        "datasets": "dataset papers",
        "models": "model papers",
        "papers": "papers",
    }[repo_name]


def get_recommendations_from_slug(
    slug: str, excluded_repo_types: Optional[list[str]] = None
):
    excluded_repo_types = tuple(excluded_repo_types)
    return _get_recommendations_from_slug(slug, excluded_repo_types=excluded_repo_types)


@cached(cache=TTLCache(maxsize=500, ttl=60))
def _get_recommendations_from_slug(
    slug: str, excluded_repo_types: Optional[tuple[str]] = None
):
    data = get_arxiv_ids_from_slug(slug)
    if excluded_repo_types:
        excluded_repo_types = list(excluded_repo_types)
        excluded_repo_types = [map_repo_name_to_api_key(k) for k in excluded_repo_types]
        print(f"excluded_repo_types_remapped={excluded_repo_types}")
    ids = format_ids(data, exclude_keys=excluded_repo_types)
    if not ids:
        return (
            "Based on your collection and exclusions"
            f" ({','.join(excluded_repo_types)}), there are no papers to recommend. Try removing some excluded repo types or adding more items to your collection."
        )
    ids = tuple(ids)
    recommendations = get_recommendations_from_semantic_scholar(ids)
    recommendations = recommendations.get("recommendedPapers")
    if recommendations is None:
        raise gr.Error("Something went wrong with the Semantic Scholar API")
    grouped = group_by_is_arxiv_paper(recommendations)
    return format_recommendation_into_markdown(grouped)


title = """📚 Collections Reading List Generator                 📚"""
description = """<img src="https://huggingface.co/datasets/librarian-bots/images/raw/main/Mascot%20Bookie.svg" 
alt="Mascot Bookie" width="200" style="float:left; margin-right:20px; margin-bottom:20px;"> 

\n\n 
Hugging Face Collections allow you to curate models, datasets, spaces, 
and papers from the Hugging Face Hub. 
This Space will generate a reading list based on the items in your collection. 
This can be a great way to find related papers to the models and datasets in your collection and dive more deeply into a topic!

The Space works by:

- finding any papers in your collection 
- finding papers related to the models and datasets in your collection 
- requesting recommendations from the [Semantic Scholar API](https://api.semanticscholar.org/api-docs/recommendations#tag/Paper-Recommendations/operation/post_papers) for these papers.

You can optionally exclude certain repo types fromm consideration when generating the reading list.
"""

slug_input = gr.Textbox(
    lines=1,
    label="Collection Slug",
    placeholder="merve/video-classification-models-6509edd0a6f657faa425e8c3",
)
example_slugs = [
    ["merve/video-classification-models-6509edd0a6f657faa425e8c3", []],
    ["osanseviero/model-merging-65097893623330a3a51ead66", []],
    ["hf4h/clinical-language-models-64f9c1cd0cedc04f3caca264",[]]
]

gr.Interface(
    get_recommendations_from_slug,
    inputs=[
        slug_input,
        gr.Dropdown(
            label="Repos to exclude from contributing to recommendations",
            choices=["datasets", "models", "papers"],
            multiselect=True,
        ),
    ],
    outputs="markdown",
    description=description,
    title=title,
    allow_flagging="never",
    examples=example_slugs,
).launch(debug=True)