import gradio as gr from gradio_client import Client import json from cachetools import cached, TTLCache from typing import Optional, Any, List, Union, Dict import httpx import requests from typing import Dict, Any from toolz import groupby CACHE_TIME = 60 * 60 * 1 # 1 hour client = Client("https://librarian-bots-collection-papers-extractor.hf.space/") @cached(cache=TTLCache(maxsize=500, ttl=10)) def get_arxiv_ids_from_slug( slug: str, ) -> Dict[str, Union[None, Dict[str, Dict[str, Union[List[str], List[str]]]]]]: result = client.predict(slug, api_name="/predict") with open(result) as f: data = json.load(f) return data def format_arxiv_id_for_semantic_scholar(arxiv_id: str) -> str: return f"ArXiv:{arxiv_id}" def format_ids(data, exclude_keys: Optional[list[str]] = None) -> list[str]: arxiv_ids = [] if exclude_keys is not None: data = {k: v for k, v in data.items() if k not in exclude_keys} # check if dict now empty if not data: return [] for repo in data.values(): if repo is None: continue for item in repo.values(): arxiv_ids.extend(item["arxiv_ids"]) # format for semantic scholar return [format_arxiv_id_for_semantic_scholar(id) for id in arxiv_ids] @cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME)) def get_recommendations_from_semantic_scholar(paper_ids: tuple[str]): paper_ids = list(paper_ids) print(paper_ids) r = httpx.post( "https://api.semanticscholar.org/recommendations/v1/papers/", json={ "positivePaperIds": paper_ids, }, params={"fields": "externalIds,title,year", "limit": 10}, timeout=30, ) print(r.text) return r.json() def is_arxiv_paper(recommendation: Dict[str, Any]) -> bool: return recommendation["externalIds"].get("ArXiv", None) is not None def group_by_is_arxiv_paper( recommendations: List[Dict[str, Any]] ) -> Dict[bool, List[Dict[str, Any]]]: return groupby(is_arxiv_paper, recommendations) def format_recommendation_into_markdown( grouped_recommendations: Dict[bool, List[Dict[str, Any]]] ): comment = "The following papers were recommended by the Semantic Scholar API \n\n" arxiv_papers = grouped_recommendations.get(True) if arxiv_papers: comment += "## Papers available on Hugging Face Papers:\n\n" for r in arxiv_papers: hub_paper_url = f"https://huggingface.co/papers/{r['externalIds']['ArXiv']}" comment += f"* [{r['title']}]({hub_paper_url}) ({r['year']})\n" other_papers = grouped_recommendations.get(False) if other_papers: comment += "\n\n## Other papers:\n\n" for r in other_papers: comment += f"* {r['title']} ({r['year']})\n" return comment def map_repo_name_to_api_key(repo_name: str) -> str: return { "datasets": "dataset papers", "models": "model papers", "papers": "papers", }[repo_name] def get_recommendations_from_slug( slug: str, excluded_repo_types: Optional[list[str]] = None ): excluded_repo_types = tuple(excluded_repo_types) return _get_recommendations_from_slug(slug, excluded_repo_types=excluded_repo_types) @cached(cache=TTLCache(maxsize=500, ttl=60)) def _get_recommendations_from_slug( slug: str, excluded_repo_types: Optional[tuple[str]] = None ): data = get_arxiv_ids_from_slug(slug) if excluded_repo_types: excluded_repo_types = list(excluded_repo_types) excluded_repo_types = [map_repo_name_to_api_key(k) for k in excluded_repo_types] print(f"excluded_repo_types_remapped={excluded_repo_types}") ids = format_ids(data, exclude_keys=excluded_repo_types) if not ids: return ( "Based on your collection and exclusions" f" ({','.join(excluded_repo_types)}), there are no papers to recommend. Try removing some excluded repo types or adding more items to your collection." ) ids = tuple(ids) recommendations = get_recommendations_from_semantic_scholar(ids) recommendations = recommendations.get("recommendedPapers") if recommendations is None: raise gr.Error("Something went wrong with the Semantic Scholar API") grouped = group_by_is_arxiv_paper(recommendations) return format_recommendation_into_markdown(grouped) title = """📚 Collections Reading List Generator 📚""" description = """Mascot Bookie \n\n Hugging Face Collections allow you to curate models, datasets, spaces, and papers from the Hugging Face Hub. This Space will generate a reading list based on the items in your collection. This can be a great way to find related papers to the models and datasets in your collection and dive more deeply into a topic! The Space works by: - finding any papers in your collection - finding papers related to the models and datasets in your collection - requesting recommendations from the [Semantic Scholar API](https://api.semanticscholar.org/api-docs/recommendations#tag/Paper-Recommendations/operation/post_papers) for these papers. You can optionally exclude certain repo types fromm consideration when generating the reading list. """ slug_input = gr.Textbox( lines=1, label="Collection Slug", placeholder="merve/video-classification-models-6509edd0a6f657faa425e8c3", ) example_slugs = [ ["merve/video-classification-models-6509edd0a6f657faa425e8c3", []], ["osanseviero/model-merging-65097893623330a3a51ead66", []], ["hf4h/clinical-language-models-64f9c1cd0cedc04f3caca264",[]] ] gr.Interface( get_recommendations_from_slug, inputs=[ slug_input, gr.Dropdown( label="Repos to exclude from contributing to recommendations", choices=["datasets", "models", "papers"], multiselect=True, ), ], outputs="markdown", description=description, title=title, allow_flagging="never", examples=example_slugs, ).launch(debug=True)