davanstrien's picture
davanstrien HF staff
turn off debugging
cbd65c5
import json
from typing import Any, Dict, List, Optional, Union
import gradio as gr
import httpx
from cachetools import TTLCache, cached
from gradio_client import Client
from toolz import groupby
CACHE_TIME = 60 * 60 * 1 # 1 hour
client = Client("https://librarian-bots-collection-papers-extractor.hf.space/")
@cached(cache=TTLCache(maxsize=500, ttl=10))
def get_arxiv_ids_from_slug(
slug: str,
) -> Dict[str, Union[None, Dict[str, Dict[str, Union[List[str], List[str]]]]]]:
result = client.predict(slug, api_name="/predict")
with open(result) as f:
data = json.load(f)
return data
def format_arxiv_id_for_semantic_scholar(arxiv_id: str) -> str:
return f"ArXiv:{arxiv_id}"
def format_ids(data, exclude_keys: Optional[list[str]] = None) -> list[str]:
arxiv_ids = []
if exclude_keys is not None:
data = {k: v for k, v in data.items() if k not in exclude_keys}
# check if dict now empty
if not data:
return []
for repo in data.values():
if repo is None:
continue
for item in repo.values():
arxiv_ids.extend(item["arxiv_ids"])
# format for semantic scholar
return [format_arxiv_id_for_semantic_scholar(id) for id in arxiv_ids]
@cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME))
def get_recommendations_from_semantic_scholar(paper_ids: tuple[str]):
paper_ids = list(paper_ids)
print(paper_ids)
r = httpx.post(
"https://api.semanticscholar.org/recommendations/v1/papers/",
json={
"positivePaperIds": paper_ids,
},
params={"fields": "externalIds,title,year", "limit": 10},
timeout=30,
)
print(r.text)
return r.json()
def is_arxiv_paper(recommendation: Dict[str, Any]) -> bool:
return recommendation["externalIds"].get("ArXiv", None) is not None
def group_by_is_arxiv_paper(
recommendations: List[Dict[str, Any]]
) -> Dict[bool, List[Dict[str, Any]]]:
return groupby(is_arxiv_paper, recommendations)
def format_recommendation_into_markdown(
grouped_recommendations: Dict[bool, List[Dict[str, Any]]]
):
comment = "The following papers were recommended by the Semantic Scholar API \n\n"
arxiv_papers = grouped_recommendations.get(True)
if arxiv_papers:
comment += "## Papers available on Hugging Face Papers:\n\n"
for r in arxiv_papers:
hub_paper_url = f"https://huggingface.co/papers/{r['externalIds']['ArXiv']}"
comment += f"* [{r['title']}]({hub_paper_url}) ({r['year']})\n"
other_papers = grouped_recommendations.get(False)
if other_papers:
comment += "\n\n## Other papers:\n\n"
for r in other_papers:
comment += f"* {r['title']} ({r['year']})\n"
return comment
def map_repo_name_to_api_key(repo_name: str) -> str:
return {
"datasets": "dataset papers",
"models": "model papers",
"papers": "papers",
}[repo_name]
def get_recommendations_from_slug(
slug: str, excluded_repo_types: Optional[list[str]] = None
):
excluded_repo_types = tuple(excluded_repo_types)
return _get_recommendations_from_slug(slug, excluded_repo_types=excluded_repo_types)
@cached(cache=TTLCache(maxsize=500, ttl=60))
def _get_recommendations_from_slug(
slug: str, excluded_repo_types: Optional[tuple[str]] = None
):
data = get_arxiv_ids_from_slug(slug)
if excluded_repo_types:
excluded_repo_types = list(excluded_repo_types)
excluded_repo_types = [map_repo_name_to_api_key(k) for k in excluded_repo_types]
print(f"excluded_repo_types_remapped={excluded_repo_types}")
ids = format_ids(data, exclude_keys=excluded_repo_types)
if not ids:
return (
"Based on your collection and exclusions"
f" ({','.join(excluded_repo_types)}), there are no papers to recommend. Try"
" removing some excluded repo types or adding more items to your"
" collection."
)
ids = tuple(ids)
recommendations = get_recommendations_from_semantic_scholar(ids)
recommendations = recommendations.get("recommendedPapers")
if recommendations is None:
raise gr.Error("Something went wrong with the Semantic Scholar API")
grouped = group_by_is_arxiv_paper(recommendations)
return format_recommendation_into_markdown(grouped)
title = """πŸ“š Collections Reading List Generator πŸ“š"""
description = """<img src="https://huggingface.co/datasets/librarian-bots/images/raw/main/Mascot%20Bookie.svg"
alt="Mascot Bookie" width="200" style="float:left; margin-right:20px; margin-bottom:20px;">
\n\n
Hugging Face Collections allow you to curate models, datasets, spaces,
and papers from the Hugging Face Hub.
This Space will generate a reading list based on the items in your collection.
This can be a great way to find related papers to the models and datasets in your collection and dive more deeply into a topic!
The Space works by:
- finding any papers in your collection
- finding papers related to the models and datasets in your collection
- requesting recommendations from the [Semantic Scholar API](https://api.semanticscholar.org/api-docs/recommendations#tag/Paper-Recommendations/operation/post_papers) for these papers.
You can optionally exclude certain repo types fromm consideration when generating the reading list.
"""
slug_input = gr.Textbox(
lines=1,
label="Collection Slug",
placeholder="merve/video-classification-models-6509edd0a6f657faa425e8c3",
)
example_slugs = [
["merve/video-classification-models-6509edd0a6f657faa425e8c3", []],
["osanseviero/model-merging-65097893623330a3a51ead66", []],
["hf4h/clinical-language-models-64f9c1cd0cedc04f3caca264", []],
]
gr.Interface(
get_recommendations_from_slug,
inputs=[
slug_input,
gr.Dropdown(
label="Repos to exclude from contributing to recommendations",
choices=["datasets", "models", "papers"],
multiselect=True,
),
],
outputs="markdown",
description=description,
title=title,
allow_flagging="never",
examples=example_slugs,
).launch()