File size: 6,195 Bytes
32c6187
0741973
 
 
32c6187
0741973
 
32c6187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0741973
 
 
32c6187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0741973
32c6187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbd65c5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import json
from typing import Any, Dict, List, Optional, Union

import gradio as gr
import httpx
from cachetools import TTLCache, cached
from gradio_client import Client
from toolz import groupby

CACHE_TIME = 60 * 60 * 1  # 1 hour

client = Client("https://librarian-bots-collection-papers-extractor.hf.space/")


@cached(cache=TTLCache(maxsize=500, ttl=10))
def get_arxiv_ids_from_slug(
    slug: str,
) -> Dict[str, Union[None, Dict[str, Dict[str, Union[List[str], List[str]]]]]]:
    result = client.predict(slug, api_name="/predict")
    with open(result) as f:
        data = json.load(f)
    return data


def format_arxiv_id_for_semantic_scholar(arxiv_id: str) -> str:
    return f"ArXiv:{arxiv_id}"


def format_ids(data, exclude_keys: Optional[list[str]] = None) -> list[str]:
    arxiv_ids = []
    if exclude_keys is not None:
        data = {k: v for k, v in data.items() if k not in exclude_keys}
        # check if dict now empty
        if not data:
            return []
    for repo in data.values():
        if repo is None:
            continue
        for item in repo.values():
            arxiv_ids.extend(item["arxiv_ids"])
    # format for semantic scholar
    return [format_arxiv_id_for_semantic_scholar(id) for id in arxiv_ids]


@cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME))
def get_recommendations_from_semantic_scholar(paper_ids: tuple[str]):
    paper_ids = list(paper_ids)
    print(paper_ids)
    r = httpx.post(
        "https://api.semanticscholar.org/recommendations/v1/papers/",
        json={
            "positivePaperIds": paper_ids,
        },
        params={"fields": "externalIds,title,year", "limit": 10},
        timeout=30,
    )
    print(r.text)
    return r.json()


def is_arxiv_paper(recommendation: Dict[str, Any]) -> bool:
    return recommendation["externalIds"].get("ArXiv", None) is not None


def group_by_is_arxiv_paper(
    recommendations: List[Dict[str, Any]]
) -> Dict[bool, List[Dict[str, Any]]]:
    return groupby(is_arxiv_paper, recommendations)


def format_recommendation_into_markdown(
    grouped_recommendations: Dict[bool, List[Dict[str, Any]]]
):
    comment = "The following papers were recommended by the Semantic Scholar API \n\n"
    arxiv_papers = grouped_recommendations.get(True)
    if arxiv_papers:
        comment += "## Papers available on Hugging Face Papers:\n\n"
        for r in arxiv_papers:
            hub_paper_url = f"https://huggingface.co/papers/{r['externalIds']['ArXiv']}"
            comment += f"* [{r['title']}]({hub_paper_url}) ({r['year']})\n"
    other_papers = grouped_recommendations.get(False)
    if other_papers:
        comment += "\n\n## Other papers:\n\n"
        for r in other_papers:
            comment += f"* {r['title']} ({r['year']})\n"
    return comment


def map_repo_name_to_api_key(repo_name: str) -> str:
    return {
        "datasets": "dataset papers",
        "models": "model papers",
        "papers": "papers",
    }[repo_name]


def get_recommendations_from_slug(
    slug: str, excluded_repo_types: Optional[list[str]] = None
):
    excluded_repo_types = tuple(excluded_repo_types)
    return _get_recommendations_from_slug(slug, excluded_repo_types=excluded_repo_types)


@cached(cache=TTLCache(maxsize=500, ttl=60))
def _get_recommendations_from_slug(
    slug: str, excluded_repo_types: Optional[tuple[str]] = None
):
    data = get_arxiv_ids_from_slug(slug)
    if excluded_repo_types:
        excluded_repo_types = list(excluded_repo_types)
        excluded_repo_types = [map_repo_name_to_api_key(k) for k in excluded_repo_types]
        print(f"excluded_repo_types_remapped={excluded_repo_types}")
    ids = format_ids(data, exclude_keys=excluded_repo_types)
    if not ids:
        return (
            "Based on your collection and exclusions"
            f" ({','.join(excluded_repo_types)}), there are no papers to recommend. Try"
            " removing some excluded repo types or adding more items to your"
            " collection."
        )
    ids = tuple(ids)
    recommendations = get_recommendations_from_semantic_scholar(ids)
    recommendations = recommendations.get("recommendedPapers")
    if recommendations is None:
        raise gr.Error("Something went wrong with the Semantic Scholar API")
    grouped = group_by_is_arxiv_paper(recommendations)
    return format_recommendation_into_markdown(grouped)


title = """πŸ“š Collections Reading List Generator                 πŸ“š"""
description = """<img src="https://huggingface.co/datasets/librarian-bots/images/raw/main/Mascot%20Bookie.svg" 
alt="Mascot Bookie" width="200" style="float:left; margin-right:20px; margin-bottom:20px;"> 

\n\n 
Hugging Face Collections allow you to curate models, datasets, spaces, 
and papers from the Hugging Face Hub. 
This Space will generate a reading list based on the items in your collection. 
This can be a great way to find related papers to the models and datasets in your collection and dive more deeply into a topic!

The Space works by:

- finding any papers in your collection 
- finding papers related to the models and datasets in your collection 
- requesting recommendations from the [Semantic Scholar API](https://api.semanticscholar.org/api-docs/recommendations#tag/Paper-Recommendations/operation/post_papers) for these papers.

You can optionally exclude certain repo types fromm consideration when generating the reading list.
"""

slug_input = gr.Textbox(
    lines=1,
    label="Collection Slug",
    placeholder="merve/video-classification-models-6509edd0a6f657faa425e8c3",
)
example_slugs = [
    ["merve/video-classification-models-6509edd0a6f657faa425e8c3", []],
    ["osanseviero/model-merging-65097893623330a3a51ead66", []],
    ["hf4h/clinical-language-models-64f9c1cd0cedc04f3caca264", []],
]

gr.Interface(
    get_recommendations_from_slug,
    inputs=[
        slug_input,
        gr.Dropdown(
            label="Repos to exclude from contributing to recommendations",
            choices=["datasets", "models", "papers"],
            multiselect=True,
        ),
    ],
    outputs="markdown",
    description=description,
    title=title,
    allow_flagging="never",
    examples=example_slugs,
).launch()