davanstrien HF staff commited on
Commit
32c6187
β€’
1 Parent(s): 5112501

basic version

Browse files
Files changed (3) hide show
  1. app.py +174 -0
  2. requirements.in +4 -0
  3. requirements.txt +184 -0
app.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_client import Client
3
+ import json
4
+ from cachetools import cached, TTLCache
5
+ from typing import Optional, Any, List, Union, Dict
6
+ import httpx
7
+ import requests
8
+ from typing import Dict, Any
9
+ from toolz import groupby
10
+
11
+ CACHE_TIME = 60 * 60 * 1 # 1 hour
12
+
13
+ client = Client("https://librarian-bots-collection-papers-extractor.hf.space/")
14
+
15
+
16
+ @cached(cache=TTLCache(maxsize=500, ttl=10))
17
+ def get_arxiv_ids_from_slug(
18
+ slug: str,
19
+ ) -> Dict[str, Union[None, Dict[str, Dict[str, Union[List[str], List[str]]]]]]:
20
+ result = client.predict(slug, api_name="/predict")
21
+ with open(result) as f:
22
+ data = json.load(f)
23
+ return data
24
+
25
+
26
+ def format_arxiv_id_for_semantic_scholar(arxiv_id: str) -> str:
27
+ return f"ArXiv:{arxiv_id}"
28
+
29
+
30
+ def format_ids(data, exclude_keys: Optional[list[str]] = None) -> list[str]:
31
+ arxiv_ids = []
32
+ if exclude_keys is not None:
33
+ data = {k: v for k, v in data.items() if k not in exclude_keys}
34
+ # check if dict now empty
35
+ if not data:
36
+ return []
37
+ for repo in data.values():
38
+ if repo is None:
39
+ continue
40
+ for item in repo.values():
41
+ arxiv_ids.extend(item["arxiv_ids"])
42
+ # format for semantic scholar
43
+ return [format_arxiv_id_for_semantic_scholar(id) for id in arxiv_ids]
44
+
45
+
46
+ @cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME))
47
+ def get_recommendations_from_semantic_scholar(paper_ids: tuple[str]):
48
+ paper_ids = list(paper_ids)
49
+ print(paper_ids)
50
+ r = httpx.post(
51
+ "https://api.semanticscholar.org/recommendations/v1/papers/",
52
+ json={
53
+ "positivePaperIds": paper_ids,
54
+ },
55
+ params={"fields": "externalIds,title,year", "limit": 10},
56
+ timeout=30,
57
+ )
58
+ print(r.text)
59
+ return r.json()
60
+
61
+
62
+ def is_arxiv_paper(recommendation: Dict[str, Any]) -> bool:
63
+ return recommendation["externalIds"].get("ArXiv", None) is not None
64
+
65
+
66
+ def group_by_is_arxiv_paper(
67
+ recommendations: List[Dict[str, Any]]
68
+ ) -> Dict[bool, List[Dict[str, Any]]]:
69
+ return groupby(is_arxiv_paper, recommendations)
70
+
71
+
72
+ def format_recommendation_into_markdown(
73
+ grouped_recommendations: Dict[bool, List[Dict[str, Any]]]
74
+ ):
75
+ comment = "The following papers were recommended by the Semantic Scholar API \n\n"
76
+ arxiv_papers = grouped_recommendations.get(True)
77
+ if arxiv_papers:
78
+ comment += "## Papers available on Hugging Face Papers:\n\n"
79
+ for r in arxiv_papers:
80
+ hub_paper_url = f"https://huggingface.co/papers/{r['externalIds']['ArXiv']}"
81
+ comment += f"* [{r['title']}]({hub_paper_url}) ({r['year']})\n"
82
+ other_papers = grouped_recommendations.get(False)
83
+ if other_papers:
84
+ comment += "\n\n## Other papers:\n\n"
85
+ for r in other_papers:
86
+ comment += f"* {r['title']} ({r['year']})\n"
87
+ return comment
88
+
89
+
90
+ def map_repo_name_to_api_key(repo_name: str) -> str:
91
+ return {
92
+ "datasets": "dataset papers",
93
+ "models": "model papers",
94
+ "papers": "papers",
95
+ }[repo_name]
96
+
97
+
98
+ def get_recommendations_from_slug(
99
+ slug: str, excluded_repo_types: Optional[list[str]] = None
100
+ ):
101
+ excluded_repo_types = tuple(excluded_repo_types)
102
+ return _get_recommendations_from_slug(slug, excluded_repo_types=excluded_repo_types)
103
+
104
+
105
+ @cached(cache=TTLCache(maxsize=500, ttl=60))
106
+ def _get_recommendations_from_slug(
107
+ slug: str, excluded_repo_types: Optional[tuple[str]] = None
108
+ ):
109
+ data = get_arxiv_ids_from_slug(slug)
110
+ if excluded_repo_types:
111
+ excluded_repo_types = list(excluded_repo_types)
112
+ excluded_repo_types = [map_repo_name_to_api_key(k) for k in excluded_repo_types]
113
+ print(f"excluded_repo_types_remapped={excluded_repo_types}")
114
+ ids = format_ids(data, exclude_keys=excluded_repo_types)
115
+ if not ids:
116
+ return (
117
+ "Based on your collection and exclusions"
118
+ f" ({','.join(excluded_repo_types)}), there are no papers to recommend. Try removing some excluded repo types or adding more items to your collection."
119
+ )
120
+ ids = tuple(ids)
121
+ recommendations = get_recommendations_from_semantic_scholar(ids)
122
+ recommendations = recommendations.get("recommendedPapers")
123
+ if recommendations is None:
124
+ raise gr.Error("Something went wrong with the Semantic Scholar API")
125
+ grouped = group_by_is_arxiv_paper(recommendations)
126
+ return format_recommendation_into_markdown(grouped)
127
+
128
+
129
+ title = """πŸ“š Collections Reading List Generator πŸ“š"""
130
+ description = """<img src="https://huggingface.co/datasets/librarian-bots/images/raw/main/Mascot%20Bookie.svg"
131
+ alt="Mascot Bookie" width="200" style="float:left; margin-right:20px; margin-bottom:20px;">
132
+
133
+ \n\n
134
+ Hugging Face Collections allow you to curate models, datasets, spaces,
135
+ and papers from the Hugging Face Hub.
136
+ This Space will generate a reading list based on the items in your collection.
137
+ This can be a great way to find related papers to the models and datasets in your collection and dive more deeply into a topic!
138
+
139
+ The Space works by:
140
+
141
+ - finding any papers in your collection
142
+ - finding papers related to the models and datasets in your collection
143
+ - requesting recommendations from the [Semantic Scholar API](https://api.semanticscholar.org/api-docs/recommendations#tag/Paper-Recommendations/operation/post_papers) for these papers.
144
+
145
+ You can optionally exclude certain repo types fromm consideration when generating the reading list.
146
+ """
147
+
148
+ slug_input = gr.Textbox(
149
+ lines=1,
150
+ label="Collection Slug",
151
+ placeholder="merve/video-classification-models-6509edd0a6f657faa425e8c3",
152
+ )
153
+ example_slugs = [
154
+ ["merve/video-classification-models-6509edd0a6f657faa425e8c3", []],
155
+ ["osanseviero/model-merging-65097893623330a3a51ead66", []],
156
+ ["hf4h/clinical-language-models-64f9c1cd0cedc04f3caca264",[]]
157
+ ]
158
+
159
+ gr.Interface(
160
+ get_recommendations_from_slug,
161
+ inputs=[
162
+ slug_input,
163
+ gr.Dropdown(
164
+ label="Repos to exclude from contributing to recommendations",
165
+ choices=["datasets", "models", "papers"],
166
+ multiselect=True,
167
+ ),
168
+ ],
169
+ outputs="markdown",
170
+ description=description,
171
+ title=title,
172
+ allow_flagging="never",
173
+ examples=example_slugs,
174
+ ).launch(debug=True)
requirements.in ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ cachetools
2
+ gradio
3
+ gradio_client
4
+ httpx
requirements.txt ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # This file is autogenerated by pip-compile with Python 3.11
3
+ # by the following command:
4
+ #
5
+ # pip-compile
6
+ #
7
+ aiofiles==23.2.1
8
+ # via gradio
9
+ altair==5.1.1
10
+ # via gradio
11
+ annotated-types==0.5.0
12
+ # via pydantic
13
+ anyio==3.7.1
14
+ # via
15
+ # fastapi
16
+ # httpcore
17
+ # starlette
18
+ attrs==23.1.0
19
+ # via
20
+ # jsonschema
21
+ # referencing
22
+ cachetools==5.3.1
23
+ # via -r requirements.in
24
+ certifi==2023.7.22
25
+ # via
26
+ # httpcore
27
+ # httpx
28
+ # requests
29
+ charset-normalizer==3.2.0
30
+ # via requests
31
+ click==8.1.7
32
+ # via uvicorn
33
+ contourpy==1.1.1
34
+ # via matplotlib
35
+ cycler==0.11.0
36
+ # via matplotlib
37
+ fastapi==0.103.1
38
+ # via gradio
39
+ ffmpy==0.3.1
40
+ # via gradio
41
+ filelock==3.12.4
42
+ # via huggingface-hub
43
+ fonttools==4.42.1
44
+ # via matplotlib
45
+ fsspec==2023.9.2
46
+ # via
47
+ # gradio-client
48
+ # huggingface-hub
49
+ gradio==3.45.1
50
+ # via -r requirements.in
51
+ gradio-client==0.5.2
52
+ # via
53
+ # -r requirements.in
54
+ # gradio
55
+ h11==0.14.0
56
+ # via
57
+ # httpcore
58
+ # uvicorn
59
+ httpcore==0.18.0
60
+ # via httpx
61
+ httpx==0.25.0
62
+ # via
63
+ # -r requirements.in
64
+ # gradio
65
+ # gradio-client
66
+ huggingface-hub==0.17.3
67
+ # via
68
+ # gradio
69
+ # gradio-client
70
+ idna==3.4
71
+ # via
72
+ # anyio
73
+ # httpx
74
+ # requests
75
+ importlib-resources==6.1.0
76
+ # via gradio
77
+ jinja2==3.1.2
78
+ # via
79
+ # altair
80
+ # gradio
81
+ jsonschema==4.19.1
82
+ # via altair
83
+ jsonschema-specifications==2023.7.1
84
+ # via jsonschema
85
+ kiwisolver==1.4.5
86
+ # via matplotlib
87
+ markupsafe==2.1.3
88
+ # via
89
+ # gradio
90
+ # jinja2
91
+ matplotlib==3.8.0
92
+ # via gradio
93
+ numpy==1.26.0
94
+ # via
95
+ # altair
96
+ # contourpy
97
+ # gradio
98
+ # matplotlib
99
+ # pandas
100
+ orjson==3.9.7
101
+ # via gradio
102
+ packaging==23.1
103
+ # via
104
+ # altair
105
+ # gradio
106
+ # gradio-client
107
+ # huggingface-hub
108
+ # matplotlib
109
+ pandas==2.1.1
110
+ # via
111
+ # altair
112
+ # gradio
113
+ pillow==10.0.1
114
+ # via
115
+ # gradio
116
+ # matplotlib
117
+ pydantic==2.4.2
118
+ # via
119
+ # fastapi
120
+ # gradio
121
+ pydantic-core==2.10.1
122
+ # via pydantic
123
+ pydub==0.25.1
124
+ # via gradio
125
+ pyparsing==3.1.1
126
+ # via matplotlib
127
+ python-dateutil==2.8.2
128
+ # via
129
+ # matplotlib
130
+ # pandas
131
+ python-multipart==0.0.6
132
+ # via gradio
133
+ pytz==2023.3.post1
134
+ # via pandas
135
+ pyyaml==6.0.1
136
+ # via
137
+ # gradio
138
+ # huggingface-hub
139
+ referencing==0.30.2
140
+ # via
141
+ # jsonschema
142
+ # jsonschema-specifications
143
+ requests==2.31.0
144
+ # via
145
+ # gradio
146
+ # gradio-client
147
+ # huggingface-hub
148
+ rpds-py==0.10.3
149
+ # via
150
+ # jsonschema
151
+ # referencing
152
+ semantic-version==2.10.0
153
+ # via gradio
154
+ six==1.16.0
155
+ # via python-dateutil
156
+ sniffio==1.3.0
157
+ # via
158
+ # anyio
159
+ # httpcore
160
+ # httpx
161
+ starlette==0.27.0
162
+ # via fastapi
163
+ toolz==0.12.0
164
+ # via altair
165
+ tqdm==4.66.1
166
+ # via huggingface-hub
167
+ typing-extensions==4.8.0
168
+ # via
169
+ # fastapi
170
+ # gradio
171
+ # gradio-client
172
+ # huggingface-hub
173
+ # pydantic
174
+ # pydantic-core
175
+ tzdata==2023.3
176
+ # via pandas
177
+ urllib3==2.0.5
178
+ # via requests
179
+ uvicorn==0.23.2
180
+ # via gradio
181
+ websockets==11.0.3
182
+ # via
183
+ # gradio
184
+ # gradio-client