davanstrien HF staff commited on
Commit
310dfa4
β€’
1 Parent(s): 22be3a0
Files changed (3) hide show
  1. app.py +107 -0
  2. requirements.in +5 -0
  3. requirements.txt +187 -0
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import list_datasets, list_models
2
+ from cachetools import TTLCache, cached
3
+ import platform
4
+ import re
5
+ import gradio as gr
6
+ from huggingface_hub import get_collection
7
+ from cytoolz import groupby
8
+ from collections import defaultdict
9
+
10
+ is_macos = platform.system() == "Darwin"
11
+ LIMIT = None
12
+
13
+
14
+ @cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
15
+ def get_models():
16
+ return list(iter(list_models(full=True, limit=LIMIT)))
17
+
18
+
19
+ @cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
20
+ def get_datasets():
21
+ return list(iter(list_datasets(full=True, limit=LIMIT)))
22
+
23
+
24
+ def check_for_arxiv_id(model):
25
+ return [tag for tag in model.tags if "arxiv" in tag] if model.tags else False
26
+
27
+
28
+ def extract_arxiv_id(input_string: str) -> str:
29
+ # Define the regular expression pattern
30
+ pattern = re.compile(r"\barxiv:(\d+\.\d+)\b")
31
+
32
+ # Search for the pattern in the input string
33
+ match = pattern.search(input_string)
34
+
35
+ # If a match is found, return the numeric part of the ARXIV ID, else return None
36
+ return match[1] if match else None
37
+
38
+
39
+ @cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
40
+ def create_model_to_arxiv_id_dict():
41
+ models = get_models()
42
+ model_to_arxiv_id = {}
43
+ for model in models:
44
+ if arxiv_papers := check_for_arxiv_id(model):
45
+ clean_arxiv_ids = []
46
+ for paper in arxiv_papers:
47
+ if arxiv_id := extract_arxiv_id(paper):
48
+ clean_arxiv_ids.append(arxiv_id)
49
+ model_to_arxiv_id[model.modelId] = clean_arxiv_ids
50
+ return model_to_arxiv_id
51
+
52
+
53
+ @cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
54
+ def create_dataset_to_arxiv_id_dict():
55
+ datasets = get_datasets()
56
+ dataset_to_arxiv_id = {}
57
+ for dataset in datasets:
58
+ if arxiv_papers := check_for_arxiv_id(dataset):
59
+ clean_arxiv_ids = []
60
+ for paper in arxiv_papers:
61
+ if arxiv_id := extract_arxiv_id(paper):
62
+ clean_arxiv_ids.append(arxiv_id)
63
+ dataset_to_arxiv_id[dataset.id] = clean_arxiv_ids
64
+ return dataset_to_arxiv_id
65
+
66
+
67
+ url = "lunarflu/ai-podcasts-and-talks-65119866353a60593bf99c58"
68
+
69
+
70
+ def group_collection_items(collection_slug: str):
71
+ collection = get_collection(collection_slug)
72
+ items = collection.items
73
+ return groupby(lambda x: f"{x.repoType}s", items)
74
+
75
+
76
+ def get_papers_for_collection(collection_slug: str):
77
+ dataset_to_arxiv_id = create_dataset_to_arxiv_id_dict()
78
+ models_to_arxiv_id = create_model_to_arxiv_id_dict()
79
+ collection = group_collection_items(collection_slug)
80
+ collection_datasets = collection.get("datasets", None)
81
+ collection_models = collection.get("models", None)
82
+ dataset_papers = []
83
+ model_papers = []
84
+ if collection_datasets is not None:
85
+ for dataset in collection_datasets:
86
+ if arxiv_id := dataset_to_arxiv_id.get(dataset.item_id, None):
87
+ data = {
88
+ "model": dataset.item_id,
89
+ "arxiv_id": arxiv_id,
90
+ "hub_paper_link": f"https://huggingface.co/papers/{arxiv_id}",
91
+ }
92
+ dataset_papers.append(arxiv_id)
93
+ if collection_models is not None:
94
+ for model in collection.get("models", []):
95
+ if arxiv_id := models_to_arxiv_id.get(model.item_id, None):
96
+ data = {
97
+ "model": model.item_id,
98
+ "arxiv_id": arxiv_id,
99
+ "hub_paper_link": f"https://huggingface.co/papers/{arxiv_id}",
100
+ }
101
+ model_papers.append(data)
102
+ return {"datasets": dataset_papers, "models": model_papers}
103
+
104
+
105
+ url = "HF-IA-archiving/models-to-archive-65006a7fdadb8c628f33aac9"
106
+
107
+ gr.Interface(get_papers_for_collection,"text", "json").launch()
requirements.in ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ cachetools
2
+ git+https://github.com/huggingface/huggingface_hub
3
+ gradio
4
+ httpx
5
+ cytoolz
requirements.txt ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # This file is autogenerated by pip-compile with Python 3.11
3
+ # by the following command:
4
+ #
5
+ # pip-compile requirements.in
6
+ #
7
+ aiofiles==23.2.1
8
+ # via gradio
9
+ altair==5.1.1
10
+ # via gradio
11
+ annotated-types==0.5.0
12
+ # via pydantic
13
+ anyio==3.7.1
14
+ # via
15
+ # fastapi
16
+ # httpcore
17
+ # starlette
18
+ attrs==23.1.0
19
+ # via
20
+ # jsonschema
21
+ # referencing
22
+ cachetools==5.3.1
23
+ # via -r requirements.in
24
+ certifi==2023.7.22
25
+ # via
26
+ # httpcore
27
+ # httpx
28
+ # requests
29
+ charset-normalizer==3.2.0
30
+ # via requests
31
+ click==8.1.7
32
+ # via uvicorn
33
+ contourpy==1.1.1
34
+ # via matplotlib
35
+ cycler==0.11.0
36
+ # via matplotlib
37
+ cytoolz==0.12.2
38
+ # via -r requirements.in
39
+ fastapi==0.103.1
40
+ # via gradio
41
+ ffmpy==0.3.1
42
+ # via gradio
43
+ filelock==3.12.4
44
+ # via huggingface-hub
45
+ fonttools==4.42.1
46
+ # via matplotlib
47
+ fsspec==2023.9.2
48
+ # via
49
+ # gradio-client
50
+ # huggingface-hub
51
+ gradio==3.44.4
52
+ # via -r requirements.in
53
+ gradio-client==0.5.1
54
+ # via gradio
55
+ h11==0.14.0
56
+ # via
57
+ # httpcore
58
+ # uvicorn
59
+ httpcore==0.18.0
60
+ # via httpx
61
+ httpx==0.25.0
62
+ # via
63
+ # -r requirements.in
64
+ # gradio
65
+ # gradio-client
66
+ huggingface-hub @ git+https://github.com/huggingface/huggingface_hub
67
+ # via
68
+ # -r requirements.in
69
+ # gradio
70
+ # gradio-client
71
+ idna==3.4
72
+ # via
73
+ # anyio
74
+ # httpx
75
+ # requests
76
+ importlib-resources==6.1.0
77
+ # via gradio
78
+ jinja2==3.1.2
79
+ # via
80
+ # altair
81
+ # gradio
82
+ jsonschema==4.19.1
83
+ # via altair
84
+ jsonschema-specifications==2023.7.1
85
+ # via jsonschema
86
+ kiwisolver==1.4.5
87
+ # via matplotlib
88
+ markupsafe==2.1.3
89
+ # via
90
+ # gradio
91
+ # jinja2
92
+ matplotlib==3.8.0
93
+ # via gradio
94
+ numpy==1.26.0
95
+ # via
96
+ # altair
97
+ # contourpy
98
+ # gradio
99
+ # matplotlib
100
+ # pandas
101
+ orjson==3.9.7
102
+ # via gradio
103
+ packaging==23.1
104
+ # via
105
+ # altair
106
+ # gradio
107
+ # gradio-client
108
+ # huggingface-hub
109
+ # matplotlib
110
+ pandas==2.1.1
111
+ # via
112
+ # altair
113
+ # gradio
114
+ pillow==10.0.1
115
+ # via
116
+ # gradio
117
+ # matplotlib
118
+ pydantic==2.4.1
119
+ # via
120
+ # fastapi
121
+ # gradio
122
+ pydantic-core==2.10.1
123
+ # via pydantic
124
+ pydub==0.25.1
125
+ # via gradio
126
+ pyparsing==3.1.1
127
+ # via matplotlib
128
+ python-dateutil==2.8.2
129
+ # via
130
+ # matplotlib
131
+ # pandas
132
+ python-multipart==0.0.6
133
+ # via gradio
134
+ pytz==2023.3.post1
135
+ # via pandas
136
+ pyyaml==6.0.1
137
+ # via
138
+ # gradio
139
+ # huggingface-hub
140
+ referencing==0.30.2
141
+ # via
142
+ # jsonschema
143
+ # jsonschema-specifications
144
+ requests==2.31.0
145
+ # via
146
+ # gradio
147
+ # gradio-client
148
+ # huggingface-hub
149
+ rpds-py==0.10.3
150
+ # via
151
+ # jsonschema
152
+ # referencing
153
+ semantic-version==2.10.0
154
+ # via gradio
155
+ six==1.16.0
156
+ # via python-dateutil
157
+ sniffio==1.3.0
158
+ # via
159
+ # anyio
160
+ # httpcore
161
+ # httpx
162
+ starlette==0.27.0
163
+ # via fastapi
164
+ toolz==0.12.0
165
+ # via
166
+ # altair
167
+ # cytoolz
168
+ tqdm==4.66.1
169
+ # via huggingface-hub
170
+ typing-extensions==4.8.0
171
+ # via
172
+ # fastapi
173
+ # gradio
174
+ # gradio-client
175
+ # huggingface-hub
176
+ # pydantic
177
+ # pydantic-core
178
+ tzdata==2023.3
179
+ # via pandas
180
+ urllib3==2.0.5
181
+ # via requests
182
+ uvicorn==0.23.2
183
+ # via gradio
184
+ websockets==11.0.3
185
+ # via
186
+ # gradio
187
+ # gradio-client