Spaces:
Runtime error
Runtime error
Commit
β’
310dfa4
1
Parent(s):
22be3a0
demo
Browse files- app.py +107 -0
- requirements.in +5 -0
- requirements.txt +187 -0
app.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import list_datasets, list_models
|
2 |
+
from cachetools import TTLCache, cached
|
3 |
+
import platform
|
4 |
+
import re
|
5 |
+
import gradio as gr
|
6 |
+
from huggingface_hub import get_collection
|
7 |
+
from cytoolz import groupby
|
8 |
+
from collections import defaultdict
|
9 |
+
|
10 |
+
is_macos = platform.system() == "Darwin"
|
11 |
+
LIMIT = None
|
12 |
+
|
13 |
+
|
14 |
+
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
|
15 |
+
def get_models():
|
16 |
+
return list(iter(list_models(full=True, limit=LIMIT)))
|
17 |
+
|
18 |
+
|
19 |
+
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
|
20 |
+
def get_datasets():
|
21 |
+
return list(iter(list_datasets(full=True, limit=LIMIT)))
|
22 |
+
|
23 |
+
|
24 |
+
def check_for_arxiv_id(model):
|
25 |
+
return [tag for tag in model.tags if "arxiv" in tag] if model.tags else False
|
26 |
+
|
27 |
+
|
28 |
+
def extract_arxiv_id(input_string: str) -> str:
|
29 |
+
# Define the regular expression pattern
|
30 |
+
pattern = re.compile(r"\barxiv:(\d+\.\d+)\b")
|
31 |
+
|
32 |
+
# Search for the pattern in the input string
|
33 |
+
match = pattern.search(input_string)
|
34 |
+
|
35 |
+
# If a match is found, return the numeric part of the ARXIV ID, else return None
|
36 |
+
return match[1] if match else None
|
37 |
+
|
38 |
+
|
39 |
+
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
|
40 |
+
def create_model_to_arxiv_id_dict():
|
41 |
+
models = get_models()
|
42 |
+
model_to_arxiv_id = {}
|
43 |
+
for model in models:
|
44 |
+
if arxiv_papers := check_for_arxiv_id(model):
|
45 |
+
clean_arxiv_ids = []
|
46 |
+
for paper in arxiv_papers:
|
47 |
+
if arxiv_id := extract_arxiv_id(paper):
|
48 |
+
clean_arxiv_ids.append(arxiv_id)
|
49 |
+
model_to_arxiv_id[model.modelId] = clean_arxiv_ids
|
50 |
+
return model_to_arxiv_id
|
51 |
+
|
52 |
+
|
53 |
+
@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
|
54 |
+
def create_dataset_to_arxiv_id_dict():
|
55 |
+
datasets = get_datasets()
|
56 |
+
dataset_to_arxiv_id = {}
|
57 |
+
for dataset in datasets:
|
58 |
+
if arxiv_papers := check_for_arxiv_id(dataset):
|
59 |
+
clean_arxiv_ids = []
|
60 |
+
for paper in arxiv_papers:
|
61 |
+
if arxiv_id := extract_arxiv_id(paper):
|
62 |
+
clean_arxiv_ids.append(arxiv_id)
|
63 |
+
dataset_to_arxiv_id[dataset.id] = clean_arxiv_ids
|
64 |
+
return dataset_to_arxiv_id
|
65 |
+
|
66 |
+
|
67 |
+
url = "lunarflu/ai-podcasts-and-talks-65119866353a60593bf99c58"
|
68 |
+
|
69 |
+
|
70 |
+
def group_collection_items(collection_slug: str):
|
71 |
+
collection = get_collection(collection_slug)
|
72 |
+
items = collection.items
|
73 |
+
return groupby(lambda x: f"{x.repoType}s", items)
|
74 |
+
|
75 |
+
|
76 |
+
def get_papers_for_collection(collection_slug: str):
|
77 |
+
dataset_to_arxiv_id = create_dataset_to_arxiv_id_dict()
|
78 |
+
models_to_arxiv_id = create_model_to_arxiv_id_dict()
|
79 |
+
collection = group_collection_items(collection_slug)
|
80 |
+
collection_datasets = collection.get("datasets", None)
|
81 |
+
collection_models = collection.get("models", None)
|
82 |
+
dataset_papers = []
|
83 |
+
model_papers = []
|
84 |
+
if collection_datasets is not None:
|
85 |
+
for dataset in collection_datasets:
|
86 |
+
if arxiv_id := dataset_to_arxiv_id.get(dataset.item_id, None):
|
87 |
+
data = {
|
88 |
+
"model": dataset.item_id,
|
89 |
+
"arxiv_id": arxiv_id,
|
90 |
+
"hub_paper_link": f"https://huggingface.co/papers/{arxiv_id}",
|
91 |
+
}
|
92 |
+
dataset_papers.append(arxiv_id)
|
93 |
+
if collection_models is not None:
|
94 |
+
for model in collection.get("models", []):
|
95 |
+
if arxiv_id := models_to_arxiv_id.get(model.item_id, None):
|
96 |
+
data = {
|
97 |
+
"model": model.item_id,
|
98 |
+
"arxiv_id": arxiv_id,
|
99 |
+
"hub_paper_link": f"https://huggingface.co/papers/{arxiv_id}",
|
100 |
+
}
|
101 |
+
model_papers.append(data)
|
102 |
+
return {"datasets": dataset_papers, "models": model_papers}
|
103 |
+
|
104 |
+
|
105 |
+
url = "HF-IA-archiving/models-to-archive-65006a7fdadb8c628f33aac9"
|
106 |
+
|
107 |
+
gr.Interface(get_papers_for_collection,"text", "json").launch()
|
requirements.in
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
cachetools
|
2 |
+
git+https://github.com/huggingface/huggingface_hub
|
3 |
+
gradio
|
4 |
+
httpx
|
5 |
+
cytoolz
|
requirements.txt
ADDED
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# This file is autogenerated by pip-compile with Python 3.11
|
3 |
+
# by the following command:
|
4 |
+
#
|
5 |
+
# pip-compile requirements.in
|
6 |
+
#
|
7 |
+
aiofiles==23.2.1
|
8 |
+
# via gradio
|
9 |
+
altair==5.1.1
|
10 |
+
# via gradio
|
11 |
+
annotated-types==0.5.0
|
12 |
+
# via pydantic
|
13 |
+
anyio==3.7.1
|
14 |
+
# via
|
15 |
+
# fastapi
|
16 |
+
# httpcore
|
17 |
+
# starlette
|
18 |
+
attrs==23.1.0
|
19 |
+
# via
|
20 |
+
# jsonschema
|
21 |
+
# referencing
|
22 |
+
cachetools==5.3.1
|
23 |
+
# via -r requirements.in
|
24 |
+
certifi==2023.7.22
|
25 |
+
# via
|
26 |
+
# httpcore
|
27 |
+
# httpx
|
28 |
+
# requests
|
29 |
+
charset-normalizer==3.2.0
|
30 |
+
# via requests
|
31 |
+
click==8.1.7
|
32 |
+
# via uvicorn
|
33 |
+
contourpy==1.1.1
|
34 |
+
# via matplotlib
|
35 |
+
cycler==0.11.0
|
36 |
+
# via matplotlib
|
37 |
+
cytoolz==0.12.2
|
38 |
+
# via -r requirements.in
|
39 |
+
fastapi==0.103.1
|
40 |
+
# via gradio
|
41 |
+
ffmpy==0.3.1
|
42 |
+
# via gradio
|
43 |
+
filelock==3.12.4
|
44 |
+
# via huggingface-hub
|
45 |
+
fonttools==4.42.1
|
46 |
+
# via matplotlib
|
47 |
+
fsspec==2023.9.2
|
48 |
+
# via
|
49 |
+
# gradio-client
|
50 |
+
# huggingface-hub
|
51 |
+
gradio==3.44.4
|
52 |
+
# via -r requirements.in
|
53 |
+
gradio-client==0.5.1
|
54 |
+
# via gradio
|
55 |
+
h11==0.14.0
|
56 |
+
# via
|
57 |
+
# httpcore
|
58 |
+
# uvicorn
|
59 |
+
httpcore==0.18.0
|
60 |
+
# via httpx
|
61 |
+
httpx==0.25.0
|
62 |
+
# via
|
63 |
+
# -r requirements.in
|
64 |
+
# gradio
|
65 |
+
# gradio-client
|
66 |
+
huggingface-hub @ git+https://github.com/huggingface/huggingface_hub
|
67 |
+
# via
|
68 |
+
# -r requirements.in
|
69 |
+
# gradio
|
70 |
+
# gradio-client
|
71 |
+
idna==3.4
|
72 |
+
# via
|
73 |
+
# anyio
|
74 |
+
# httpx
|
75 |
+
# requests
|
76 |
+
importlib-resources==6.1.0
|
77 |
+
# via gradio
|
78 |
+
jinja2==3.1.2
|
79 |
+
# via
|
80 |
+
# altair
|
81 |
+
# gradio
|
82 |
+
jsonschema==4.19.1
|
83 |
+
# via altair
|
84 |
+
jsonschema-specifications==2023.7.1
|
85 |
+
# via jsonschema
|
86 |
+
kiwisolver==1.4.5
|
87 |
+
# via matplotlib
|
88 |
+
markupsafe==2.1.3
|
89 |
+
# via
|
90 |
+
# gradio
|
91 |
+
# jinja2
|
92 |
+
matplotlib==3.8.0
|
93 |
+
# via gradio
|
94 |
+
numpy==1.26.0
|
95 |
+
# via
|
96 |
+
# altair
|
97 |
+
# contourpy
|
98 |
+
# gradio
|
99 |
+
# matplotlib
|
100 |
+
# pandas
|
101 |
+
orjson==3.9.7
|
102 |
+
# via gradio
|
103 |
+
packaging==23.1
|
104 |
+
# via
|
105 |
+
# altair
|
106 |
+
# gradio
|
107 |
+
# gradio-client
|
108 |
+
# huggingface-hub
|
109 |
+
# matplotlib
|
110 |
+
pandas==2.1.1
|
111 |
+
# via
|
112 |
+
# altair
|
113 |
+
# gradio
|
114 |
+
pillow==10.0.1
|
115 |
+
# via
|
116 |
+
# gradio
|
117 |
+
# matplotlib
|
118 |
+
pydantic==2.4.1
|
119 |
+
# via
|
120 |
+
# fastapi
|
121 |
+
# gradio
|
122 |
+
pydantic-core==2.10.1
|
123 |
+
# via pydantic
|
124 |
+
pydub==0.25.1
|
125 |
+
# via gradio
|
126 |
+
pyparsing==3.1.1
|
127 |
+
# via matplotlib
|
128 |
+
python-dateutil==2.8.2
|
129 |
+
# via
|
130 |
+
# matplotlib
|
131 |
+
# pandas
|
132 |
+
python-multipart==0.0.6
|
133 |
+
# via gradio
|
134 |
+
pytz==2023.3.post1
|
135 |
+
# via pandas
|
136 |
+
pyyaml==6.0.1
|
137 |
+
# via
|
138 |
+
# gradio
|
139 |
+
# huggingface-hub
|
140 |
+
referencing==0.30.2
|
141 |
+
# via
|
142 |
+
# jsonschema
|
143 |
+
# jsonschema-specifications
|
144 |
+
requests==2.31.0
|
145 |
+
# via
|
146 |
+
# gradio
|
147 |
+
# gradio-client
|
148 |
+
# huggingface-hub
|
149 |
+
rpds-py==0.10.3
|
150 |
+
# via
|
151 |
+
# jsonschema
|
152 |
+
# referencing
|
153 |
+
semantic-version==2.10.0
|
154 |
+
# via gradio
|
155 |
+
six==1.16.0
|
156 |
+
# via python-dateutil
|
157 |
+
sniffio==1.3.0
|
158 |
+
# via
|
159 |
+
# anyio
|
160 |
+
# httpcore
|
161 |
+
# httpx
|
162 |
+
starlette==0.27.0
|
163 |
+
# via fastapi
|
164 |
+
toolz==0.12.0
|
165 |
+
# via
|
166 |
+
# altair
|
167 |
+
# cytoolz
|
168 |
+
tqdm==4.66.1
|
169 |
+
# via huggingface-hub
|
170 |
+
typing-extensions==4.8.0
|
171 |
+
# via
|
172 |
+
# fastapi
|
173 |
+
# gradio
|
174 |
+
# gradio-client
|
175 |
+
# huggingface-hub
|
176 |
+
# pydantic
|
177 |
+
# pydantic-core
|
178 |
+
tzdata==2023.3
|
179 |
+
# via pandas
|
180 |
+
urllib3==2.0.5
|
181 |
+
# via requests
|
182 |
+
uvicorn==0.23.2
|
183 |
+
# via gradio
|
184 |
+
websockets==11.0.3
|
185 |
+
# via
|
186 |
+
# gradio
|
187 |
+
# gradio-client
|