Spaces:

librarian-bots
/

collection_papers_extractor

Runtime error

App Files Files Community

davanstrien HF staff commited on Sep 26, 2023

Commit

310dfa4

•

1 Parent(s): 22be3a0

demo

Browse files

Files changed (3) hide show

app.py +107 -0
requirements.in +5 -0
requirements.txt +187 -0

app.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from huggingface_hub import list_datasets, list_models
+from cachetools import TTLCache, cached
+import platform
+import re
+import gradio as gr
+from huggingface_hub import get_collection
+from cytoolz import groupby
+from collections import defaultdict
+is_macos = platform.system() == "Darwin"
+LIMIT = None
+@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
+def get_models():
+    return list(iter(list_models(full=True, limit=LIMIT)))
+@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
+def get_datasets():
+    return list(iter(list_datasets(full=True, limit=LIMIT)))
+def check_for_arxiv_id(model):
+    return [tag for tag in model.tags if "arxiv" in tag] if model.tags else False
+def extract_arxiv_id(input_string: str) -> str:
+    # Define the regular expression pattern
+    pattern = re.compile(r"\barxiv:(\d+\.\d+)\b")
+    # Search for the pattern in the input string
+    match = pattern.search(input_string)
+    # If a match is found, return the numeric part of the ARXIV ID, else return None
+    return match[1] if match else None
+@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
+def create_model_to_arxiv_id_dict():
+    models = get_models()
+    model_to_arxiv_id = {}
+    for model in models:
+        if arxiv_papers := check_for_arxiv_id(model):
+            clean_arxiv_ids = []
+            for paper in arxiv_papers:
+                if arxiv_id := extract_arxiv_id(paper):
+                    clean_arxiv_ids.append(arxiv_id)
+            model_to_arxiv_id[model.modelId] = clean_arxiv_ids
+    return model_to_arxiv_id
+@cached(cache=TTLCache(maxsize=100, ttl=60 * 30))
+def create_dataset_to_arxiv_id_dict():
+    datasets = get_datasets()
+    dataset_to_arxiv_id = {}
+    for dataset in datasets:
+        if arxiv_papers := check_for_arxiv_id(dataset):
+            clean_arxiv_ids = []
+            for paper in arxiv_papers:
+                if arxiv_id := extract_arxiv_id(paper):
+                    clean_arxiv_ids.append(arxiv_id)
+            dataset_to_arxiv_id[dataset.id] = clean_arxiv_ids
+    return dataset_to_arxiv_id
+url = "lunarflu/ai-podcasts-and-talks-65119866353a60593bf99c58"
+def group_collection_items(collection_slug: str):
+    collection = get_collection(collection_slug)
+    items = collection.items
+    return groupby(lambda x: f"{x.repoType}s", items)
+def get_papers_for_collection(collection_slug: str):
+    dataset_to_arxiv_id = create_dataset_to_arxiv_id_dict()
+    models_to_arxiv_id = create_model_to_arxiv_id_dict()
+    collection = group_collection_items(collection_slug)
+    collection_datasets = collection.get("datasets", None)
+    collection_models = collection.get("models", None)
+    dataset_papers = []
+    model_papers = []
+    if collection_datasets is not None:
+        for dataset in collection_datasets:
+            if arxiv_id := dataset_to_arxiv_id.get(dataset.item_id, None):
+                data = {
+                    "model": dataset.item_id,
+                    "arxiv_id": arxiv_id,
+                    "hub_paper_link": f"https://huggingface.co/papers/{arxiv_id}",
+                }
+                dataset_papers.append(arxiv_id)
+    if collection_models is not None:
+        for model in collection.get("models", []):
+            if arxiv_id := models_to_arxiv_id.get(model.item_id, None):
+                data = {
+                    "model": model.item_id,
+                    "arxiv_id": arxiv_id,
+                    "hub_paper_link": f"https://huggingface.co/papers/{arxiv_id}",
+                }
+                model_papers.append(data)
+    return {"datasets": dataset_papers, "models": model_papers}
+url = "HF-IA-archiving/models-to-archive-65006a7fdadb8c628f33aac9"
+gr.Interface(get_papers_for_collection,"text", "json").launch()

requirements.in ADDED Viewed

	@@ -0,0 +1,5 @@

+cachetools
+git+https://github.com/huggingface/huggingface_hub
+gradio
+httpx
+cytoolz

requirements.txt ADDED Viewed

	@@ -0,0 +1,187 @@

+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile requirements.in
+#
+aiofiles==23.2.1
+    # via gradio
+altair==5.1.1
+    # via gradio
+annotated-types==0.5.0
+    # via pydantic
+anyio==3.7.1
+    # via
+    #   fastapi
+    #   httpcore
+    #   starlette
+attrs==23.1.0
+    # via
+    #   jsonschema
+    #   referencing
+cachetools==5.3.1
+    # via -r requirements.in
+certifi==2023.7.22
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+charset-normalizer==3.2.0
+    # via requests
+click==8.1.7
+    # via uvicorn
+contourpy==1.1.1
+    # via matplotlib
+cycler==0.11.0
+    # via matplotlib
+cytoolz==0.12.2
+    # via -r requirements.in
+fastapi==0.103.1
+    # via gradio
+ffmpy==0.3.1
+    # via gradio
+filelock==3.12.4
+    # via huggingface-hub
+fonttools==4.42.1
+    # via matplotlib
+fsspec==2023.9.2
+    # via
+    #   gradio-client
+    #   huggingface-hub
+gradio==3.44.4
+    # via -r requirements.in
+gradio-client==0.5.1
+    # via gradio
+h11==0.14.0
+    # via
+    #   httpcore
+    #   uvicorn
+httpcore==0.18.0
+    # via httpx
+httpx==0.25.0
+    # via
+    #   -r requirements.in
+    #   gradio
+    #   gradio-client
+huggingface-hub @ git+https://github.com/huggingface/huggingface_hub
+    # via
+    #   -r requirements.in
+    #   gradio
+    #   gradio-client
+idna==3.4
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+importlib-resources==6.1.0
+    # via gradio
+jinja2==3.1.2
+    # via
+    #   altair
+    #   gradio
+jsonschema==4.19.1
+    # via altair
+jsonschema-specifications==2023.7.1
+    # via jsonschema
+kiwisolver==1.4.5
+    # via matplotlib
+markupsafe==2.1.3
+    # via
+    #   gradio
+    #   jinja2
+matplotlib==3.8.0
+    # via gradio
+numpy==1.26.0
+    # via
+    #   altair
+    #   contourpy
+    #   gradio
+    #   matplotlib
+    #   pandas
+orjson==3.9.7
+    # via gradio
+packaging==23.1
+    # via
+    #   altair
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   matplotlib
+pandas==2.1.1
+    # via
+    #   altair
+    #   gradio
+pillow==10.0.1
+    # via
+    #   gradio
+    #   matplotlib
+pydantic==2.4.1
+    # via
+    #   fastapi
+    #   gradio
+pydantic-core==2.10.1
+    # via pydantic
+pydub==0.25.1
+    # via gradio
+pyparsing==3.1.1
+    # via matplotlib
+python-dateutil==2.8.2
+    # via
+    #   matplotlib
+    #   pandas
+python-multipart==0.0.6
+    # via gradio
+pytz==2023.3.post1
+    # via pandas
+pyyaml==6.0.1
+    # via
+    #   gradio
+    #   huggingface-hub
+referencing==0.30.2
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+requests==2.31.0
+    # via
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+rpds-py==0.10.3
+    # via
+    #   jsonschema
+    #   referencing
+semantic-version==2.10.0
+    # via gradio
+six==1.16.0
+    # via python-dateutil
+sniffio==1.3.0
+    # via
+    #   anyio
+    #   httpcore
+    #   httpx
+starlette==0.27.0
+    # via fastapi
+toolz==0.12.0
+    # via
+    #   altair
+    #   cytoolz
+tqdm==4.66.1
+    # via huggingface-hub
+typing-extensions==4.8.0
+    # via
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   pydantic
+    #   pydantic-core
+tzdata==2023.3
+    # via pandas
+urllib3==2.0.5
+    # via requests
+uvicorn==0.23.2
+    # via gradio
+websockets==11.0.3
+    # via
+    #   gradio
+    #   gradio-client