Spaces:

librarian-bots
/

collection_papers_extractor

Sleeping

App Files Files Community

davanstrien HF staff commited on Sep 27, 2023

Commit

479b4a3

•

1 Parent(s): a0915df

add paper parsing

Browse files

Files changed (2) hide show

app.py +34 -12
requirements.in +2 -2

app.py CHANGED Viewed

@@ -6,12 +6,13 @@ from collections import defaultdict
 import gradio as gr
 from cachetools import TTLCache, cached
 from cytoolz import groupby
-from huggingface_hub import get_collection, list_datasets, list_models
 from tqdm.auto import tqdm
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 is_macos = platform.system() == "Darwin"
-LIMIT = None
 CACHE_TIME = 60 * 5  # 5 minutes
@@ -34,13 +35,8 @@ def check_for_arxiv_id(model):
 def extract_arxiv_id(input_string: str) -> str:
-    # Define the regular expression pattern
     pattern = re.compile(r"\barxiv:(\d+\.\d+)\b")
-    # Search for the pattern in the input string
     match = pattern.search(input_string)
-    # If a match is found, return the numeric part of the ARXIV ID, else return None
     return match[1] if match else None
@@ -72,13 +68,17 @@ def create_dataset_to_arxiv_id_dict():
     return dataset_to_arxiv_id
-placeholder_url = "lunarflu/ai-podcasts-and-talks-65119866353a60593bf99c58"
 def group_collection_items(collection_slug: str):
     collection = get_collection(collection_slug)
     items = collection.items
-    return groupby(lambda x: f"{x.repoType}s", items)
 def get_papers_for_collection(collection_slug: str):
@@ -87,8 +87,10 @@ def get_papers_for_collection(collection_slug: str):
     collection = group_collection_items(collection_slug)
     collection_datasets = collection.get("datasets", None)
     collection_models = collection.get("models", None)
     dataset_papers = defaultdict(dict)
     model_papers = defaultdict(dict)
     if collection_datasets is not None:
         for dataset in collection_datasets:
             if arxiv_ids := dataset_to_arxiv_id.get(dataset.item_id, None):
@@ -111,7 +113,24 @@ def get_papers_for_collection(collection_slug: str):
                     ],
                 }
                 model_papers[model.item_id] = data
-    return {"dataset papers": dataset_papers, "model papers": model_papers}
 placeholder_url = "HF-IA-archiving/models-to-archive-65006a7fdadb8c628f33aac9"
@@ -119,8 +138,10 @@ slug_input = gr.Textbox(
     placeholder=placeholder_url, interactive=True, label="Collection slug", max_lines=1
 )
 description = (
-    "Enter a collection slug to get a list of papers associated with models and"
-    " datasets in the collection."
 )
 examples = [
@@ -131,6 +152,7 @@ gr.Interface(
     get_papers_for_collection,
     slug_input,
     "json",
     description=description,
     examples=examples,
     cache_examples=True,

 import gradio as gr
 from cachetools import TTLCache, cached
 from cytoolz import groupby
+from huggingface_hub import (CollectionItem, get_collection, list_datasets,
+                             list_models)
 from tqdm.auto import tqdm
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 is_macos = platform.system() == "Darwin"
+LIMIT = 1000 if is_macos else None  # limit for local dev because slooow internet
 CACHE_TIME = 60 * 5  # 5 minutes
 def extract_arxiv_id(input_string: str) -> str:
     pattern = re.compile(r"\barxiv:(\d+\.\d+)\b")
     match = pattern.search(input_string)
     return match[1] if match else None
     return dataset_to_arxiv_id
+def get_collection_type(collection_item: CollectionItem):
+    try:
+        return f"{collection_item.item_type}s"
+    except AttributeError:
+        return None
 def group_collection_items(collection_slug: str):
     collection = get_collection(collection_slug)
     items = collection.items
+    return groupby(get_collection_type, items)
 def get_papers_for_collection(collection_slug: str):
     collection = group_collection_items(collection_slug)
     collection_datasets = collection.get("datasets", None)
     collection_models = collection.get("models", None)
+    papers = collection.get("papers", None)
     dataset_papers = defaultdict(dict)
     model_papers = defaultdict(dict)
+    collection_papers = defaultdict(dict)
     if collection_datasets is not None:
         for dataset in collection_datasets:
             if arxiv_ids := dataset_to_arxiv_id.get(dataset.item_id, None):
                     ],
                 }
                 model_papers[model.item_id] = data
+    if papers is not None:
+        for paper in papers:
+            data = {
+                "arxiv_ids": paper.item_id,
+                "hub_paper_links": [f"https://huggingface.co/papers/{paper.item_id}"],
+            }
+            collection_papers[paper.item_id] = data
+    if not dataset_papers:
+        dataset_papers = None
+    if not model_papers:
+        model_papers = None
+    if not collection_papers:
+        collection_papers = None
+    return {
+        "dataset papers": dataset_papers,
+        "model papers": model_papers,
+        "papers": collection_papers,
+    }
 placeholder_url = "HF-IA-archiving/models-to-archive-65006a7fdadb8c628f33aac9"
     placeholder=placeholder_url, interactive=True, label="Collection slug", max_lines=1
 )
 description = (
+    "Enter a Collection slug to get the ArXiv IDs and Hugging Face Paper links for"
+    " papers associated with models and datasets in the collection. If the collection"
+    " includes papers the ArXiv IDs and Hugging Face Paper links will be returned for"
+    " those papers as well."
 )
 examples = [
     get_papers_for_collection,
     slug_input,
     "json",
+    title="📄🔗: Extract linked papers from a Hugging Face Collection",
     description=description,
     examples=examples,
     cache_examples=True,

requirements.in CHANGED Viewed

@@ -1,5 +1,5 @@
 cachetools
 git+https://github.com/huggingface/huggingface_hub
 gradio
-httpx
-cytoolz

 cachetools
+cytoolz
 git+https://github.com/huggingface/huggingface_hub
 gradio
+httpx