davanstrien HF staff commited on
Commit
479b4a3
β€’
1 Parent(s): a0915df

add paper parsing

Browse files
Files changed (2) hide show
  1. app.py +34 -12
  2. requirements.in +2 -2
app.py CHANGED
@@ -6,12 +6,13 @@ from collections import defaultdict
6
  import gradio as gr
7
  from cachetools import TTLCache, cached
8
  from cytoolz import groupby
9
- from huggingface_hub import get_collection, list_datasets, list_models
 
10
  from tqdm.auto import tqdm
11
 
12
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
13
  is_macos = platform.system() == "Darwin"
14
- LIMIT = None
15
  CACHE_TIME = 60 * 5 # 5 minutes
16
 
17
 
@@ -34,13 +35,8 @@ def check_for_arxiv_id(model):
34
 
35
 
36
  def extract_arxiv_id(input_string: str) -> str:
37
- # Define the regular expression pattern
38
  pattern = re.compile(r"\barxiv:(\d+\.\d+)\b")
39
-
40
- # Search for the pattern in the input string
41
  match = pattern.search(input_string)
42
-
43
- # If a match is found, return the numeric part of the ARXIV ID, else return None
44
  return match[1] if match else None
45
 
46
 
@@ -72,13 +68,17 @@ def create_dataset_to_arxiv_id_dict():
72
  return dataset_to_arxiv_id
73
 
74
 
75
- placeholder_url = "lunarflu/ai-podcasts-and-talks-65119866353a60593bf99c58"
 
 
 
 
76
 
77
 
78
  def group_collection_items(collection_slug: str):
79
  collection = get_collection(collection_slug)
80
  items = collection.items
81
- return groupby(lambda x: f"{x.repoType}s", items)
82
 
83
 
84
  def get_papers_for_collection(collection_slug: str):
@@ -87,8 +87,10 @@ def get_papers_for_collection(collection_slug: str):
87
  collection = group_collection_items(collection_slug)
88
  collection_datasets = collection.get("datasets", None)
89
  collection_models = collection.get("models", None)
 
90
  dataset_papers = defaultdict(dict)
91
  model_papers = defaultdict(dict)
 
92
  if collection_datasets is not None:
93
  for dataset in collection_datasets:
94
  if arxiv_ids := dataset_to_arxiv_id.get(dataset.item_id, None):
@@ -111,7 +113,24 @@ def get_papers_for_collection(collection_slug: str):
111
  ],
112
  }
113
  model_papers[model.item_id] = data
114
- return {"dataset papers": dataset_papers, "model papers": model_papers}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
 
117
  placeholder_url = "HF-IA-archiving/models-to-archive-65006a7fdadb8c628f33aac9"
@@ -119,8 +138,10 @@ slug_input = gr.Textbox(
119
  placeholder=placeholder_url, interactive=True, label="Collection slug", max_lines=1
120
  )
121
  description = (
122
- "Enter a collection slug to get a list of papers associated with models and"
123
- " datasets in the collection."
 
 
124
  )
125
 
126
  examples = [
@@ -131,6 +152,7 @@ gr.Interface(
131
  get_papers_for_collection,
132
  slug_input,
133
  "json",
 
134
  description=description,
135
  examples=examples,
136
  cache_examples=True,
 
6
  import gradio as gr
7
  from cachetools import TTLCache, cached
8
  from cytoolz import groupby
9
+ from huggingface_hub import (CollectionItem, get_collection, list_datasets,
10
+ list_models)
11
  from tqdm.auto import tqdm
12
 
13
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
14
  is_macos = platform.system() == "Darwin"
15
+ LIMIT = 1000 if is_macos else None # limit for local dev because slooow internet
16
  CACHE_TIME = 60 * 5 # 5 minutes
17
 
18
 
 
35
 
36
 
37
  def extract_arxiv_id(input_string: str) -> str:
 
38
  pattern = re.compile(r"\barxiv:(\d+\.\d+)\b")
 
 
39
  match = pattern.search(input_string)
 
 
40
  return match[1] if match else None
41
 
42
 
 
68
  return dataset_to_arxiv_id
69
 
70
 
71
+ def get_collection_type(collection_item: CollectionItem):
72
+ try:
73
+ return f"{collection_item.item_type}s"
74
+ except AttributeError:
75
+ return None
76
 
77
 
78
  def group_collection_items(collection_slug: str):
79
  collection = get_collection(collection_slug)
80
  items = collection.items
81
+ return groupby(get_collection_type, items)
82
 
83
 
84
  def get_papers_for_collection(collection_slug: str):
 
87
  collection = group_collection_items(collection_slug)
88
  collection_datasets = collection.get("datasets", None)
89
  collection_models = collection.get("models", None)
90
+ papers = collection.get("papers", None)
91
  dataset_papers = defaultdict(dict)
92
  model_papers = defaultdict(dict)
93
+ collection_papers = defaultdict(dict)
94
  if collection_datasets is not None:
95
  for dataset in collection_datasets:
96
  if arxiv_ids := dataset_to_arxiv_id.get(dataset.item_id, None):
 
113
  ],
114
  }
115
  model_papers[model.item_id] = data
116
+ if papers is not None:
117
+ for paper in papers:
118
+ data = {
119
+ "arxiv_ids": paper.item_id,
120
+ "hub_paper_links": [f"https://huggingface.co/papers/{paper.item_id}"],
121
+ }
122
+ collection_papers[paper.item_id] = data
123
+ if not dataset_papers:
124
+ dataset_papers = None
125
+ if not model_papers:
126
+ model_papers = None
127
+ if not collection_papers:
128
+ collection_papers = None
129
+ return {
130
+ "dataset papers": dataset_papers,
131
+ "model papers": model_papers,
132
+ "papers": collection_papers,
133
+ }
134
 
135
 
136
  placeholder_url = "HF-IA-archiving/models-to-archive-65006a7fdadb8c628f33aac9"
 
138
  placeholder=placeholder_url, interactive=True, label="Collection slug", max_lines=1
139
  )
140
  description = (
141
+ "Enter a Collection slug to get the ArXiv IDs and Hugging Face Paper links for"
142
+ " papers associated with models and datasets in the collection. If the collection"
143
+ " includes papers the ArXiv IDs and Hugging Face Paper links will be returned for"
144
+ " those papers as well."
145
  )
146
 
147
  examples = [
 
152
  get_papers_for_collection,
153
  slug_input,
154
  "json",
155
+ title="πŸ“„πŸ”—: Extract linked papers from a Hugging Face Collection",
156
  description=description,
157
  examples=examples,
158
  cache_examples=True,
requirements.in CHANGED
@@ -1,5 +1,5 @@
1
  cachetools
 
2
  git+https://github.com/huggingface/huggingface_hub
3
  gradio
4
- httpx
5
- cytoolz
 
1
  cachetools
2
+ cytoolz
3
  git+https://github.com/huggingface/huggingface_hub
4
  gradio
5
+ httpx