Spaces:

librarian-bots
/

collection_papers_extractor

Sleeping

App Files Files Community

davanstrien HF staff commited on Sep 27, 2023

Commit

212995a

•

1 Parent(s): adf5d9a

schedule refreshing of data

Browse files

Files changed (3) hide show

app.py +14 -1
requirements.in +2 -2
requirements.txt +11 -4

app.py CHANGED Viewed

@@ -8,20 +8,25 @@ from cachetools import TTLCache, cached
 from cytoolz import groupby
 from huggingface_hub import CollectionItem, get_collection, list_datasets, list_models
 from tqdm.auto import tqdm
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 is_macos = platform.system() == "Darwin"
 LIMIT = 1000 if is_macos else None  # limit for local dev because slooow internet
-CACHE_TIME = 60 * 5  # 5 minutes
 @cached(cache=TTLCache(maxsize=100, ttl=CACHE_TIME))
 def get_models():
     return list(tqdm(iter(list_models(full=True, limit=LIMIT))))
 @cached(cache=TTLCache(maxsize=100, ttl=CACHE_TIME))
 def get_datasets():
     return list(tqdm(iter(list_datasets(full=True, limit=LIMIT))))
@@ -79,6 +84,7 @@ def group_collection_items(collection_slug: str):
     items = collection.items
     return groupby(get_collection_type, items)
 @cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME))
 def get_papers_for_collection(collection_slug: str):
     dataset_to_arxiv_id = create_dataset_to_arxiv_id_dict()
@@ -132,6 +138,11 @@ def get_papers_for_collection(collection_slug: str):
     }
 placeholder_url = "HF-IA-archiving/models-to-archive-65006a7fdadb8c628f33aac9"
 slug_input = gr.Textbox(
     placeholder=placeholder_url, interactive=True, label="Collection slug", max_lines=1
@@ -147,6 +158,8 @@ examples = [
     placeholder_url,
     "davanstrien/historic-language-modeling-64f99e243188ade79d7ad74b",
 ]
 gr.Interface(
     get_papers_for_collection,
     slug_input,

 from cytoolz import groupby
 from huggingface_hub import CollectionItem, get_collection, list_datasets, list_models
 from tqdm.auto import tqdm
+from apscheduler.schedulers.background import BackgroundScheduler
+from apscheduler.triggers.cron import CronTrigger
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 is_macos = platform.system() == "Darwin"
+local = platform.system() == "Darwin"
 LIMIT = 1000 if is_macos else None  # limit for local dev because slooow internet
+CACHE_TIME = 60 * 15  # 15 minutes
 @cached(cache=TTLCache(maxsize=100, ttl=CACHE_TIME))
 def get_models():
+    print("getting models...")
     return list(tqdm(iter(list_models(full=True, limit=LIMIT))))
 @cached(cache=TTLCache(maxsize=100, ttl=CACHE_TIME))
 def get_datasets():
+    print("getting datasets...")
     return list(tqdm(iter(list_datasets(full=True, limit=LIMIT))))
     items = collection.items
     return groupby(get_collection_type, items)
 @cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME))
 def get_papers_for_collection(collection_slug: str):
     dataset_to_arxiv_id = create_dataset_to_arxiv_id_dict()
     }
+scheduler = BackgroundScheduler()
+scheduler.add_job(get_datasets, "interval", minutes=15)
+scheduler.add_job(get_models, "interval", minutes=15)
+scheduler.start()
 placeholder_url = "HF-IA-archiving/models-to-archive-65006a7fdadb8c628f33aac9"
 slug_input = gr.Textbox(
     placeholder=placeholder_url, interactive=True, label="Collection slug", max_lines=1
     placeholder_url,
     "davanstrien/historic-language-modeling-64f99e243188ade79d7ad74b",
 ]
 gr.Interface(
     get_papers_for_collection,
     slug_input,

requirements.in CHANGED Viewed

@@ -1,5 +1,5 @@
 cachetools
 cytoolz
 git+https://github.com/huggingface/huggingface_hub
-gradio
-httpx

+apscheduler
 cachetools
 cytoolz
 git+https://github.com/huggingface/huggingface_hub
+gradio

requirements.txt CHANGED Viewed

@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
-#    pip-compile requirements.in
 #
 aiofiles==23.2.1
     # via gradio
@@ -15,6 +15,8 @@ anyio==3.7.1
     #   fastapi
     #   httpcore
     #   starlette
 attrs==23.1.0
     # via
     #   jsonschema
@@ -60,7 +62,6 @@ httpcore==0.18.0
     # via httpx
 httpx==0.25.0
     # via
-    #   -r requirements.in
     #   gradio
     #   gradio-client
 huggingface-hub @ git+https://github.com/huggingface/huggingface_hub
@@ -132,7 +133,9 @@ python-dateutil==2.8.2
 python-multipart==0.0.6
     # via gradio
 pytz==2023.3.post1
-    # via pandas
 pyyaml==6.0.1
     # via
     #   gradio
@@ -153,7 +156,9 @@ rpds-py==0.10.3
 semantic-version==2.10.0
     # via gradio
 six==1.16.0
-    # via python-dateutil
 sniffio==1.3.0
     # via
     #   anyio
@@ -177,6 +182,8 @@ typing-extensions==4.8.0
     #   pydantic-core
 tzdata==2023.3
     # via pandas
 urllib3==2.0.5
     # via requests
 uvicorn==0.23.2

 # This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
+#    pip-compile
 #
 aiofiles==23.2.1
     # via gradio
     #   fastapi
     #   httpcore
     #   starlette
+apscheduler==3.10.4
+    # via -r requirements.in
 attrs==23.1.0
     # via
     #   jsonschema
     # via httpx
 httpx==0.25.0
     # via
     #   gradio
     #   gradio-client
 huggingface-hub @ git+https://github.com/huggingface/huggingface_hub
 python-multipart==0.0.6
     # via gradio
 pytz==2023.3.post1
+    # via
+    #   apscheduler
+    #   pandas
 pyyaml==6.0.1
     # via
     #   gradio
 semantic-version==2.10.0
     # via gradio
 six==1.16.0
+    # via
+    #   apscheduler
+    #   python-dateutil
 sniffio==1.3.0
     # via
     #   anyio
     #   pydantic-core
 tzdata==2023.3
     # via pandas
+tzlocal==5.0.1
+    # via apscheduler
 urllib3==2.0.5
     # via requests
 uvicorn==0.23.2