hysts HF staff commited on
Commit
b4eb3ca
1 Parent(s): 2716ba4

Add scheduler

Browse files
Files changed (3) hide show
  1. app.py +12 -1
  2. requirements.txt +3 -1
  3. update_scheduler.py +114 -0
app.py CHANGED
@@ -1,12 +1,23 @@
1
  #!/usr/bin/env python
2
 
 
 
3
  import gradio as gr
4
 
5
  from papers import PaperList, get_df
 
 
 
6
 
7
  paper_list = PaperList(get_df('papers.csv'))
8
 
9
- DESCRIPTION = '''# list of [Daily Papers](https://huggingface.co/papers)'''
 
 
 
 
 
 
10
 
11
  with gr.Blocks(css='style.css') as demo:
12
  gr.Markdown(DESCRIPTION)
 
1
  #!/usr/bin/env python
2
 
3
+ import os
4
+
5
  import gradio as gr
6
 
7
  from papers import PaperList, get_df
8
+ from update_scheduler import UpdateScheduler
9
+
10
+ DESCRIPTION = '''# list of [Daily Papers](https://huggingface.co/papers)'''
11
 
12
  paper_list = PaperList(get_df('papers.csv'))
13
 
14
+ if (SPACE_ID := os.getenv('SPACE_ID')) is not None:
15
+ CRON_HOUR = os.getenv('CRON_HOUR', '*/4')
16
+ CRON_MINUTE = os.getenv('CRON_MINUTE', '0')
17
+ scheduler = UpdateScheduler(space_id=SPACE_ID,
18
+ cron_hour=CRON_HOUR,
19
+ cron_minute=CRON_MINUTE)
20
+ scheduler.start()
21
 
22
  with gr.Blocks(css='style.css') as demo:
23
  gr.Markdown(DESCRIPTION)
requirements.txt CHANGED
@@ -1,4 +1,6 @@
 
1
  gradio==3.39.0
 
2
  pandas==2.0.3
3
  requests==2.31.0
4
- tqdm==4.65.0
 
1
+ apscheduler==3.10.3
2
  gradio==3.39.0
3
+ huggingface_hub==0.16.4
4
  pandas==2.0.3
5
  requests==2.31.0
6
+ tqdm==4.66.1
update_scheduler.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import pathlib
3
+ import re
4
+ import tempfile
5
+
6
+ import pandas as pd
7
+ import requests
8
+ from apscheduler.schedulers.background import BackgroundScheduler
9
+ from huggingface_hub import HfApi, Repository
10
+ from huggingface_hub.utils import RepositoryNotFoundError
11
+
12
+
13
+ class SpaceRestarter:
14
+ def __init__(self, space_id: str):
15
+ self.api = HfApi()
16
+ if self.api.get_token_permission() != 'write':
17
+ raise ValueError('The HF token must have write permission.')
18
+ try:
19
+ self.api.space_info(repo_id=space_id)
20
+ except RepositoryNotFoundError:
21
+ raise ValueError('The Space ID does not exist.')
22
+ self.space_id = space_id
23
+
24
+ def restart(self) -> None:
25
+ self.api.restart_space(self.space_id)
26
+
27
+
28
+ def find_github_links(summary: str) -> str:
29
+ links = re.findall(
30
+ r'https://github.com/[^/]+/[^/)}, ]+(?:/(?:tree|blob)/[^/]+/[^/)}, ]+)?',
31
+ summary)
32
+ if len(links) == 0:
33
+ return ''
34
+ if len(links) != 1:
35
+ raise RuntimeError(f'Found multiple GitHub links: {links}')
36
+ link = links[0]
37
+ if link.endswith('.'):
38
+ link = link[:-1]
39
+ link = link.strip()
40
+ return link
41
+
42
+
43
+ class RepoUpdater:
44
+ def __init__(self, repo_id: str, repo_type: str):
45
+ api = HfApi()
46
+ name = api.whoami()['name']
47
+
48
+ self.repo_dir = pathlib.Path(
49
+ tempfile.tempdir) / repo_id.split('/')[-1] # type: ignore
50
+ self.repo = Repository(
51
+ local_dir=self.repo_dir,
52
+ clone_from=repo_id,
53
+ repo_type=repo_type,
54
+ git_user=name,
55
+ git_email=f'{name}@users.noreply.huggingface.co')
56
+ self.repo.git_pull()
57
+
58
+ def update(self) -> None:
59
+ yesterday = (datetime.datetime.now() -
60
+ datetime.timedelta(days=1)).strftime('%Y-%m-%d')
61
+ today = datetime.datetime.now().strftime('%Y-%m-%d')
62
+ daily_papers = requests.get(
63
+ f'https://huggingface.co/api/daily_papers?date={yesterday}').json(
64
+ )
65
+ daily_papers += requests.get(
66
+ f'https://huggingface.co/api/daily_papers?date={today}').json()
67
+
68
+ self.repo.git_pull()
69
+ df = pd.read_csv(self.repo_dir / 'papers.csv', dtype=str).fillna('')
70
+ rows = [row for _, row in df.iterrows()]
71
+ arxiv_ids = {row.arxiv_id for row in rows}
72
+
73
+ for paper in daily_papers:
74
+ arxiv_id = paper['paper']['id']
75
+ if arxiv_id in arxiv_ids:
76
+ continue
77
+ try:
78
+ github = find_github_links(paper['paper']['summary'])
79
+ except RuntimeError as e:
80
+ print(e)
81
+ continue
82
+ rows.append(pd.Series({
83
+ 'arxiv_id': arxiv_id,
84
+ 'github': github,
85
+ }))
86
+ df = pd.DataFrame(rows).reset_index(drop=True)
87
+ df.to_csv(self.repo_dir / 'papers.csv', index=False)
88
+
89
+ def push(self) -> None:
90
+ self.repo.push_to_hub()
91
+
92
+
93
+ class UpdateScheduler:
94
+ def __init__(self, space_id: str, cron_hour: str, cron_minute: str):
95
+ self.space_restarter = SpaceRestarter(space_id=space_id)
96
+ self.repo_updater = RepoUpdater(repo_id=space_id, repo_type='space')
97
+
98
+ self.scheduler = BackgroundScheduler()
99
+ self.scheduler.add_job(func=self._update,
100
+ trigger='cron',
101
+ hour=cron_hour,
102
+ minute=cron_minute,
103
+ second=0,
104
+ timezone='UTC')
105
+
106
+ def _update(self) -> None:
107
+ self.repo_updater.update()
108
+ if self.repo_updater.repo.is_repo_clean():
109
+ self.space_restarter.restart()
110
+ else:
111
+ self.repo_updater.push()
112
+
113
+ def start(self) -> None:
114
+ self.scheduler.start()