import datetime import pathlib import re import tempfile import pandas as pd import requests from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import HfApi, Repository from huggingface_hub.utils import RepositoryNotFoundError class SpaceRestarter: def __init__(self, space_id: str): self.api = HfApi() if self.api.get_token_permission() != 'write': raise ValueError('The HF token must have write permission.') try: self.api.space_info(repo_id=space_id) except RepositoryNotFoundError: raise ValueError('The Space ID does not exist.') self.space_id = space_id def restart(self) -> None: self.api.restart_space(self.space_id) def find_github_links(summary: str) -> str: links = re.findall( r'https://github.com/[^/]+/[^/)}, ]+(?:/(?:tree|blob)/[^/]+/[^/)}, ]+)?', summary) if len(links) == 0: return '' if len(links) != 1: raise RuntimeError(f'Found multiple GitHub links: {links}') link = links[0] if link.endswith('.'): link = link[:-1] link = link.strip() return link class RepoUpdater: def __init__(self, repo_id: str, repo_type: str): api = HfApi() if api.get_token_permission() != 'write': raise ValueError('The HF token must have write permission.') name = api.whoami()['name'] repo_dir = pathlib.Path( tempfile.tempdir) / repo_id.split('/')[-1] # type: ignore self.csv_path = repo_dir / 'papers.csv' self.repo = Repository( local_dir=repo_dir, clone_from=repo_id, repo_type=repo_type, git_user=name, git_email=f'{name}@users.noreply.huggingface.co') self.repo.git_pull() def update(self) -> None: yesterday = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime('%Y-%m-%d') today = datetime.datetime.now().strftime('%Y-%m-%d') daily_papers = [ { 'date': yesterday, 'papers': requests.get( f'https://huggingface.co/api/daily_papers?date={yesterday}' ).json() }, { 'date': today, 'papers': requests.get( f'https://huggingface.co/api/daily_papers?date={today}'). json() }, ] self.repo.git_pull() df = pd.read_csv(self.csv_path, dtype=str).fillna('') rows = [row for _, row in df.iterrows()] arxiv_ids = {row.arxiv_id for row in rows} for d in daily_papers: date = d['date'] papers = d['papers'] for paper in papers: arxiv_id = paper['paper']['id'] if arxiv_id in arxiv_ids: continue try: github = find_github_links(paper['paper']['summary']) except RuntimeError as e: print(e) continue rows.append( pd.Series({ 'date': date, 'arxiv_id': arxiv_id, 'github': github, })) df = pd.DataFrame(rows).reset_index(drop=True) df.to_csv(self.csv_path, index=False) def push(self) -> None: self.repo.push_to_hub() class UpdateScheduler: def __init__(self, space_id: str, cron_hour: str, cron_minute: str): self.space_restarter = SpaceRestarter(space_id=space_id) self.repo_updater = RepoUpdater(repo_id=space_id, repo_type='space') self.scheduler = BackgroundScheduler() self.scheduler.add_job(func=self._update, trigger='cron', hour=cron_hour, minute=cron_minute, second=0, timezone='UTC') def _update(self) -> None: self.repo_updater.update() if self.repo_updater.repo.is_repo_clean(): self.space_restarter.restart() else: self.repo_updater.push() def start(self) -> None: self.scheduler.start()