hysts HF staff commited on
Commit
9fb4b90
1 Parent(s): 86d73cf

Migrate from yapf to black

Browse files
Files changed (6) hide show
  1. .pre-commit-config.yaml +49 -49
  2. .style.yapf +0 -5
  3. .vscode/settings.json +11 -8
  4. app.py +13 -13
  5. papers.py +22 -30
  6. update_scheduler.py +43 -49
.pre-commit-config.yaml CHANGED
@@ -1,50 +1,50 @@
1
  repos:
2
- - repo: https://github.com/pre-commit/pre-commit-hooks
3
- rev: v4.2.0
4
- hooks:
5
- - id: check-executables-have-shebangs
6
- - id: check-json
7
- - id: check-merge-conflict
8
- - id: check-shebang-scripts-are-executable
9
- - id: check-toml
10
- - id: check-yaml
11
- - id: double-quote-string-fixer
12
- - id: end-of-file-fixer
13
- - id: mixed-line-ending
14
- args: ["--fix=lf"]
15
- - id: requirements-txt-fixer
16
- - id: trailing-whitespace
17
- - repo: https://github.com/myint/docformatter
18
- rev: v1.4
19
- hooks:
20
- - id: docformatter
21
- args: ["--in-place"]
22
- - repo: https://github.com/pycqa/isort
23
- rev: 5.12.0
24
- hooks:
25
- - id: isort
26
- - repo: https://github.com/pre-commit/mirrors-mypy
27
- rev: v0.991
28
- hooks:
29
- - id: mypy
30
- args: ["--ignore-missing-imports"]
31
- additional_dependencies: ["types-python-slugify", "types-requests"]
32
- - repo: https://github.com/google/yapf
33
- rev: v0.32.0
34
- hooks:
35
- - id: yapf
36
- args: ["--parallel", "--in-place"]
37
- - repo: https://github.com/kynan/nbstripout
38
- rev: 0.6.0
39
- hooks:
40
- - id: nbstripout
41
- args:
42
- [
43
- "--extra-keys",
44
- "metadata.interpreter metadata.kernelspec cell.metadata.pycharm",
45
- ]
46
- - repo: https://github.com/nbQA-dev/nbQA
47
- rev: 1.7.0
48
- hooks:
49
- - id: nbqa-isort
50
- - id: nbqa-yapf
 
1
  repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v4.4.0
4
+ hooks:
5
+ - id: check-executables-have-shebangs
6
+ - id: check-json
7
+ - id: check-merge-conflict
8
+ - id: check-shebang-scripts-are-executable
9
+ - id: check-toml
10
+ - id: check-yaml
11
+ - id: end-of-file-fixer
12
+ - id: mixed-line-ending
13
+ args: ["--fix=lf"]
14
+ - id: requirements-txt-fixer
15
+ - id: trailing-whitespace
16
+ - repo: https://github.com/myint/docformatter
17
+ rev: v1.7.5
18
+ hooks:
19
+ - id: docformatter
20
+ args: ["--in-place"]
21
+ - repo: https://github.com/pycqa/isort
22
+ rev: 5.12.0
23
+ hooks:
24
+ - id: isort
25
+ args: ["--profile", "black"]
26
+ - repo: https://github.com/pre-commit/mirrors-mypy
27
+ rev: v1.5.1
28
+ hooks:
29
+ - id: mypy
30
+ args: ["--ignore-missing-imports"]
31
+ additional_dependencies: ["types-python-slugify", "types-requests", "types-PyYAML"]
32
+ - repo: https://github.com/psf/black
33
+ rev: 23.7.0
34
+ hooks:
35
+ - id: black
36
+ language_version: python3.10
37
+ args: ["--line-length", "119"]
38
+ - repo: https://github.com/kynan/nbstripout
39
+ rev: 0.6.1
40
+ hooks:
41
+ - id: nbstripout
42
+ args: ["--extra-keys", "metadata.interpreter metadata.kernelspec cell.metadata.pycharm"]
43
+ - repo: https://github.com/nbQA-dev/nbQA
44
+ rev: 1.7.0
45
+ hooks:
46
+ - id: nbqa-black
47
+ - id: nbqa-pyupgrade
48
+ args: ["--py37-plus"]
49
+ - id: nbqa-isort
50
+ args: ["--float-to-top"]
.style.yapf DELETED
@@ -1,5 +0,0 @@
1
- [style]
2
- based_on_style = pep8
3
- blank_line_before_nested_class_or_def = false
4
- spaces_before_comment = 2
5
- split_before_logical_operator = true
 
 
 
 
 
 
.vscode/settings.json CHANGED
@@ -1,18 +1,21 @@
1
  {
2
- "python.linting.enabled": true,
3
- "python.linting.flake8Enabled": true,
4
- "python.linting.pylintEnabled": false,
5
- "python.linting.lintOnSave": true,
6
- "python.formatting.provider": "yapf",
7
- "python.formatting.yapfArgs": [
8
- "--style={based_on_style: pep8, indent_width: 4, blank_line_before_nested_class_or_def: false, spaces_before_comment: 2, split_before_logical_operator: true}"
9
- ],
10
  "[python]": {
 
11
  "editor.formatOnType": true,
12
  "editor.codeActionsOnSave": {
13
  "source.organizeImports": true
14
  }
15
  },
 
 
 
 
 
 
 
 
 
 
16
  "editor.formatOnSave": true,
17
  "files.insertFinalNewline": true
18
  }
 
1
  {
 
 
 
 
 
 
 
 
2
  "[python]": {
3
+ "editor.defaultFormatter": "ms-python.black-formatter",
4
  "editor.formatOnType": true,
5
  "editor.codeActionsOnSave": {
6
  "source.organizeImports": true
7
  }
8
  },
9
+ "black-formatter.args": [
10
+ "--line-length=119"
11
+ ],
12
+ "isort.args": ["--profile", "black"],
13
+ "flake8.args": [
14
+ "--max-line-length=119"
15
+ ],
16
+ "ruff.args": [
17
+ "--line-length=119"
18
+ ],
19
  "editor.formatOnSave": true,
20
  "files.insertFinalNewline": true
21
  }
app.py CHANGED
@@ -7,22 +7,22 @@ import gradio as gr
7
  from papers import PaperList, get_df
8
  from update_scheduler import UpdateScheduler
9
 
10
- DESCRIPTION = '''# [Daily Papers](https://huggingface.co/papers)'''
11
 
12
- paper_list = PaperList(get_df('papers.csv'))
13
 
14
- if (SPACE_ID := os.getenv('SPACE_ID')) is not None:
15
- CRON_HOUR = os.getenv('CRON_HOUR', '*/4')
16
- CRON_MINUTE = os.getenv('CRON_MINUTE', '0')
17
- scheduler = UpdateScheduler(space_id=SPACE_ID,
18
- cron_hour=CRON_HOUR,
19
- cron_minute=CRON_MINUTE)
20
  scheduler.start()
21
 
22
- with gr.Blocks(css='style.css') as demo:
23
  gr.Markdown(DESCRIPTION)
24
- df = gr.Dataframe(value=paper_list.df_prettified,
25
- datatype=paper_list.column_datatype,
26
- type='pandas',
27
- interactive=False)
 
 
28
  demo.queue(api_open=False).launch()
 
7
  from papers import PaperList, get_df
8
  from update_scheduler import UpdateScheduler
9
 
10
+ DESCRIPTION = "# [Daily Papers](https://huggingface.co/papers)"
11
 
12
+ paper_list = PaperList(get_df("papers.csv"))
13
 
14
+ if (SPACE_ID := os.getenv("SPACE_ID")) is not None:
15
+ CRON_HOUR = os.getenv("CRON_HOUR", "*/4")
16
+ CRON_MINUTE = os.getenv("CRON_MINUTE", "0")
17
+ scheduler = UpdateScheduler(space_id=SPACE_ID, cron_hour=CRON_HOUR, cron_minute=CRON_MINUTE)
 
 
18
  scheduler.start()
19
 
20
+ with gr.Blocks(css="style.css") as demo:
21
  gr.Markdown(DESCRIPTION)
22
+ df = gr.Dataframe(
23
+ value=paper_list.df_prettified,
24
+ datatype=paper_list.column_datatype,
25
+ type="pandas",
26
+ interactive=False,
27
+ )
28
  demo.queue(api_open=False).launch()
papers.py CHANGED
@@ -19,31 +19,28 @@ class PaperInfo:
19
  published_at: str
20
 
21
  def __post_init__(self):
22
- object.__setattr__(self, 'published_at',
23
- PaperInfo.convert_timestamp(self.published_at))
24
 
25
  @staticmethod
26
  def convert_timestamp(timestamp: str) -> str:
27
  try:
28
- return datetime.datetime.strptime(
29
- timestamp,
30
- '%Y-%m-%dT%H:%M:%S.%fZ').strftime('%Y/%m/%d %H:%M:%S')
31
  except ValueError:
32
  return timestamp
33
 
34
 
35
  def get_df(path: pathlib.Path | str) -> pd.DataFrame:
36
- df = pd.read_csv(path, dtype=str).fillna('')
37
  paper_info = []
38
  for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
39
- res = requests.get(
40
- f'https://huggingface.co/api/papers/{row.arxiv_id}').json()
41
  info = PaperInfo(
42
  **row,
43
- title=res['title'],
44
- paper_page=f'https://huggingface.co/papers/{row.arxiv_id}',
45
- upvotes=res['upvotes'],
46
- published_at=res['publishedAt'])
 
47
  paper_info.append(info)
48
  return pd.DataFrame([dataclasses.asdict(info) for info in paper_info])
49
 
@@ -52,8 +49,8 @@ class Prettifier:
52
  @staticmethod
53
  def get_github_link(link: str) -> str:
54
  if not link:
55
- return ''
56
- return Prettifier.create_link('github', link)
57
 
58
  @staticmethod
59
  def create_link(text: str, url: str) -> str:
@@ -62,23 +59,18 @@ class Prettifier:
62
  @staticmethod
63
  def to_div(text: str | None, category_name: str) -> str:
64
  if text is None:
65
- text = ''
66
- class_name = f'{category_name}-{text.lower()}'
67
  return f'<div class="{class_name}">{text}</div>'
68
 
69
  def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
70
- df = df.sort_values('arxiv_id', ascending=False).reset_index(drop=True)
71
  new_rows = []
72
  for _, row in df.iterrows():
73
  new_row = dict(row) | {
74
- 'date':
75
- Prettifier.create_link(
76
- row.date,
77
- f'https://huggingface.co/papers?date={row.date}'),
78
- 'paper_page':
79
- Prettifier.create_link(row.arxiv_id, row.paper_page),
80
- 'github':
81
- self.get_github_link(row.github),
82
  }
83
  new_rows.append(new_row)
84
  return pd.DataFrame(new_rows, columns=df.columns)
@@ -86,11 +78,11 @@ class Prettifier:
86
 
87
  class PaperList:
88
  COLUMN_INFO = [
89
- ['date', 'markdown'],
90
- ['paper_page', 'markdown'],
91
- ['title', 'str'],
92
- ['github', 'markdown'],
93
- ['upvotes', 'number'],
94
  ]
95
 
96
  def __init__(self, df: pd.DataFrame):
 
19
  published_at: str
20
 
21
  def __post_init__(self):
22
+ object.__setattr__(self, "published_at", PaperInfo.convert_timestamp(self.published_at))
 
23
 
24
  @staticmethod
25
  def convert_timestamp(timestamp: str) -> str:
26
  try:
27
+ return datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ").strftime("%Y/%m/%d %H:%M:%S")
 
 
28
  except ValueError:
29
  return timestamp
30
 
31
 
32
  def get_df(path: pathlib.Path | str) -> pd.DataFrame:
33
+ df = pd.read_csv(path, dtype=str).fillna("")
34
  paper_info = []
35
  for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
36
+ res = requests.get(f"https://huggingface.co/api/papers/{row.arxiv_id}").json()
 
37
  info = PaperInfo(
38
  **row,
39
+ title=res["title"],
40
+ paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
41
+ upvotes=res["upvotes"],
42
+ published_at=res["publishedAt"],
43
+ )
44
  paper_info.append(info)
45
  return pd.DataFrame([dataclasses.asdict(info) for info in paper_info])
46
 
 
49
  @staticmethod
50
  def get_github_link(link: str) -> str:
51
  if not link:
52
+ return ""
53
+ return Prettifier.create_link("github", link)
54
 
55
  @staticmethod
56
  def create_link(text: str, url: str) -> str:
 
59
  @staticmethod
60
  def to_div(text: str | None, category_name: str) -> str:
61
  if text is None:
62
+ text = ""
63
+ class_name = f"{category_name}-{text.lower()}"
64
  return f'<div class="{class_name}">{text}</div>'
65
 
66
  def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
67
+ df = df.sort_values("arxiv_id", ascending=False).reset_index(drop=True)
68
  new_rows = []
69
  for _, row in df.iterrows():
70
  new_row = dict(row) | {
71
+ "date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"),
72
+ "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
73
+ "github": self.get_github_link(row.github),
 
 
 
 
 
74
  }
75
  new_rows.append(new_row)
76
  return pd.DataFrame(new_rows, columns=df.columns)
 
78
 
79
  class PaperList:
80
  COLUMN_INFO = [
81
+ ["date", "markdown"],
82
+ ["paper_page", "markdown"],
83
+ ["title", "str"],
84
+ ["github", "markdown"],
85
+ ["upvotes", "number"],
86
  ]
87
 
88
  def __init__(self, df: pd.DataFrame):
update_scheduler.py CHANGED
@@ -13,12 +13,12 @@ from huggingface_hub.utils import RepositoryNotFoundError
13
  class SpaceRestarter:
14
  def __init__(self, space_id: str):
15
  self.api = HfApi()
16
- if self.api.get_token_permission() != 'write':
17
- raise ValueError('The HF token must have write permission.')
18
  try:
19
  self.api.space_info(repo_id=space_id)
20
  except RepositoryNotFoundError:
21
- raise ValueError('The Space ID does not exist.')
22
  self.space_id = space_id
23
 
24
  def restart(self) -> None:
@@ -26,15 +26,13 @@ class SpaceRestarter:
26
 
27
 
28
  def find_github_links(summary: str) -> str:
29
- links = re.findall(
30
- r'https://github.com/[^/]+/[^/)}, ]+(?:/(?:tree|blob)/[^/]+/[^/)}, ]+)?',
31
- summary)
32
  if len(links) == 0:
33
- return ''
34
  if len(links) != 1:
35
- raise RuntimeError(f'Found multiple GitHub links: {links}')
36
  link = links[0]
37
- if link.endswith('.'):
38
  link = link[:-1]
39
  link = link.strip()
40
  return link
@@ -43,68 +41,62 @@ def find_github_links(summary: str) -> str:
43
  class RepoUpdater:
44
  def __init__(self, repo_id: str, repo_type: str):
45
  api = HfApi()
46
- if api.get_token_permission() != 'write':
47
- raise ValueError('The HF token must have write permission.')
48
 
49
- name = api.whoami()['name']
50
 
51
- repo_dir = pathlib.Path(
52
- tempfile.tempdir) / repo_id.split('/')[-1] # type: ignore
53
- self.csv_path = repo_dir / 'papers.csv'
54
  self.repo = Repository(
55
  local_dir=repo_dir,
56
  clone_from=repo_id,
57
  repo_type=repo_type,
58
  git_user=name,
59
- git_email=f'{name}@users.noreply.huggingface.co')
 
60
  self.repo.git_pull()
61
 
62
  def update(self) -> None:
63
- yesterday = (datetime.datetime.now() -
64
- datetime.timedelta(days=1)).strftime('%Y-%m-%d')
65
- today = datetime.datetime.now().strftime('%Y-%m-%d')
66
  daily_papers = [
67
  {
68
- 'date':
69
- yesterday,
70
- 'papers':
71
- requests.get(
72
- f'https://huggingface.co/api/daily_papers?date={yesterday}'
73
- ).json()
74
  },
75
  {
76
- 'date':
77
- today,
78
- 'papers':
79
- requests.get(
80
- f'https://huggingface.co/api/daily_papers?date={today}').
81
- json()
82
  },
83
  ]
84
 
85
  self.repo.git_pull()
86
- df = pd.read_csv(self.csv_path, dtype=str).fillna('')
87
  rows = [row for _, row in df.iterrows()]
88
  arxiv_ids = {row.arxiv_id for row in rows}
89
 
90
  for d in daily_papers:
91
- date = d['date']
92
- papers = d['papers']
93
  for paper in papers:
94
- arxiv_id = paper['paper']['id']
95
  if arxiv_id in arxiv_ids:
96
  continue
97
  try:
98
- github = find_github_links(paper['paper']['summary'])
99
  except RuntimeError as e:
100
  print(e)
101
  continue
102
  rows.append(
103
- pd.Series({
104
- 'date': date,
105
- 'arxiv_id': arxiv_id,
106
- 'github': github,
107
- }))
 
 
 
108
  df = pd.DataFrame(rows).reset_index(drop=True)
109
  df.to_csv(self.csv_path, index=False)
110
 
@@ -113,17 +105,19 @@ class RepoUpdater:
113
 
114
 
115
  class UpdateScheduler:
116
- def __init__(self, space_id: str, cron_hour: str, cron_minute: str):
117
  self.space_restarter = SpaceRestarter(space_id=space_id)
118
- self.repo_updater = RepoUpdater(repo_id=space_id, repo_type='space')
119
 
120
  self.scheduler = BackgroundScheduler()
121
- self.scheduler.add_job(func=self._update,
122
- trigger='cron',
123
- hour=cron_hour,
124
- minute=cron_minute,
125
- second=0,
126
- timezone='UTC')
 
 
127
 
128
  def _update(self) -> None:
129
  self.repo_updater.update()
 
13
  class SpaceRestarter:
14
  def __init__(self, space_id: str):
15
  self.api = HfApi()
16
+ if self.api.get_token_permission() != "write":
17
+ raise ValueError("The HF token must have write permission.")
18
  try:
19
  self.api.space_info(repo_id=space_id)
20
  except RepositoryNotFoundError:
21
+ raise ValueError("The Space ID does not exist.")
22
  self.space_id = space_id
23
 
24
  def restart(self) -> None:
 
26
 
27
 
28
  def find_github_links(summary: str) -> str:
29
+ links = re.findall(r"https://github.com/[^/]+/[^/)}, ]+(?:/(?:tree|blob)/[^/]+/[^/)}, ]+)?", summary)
 
 
30
  if len(links) == 0:
31
+ return ""
32
  if len(links) != 1:
33
+ raise RuntimeError(f"Found multiple GitHub links: {links}")
34
  link = links[0]
35
+ if link.endswith("."):
36
  link = link[:-1]
37
  link = link.strip()
38
  return link
 
41
  class RepoUpdater:
42
  def __init__(self, repo_id: str, repo_type: str):
43
  api = HfApi()
44
+ if api.get_token_permission() != "write":
45
+ raise ValueError("The HF token must have write permission.")
46
 
47
+ name = api.whoami()["name"]
48
 
49
+ repo_dir = pathlib.Path(tempfile.tempdir) / repo_id.split("/")[-1] # type: ignore
50
+ self.csv_path = repo_dir / "papers.csv"
 
51
  self.repo = Repository(
52
  local_dir=repo_dir,
53
  clone_from=repo_id,
54
  repo_type=repo_type,
55
  git_user=name,
56
+ git_email=f"{name}@users.noreply.huggingface.co",
57
+ )
58
  self.repo.git_pull()
59
 
60
  def update(self) -> None:
61
+ yesterday = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
62
+ today = datetime.datetime.now().strftime("%Y-%m-%d")
 
63
  daily_papers = [
64
  {
65
+ "date": yesterday,
66
+ "papers": requests.get(f"https://huggingface.co/api/daily_papers?date={yesterday}").json(),
 
 
 
 
67
  },
68
  {
69
+ "date": today,
70
+ "papers": requests.get(f"https://huggingface.co/api/daily_papers?date={today}").json(),
 
 
 
 
71
  },
72
  ]
73
 
74
  self.repo.git_pull()
75
+ df = pd.read_csv(self.csv_path, dtype=str).fillna("")
76
  rows = [row for _, row in df.iterrows()]
77
  arxiv_ids = {row.arxiv_id for row in rows}
78
 
79
  for d in daily_papers:
80
+ date = d["date"]
81
+ papers = d["papers"]
82
  for paper in papers:
83
+ arxiv_id = paper["paper"]["id"]
84
  if arxiv_id in arxiv_ids:
85
  continue
86
  try:
87
+ github = find_github_links(paper["paper"]["summary"])
88
  except RuntimeError as e:
89
  print(e)
90
  continue
91
  rows.append(
92
+ pd.Series(
93
+ {
94
+ "date": date,
95
+ "arxiv_id": arxiv_id,
96
+ "github": github,
97
+ }
98
+ )
99
+ )
100
  df = pd.DataFrame(rows).reset_index(drop=True)
101
  df.to_csv(self.csv_path, index=False)
102
 
 
105
 
106
 
107
  class UpdateScheduler:
108
+ def __init__(self, space_id: str, cron_hour: str, cron_minute: str, cron_second: str = "0"):
109
  self.space_restarter = SpaceRestarter(space_id=space_id)
110
+ self.repo_updater = RepoUpdater(repo_id=space_id, repo_type="space")
111
 
112
  self.scheduler = BackgroundScheduler()
113
+ self.scheduler.add_job(
114
+ func=self._update,
115
+ trigger="cron",
116
+ hour=cron_hour,
117
+ minute=cron_minute,
118
+ second=cron_second,
119
+ timezone="UTC",
120
+ )
121
 
122
  def _update(self) -> None:
123
  self.repo_updater.update()