papers / papers.py
hysts's picture
hysts HF staff
Migrate from yapf to black
9fb4b90
raw
history blame
No virus
3.05 kB
import dataclasses
import datetime
import operator
import pathlib
import pandas as pd
import requests
import tqdm.auto
@dataclasses.dataclass(frozen=True)
class PaperInfo:
date: str
arxiv_id: str
github: str
title: str
paper_page: str
upvotes: int
published_at: str
def __post_init__(self):
object.__setattr__(self, "published_at", PaperInfo.convert_timestamp(self.published_at))
@staticmethod
def convert_timestamp(timestamp: str) -> str:
try:
return datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ").strftime("%Y/%m/%d %H:%M:%S")
except ValueError:
return timestamp
def get_df(path: pathlib.Path | str) -> pd.DataFrame:
df = pd.read_csv(path, dtype=str).fillna("")
paper_info = []
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
res = requests.get(f"https://huggingface.co/api/papers/{row.arxiv_id}").json()
info = PaperInfo(
**row,
title=res["title"],
paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
upvotes=res["upvotes"],
published_at=res["publishedAt"],
)
paper_info.append(info)
return pd.DataFrame([dataclasses.asdict(info) for info in paper_info])
class Prettifier:
@staticmethod
def get_github_link(link: str) -> str:
if not link:
return ""
return Prettifier.create_link("github", link)
@staticmethod
def create_link(text: str, url: str) -> str:
return f'<a href="{url}" target="_blank">{text}</a>'
@staticmethod
def to_div(text: str | None, category_name: str) -> str:
if text is None:
text = ""
class_name = f"{category_name}-{text.lower()}"
return f'<div class="{class_name}">{text}</div>'
def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
df = df.sort_values("arxiv_id", ascending=False).reset_index(drop=True)
new_rows = []
for _, row in df.iterrows():
new_row = dict(row) | {
"date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"),
"paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
"github": self.get_github_link(row.github),
}
new_rows.append(new_row)
return pd.DataFrame(new_rows, columns=df.columns)
class PaperList:
COLUMN_INFO = [
["date", "markdown"],
["paper_page", "markdown"],
["title", "str"],
["github", "markdown"],
["upvotes", "number"],
]
def __init__(self, df: pd.DataFrame):
self.df_raw = df
self._prettifier = Prettifier()
self.df_prettified = self._prettifier(df).loc[:, self.column_names]
@property
def column_names(self):
return list(map(operator.itemgetter(0), self.COLUMN_INFO))
@property
def column_datatype(self):
return list(map(operator.itemgetter(1), self.COLUMN_INFO))