File size: 3,086 Bytes
08080f2
 
 
 
 
 
 
 
 
 
 
 
e2797b8
08080f2
 
 
 
 
857ce49
08080f2
 
9fb4b90
08080f2
 
 
 
9fb4b90
08080f2
 
 
 
 
9fb4b90
08080f2
 
9fb4b90
08080f2
 
9fb4b90
 
 
 
 
08080f2
 
 
 
 
 
 
 
9fb4b90
 
08080f2
 
 
e2797b8
08080f2
 
 
 
9fb4b90
 
08080f2
 
 
9fb4b90
08080f2
 
 
9fb4b90
2b2c6a9
9fb4b90
08080f2
 
 
 
 
 
 
c0db5be
6414061
c0db5be
9fb4b90
6414061
08080f2
 
 
07aaba9
08080f2
07aaba9
08080f2
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import dataclasses
import datetime
import operator
import pathlib

import pandas as pd
import requests
import tqdm.auto


@dataclasses.dataclass(frozen=True)
class PaperInfo:
    date: str
    arxiv_id: str
    github: str
    title: str
    paper_page: str
    upvotes: int
    published_at: str

    def __post_init__(self):
        object.__setattr__(self, "published_at", PaperInfo.convert_timestamp(self.published_at))

    @staticmethod
    def convert_timestamp(timestamp: str) -> str:
        try:
            return datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ").strftime("%Y/%m/%d %H:%M:%S")
        except ValueError:
            return timestamp


def get_df(path: pathlib.Path | str) -> pd.DataFrame:
    df = pd.read_csv(path, dtype=str).fillna("")
    paper_info = []
    for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
        res = requests.get(f"https://huggingface.co/api/papers/{row.arxiv_id}").json()
        info = PaperInfo(
            **row,
            title=res["title"],
            paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
            upvotes=res["upvotes"],
            published_at=res["publishedAt"],
        )
        paper_info.append(info)
    return pd.DataFrame([dataclasses.asdict(info) for info in paper_info])


class Prettifier:
    @staticmethod
    def get_github_link(link: str) -> str:
        if not link:
            return ""
        return Prettifier.create_link("github", link)

    @staticmethod
    def create_link(text: str, url: str) -> str:
        return f'<a href="{url}" target="_blank">{text}</a>'

    @staticmethod
    def to_div(text: str | None, category_name: str) -> str:
        if text is None:
            text = ""
        class_name = f"{category_name}-{text.lower()}"
        return f'<div class="{class_name}">{text}</div>'

    def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.sort_values("arxiv_id", ascending=False).reset_index(drop=True)
        new_rows = []
        for _, row in df.iterrows():
            new_row = dict(row) | {
                "date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"),
                "paper_page": f'<a href="%s" target="_blank">%s</a>' % ("https://arxiv.org/abs/"+row.arxiv_id,row.arxiv_id),
                "github": self.get_github_link(row.github),
            }
            new_rows.append(new_row)
        return pd.DataFrame(new_rows, columns=df.columns)


class PaperList:
    COLUMN_INFO = [
        ["date", "markdown"],
        ["paper_page", "markdown"],
        ["title", "str"],
        ["github", "markdown"],
        ["upvotes", "number"],
    ]

    def __init__(self, df: pd.DataFrame):
        self.df_raw = df
        self._prettifier = Prettifier()
        self.df_prettified = self._prettifier(df).loc[:, self.column_names]

    @property
    def column_names(self):
        return list(map(operator.itemgetter(0), self.COLUMN_INFO))

    @property
    def column_datatype(self):
        return list(map(operator.itemgetter(1), self.COLUMN_INFO))