from __future__ import annotations import pandas as pd class PaperList: def __init__(self): self.organization_name = "ICML2023" self.table = pd.read_json("papers.json").fillna("") claim_info = pd.read_csv("claim_info.csv", dtype={"arxiv_id": str, "n_authors": int, "n_linked_authors": int}) self.table = pd.merge(self.table, claim_info, on="arxiv_id", how="left") self.table[["n_authors", "n_linked_authors"]] = ( self.table[["n_authors", "n_linked_authors"]].fillna(-1).astype(int) ) self._preprocess_table() self.table_header = """ Title Authors Type arXiv GitHub Paper pages Spaces Models Datasets Claimed """ def _preprocess_table(self) -> None: self.table["title_lowercase"] = self.table.title.str.lower() self.table["arxiv"] = self.table.arxiv_id.apply(lambda x: f"https://arxiv.org/abs/{x}" if x else "") self.table["hf_paper"] = self.table.arxiv_id.apply(lambda x: f"https://huggingface.co/papers/{x}" if x else "") self.table["authors"] = self.table.authors.apply(lambda x: ", ".join(x)) rows = [] for row in self.table.itertuples(): title = f'{row.title}' arxiv = f'arXiv' if row.arxiv else "" github = f'GitHub' if row.github else "" hf_paper = f'Paper page' if row.hf_paper else "" hf_space = f'Space' if row.hf_space else "" hf_model = f'Model' if row.hf_model else "" hf_dataset = f'Dataset' if row.hf_dataset else "" author_linked = "✅" if row.n_linked_authors > 0 else "" n_linked_authors = "" if row.n_linked_authors == -1 else row.n_linked_authors n_authors = "" if row.n_authors == -1 else row.n_authors claimed_paper = "" if n_linked_authors == "" else f"{n_linked_authors}/{n_authors} {author_linked}" row = f""" {title} {row.authors} {row.type} {arxiv} {github} {hf_paper} {hf_space} {hf_model} {hf_dataset} {claimed_paper} """ rows.append(row) self.table["html_table_content"] = rows def render( self, search_query: str, case_sensitive: bool, filter_names: list[str], presentation_type: str, ) -> tuple[str, str]: df = self.table if presentation_type != "(ALL)": df = df[df.type == presentation_type.lower()] if search_query: if case_sensitive: df = df[df.title.str.contains(search_query)] else: df = df[df.title_lowercase.str.contains(search_query.lower())] has_arxiv = "arXiv" in filter_names has_github = "GitHub" in filter_names has_hf_space = "Space" in filter_names has_hf_model = "Model" in filter_names has_hf_dataset = "Dataset" in filter_names df = self.filter_table(df, has_arxiv, has_github, has_hf_space, has_hf_model, has_hf_dataset) n_claimed = len(df[df.n_linked_authors > 0]) return f"{len(df)} ({n_claimed} claimed)", self.to_html(df, self.table_header) @staticmethod def filter_table( df: pd.DataFrame, has_arxiv: bool, has_github: bool, has_hf_space: bool, has_hf_model: bool, has_hf_dataset: bool, ) -> pd.DataFrame: if has_arxiv: df = df[df.arxiv != ""] if has_github: df = df[df.github != ""] if has_hf_space: df = df[df.hf_space != ""] if has_hf_model: df = df[df.hf_model != ""] if has_hf_dataset: df = df[df.hf_dataset != ""] return df @staticmethod def to_html(df: pd.DataFrame, table_header: str) -> str: table_data = "".join(df.html_table_content) html = f""" {table_header} {table_data}
""" return html