Spaces:

ICLR2024
/

ICLR2024-papers

Running on CPU Upgrade

App Files Files Community

hysts HF staff commited on Apr 2, 2024

Commit

25c0a98

1 Parent(s): 15ee8f1

Update

Browse files

Files changed (5) hide show

.gitignore +162 -0
app.py +137 -3
papers.py +137 -0
requirements.txt +6 -0
style.css +15 -5

.gitignore ADDED Viewed

	@@ -0,0 +1,162 @@

+.ragatouille/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

app.py CHANGED Viewed

@@ -1,9 +1,143 @@
 #!/usr/bin/env python
 import gradio as gr
-with gr.Blocks() as demo:
-    pass
 if __name__ == "__main__":
-    demo.queue().launch()

 #!/usr/bin/env python
 import gradio as gr
+import pandas as pd
+from papers import PaperList
+DESCRIPTION = "# ICLR 2024 Papers"
+paper_list = PaperList()
+DEFAULT_COLUMNS = [
+    "Title",
+    "Type",
+    "Paper page",
+    "OpenReview",
+    "GitHub",
+    "Spaces",
+    "Models",
+    "Datasets",
+    "claimed",
+]
+def update_num_papers(df: pd.DataFrame) -> str:
+    return f"{len(df)} / {len(paper_list.df_raw)} ({len(df[df['claimed'].str.contains('✅')])} claimed)"
+def update_df(
+    title_search_query: str,
+    abstract_search_query: str,
+    max_num_to_retrieve: int,
+    filter_names: list,
+    presentation_type: str,
+    column_names: list[str],
+) -> pd.DataFrame:
+    return gr.DataFrame(
+        value=paper_list.search(
+            title_search_query,
+            abstract_search_query,
+            max_num_to_retrieve,
+            filter_names,
+            presentation_type,
+            column_names,
+        ),
+        datatype=paper_list.get_column_datatypes(column_names),
+    )
+with gr.Blocks(css="style.css") as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Group():
+        search_title = gr.Textbox(label="Search title")
+        with gr.Row():
+            with gr.Column(scale=4):
+                search_abstract = gr.Textbox(
+                    label="Search abstract",
+                    info="The result may not be accurate as the abstract does not contain all the information.",
+                )
+            with gr.Column(scale=1):
+                max_num_to_retrieve = gr.Slider(
+                    label="Max number to retrieve",
+                    info="This is used only for search on abstracts.",
+                    minimum=1,
+                    maximum=len(paper_list.df_raw),
+                    step=1,
+                    value=100,
+                )
+        filter_names = gr.CheckboxGroup(
+            label="Filter",
+            choices=[
+                "Paper page",
+                "GitHub",
+                "Space",
+                "Model",
+                "Dataset",
+            ],
+        )
+        presentation_type = gr.Radio(
+            label="Presentation Type",
+            choices=["(ALL)", "Oral", "Spotlight Poster", "Poster"],
+            value="(ALL)",
+        )
+        column_names = gr.CheckboxGroup(label="Columns", choices=paper_list.get_column_names(), value=DEFAULT_COLUMNS)
+    num_papers = gr.Textbox(
+        label="Number of papers", value=update_num_papers(paper_list.df_prettified), interactive=False
+    )
+    df = gr.Dataframe(
+        value=paper_list.df_prettified,
+        datatype=paper_list.get_column_datatypes(paper_list.get_column_names()),
+        type="pandas",
+        row_count=(0, "dynamic"),
+        interactive=False,
+        height=1000,
+        elem_id="table",
+        wrap=True,
+    )
+    inputs = [
+        search_title,
+        search_abstract,
+        max_num_to_retrieve,
+        filter_names,
+        presentation_type,
+        column_names,
+    ]
+    gr.on(
+        triggers=[
+            search_title.submit,
+            search_abstract.submit,
+            filter_names.input,
+            presentation_type.input,
+            column_names.input,
+        ],
+        fn=update_df,
+        inputs=inputs,
+        outputs=df,
+        api_name=False,
+    ).then(
+        fn=update_num_papers,
+        inputs=df,
+        outputs=num_papers,
+        queue=False,
+        api_name=False,
+    )
+    demo.load(
+        fn=update_df,
+        inputs=inputs,
+        outputs=df,
+        api_name=False,
+    ).then(
+        fn=update_num_papers,
+        inputs=df,
+        outputs=num_papers,
+        queue=False,
+        api_name=False,
+    )
 if __name__ == "__main__":
+    demo.queue(api_open=False).launch(show_api=False)

papers.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import operator
+import datasets
+import pandas as pd
+from huggingface_hub import HfApi
+from ragatouille import RAGPretrainedModel
+api = HfApi()
+INDEX_DIR_PATH = ".ragatouille/colbert/indexes/ICLR2024-papers-abstract-index/"
+api.snapshot_download(
+    repo_id="ICLR2024/ICLR2024-papers-abstract-index",
+    repo_type="dataset",
+    local_dir=INDEX_DIR_PATH,
+)
+ABSTRACT_RETRIEVER = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
+# Run once to initialize the retriever
+ABSTRACT_RETRIEVER.search("LLM")
+class PaperList:
+    COLUMN_INFO = [
+        ["Title", "str"],
+        ["Authors", "str"],
+        ["Type", "str"],
+        ["Paper page", "markdown"],
+        ["OpenReview", "markdown"],
+        ["GitHub", "markdown"],
+        ["Spaces", "markdown"],
+        ["Models", "markdown"],
+        ["Datasets", "markdown"],
+        ["claimed", "markdown"],
+    ]
+    def __init__(self):
+        self.df_raw = self.get_df()
+        self.df_prettified = self.prettify(self.df_raw)
+    @staticmethod
+    def get_df() -> pd.DataFrame:
+        df = pd.merge(
+            left=datasets.load_dataset("ICLR2024/ICLR2024-papers", split="train").to_pandas(),
+            right=datasets.load_dataset("ICLR2024/ICLR2024-num-claimed-papers", split="train").to_pandas(),
+            on="id",
+            how="left",
+        )
+        df[["n_authors", "n_linked_authors"]] = df[["n_authors", "n_linked_authors"]].fillna(-1).astype(int)
+        df["paper_page"] = df["arxiv_id"].apply(
+            lambda arxiv_id: f"https://huggingface.co/papers/{arxiv_id}" if arxiv_id else ""
+        )
+        return df
+    @staticmethod
+    def create_link(text: str, url: str) -> str:
+        return f'<a href="{url}" target="_blank">{text}</a>'
+    @staticmethod
+    def prettify(df: pd.DataFrame) -> pd.DataFrame:
+        rows = []
+        for _, row in df.iterrows():
+            author_linked = "✅" if row.n_linked_authors > 0 else ""
+            n_linked_authors = "" if row.n_linked_authors == -1 else row.n_linked_authors
+            n_authors = "" if row.n_authors == -1 else row.n_authors
+            claimed_paper = "" if n_linked_authors == "" else f"{n_linked_authors}/{n_authors} {author_linked}"
+            new_row = {
+                "Title": row["title"],
+                "Authors": ", ".join(row["authors"]),
+                "Type": row["type"],
+                "Paper page": PaperList.create_link(row["arxiv_id"], row["paper_page"]),
+                "OpenReview": PaperList.create_link("OpenReview", row["OpenReview"]),
+                "GitHub": "\n".join([PaperList.create_link("GitHub", url) for url in row["GitHub"]]),
+                "Spaces": "\n".join([PaperList.create_link("Space", url) for url in row["Space"]]),
+                "Models": "\n".join([PaperList.create_link("Model", url) for url in row["Model"]]),
+                "Datasets": "\n".join([PaperList.create_link("Dataset", url) for url in row["Dataset"]]),
+                "claimed": claimed_paper,
+            }
+            rows.append(new_row)
+        return pd.DataFrame(rows, columns=PaperList.get_column_names())
+    @staticmethod
+    def get_column_names():
+        return list(map(operator.itemgetter(0), PaperList.COLUMN_INFO))
+    def get_column_datatypes(self, column_names: list[str]) -> list[str]:
+        mapping = dict(self.COLUMN_INFO)
+        return [mapping[name] for name in column_names]
+    def search(
+        self,
+        title_search_query: str,
+        abstract_search_query: str,
+        max_num_to_retrieve: int,
+        filter_names: list[str],
+        presentation_type: str,
+        columns_names: list[str],
+    ) -> pd.DataFrame:
+        df = self.df_raw.copy()
+        # As ragatouille uses str for document_id
+        df["id"] = df["id"].astype(str)
+        # Filter by title
+        df = df[df["title"].str.contains(title_search_query, case=False)]
+        # Filter by presentation type
+        if presentation_type != "(ALL)":
+            df = df[df["type"] == presentation_type]
+        if "Paper page" in filter_names:
+            df = df[df["paper_page"].notnull()]
+        if "GitHub" in filter_names:
+            df = df[df["GitHub"].apply(len) > 0]
+        if "Space" in filter_names:
+            df = df[df["Space"].apply(len) > 0]
+        if "Model" in filter_names:
+            df = df[df["Model"].apply(len) > 0]
+        if "Dataset" in filter_names:
+            df = df[df["Dataset"].apply(len) > 0]
+        # Filter by abstract
+        if abstract_search_query:
+            results = ABSTRACT_RETRIEVER.search(abstract_search_query, k=max_num_to_retrieve)
+            remaining_ids = set(map(str, df["id"]))
+            found_id_set = set()
+            found_ids = []
+            for x in results:
+                paper_id = x["document_id"]
+                if paper_id not in remaining_ids:
+                    continue
+                if paper_id in found_id_set:
+                    continue
+                found_id_set.add(paper_id)
+                found_ids.append(paper_id)
+            df = df[df["id"].isin(found_ids)].set_index("id").reindex(index=found_ids).reset_index()
+        df_prettified = self.prettify(df)
+        return df_prettified.loc[:, columns_names]

requirements.txt CHANGED Viewed

	@@ -0,0 +1,6 @@

+datasets==2.18.0
+#gradio==4.24.0
+huggingface_hub==0.22.2
+pandas==2.2.1
+ragatouille==0.0.8.post2
+tqdm==4.66.2

style.css CHANGED Viewed

@@ -3,9 +3,19 @@ h1 {
   display: block;
 }
-#duplicate-button {
-  margin: auto;
-  color: #fff;
-  background: #1565c0;
-  border-radius: 100vh;
 }

   display: block;
 }
+body a,
+.contain a,
+#table a {
+  background-color: transparent;
+  color: #58a6ff;
+  text-decoration: none;
+}
+body a:active,
+body a:hover {
+  outline-width: 0;
+}
+body a:hover {
+  text-decoration: underline;
 }