hysts HF Staff commited on
Commit
fb9f067
·
1 Parent(s): 93a0776
Files changed (11) hide show
  1. .pre-commit-config.yaml +33 -0
  2. .python-version +1 -0
  3. README.md +5 -5
  4. app.py +204 -0
  5. app_mcp.py +127 -0
  6. pyproject.toml +58 -0
  7. requirements.txt +364 -0
  8. search.py +30 -0
  9. style.css +19 -0
  10. table.py +152 -0
  11. uv.lock +0 -0
.pre-commit-config.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v6.0.0
4
+ hooks:
5
+ - id: check-executables-have-shebangs
6
+ - id: check-json
7
+ - id: check-merge-conflict
8
+ - id: check-shebang-scripts-are-executable
9
+ - id: check-toml
10
+ - id: check-yaml
11
+ - id: end-of-file-fixer
12
+ - id: mixed-line-ending
13
+ args: ["--fix=lf"]
14
+ - id: requirements-txt-fixer
15
+ - id: trailing-whitespace
16
+ - repo: https://github.com/astral-sh/ruff-pre-commit
17
+ rev: v0.13.1
18
+ hooks:
19
+ - id: ruff-check
20
+ args: ["--fix"]
21
+ - id: ruff-format
22
+ - repo: https://github.com/pre-commit/mirrors-mypy
23
+ rev: v1.18.2
24
+ hooks:
25
+ - id: mypy
26
+ args: ["--ignore-missing-imports"]
27
+ additional_dependencies:
28
+ [
29
+ "types-python-slugify",
30
+ "types-pytz",
31
+ "types-PyYAML",
32
+ "types-requests",
33
+ ]
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: ICCV2025
3
- emoji: 😻
4
- colorFrom: purple
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 5.47.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
1
  ---
2
+ title: ICCV 2025
3
+ emoji:
4
+ colorFrom: red
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.47.1
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ import gradio as gr
4
+ import polars as pl
5
+ from gradio_modal import Modal
6
+
7
+ from app_mcp import demo as demo_mcp
8
+ from search import search
9
+ from table import df_orig
10
+
11
+ DESCRIPTION = "# ICCV 2025"
12
+
13
+ df_main = df_orig.select(
14
+ "title",
15
+ "authors_str",
16
+ "paper_page_md",
17
+ "upvotes",
18
+ "num_comments",
19
+ "project_page_md",
20
+ "github_md",
21
+ "Spaces",
22
+ "Models",
23
+ "Datasets",
24
+ "claimed",
25
+ "abstract",
26
+ "paper_id",
27
+ )
28
+
29
+ # TODO: remove this once https://github.com/gradio-app/gradio/issues/10916 https://github.com/gradio-app/gradio/issues/11001 https://github.com/gradio-app/gradio/issues/11002 are fixed # noqa: TD002, FIX002
30
+ df_main = df_main.with_columns(
31
+ [
32
+ pl.when(pl.col(col) == "").then(None).otherwise(pl.col(col)).cast(pl.Int64).fill_null(0).alias(col)
33
+ for col in ["upvotes", "num_comments"]
34
+ ]
35
+ )
36
+
37
+ df_main = df_main.rename(
38
+ {
39
+ "title": "Title",
40
+ "authors_str": "Authors",
41
+ "paper_page_md": "Paper page",
42
+ "upvotes": "👍",
43
+ "num_comments": "💬",
44
+ "project_page_md": "Project page",
45
+ "github_md": "GitHub",
46
+ }
47
+ )
48
+
49
+ COLUMN_INFO = {
50
+ "Title": ("str", "40%"),
51
+ "Authors": ("str", "20%"),
52
+ "Paper page": ("markdown", "135px"),
53
+ "👍": ("number", "50px"),
54
+ "💬": ("number", "50px"),
55
+ "Project page": ("markdown", None),
56
+ "GitHub": ("markdown", None),
57
+ "Spaces": ("markdown", None),
58
+ "Models": ("markdown", None),
59
+ "Datasets": ("markdown", None),
60
+ "claimed": ("markdown", None),
61
+ }
62
+
63
+
64
+ DEFAULT_COLUMNS = [
65
+ "Title",
66
+ "Paper page",
67
+ "👍",
68
+ "💬",
69
+ "Project page",
70
+ "GitHub",
71
+ "Spaces",
72
+ "Models",
73
+ "Datasets",
74
+ ]
75
+
76
+
77
+ def update_num_papers(df: pl.DataFrame) -> str:
78
+ if "claimed" in df.columns:
79
+ return f"{len(df)} / {len(df_main)} ({df.select(pl.col('claimed').str.contains('✅').sum()).item()} claimed)"
80
+ return f"{len(df)} / {len(df_main)}"
81
+
82
+
83
+ def update_df(
84
+ search_query: str,
85
+ candidate_pool_size: int,
86
+ num_results: int,
87
+ column_names: list[str],
88
+ ) -> gr.Dataframe:
89
+ if num_results > candidate_pool_size:
90
+ raise gr.Error("Number of results must be less than or equal to candidate pool size", print_exception=False)
91
+
92
+ df = df_main.clone()
93
+ column_names = ["Title", *column_names]
94
+
95
+ if search_query:
96
+ results = search(search_query, candidate_pool_size, num_results)
97
+ if not results:
98
+ df = df.head(0)
99
+ else:
100
+ df = pl.DataFrame(results).join(df, on="paper_id", how="inner")
101
+ df = df.sort("ce_score", descending=True).drop("ce_score")
102
+
103
+ sorted_column_names = [col for col in COLUMN_INFO if col in column_names]
104
+ df = df.select(sorted_column_names)
105
+ return gr.Dataframe(
106
+ value=df,
107
+ datatype=[COLUMN_INFO[col][0] for col in sorted_column_names],
108
+ column_widths=[COLUMN_INFO[col][1] for col in sorted_column_names],
109
+ )
110
+
111
+
112
+ def df_row_selected(
113
+ evt: gr.SelectData,
114
+ ) -> tuple[
115
+ Modal,
116
+ gr.Textbox, # title
117
+ gr.Textbox, # abstract
118
+ ]:
119
+ if evt.index[1] != 0:
120
+ return Modal(), gr.Textbox(), gr.Textbox()
121
+
122
+ title = evt.row_value[0]
123
+ row = df_main.filter(pl.col("Title") == title)
124
+ return (
125
+ Modal(visible=True),
126
+ gr.Textbox(value=row["Title"].item()), # title
127
+ gr.Textbox(value=row["abstract"].item()), # abstract
128
+ )
129
+
130
+
131
+ with gr.Blocks(css_paths="style.css") as demo:
132
+ gr.Markdown(DESCRIPTION)
133
+ search_query = gr.Textbox(label="Search", submit_btn=True, show_label=False, placeholder="Search...")
134
+ with gr.Accordion(label="Advanced Search Options", open=False) as advanced_search_options:
135
+ with gr.Row():
136
+ candidate_pool_size = gr.Slider(label="Candidate Pool Size", minimum=1, maximum=600, step=1, value=200)
137
+ num_results = gr.Slider(label="Number of Results", minimum=1, maximum=400, step=1, value=100)
138
+
139
+ column_names = gr.CheckboxGroup(
140
+ label="Columns",
141
+ choices=[col for col in COLUMN_INFO if col != "Title"],
142
+ value=[col for col in DEFAULT_COLUMNS if col != "Title"],
143
+ )
144
+
145
+ num_papers = gr.Textbox(label="Number of papers", value=update_num_papers(df_orig), interactive=False)
146
+
147
+ df = gr.Dataframe(
148
+ value=df_main,
149
+ datatype=list(COLUMN_INFO.values()),
150
+ type="polars",
151
+ row_count=(0, "dynamic"),
152
+ show_row_numbers=True,
153
+ interactive=False,
154
+ max_height=1000,
155
+ elem_id="table",
156
+ column_widths=[COLUMN_INFO[col][1] for col in COLUMN_INFO],
157
+ )
158
+ with Modal(visible=False, elem_id="abstract-modal") as abstract_modal:
159
+ title = gr.Textbox(label="Title")
160
+ abstract = gr.Textbox(label="Abstract")
161
+
162
+ df.select(fn=df_row_selected, outputs=[abstract_modal, title, abstract])
163
+
164
+ inputs = [
165
+ search_query,
166
+ candidate_pool_size,
167
+ num_results,
168
+ column_names,
169
+ ]
170
+ gr.on(
171
+ triggers=[
172
+ search_query.submit,
173
+ column_names.input,
174
+ ],
175
+ fn=update_df,
176
+ inputs=inputs,
177
+ outputs=df,
178
+ api_name=False,
179
+ ).then(
180
+ fn=update_num_papers,
181
+ inputs=df,
182
+ outputs=num_papers,
183
+ queue=False,
184
+ api_name=False,
185
+ )
186
+ demo.load(
187
+ fn=update_df,
188
+ inputs=inputs,
189
+ outputs=df,
190
+ api_name=False,
191
+ ).then(
192
+ fn=update_num_papers,
193
+ inputs=df,
194
+ outputs=num_papers,
195
+ queue=False,
196
+ api_name=False,
197
+ )
198
+
199
+ with gr.Row(visible=False):
200
+ demo_mcp.render()
201
+
202
+
203
+ if __name__ == "__main__":
204
+ demo.launch(mcp_server=True)
app_mcp.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import polars as pl
3
+
4
+ from search import search
5
+ from table import df_orig
6
+
7
+ COLUMNS_MCP = [
8
+ "title",
9
+ "authors",
10
+ "abstract",
11
+ "arxiv_id",
12
+ "paper_page",
13
+ "space_ids",
14
+ "model_ids",
15
+ "dataset_ids",
16
+ "upvotes",
17
+ "num_comments",
18
+ "project_page",
19
+ "github",
20
+ "row_index",
21
+ ]
22
+ DEFAULT_COLUMNS_MCP = [
23
+ "title",
24
+ "authors",
25
+ "abstract",
26
+ "arxiv_id",
27
+ "project_page",
28
+ "github",
29
+ "row_index",
30
+ ]
31
+
32
+ df_mcp = df_orig.rename({"paper_id": "row_index"}).select(COLUMNS_MCP)
33
+
34
+
35
+ def search_papers(
36
+ search_query: str,
37
+ candidate_pool_size: int,
38
+ num_results: int,
39
+ columns: list[str],
40
+ ) -> list[dict]:
41
+ """Searches ICCV 2025 papers relevant to a user query in English.
42
+
43
+ This function performs a semantic search over ICCV 2025 papers.
44
+ It uses a dual-stage retrieval process:
45
+ - First, it retrieves `candidate_pool_size` papers using dense vector similarity.
46
+ - Then, it re-ranks them with a cross-encoder model to select the top `num_results` most relevant papers.
47
+ - The search results are returned as a list of dictionaries.
48
+
49
+ Note:
50
+ The search query must be written in English. Queries in other languages are not supported.
51
+
52
+ Args:
53
+ search_query (str): The natural language query input by the user. Must be in English.
54
+ candidate_pool_size (int): Number of candidate papers to retrieve using the dense vector model.
55
+ num_results (int): Final number of top-ranked papers to return after re-ranking.
56
+ columns (list[str]): The columns to select from the DataFrame.
57
+
58
+ Returns:
59
+ list[dict]: A list of dictionaries of the top-ranked papers matching the query, sorted by relevance.
60
+ """
61
+ if not search_query:
62
+ raise ValueError("Search query cannot be empty")
63
+ if num_results > candidate_pool_size:
64
+ raise ValueError("Number of results must be less than or equal to candidate pool size")
65
+
66
+ df = df_mcp.clone()
67
+ results = search(search_query, candidate_pool_size, num_results)
68
+ df = pl.DataFrame(results).rename({"paper_id": "row_index"}).join(df, on="row_index", how="inner")
69
+ df = df.sort("ce_score", descending=True)
70
+ return df.select(columns).to_dicts()
71
+
72
+
73
+ def get_metadata(row_index: int) -> dict:
74
+ """Returns a dictionary of metadata for a ICCV 2025 paper at the given table row index.
75
+
76
+ Args:
77
+ row_index (int): The index of the paper in the internal paper list table.
78
+
79
+ Returns:
80
+ dict: A dictionary containing metadata for the corresponding paper.
81
+ """
82
+ return df_mcp.filter(pl.col("row_index") == row_index).to_dicts()[0]
83
+
84
+
85
+ def get_table(columns: list[str]) -> list[dict]:
86
+ """Returns a list of dictionaries of all ICCV 2025 papers.
87
+
88
+ Args:
89
+ columns (list[str]): The columns to select from the DataFrame.
90
+
91
+ Returns:
92
+ list[dict]: A list of dictionaries of all ICCV 2025 papers.
93
+ """
94
+ return df_mcp.select(columns).to_dicts()
95
+
96
+
97
+ with gr.Blocks() as demo:
98
+ search_query = gr.Textbox(label="Search", submit_btn=True)
99
+ candidate_pool_size = gr.Slider(label="Candidate Pool Size", minimum=1, maximum=500, step=1, value=200)
100
+ num_results = gr.Slider(label="Number of Results", minimum=1, maximum=400, step=1, value=100)
101
+ column_names = gr.CheckboxGroup(label="Columns", choices=COLUMNS_MCP, value=DEFAULT_COLUMNS_MCP)
102
+ row_index = gr.Slider(label="Row Index", minimum=0, maximum=len(df_mcp) - 1, step=1, value=0)
103
+
104
+ out = gr.JSON()
105
+
106
+ search_papers_btn = gr.Button("Search Papers")
107
+ get_metadata_btn = gr.Button("Get Metadata")
108
+ get_table_btn = gr.Button("Get Table")
109
+
110
+ search_papers_btn.click(
111
+ fn=search_papers,
112
+ inputs=[search_query, candidate_pool_size, num_results, column_names],
113
+ outputs=out,
114
+ )
115
+ get_metadata_btn.click(
116
+ fn=get_metadata,
117
+ inputs=row_index,
118
+ outputs=out,
119
+ )
120
+ get_table_btn.click(
121
+ fn=get_table,
122
+ inputs=column_names,
123
+ outputs=out,
124
+ )
125
+
126
+ if __name__ == "__main__":
127
+ demo.launch(mcp_server=True)
pyproject.toml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "iccv2025"
3
+ version = "0.1.0"
4
+ description = ""
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "datasets>=4.1.1",
9
+ "faiss-cpu>=1.12.0",
10
+ "gradio[mcp]>=5.47.1",
11
+ "gradio-modal>=0.0.4",
12
+ "hf-transfer>=0.1.9",
13
+ "loguru>=0.7.3",
14
+ "polars>=1.33.1",
15
+ "sentence-transformers>=5.1.1",
16
+ "spaces>=0.42.1",
17
+ "torch==2.8.0",
18
+ ]
19
+
20
+ [tool.ruff]
21
+ line-length = 119
22
+
23
+ [tool.ruff.lint]
24
+ select = ["ALL"]
25
+ ignore = [
26
+ "COM812", # missing-trailing-comma
27
+ "D203", # one-blank-line-before-class
28
+ "D213", # multi-line-summary-second-line
29
+ "E501", # line-too-long
30
+ "SIM117", # multiple-with-statements
31
+ #
32
+ "D100", # undocumented-public-module
33
+ "D101", # undocumented-public-class
34
+ "D102", # undocumented-public-method
35
+ "D103", # undocumented-public-function
36
+ "D104", # undocumented-public-package
37
+ "D105", # undocumented-magic-method
38
+ "D107", # undocumented-public-init
39
+ "EM101", # raw-string-in-exception
40
+ "FBT001", # boolean-type-hint-positional-argument
41
+ "FBT002", # boolean-default-value-positional-argument
42
+ "PGH003", # blanket-type-ignore
43
+ "PLR0913", # too-many-arguments
44
+ "PLR0915", # too-many-statements
45
+ "TRY003", # raise-vanilla-args
46
+ ]
47
+ unfixable = [
48
+ "F401", # unused-import
49
+ ]
50
+
51
+ [tool.ruff.lint.pydocstyle]
52
+ convention = "google"
53
+
54
+ [tool.ruff.lint.per-file-ignores]
55
+ "*.ipynb" = ["T201", "T203"]
56
+
57
+ [tool.ruff.format]
58
+ docstring-code-format = true
requirements.txt ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was autogenerated by uv via the following command:
2
+ # uv pip compile pyproject.toml -o requirements.txt
3
+ aiofiles==24.1.0
4
+ # via gradio
5
+ aiohappyeyeballs==2.6.1
6
+ # via aiohttp
7
+ aiohttp==3.12.15
8
+ # via fsspec
9
+ aiosignal==1.4.0
10
+ # via aiohttp
11
+ annotated-types==0.7.0
12
+ # via pydantic
13
+ anyio==4.11.0
14
+ # via
15
+ # gradio
16
+ # httpx
17
+ # mcp
18
+ # sse-starlette
19
+ # starlette
20
+ async-timeout==5.0.1
21
+ # via aiohttp
22
+ attrs==25.3.0
23
+ # via
24
+ # aiohttp
25
+ # jsonschema
26
+ # referencing
27
+ brotli==1.1.0
28
+ # via gradio
29
+ certifi==2025.8.3
30
+ # via
31
+ # httpcore
32
+ # httpx
33
+ # requests
34
+ charset-normalizer==3.4.3
35
+ # via requests
36
+ click==8.3.0
37
+ # via
38
+ # typer
39
+ # uvicorn
40
+ datasets==4.1.1
41
+ # via iccv2025 (pyproject.toml)
42
+ dill==0.4.0
43
+ # via
44
+ # datasets
45
+ # multiprocess
46
+ exceptiongroup==1.3.0
47
+ # via anyio
48
+ faiss-cpu==1.12.0
49
+ # via iccv2025 (pyproject.toml)
50
+ fastapi==0.117.1
51
+ # via gradio
52
+ ffmpy==0.6.1
53
+ # via gradio
54
+ filelock==3.19.1
55
+ # via
56
+ # datasets
57
+ # huggingface-hub
58
+ # torch
59
+ # transformers
60
+ frozenlist==1.7.0
61
+ # via
62
+ # aiohttp
63
+ # aiosignal
64
+ fsspec==2025.9.0
65
+ # via
66
+ # datasets
67
+ # gradio-client
68
+ # huggingface-hub
69
+ # torch
70
+ gradio==5.47.1
71
+ # via
72
+ # iccv2025 (pyproject.toml)
73
+ # gradio-modal
74
+ # spaces
75
+ gradio-client==1.13.2
76
+ # via gradio
77
+ gradio-modal==0.0.4
78
+ # via iccv2025 (pyproject.toml)
79
+ groovy==0.1.2
80
+ # via gradio
81
+ h11==0.16.0
82
+ # via
83
+ # httpcore
84
+ # uvicorn
85
+ hf-transfer==0.1.9
86
+ # via iccv2025 (pyproject.toml)
87
+ hf-xet==1.1.10
88
+ # via huggingface-hub
89
+ httpcore==1.0.9
90
+ # via httpx
91
+ httpx==0.28.1
92
+ # via
93
+ # gradio
94
+ # gradio-client
95
+ # mcp
96
+ # safehttpx
97
+ # spaces
98
+ httpx-sse==0.4.1
99
+ # via mcp
100
+ huggingface-hub==0.35.1
101
+ # via
102
+ # datasets
103
+ # gradio
104
+ # gradio-client
105
+ # sentence-transformers
106
+ # tokenizers
107
+ # transformers
108
+ idna==3.10
109
+ # via
110
+ # anyio
111
+ # httpx
112
+ # requests
113
+ # yarl
114
+ jinja2==3.1.6
115
+ # via
116
+ # gradio
117
+ # torch
118
+ joblib==1.5.2
119
+ # via scikit-learn
120
+ jsonschema==4.25.1
121
+ # via mcp
122
+ jsonschema-specifications==2025.9.1
123
+ # via jsonschema
124
+ loguru==0.7.3
125
+ # via iccv2025 (pyproject.toml)
126
+ markdown-it-py==4.0.0
127
+ # via rich
128
+ markupsafe==3.0.2
129
+ # via
130
+ # gradio
131
+ # jinja2
132
+ mcp==1.10.1
133
+ # via gradio
134
+ mdurl==0.1.2
135
+ # via markdown-it-py
136
+ mpmath==1.3.0
137
+ # via sympy
138
+ multidict==6.6.4
139
+ # via
140
+ # aiohttp
141
+ # yarl
142
+ multiprocess==0.70.16
143
+ # via datasets
144
+ networkx==3.4.2
145
+ # via torch
146
+ numpy==2.2.6
147
+ # via
148
+ # datasets
149
+ # faiss-cpu
150
+ # gradio
151
+ # pandas
152
+ # scikit-learn
153
+ # scipy
154
+ # transformers
155
+ nvidia-cublas-cu12==12.8.4.1
156
+ # via
157
+ # nvidia-cudnn-cu12
158
+ # nvidia-cusolver-cu12
159
+ # torch
160
+ nvidia-cuda-cupti-cu12==12.8.90
161
+ # via torch
162
+ nvidia-cuda-nvrtc-cu12==12.8.93
163
+ # via torch
164
+ nvidia-cuda-runtime-cu12==12.8.90
165
+ # via torch
166
+ nvidia-cudnn-cu12==9.10.2.21
167
+ # via torch
168
+ nvidia-cufft-cu12==11.3.3.83
169
+ # via torch
170
+ nvidia-cufile-cu12==1.13.1.3
171
+ # via torch
172
+ nvidia-curand-cu12==10.3.9.90
173
+ # via torch
174
+ nvidia-cusolver-cu12==11.7.3.90
175
+ # via torch
176
+ nvidia-cusparse-cu12==12.5.8.93
177
+ # via
178
+ # nvidia-cusolver-cu12
179
+ # torch
180
+ nvidia-cusparselt-cu12==0.7.1
181
+ # via torch
182
+ nvidia-nccl-cu12==2.27.3
183
+ # via torch
184
+ nvidia-nvjitlink-cu12==12.8.93
185
+ # via
186
+ # nvidia-cufft-cu12
187
+ # nvidia-cusolver-cu12
188
+ # nvidia-cusparse-cu12
189
+ # torch
190
+ nvidia-nvtx-cu12==12.8.90
191
+ # via torch
192
+ orjson==3.11.3
193
+ # via gradio
194
+ packaging==25.0
195
+ # via
196
+ # datasets
197
+ # faiss-cpu
198
+ # gradio
199
+ # gradio-client
200
+ # huggingface-hub
201
+ # spaces
202
+ # transformers
203
+ pandas==2.3.2
204
+ # via
205
+ # datasets
206
+ # gradio
207
+ pillow==11.3.0
208
+ # via
209
+ # gradio
210
+ # sentence-transformers
211
+ polars==1.33.1
212
+ # via iccv2025 (pyproject.toml)
213
+ propcache==0.3.2
214
+ # via
215
+ # aiohttp
216
+ # yarl
217
+ psutil==5.9.8
218
+ # via spaces
219
+ pyarrow==21.0.0
220
+ # via datasets
221
+ pydantic==2.11.9
222
+ # via
223
+ # fastapi
224
+ # gradio
225
+ # mcp
226
+ # pydantic-settings
227
+ # spaces
228
+ pydantic-core==2.33.2
229
+ # via pydantic
230
+ pydantic-settings==2.11.0
231
+ # via mcp
232
+ pydub==0.25.1
233
+ # via gradio
234
+ pygments==2.19.2
235
+ # via rich
236
+ python-dateutil==2.9.0.post0
237
+ # via pandas
238
+ python-dotenv==1.1.1
239
+ # via pydantic-settings
240
+ python-multipart==0.0.20
241
+ # via
242
+ # gradio
243
+ # mcp
244
+ pytz==2025.2
245
+ # via pandas
246
+ pyyaml==6.0.3
247
+ # via
248
+ # datasets
249
+ # gradio
250
+ # huggingface-hub
251
+ # transformers
252
+ referencing==0.36.2
253
+ # via
254
+ # jsonschema
255
+ # jsonschema-specifications
256
+ regex==2025.9.18
257
+ # via transformers
258
+ requests==2.32.5
259
+ # via
260
+ # datasets
261
+ # huggingface-hub
262
+ # spaces
263
+ # transformers
264
+ rich==14.1.0
265
+ # via typer
266
+ rpds-py==0.27.1
267
+ # via
268
+ # jsonschema
269
+ # referencing
270
+ ruff==0.13.2
271
+ # via gradio
272
+ safehttpx==0.1.6
273
+ # via gradio
274
+ safetensors==0.6.2
275
+ # via transformers
276
+ scikit-learn==1.7.2
277
+ # via sentence-transformers
278
+ scipy==1.15.3
279
+ # via
280
+ # scikit-learn
281
+ # sentence-transformers
282
+ semantic-version==2.10.0
283
+ # via gradio
284
+ sentence-transformers==5.1.1
285
+ # via iccv2025 (pyproject.toml)
286
+ setuptools==80.9.0
287
+ # via triton
288
+ shellingham==1.5.4
289
+ # via typer
290
+ six==1.17.0
291
+ # via python-dateutil
292
+ sniffio==1.3.1
293
+ # via anyio
294
+ spaces==0.42.1
295
+ # via iccv2025 (pyproject.toml)
296
+ sse-starlette==3.0.2
297
+ # via mcp
298
+ starlette==0.48.0
299
+ # via
300
+ # fastapi
301
+ # gradio
302
+ # mcp
303
+ sympy==1.14.0
304
+ # via torch
305
+ threadpoolctl==3.6.0
306
+ # via scikit-learn
307
+ tokenizers==0.22.1
308
+ # via transformers
309
+ tomlkit==0.13.3
310
+ # via gradio
311
+ torch==2.8.0
312
+ # via
313
+ # iccv2025 (pyproject.toml)
314
+ # sentence-transformers
315
+ tqdm==4.67.1
316
+ # via
317
+ # datasets
318
+ # huggingface-hub
319
+ # sentence-transformers
320
+ # transformers
321
+ transformers==4.56.2
322
+ # via sentence-transformers
323
+ triton==3.4.0
324
+ # via torch
325
+ typer==0.19.2
326
+ # via gradio
327
+ typing-extensions==4.15.0
328
+ # via
329
+ # aiosignal
330
+ # anyio
331
+ # exceptiongroup
332
+ # fastapi
333
+ # gradio
334
+ # gradio-client
335
+ # huggingface-hub
336
+ # multidict
337
+ # pydantic
338
+ # pydantic-core
339
+ # referencing
340
+ # sentence-transformers
341
+ # spaces
342
+ # starlette
343
+ # torch
344
+ # typer
345
+ # typing-inspection
346
+ # uvicorn
347
+ typing-inspection==0.4.1
348
+ # via
349
+ # pydantic
350
+ # pydantic-settings
351
+ tzdata==2025.2
352
+ # via pandas
353
+ urllib3==2.5.0
354
+ # via requests
355
+ uvicorn==0.37.0
356
+ # via
357
+ # gradio
358
+ # mcp
359
+ websockets==15.0.1
360
+ # via gradio-client
361
+ xxhash==3.5.0
362
+ # via datasets
363
+ yarl==1.20.1
364
+ # via aiohttp
search.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import numpy as np
3
+ import spaces
4
+ from sentence_transformers import CrossEncoder, SentenceTransformer
5
+
6
+ from table import BASE_REPO_ID
7
+
8
+ ds = datasets.load_dataset(BASE_REPO_ID, split="train")
9
+ ds.add_faiss_index(column="embedding")
10
+
11
+ bi_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
12
+ ce_model = CrossEncoder("BAAI/bge-reranker-base")
13
+
14
+
15
+ @spaces.GPU(duration=10)
16
+ def search(query: str, candidate_pool_size: int = 100, retrieval_k: int = 50) -> list[dict]:
17
+ prefix = "Represent this sentence for searching relevant passages: "
18
+ q_vec = bi_model.encode(prefix + query, normalize_embeddings=True)
19
+
20
+ _, retrieved_ds = ds.get_nearest_examples("embedding", q_vec, k=candidate_pool_size)
21
+
22
+ ce_inputs = [
23
+ (query, f"{retrieved_ds['title'][i]} {retrieved_ds['abstract'][i]}") for i in range(len(retrieved_ds["title"]))
24
+ ]
25
+ ce_scores = ce_model.predict(ce_inputs, batch_size=16)
26
+
27
+ sorted_idx = np.argsort(ce_scores)[::-1]
28
+ return [
29
+ {"paper_id": retrieved_ds["paper_id"][i], "ce_score": float(ce_scores[i])} for i in sorted_idx[:retrieval_k]
30
+ ]
style.css ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ h1 {
2
+ text-align: center;
3
+ display: block;
4
+ }
5
+
6
+ #abstract-modal .modal-block {
7
+ position: fixed !important;
8
+ top: 50% !important;
9
+ left: 50% !important;
10
+ transform: translate(-50%, -50%) !important;
11
+ width: 80vw !important;
12
+ max-width: 900px !important;
13
+ margin: 0 !important;
14
+ }
15
+
16
+ #abstract-modal .modal-block,
17
+ #abstract-modal .modal-block * {
18
+ font-size: 1.0rem !important;
19
+ }
table.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import polars as pl
3
+ from loguru import logger
4
+ from polars import datatypes as pdt
5
+
6
+ BASE_REPO_ID = "ai-conferences/ICCV2025"
7
+ PATCH_REPO_ID = "ai-conferences/ICCV2025-patches"
8
+ PAPER_PAGE_REPO_ID = "hysts-bot-data/paper-pages-slim"
9
+
10
+
11
+ def get_patch_latest_values(
12
+ df: pl.DataFrame, all_columns: list[str], id_col: str, timestamp_col: str = "timestamp", delimiter: str = ","
13
+ ) -> pl.DataFrame:
14
+ df = df.sort(timestamp_col)
15
+
16
+ list_cols = [
17
+ col for col, dtype in df.schema.items() if col not in (id_col, timestamp_col) and dtype.base_type() is pdt.List
18
+ ]
19
+ df = df.with_columns(
20
+ [
21
+ pl.when(pl.col(c).is_not_null()).then(pl.col(c).list.join(delimiter)).otherwise(None).alias(c)
22
+ for c in list_cols
23
+ ]
24
+ )
25
+
26
+ update_columns = [col for col in df.columns if col not in (id_col, timestamp_col)]
27
+ melted = df.unpivot(on=update_columns, index=[timestamp_col, id_col]).drop_nulls()
28
+
29
+ latest_rows = (
30
+ melted.sort(timestamp_col)
31
+ .group_by([id_col, "variable"])
32
+ .agg(pl.col("value").last())
33
+ .pivot("variable", index=id_col, values="value")
34
+ )
35
+
36
+ latest_rows = latest_rows.with_columns(
37
+ [
38
+ pl.when(pl.col(c).is_not_null()).then(pl.col(c).str.split(delimiter)).otherwise(None).alias(c)
39
+ for c in list_cols
40
+ ]
41
+ )
42
+
43
+ missing_cols = [c for c in all_columns if c not in latest_rows.columns and c != id_col]
44
+ if missing_cols:
45
+ latest_rows = latest_rows.with_columns([pl.lit(None).alias(c) for c in missing_cols])
46
+
47
+ return latest_rows.select([id_col] + [col for col in all_columns if col != id_col])
48
+
49
+
50
+ def format_author_claim_ratio(row: dict) -> str:
51
+ n_linked_authors = row["n_linked_authors"]
52
+ n_authors = row["n_authors"]
53
+
54
+ if n_linked_authors is None or n_authors is None:
55
+ return ""
56
+
57
+ author_linked = "✅" if n_linked_authors > 0 else ""
58
+ return f"{n_linked_authors}/{n_authors} {author_linked}".strip()
59
+
60
+
61
+ df_orig = (
62
+ datasets.load_dataset(BASE_REPO_ID, split="train")
63
+ .to_polars()
64
+ .with_columns(
65
+ pl.lit([], dtype=pl.List(pl.Utf8)).alias(col_name) for col_name in ["space_ids", "model_ids", "dataset_ids"]
66
+ )
67
+ )
68
+ df_paper_page = (
69
+ datasets.load_dataset(PAPER_PAGE_REPO_ID, split="train")
70
+ .to_polars()
71
+ .drop(["summary", "author_names", "ai_keywords"])
72
+ )
73
+ df_orig = (
74
+ df_orig.join(df_paper_page, on="arxiv_id", how="left", suffix="_2")
75
+ .with_columns(
76
+ [
77
+ pl.when(pl.col("github_2").is_not_null())
78
+ .then(pl.col("github_2"))
79
+ .otherwise(pl.col("github"))
80
+ .alias("github")
81
+ ]
82
+ )
83
+ .drop(["github_2"])
84
+ )
85
+
86
+ try:
87
+ df_patches = (
88
+ datasets.load_dataset(PATCH_REPO_ID, split="train")
89
+ .to_polars()
90
+ .drop("diff")
91
+ .with_columns(pl.col("timestamp").str.strptime(pl.Datetime, "%+"))
92
+ )
93
+ df_patches = get_patch_latest_values(df_patches, df_orig.columns, id_col="paper_id", timestamp_col="timestamp")
94
+ df_orig = (
95
+ df_orig.join(df_patches, on="paper_id", how="left")
96
+ .with_columns(
97
+ [
98
+ pl.coalesce([pl.col(col + "_right"), pl.col(col)]).alias(col)
99
+ for col in df_orig.columns
100
+ if col != "paper_id"
101
+ ]
102
+ )
103
+ .select(df_orig.columns)
104
+ )
105
+ except Exception as e: # noqa: BLE001
106
+ logger.warning(e)
107
+
108
+ # format authors
109
+ df_orig = df_orig.with_columns(pl.col("authors").list.join(", ").alias("authors_str"))
110
+ # format links
111
+ df_orig = df_orig.with_columns(
112
+ [pl.format("[link]({})", pl.col(col)).fill_null("").alias(f"{col}_md") for col in ["project_page", "github"]]
113
+ )
114
+ # format paper page link
115
+ df_orig = df_orig.with_columns(
116
+ (pl.lit("https://huggingface.co/papers/") + pl.col("arxiv_id")).alias("paper_page")
117
+ ).with_columns(pl.format("[{}]({})", pl.col("arxiv_id"), pl.col("paper_page")).fill_null("").alias("paper_page_md"))
118
+
119
+ # count authors
120
+ df_orig = df_orig.with_columns(pl.col("authors").list.len().alias("n_authors"))
121
+ df_orig = df_orig.with_columns(
122
+ pl.col("author_usernames")
123
+ .map_elements(lambda lst: sum(x is not None for x in lst) if lst is not None else None, return_dtype=pl.Int64)
124
+ .alias("n_linked_authors")
125
+ )
126
+ df_orig = df_orig.with_columns(
127
+ pl.struct(["n_linked_authors", "n_authors"])
128
+ .map_elements(format_author_claim_ratio, return_dtype=pl.Utf8)
129
+ .alias("claimed")
130
+ )
131
+
132
+ # TODO: Fix this once https://github.com/gradio-app/gradio/issues/10916 is fixed # noqa: FIX002, TD002
133
+ # format numbers as strings
134
+ df_orig = df_orig.with_columns(
135
+ [pl.col(col).cast(pl.Utf8).fill_null("").alias(col) for col in ["upvotes", "num_comments"]]
136
+ )
137
+
138
+ # format spaces, models, datasets
139
+ for repo_id_col, markdown_col, base_url in [
140
+ ("space_ids", "Spaces", "https://huggingface.co/spaces/"),
141
+ ("model_ids", "Models", "https://huggingface.co/"),
142
+ ("dataset_ids", "Datasets", "https://huggingface.co/datasets/"),
143
+ ]:
144
+ df_orig = df_orig.with_columns(
145
+ pl.col(repo_id_col)
146
+ .map_elements(
147
+ lambda lst: "\n".join([f"[link]({base_url}{x})" for x in lst]) if lst is not None else None, # noqa: B023
148
+ return_dtype=pl.Utf8,
149
+ )
150
+ .fill_null("")
151
+ .alias(markdown_col)
152
+ )
uv.lock ADDED
The diff for this file is too large to render. See raw diff