jbdel commited on
Commit
4e925af
0 Parent(s):

Initial commit

Browse files
Files changed (6) hide show
  1. app.py +329 -0
  2. constants.py +6 -0
  3. df/PaperCentral.py +443 -0
  4. requirements.txt +3 -0
  5. style.css +23 -0
  6. utils.py +16 -0
app.py ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from df.PaperCentral import PaperCentral
3
+ from gradio_calendar import Calendar
4
+ from datetime import datetime, timedelta
5
+ from typing import Union, List
6
+
7
+ # Initialize the PaperCentral class instance
8
+ paper_central_df = PaperCentral()
9
+
10
+ # Create the Gradio Blocks app with custom CSS
11
+ with gr.Blocks(css="style.css") as demo:
12
+ gr.Markdown("# Paper Central")
13
+
14
+ # Create a row for navigation buttons and calendar
15
+ with gr.Row():
16
+ with gr.Column(scale=1):
17
+ # Define the 'Next Day' and 'Previous Day' buttons
18
+ next_day_btn = gr.Button("Next Day")
19
+ prev_day_btn = gr.Button("Previous Day")
20
+ with gr.Column(scale=4):
21
+ # Define the calendar component for date selection
22
+ calendar = Calendar(
23
+ type="datetime",
24
+ label="Select a date",
25
+ info="Click the calendar icon to bring up the calendar.",
26
+ value=datetime.today().strftime('%Y-%m-%d') # Default to today's date
27
+ )
28
+
29
+ # Create a row for Hugging Face options and Conference options
30
+ with gr.Row():
31
+ with gr.Column():
32
+ # Define the checkbox group for Hugging Face options
33
+ cat_options = gr.CheckboxGroup(
34
+ label="Category",
35
+ choices=[
36
+ 'cs.*',
37
+ 'eess.*',
38
+ 'econ.*',
39
+ 'math.*',
40
+ 'astro-ph.*',
41
+ 'cond-mat.*',
42
+ 'gr-qc',
43
+ 'hep-ex',
44
+ 'hep-lat',
45
+ 'hep-ph',
46
+ 'hep-th',
47
+ 'math-ph',
48
+ 'nlin.*',
49
+ 'nucl-ex',
50
+ 'nucl-th',
51
+ 'physics.*',
52
+ 'quant-ph',
53
+ 'q-bio.*',
54
+ 'q-fin.*',
55
+ 'stat.*',
56
+ ],
57
+ value=["cs.*"]
58
+ )
59
+ hf_options = gr.CheckboxGroup(
60
+ label="Hugging Face options",
61
+ choices=["show_details", "datasets", "models", "spaces"]
62
+ )
63
+
64
+ with gr.Column():
65
+ # Define the checkbox group for Conference options
66
+ conference_options = gr.CheckboxGroup(
67
+ label="Conference options",
68
+ choices=["In proceedings"] + PaperCentral.CONFERENCES
69
+ )
70
+
71
+ # Define the Dataframe component to display paper data
72
+ # List of columns in your DataFrame
73
+ columns = paper_central_df.COLUMNS_START_PAPER_PAGE
74
+
75
+ paper_central_component = gr.Dataframe(
76
+ label="Paper Data",
77
+ value=paper_central_df.df_prettified[columns],
78
+ datatype=[
79
+ paper_central_df.DATATYPES[column]
80
+ for column in columns
81
+ ],
82
+ row_count=(0, "dynamic"),
83
+ interactive=False,
84
+ height=1000,
85
+ elem_id="table",
86
+ )
87
+
88
+
89
+ # Define function to move to the next day
90
+ def go_to_next_day(
91
+ date: Union[str, datetime],
92
+ cat_options_list: List[str],
93
+ hf_options_list: List[str],
94
+ conference_options_list: List[str]
95
+ ) -> tuple:
96
+ """
97
+ Moves the selected date to the next day and updates the data.
98
+
99
+ Args:
100
+ date (Union[str, datetime]): The current date selected in the calendar.
101
+ cat_options_list (List[str]): List of selected Category options.
102
+ hf_options_list (List[str]): List of selected Hugging Face options.
103
+ conference_options_list (List[str]): List of selected Conference options.
104
+
105
+ Returns:
106
+ tuple: The new date as a string and the updated Dataframe component.
107
+ """
108
+ # Ensure the date is in string format
109
+ if isinstance(date, datetime):
110
+ date_str = date.strftime('%Y-%m-%d')
111
+ else:
112
+ date_str = date
113
+
114
+ # Parse the date string and add one day
115
+ new_date = datetime.strptime(date_str, '%Y-%m-%d') + timedelta(days=1)
116
+ new_date_str = new_date.strftime('%Y-%m-%d')
117
+
118
+ # Update the Dataframe
119
+ updated_data = paper_central_df.filter(
120
+ selected_date=new_date_str,
121
+ cat_options=cat_options_list,
122
+ hf_options=hf_options_list,
123
+ conference_options=conference_options_list
124
+ )
125
+
126
+ # Return the new date and updated Dataframe
127
+ return new_date_str, updated_data
128
+
129
+
130
+ # Define function to move to the previous day
131
+ def go_to_previous_day(
132
+ date: Union[str, datetime],
133
+ cat_options_list: List[str],
134
+ hf_options_list: List[str],
135
+ conference_options_list: List[str]
136
+ ) -> tuple:
137
+ """
138
+ Moves the selected date to the previous day and updates the data.
139
+
140
+ Args:
141
+ date (Union[str, datetime]): The current date selected in the calendar.
142
+ cat_options_list (List[str]): List of selected Category options.
143
+ hf_options_list (List[str]): List of selected Hugging Face options.
144
+ conference_options_list (List[str]): List of selected Conference options.
145
+
146
+ Returns:
147
+ tuple: The new date as a string and the updated Dataframe component.
148
+ """
149
+ # Ensure the date is in string format
150
+ if isinstance(date, datetime):
151
+ date_str = date.strftime('%Y-%m-%d')
152
+ else:
153
+ date_str = date
154
+
155
+ # Parse the date string and subtract one day
156
+ new_date = datetime.strptime(date_str, '%Y-%m-%d') - timedelta(days=1)
157
+ new_date_str = new_date.strftime('%Y-%m-%d')
158
+
159
+ # Update the Dataframe
160
+ updated_data = paper_central_df.filter(
161
+ selected_date=new_date_str,
162
+ cat_options=cat_options_list,
163
+ hf_options=hf_options_list,
164
+ conference_options=conference_options_list
165
+ )
166
+
167
+ # Return the new date and updated Dataframe
168
+ return new_date_str, updated_data
169
+
170
+
171
+ # Define function to update data when date or options change
172
+ def update_data(
173
+ date: Union[str, datetime],
174
+ cat_options_list: List[str],
175
+ hf_options_list: List[str],
176
+ conference_options_list: List[str]
177
+ ):
178
+ """
179
+ Updates the data displayed in the Dataframe based on the selected date and options.
180
+
181
+ Args:
182
+ date (Union[str, datetime]): The selected date.
183
+ cat_options_list (List[str]): List of selected Category options.
184
+ hf_options_list (List[str]): List of selected Hugging Face options.
185
+ conference_options_list (List[str]): List of selected Conference options.
186
+
187
+ Returns:
188
+ gr.Dataframe.update: An update object for the Dataframe component.
189
+ """
190
+ return paper_central_df.filter(
191
+ selected_date=date,
192
+ cat_options=cat_options_list,
193
+ hf_options=hf_options_list,
194
+ conference_options=conference_options_list
195
+ )
196
+
197
+
198
+ # Function to handle conference options change
199
+ def on_conference_options_change(
200
+ date: Union[str, datetime],
201
+ cat_options_list: List[str],
202
+ hf_options_list: List[str],
203
+ conference_options_list: List[str]
204
+ ):
205
+
206
+ cat_options_update = gr.update()
207
+ paper_central_component_update = gr.update()
208
+ visible = True
209
+
210
+ # Some conference options are selected
211
+ # Update cat_options to empty list
212
+ if conference_options_list:
213
+ cat_options_update = gr.update(value=[])
214
+ paper_central_component_update = update_data(
215
+ date,
216
+ [],
217
+ hf_options_list,
218
+ conference_options_list,
219
+ )
220
+ visible = False
221
+
222
+ calendar_update = gr.update(visible=visible)
223
+ next_day_btn_update = gr.update(visible=visible)
224
+ prev_day_btn_update = gr.update(visible=visible)
225
+
226
+ return paper_central_component_update, cat_options_update, calendar_update, next_day_btn_update, prev_day_btn_update
227
+
228
+
229
+ # Function to handle category options change
230
+ def on_cat_options_change(
231
+ date: Union[str, datetime],
232
+ cat_options_list: List[str],
233
+ hf_options_list: List[str],
234
+ conference_options_list: List[str]
235
+ ):
236
+ conference_options_update = gr.update()
237
+ paper_central_component_update = gr.update()
238
+ visible = False
239
+
240
+ # Some category options are selected
241
+ # Update conference_options to empty list
242
+ if cat_options_list:
243
+ conference_options_update = gr.update(value=[])
244
+ paper_central_component_update = update_data(
245
+ date,
246
+ cat_options_list,
247
+ hf_options_list,
248
+ [],
249
+ )
250
+ visible = True
251
+
252
+ calendar_update = gr.update(visible=visible)
253
+ next_day_btn_update = gr.update(visible=visible)
254
+ prev_day_btn_update = gr.update(visible=visible)
255
+
256
+ return paper_central_component_update, conference_options_update, calendar_update, next_day_btn_update, prev_day_btn_update
257
+
258
+
259
+
260
+ # Set up the event listener for the 'Next Day' button
261
+ next_day_btn.click(
262
+ fn=go_to_next_day,
263
+ inputs=[calendar, cat_options, hf_options, conference_options],
264
+ outputs=[calendar, paper_central_component],
265
+ )
266
+
267
+ # Set up the event listener for the 'Previous Day' button
268
+ prev_day_btn.click(
269
+ fn=go_to_previous_day,
270
+ inputs=[calendar, cat_options, hf_options, conference_options],
271
+ outputs=[calendar, paper_central_component],
272
+ )
273
+
274
+ # Define the inputs for the filter function
275
+ inputs = [
276
+ calendar,
277
+ cat_options,
278
+ hf_options,
279
+ conference_options,
280
+ ]
281
+
282
+ # Set up the event listener for the calendar date change
283
+ calendar.change(
284
+ fn=update_data,
285
+ inputs=inputs,
286
+ outputs=paper_central_component,
287
+ )
288
+
289
+ # Set up the event listener for the Hugging Face options change
290
+ hf_options.change(
291
+ fn=update_data,
292
+ inputs=inputs,
293
+ outputs=paper_central_component,
294
+ )
295
+
296
+ # Event chaining for conference options change
297
+ conference_options.change(
298
+ fn=on_conference_options_change,
299
+ inputs=inputs,
300
+ outputs=[paper_central_component, cat_options, calendar, next_day_btn, prev_day_btn],
301
+ )
302
+
303
+ # Event chaining for category options change
304
+ cat_options.change(
305
+ fn=on_cat_options_change,
306
+ inputs=inputs,
307
+ outputs=[paper_central_component, conference_options, calendar, next_day_btn, prev_day_btn],
308
+ )
309
+
310
+ # Load the initial data when the app starts
311
+ demo.load(
312
+ fn=update_data,
313
+ inputs=inputs,
314
+ outputs=paper_central_component,
315
+ api_name=False,
316
+ )
317
+
318
+
319
+ # Define the main function to launch the app
320
+ def main():
321
+ """
322
+ Launches the Gradio app.
323
+ """
324
+ demo.launch()
325
+
326
+
327
+ # Run the main function when the script is executed
328
+ if __name__ == "__main__":
329
+ main()
constants.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ NEURIPS_ICO = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAMAAABEpIrGAAAAeFBMVEVHcExoRIuHYZ5oRItoRItoRItoRItoRItSN2loRItoRIuDXpxoRItoRIuSbKWRa6SRa6QCAAsAAAAAAAAAAAAAAAAAAAAAAAAAAACRa6SRa6QAAAAAAAAAAACRa6SRa6SRa6QAAAAAAAAAAACRa6QAAACRa6SRa6TWTNGBAAAAKHRSTlMAQBVQ9mKR0AgzfiGvcypL4BBmP11VbIQvd5JKcSFaOv95io2qjsjs/SKIogAAAT9JREFUeAHMwbUBAzAMBMB/C2wLAvsvmyYMfe7wZ8gh+E3Np6lMfLV0h+bwHFLEJ90lEFGZvcUcH6ayIri5ucLF8Iq6AZl0aROAKcSzw8G3NFYJbohnx+3Ybbbxw+msBHJfqp2DHAdhGAqgX1khRSKY2GAwJtBhOve/4aTNoisO0GfJ9srfGDOLYGKaSfGxpBWNbVtmr20cDR9xL6iydpp9ooncoCKZjNRRHf0eOkBsPI3xOPVnlhlmjG0WVDEMJQLmnuHmlMndO8+v3dqFUK6AipjFTUXZXyWiag5g2VO5egA+8cZgwekmWSz/TmTAMqThwrD26PA+zLzZWOdJhEwAwrPfhxLCAZjpaIY8+eMUIzTLMy7laiECVYaKqrf85u/CuuOIuLOGNSUUpB53jvpFisOFe8d+rCHgXkQK+Fb/W30Y/tu4vyAAAAAASUVORK5CYII="
2
+ DATASET_ARXIV_SCAN_PAPERS = "IAMJB/scanned-arxiv-papers-id"
3
+ DATASET_CONFERENCE_PAPERS = "IAMJB/paper_conference_aggregate"
4
+ DATASET_DAILY_PAPERS = "hysts-bot-data/daily-papers"
5
+ DATASET_DAILY_PAPERS_STATS = "hysts-bot-data/daily-papers-stats"
6
+ DATASET_COMMUNITY_SCIENCE = "huggingface/community-science-paper-v2"
df/PaperCentral.py ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from typing import List, Dict, Optional
3
+ from constants import (
4
+ DATASET_ARXIV_SCAN_PAPERS,
5
+ DATASET_CONFERENCE_PAPERS,
6
+ DATASET_COMMUNITY_SCIENCE,
7
+ NEURIPS_ICO,
8
+ )
9
+ import gradio as gr
10
+ from utils import load_and_process
11
+ import numpy as np
12
+
13
+
14
+ class PaperCentral:
15
+ """
16
+ A class to manage and process paper data for display in a Gradio Dataframe component.
17
+ """
18
+
19
+ CONFERENCES = [
20
+ "ACL2023",
21
+ "ACL2024",
22
+ "COLING2024",
23
+ "CVPR2023",
24
+ "CVPR2024",
25
+ "ECCV2024",
26
+ "EMNLP2023",
27
+ "NAACL2023",
28
+ "NeurIPS2023",
29
+ "NeurIPS2023 D&B",
30
+ ]
31
+ CONFERENCES_ICONS = {
32
+ "ACL2023": 'https://aclanthology.org/aclicon.ico',
33
+ "ACL2024": 'https://aclanthology.org/aclicon.ico',
34
+ "COLING2024": 'https://aclanthology.org/aclicon.ico',
35
+ "CVPR2023": "https://openaccess.thecvf.com/favicon.ico",
36
+ "CVPR2024": "https://openaccess.thecvf.com/favicon.ico",
37
+ "ECCV2024": "https://openaccess.thecvf.com/favicon.ico",
38
+ "EMNLP2023": 'https://aclanthology.org/aclicon.ico',
39
+ "NAACL2023": 'https://aclanthology.org/aclicon.ico',
40
+ "NeurIPS2023": NEURIPS_ICO,
41
+ "NeurIPS2023 D&B": NEURIPS_ICO,
42
+ }
43
+
44
+ # Class-level constants defining columns and their data types
45
+ COLUMNS_START_PAPER_PAGE: List[str] = [
46
+ 'date',
47
+ 'arxiv_id',
48
+ 'paper_page',
49
+ 'title',
50
+ ]
51
+
52
+ COLUMNS_ORDER_PAPER_PAGE: List[str] = [
53
+ 'date',
54
+ 'arxiv_id',
55
+ 'paper_page',
56
+ 'num_models',
57
+ 'num_datasets',
58
+ 'num_spaces',
59
+ 'conference_name',
60
+ 'id',
61
+ 'type',
62
+ 'proceedings',
63
+ 'title',
64
+ 'upvotes',
65
+ 'num_comments',
66
+ ]
67
+
68
+ DATATYPES: Dict[str, str] = {
69
+ 'date': 'str',
70
+ 'arxiv_id': 'markdown',
71
+ 'paper_page': 'markdown',
72
+ 'upvotes': 'str',
73
+ 'num_comments': 'str',
74
+ 'num_models': 'markdown',
75
+ 'num_datasets': 'markdown',
76
+ 'num_spaces': 'markdown',
77
+ 'title': 'str',
78
+ 'proceedings': 'markdown',
79
+ 'conference_name': 'str',
80
+ 'id': 'str',
81
+ 'type': 'str',
82
+ }
83
+
84
+ # Mapping for renaming columns for display purposes
85
+ COLUMN_RENAME_MAP: Dict[str, str] = {
86
+ 'num_models': 'models',
87
+ 'num_spaces': 'spaces',
88
+ 'num_datasets': 'datasets',
89
+ 'conference_name': 'venue',
90
+ }
91
+
92
+ def __init__(self):
93
+ """
94
+ Initialize the PaperCentral class by loading and processing the datasets.
95
+ """
96
+ self.df_raw: pd.DataFrame = self.get_df()
97
+ self.df_prettified: pd.DataFrame = self.prettify(self.df_raw)
98
+
99
+ @staticmethod
100
+ def get_columns_order(columns: List[str]) -> List[str]:
101
+ """
102
+ Get columns ordered according to COLUMNS_ORDER_PAPER_PAGE.
103
+
104
+ Args:
105
+ columns (List[str]): List of column names to order.
106
+
107
+ Returns:
108
+ List[str]: Ordered list of column names.
109
+ """
110
+ return [c for c in PaperCentral.COLUMNS_ORDER_PAPER_PAGE if c in columns]
111
+
112
+ @staticmethod
113
+ def get_columns_datatypes(columns: List[str]) -> List[str]:
114
+ """
115
+ Get data types for the specified columns.
116
+
117
+ Args:
118
+ columns (List[str]): List of column names.
119
+
120
+ Returns:
121
+ List[str]: List of data types corresponding to the columns.
122
+ """
123
+ return [PaperCentral.DATATYPES[c] for c in columns]
124
+
125
+ @staticmethod
126
+ def get_df() -> pd.DataFrame:
127
+ """
128
+ Load and merge datasets to create the raw DataFrame.
129
+
130
+ Returns:
131
+ pd.DataFrame: The merged and processed DataFrame.
132
+ """
133
+ # Load datasets
134
+ arxiv_scan_papers: pd.DataFrame = load_and_process(DATASET_ARXIV_SCAN_PAPERS)[
135
+ ['arxiv_id', 'published_date', 'categories', 'title', 'primary_category',
136
+ 'huggingface_urls']
137
+ ]
138
+ arxiv_scan_papers['published_date'] = pd.to_datetime(arxiv_scan_papers['published_date']) + pd.DateOffset(
139
+ days=1)
140
+
141
+ community_science_papers: pd.DataFrame = load_and_process(DATASET_COMMUNITY_SCIENCE)[
142
+ ['arxiv_id', 'date', 'upvotes', 'num_comments', 'github', 'num_models', 'num_datasets', 'num_spaces',
143
+ 'title']
144
+ ]
145
+
146
+ conference_papers: pd.DataFrame = load_and_process(DATASET_CONFERENCE_PAPERS)[
147
+ ['id', 'proceedings', 'type', 'arxiv_id', 'title', 'conference_name']
148
+ ]
149
+
150
+ # Merge arxiv_scan_papers and community_science_papers on 'arxiv_id'
151
+ merged_df: pd.DataFrame = pd.merge(arxiv_scan_papers, community_science_papers, on='arxiv_id', how='outer')
152
+ merged_df['title'] = merged_df['title_x'].combine_first(merged_df['title_y'])
153
+ merged_df = merged_df.drop(columns=['title_x', 'title_y'])
154
+
155
+ final_merged_df: pd.DataFrame = pd.merge(
156
+ merged_df,
157
+ conference_papers,
158
+ on='arxiv_id',
159
+ how='outer'
160
+ )
161
+
162
+ # Combine the 'title' columns into one
163
+ final_merged_df['title'] = final_merged_df['title_x'].combine_first(final_merged_df['title_y'])
164
+
165
+ # Drop the redundant 'title_x' and 'title_y' columns
166
+ final_merged_df = final_merged_df.drop(columns=['title_x', 'title_y'])
167
+
168
+ # Use 'date' from community_science_papers if available; otherwise, use 'published_date'
169
+ final_merged_df['date'] = final_merged_df['date'].combine_first(final_merged_df['published_date'])
170
+ final_merged_df.drop(columns=['published_date'], inplace=True)
171
+
172
+ # If 'arxiv_id' is in community_science_papers, set 'paper_page' to 'arxiv_id'
173
+ final_merged_df.loc[
174
+ final_merged_df['arxiv_id'].isin(community_science_papers['arxiv_id']), 'paper_page'
175
+ ] = final_merged_df['arxiv_id']
176
+
177
+ # Format the 'date' column
178
+ final_merged_df = PaperCentral.format_df_date(final_merged_df, "date")
179
+ final_merged_df['date'] = final_merged_df['date'].astype(str)
180
+
181
+ print(final_merged_df.head())
182
+ return final_merged_df
183
+
184
+ @staticmethod
185
+ def format_df_date(df: pd.DataFrame, date_column: str = "date") -> pd.DataFrame:
186
+ """
187
+ Format the date column in the DataFrame to 'YYYY-MM-DD'.
188
+
189
+ Args:
190
+ df (pd.DataFrame): The DataFrame to format.
191
+ date_column (str): The name of the date column.
192
+
193
+ Returns:
194
+ pd.DataFrame: The DataFrame with the formatted date column.
195
+ """
196
+ df.loc[:, date_column] = pd.to_datetime(df[date_column]).dt.strftime('%Y-%m-%d')
197
+ return df
198
+
199
+ @staticmethod
200
+ def prettify(df: pd.DataFrame) -> pd.DataFrame:
201
+ """
202
+ Prettify the DataFrame by adding markdown links and sorting.
203
+
204
+ Args:
205
+ df (pd.DataFrame): The DataFrame to prettify.
206
+
207
+ Returns:
208
+ pd.DataFrame: The prettified DataFrame.
209
+ """
210
+
211
+ def update_row(row: pd.Series) -> pd.Series:
212
+ """
213
+ Update a row by adding markdown links to 'paper_page' and 'arxiv_id' columns.
214
+
215
+ Args:
216
+ row (pd.Series): A row from the DataFrame.
217
+
218
+ Returns:
219
+ pd.Series: The updated row.
220
+ """
221
+ # Process 'num_models' column
222
+ if (
223
+ 'num_models' in row and pd.notna(row['num_models']) and row["arxiv_id"]
224
+ and float(row['num_models']) > 0
225
+ ):
226
+ num_models = int(float(row['num_models']))
227
+ row['num_models'] = (
228
+ f"[{num_models}](https://huggingface.co/models?other=arxiv:{row['arxiv_id']})"
229
+ )
230
+
231
+ if (
232
+ 'num_datasets' in row and pd.notna(row['num_datasets']) and row["arxiv_id"]
233
+ and float(row['num_datasets']) > 0
234
+ ):
235
+ num_datasets = int(float(row['num_datasets']))
236
+ row['num_datasets'] = (
237
+ f"[{num_datasets}](https://huggingface.co/datasets?other=arxiv:{row['arxiv_id']})"
238
+ )
239
+
240
+ if (
241
+ 'num_spaces' in row and pd.notna(row['num_spaces']) and row["arxiv_id"]
242
+ and float(row['num_spaces']) > 0
243
+ ):
244
+ num_spaces = int(float(row['num_spaces']))
245
+ row['num_spaces'] = (
246
+ f"[{num_spaces}](https://huggingface.co/spaces?other=arxiv:{row['arxiv_id']})"
247
+ )
248
+
249
+ if 'proceedings' in row and pd.notna(row['proceedings']) and row['proceedings']:
250
+ image_url = PaperCentral.CONFERENCES_ICONS[row["conference_name"]]
251
+
252
+ style = "display:inline-block; vertical-align:middle; width: 16px; height:16px"
253
+ row['proceedings'] = (
254
+ f"<img src='{image_url}' style='{style}'/>"
255
+ f"<a href='{row['proceedings']}'>proc_page</a>"
256
+ )
257
+
258
+ ####
259
+ ### This should be processed last :)
260
+ ####
261
+ # Add markdown link to 'paper_page' if it exists
262
+ if 'paper_page' in row and pd.notna(row['paper_page']):
263
+ row['paper_page'] = f"🤗[paper_page](https://huggingface.co/papers/{row['paper_page']})"
264
+
265
+ # Add image and link to 'arxiv_id' if it exists
266
+ if 'arxiv_id' in row and pd.notna(row['arxiv_id']):
267
+ image_url = "https://arxiv.org/static/browse/0.3.4/images/icons/favicon-16x16.png"
268
+ style = "display:inline-block; vertical-align:middle;"
269
+ row['arxiv_id'] = (
270
+ f"<img src='{image_url}' style='{style}'/>"
271
+ f"<a href='https://arxiv.org/abs/{row['arxiv_id']}'>arxiv_page</a>"
272
+ )
273
+
274
+ return row
275
+
276
+ df = df.copy()
277
+
278
+ # Sort rows to display entries with 'paper_page' first
279
+ if 'paper_page' in df.columns:
280
+ df['has_paper_page'] = df['paper_page'].notna()
281
+ df.sort_values(by='has_paper_page', ascending=False, inplace=True)
282
+ df.drop(columns='has_paper_page', inplace=True)
283
+
284
+ # Apply the update_row function to each row
285
+ prettified_df: pd.DataFrame = df.apply(update_row, axis=1)
286
+ return prettified_df
287
+
288
+ def rename_columns_for_display(self, df: pd.DataFrame) -> pd.DataFrame:
289
+ """
290
+ Rename columns in the DataFrame according to COLUMN_RENAME_MAP for display purposes.
291
+
292
+ Args:
293
+ df (pd.DataFrame): The DataFrame whose columns need to be renamed.
294
+
295
+ Returns:
296
+ pd.DataFrame: The DataFrame with renamed columns.
297
+ """
298
+ return df.rename(columns=self.COLUMN_RENAME_MAP)
299
+
300
+ def filter(
301
+ self,
302
+ selected_date: Optional[str] = None,
303
+ cat_options: Optional[List[str]] = None,
304
+ hf_options: Optional[List[str]] = None,
305
+ conference_options: Optional[List[str]] = None
306
+ ) -> gr.update:
307
+ """
308
+ Filter the DataFrame based on selected date and options, and prepare it for display.
309
+
310
+ Args:
311
+ selected_date (Optional[str]): The date to filter the DataFrame.
312
+ hf_options (Optional[List[str]]): List of options selected by the user.
313
+ conference_options (Optional[List[str]]): List of conference options selected by the user.
314
+
315
+ Returns:
316
+ gr.Update: An update object for the Gradio Dataframe component.
317
+ """
318
+ filtered_df: pd.DataFrame = self.df_raw.copy()
319
+
320
+ # Start with the initial columns to display
321
+ columns_to_show: List[str] = PaperCentral.COLUMNS_START_PAPER_PAGE.copy()
322
+
323
+ if cat_options:
324
+ options = [o.replace(".*", "") for o in cat_options]
325
+ # Initialize filter series
326
+ conference_filter = pd.Series(False, index=filtered_df.index)
327
+ for option in options:
328
+ # Filter rows where 'conference_name' contains the conference string (case-insensitive)
329
+ conference_filter |= (
330
+ filtered_df['primary_category'].notna() &
331
+ filtered_df['primary_category'].str.contains(option, case=False)
332
+ )
333
+ filtered_df = filtered_df[conference_filter]
334
+
335
+ # Date
336
+ if selected_date and not conference_options:
337
+ selected_date = pd.to_datetime(selected_date).strftime('%Y-%m-%d')
338
+ filtered_df = filtered_df[filtered_df['date'] == selected_date]
339
+
340
+ # HF options
341
+ if hf_options:
342
+ if "show_details" in hf_options:
343
+ # Filter rows where 'paper_page' is not empty or NaN
344
+ filtered_df = filtered_df[
345
+ (filtered_df['paper_page'] != "") & (filtered_df['paper_page'].notna())
346
+ ]
347
+
348
+ # Add 'upvotes' column if not already in columns_to_show
349
+ if 'upvotes' not in columns_to_show:
350
+ columns_to_show.append('upvotes')
351
+
352
+ # Add 'num_models' column if not already in columns_to_show
353
+ if 'num_models' not in columns_to_show:
354
+ columns_to_show.append('num_models')
355
+ if 'num_datasets' not in columns_to_show:
356
+ columns_to_show.append('num_datasets')
357
+ if 'num_spaces' not in columns_to_show:
358
+ columns_to_show.append('num_spaces')
359
+
360
+ if "datasets" in hf_options:
361
+ if 'num_datasets' not in columns_to_show:
362
+ columns_to_show.append('num_datasets')
363
+ filtered_df = filtered_df[filtered_df['num_datasets'] != 0]
364
+
365
+ if "models" in hf_options:
366
+ if 'num_models' not in columns_to_show:
367
+ columns_to_show.append('num_models')
368
+ filtered_df = filtered_df[filtered_df['num_models'] != 0]
369
+ if "spaces" in hf_options:
370
+ if 'num_spaces' not in columns_to_show:
371
+ columns_to_show.append('num_spaces')
372
+ filtered_df = filtered_df[filtered_df['num_spaces'] != 0]
373
+
374
+ # Apply conference filtering
375
+ if conference_options:
376
+
377
+ columns_to_show.remove("date")
378
+ columns_to_show.remove("arxiv_id")
379
+
380
+ if 'conference_name' not in columns_to_show:
381
+ columns_to_show.append('conference_name')
382
+
383
+ if 'proceedings' not in columns_to_show:
384
+ columns_to_show.append('proceedings')
385
+
386
+ if 'type' not in columns_to_show:
387
+ columns_to_show.append('type')
388
+
389
+ if 'id' not in columns_to_show:
390
+ columns_to_show.append('id')
391
+
392
+ # If "In proceedings" is selected
393
+ if "In proceedings" in conference_options:
394
+ # Filter rows where 'conference_name' is not None, not NaN, and not empty
395
+ filtered_df = filtered_df[
396
+ filtered_df['conference_name'].notna() & (filtered_df['conference_name'] != "")
397
+ ]
398
+
399
+ # For other conference options
400
+ other_conferences = [conf for conf in conference_options if conf != "In proceedings"]
401
+ if other_conferences:
402
+ # Initialize filter series
403
+ conference_filter = pd.Series(False, index=filtered_df.index)
404
+ for conference in other_conferences:
405
+ # Filter rows where 'conference_name' contains the conference string (case-insensitive)
406
+ conference_filter |= (
407
+ filtered_df['conference_name'].notna() &
408
+ (filtered_df['conference_name'].str.lower() == conference.lower())
409
+ )
410
+ filtered_df = filtered_df[conference_filter]
411
+
412
+ # Prettify the DataFrame
413
+ filtered_df = self.prettify(filtered_df)
414
+
415
+ # Ensure columns are ordered according to COLUMNS_ORDER_PAPER_PAGE
416
+ columns_in_order: List[str] = [col for col in PaperCentral.COLUMNS_ORDER_PAPER_PAGE if col in columns_to_show]
417
+
418
+ # Select and reorder the columns
419
+ filtered_df = filtered_df[columns_in_order]
420
+
421
+ # Rename columns for display
422
+ filtered_df = self.rename_columns_for_display(filtered_df)
423
+
424
+ # Get the corresponding data types for the columns
425
+ new_datatypes: List[str] = [
426
+ PaperCentral.DATATYPES.get(self._get_original_column_name(col), 'str') for col in filtered_df.columns
427
+ ]
428
+
429
+ # Return an update object to modify the Dataframe component
430
+ return gr.update(value=filtered_df, datatype=new_datatypes)
431
+
432
+ def _get_original_column_name(self, display_column_name: str) -> str:
433
+ """
434
+ Retrieve the original column name given a display column name.
435
+
436
+ Args:
437
+ display_column_name (str): The display name of the column.
438
+
439
+ Returns:
440
+ str: The original name of the column.
441
+ """
442
+ inverse_map = {v: k for k, v in self.COLUMN_RENAME_MAP.items()}
443
+ return inverse_map.get(display_column_name, display_column_name)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ gradio_calendar
3
+ datasets
style.css ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ h1 {
2
+ text-align: center;
3
+ display: block;
4
+ }
5
+
6
+ body a,
7
+ .contain a,
8
+ #table a {
9
+ background-color: transparent;
10
+ color: #58a6ff;
11
+ text-decoration: none;
12
+ }
13
+
14
+ body a:active,
15
+ body a:hover {
16
+ outline-width: 0;
17
+ }
18
+
19
+ body a:hover {
20
+ text-decoration: underline;
21
+ }
22
+
23
+
utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from datasets import load_dataset
3
+
4
+
5
+ def arxiv_remove_version_suffix(arxiv_id):
6
+ # Use regex to remove version suffix (e.g., v1, v2, etc.) if present
7
+ cleaned_id = re.sub(r'v\d+$', '', arxiv_id)
8
+ return cleaned_id
9
+
10
+
11
+ # Load datasets
12
+ def load_and_process(dataset_name):
13
+ data = load_dataset(dataset_name, split="train").to_pandas()
14
+ if 'arxiv_id' in data.columns:
15
+ data['arxiv_id'] = data['arxiv_id'].apply(arxiv_remove_version_suffix)
16
+ return data