IAMJB commited on
Commit
df43c05
·
1 Parent(s): ff6d03f

author, github

Browse files
Files changed (5) hide show
  1. app.py +43 -19
  2. constants.py +2 -0
  3. df/PaperCentral.py +57 -53
  4. style.css +12 -0
  5. utils.py +10 -4
app.py CHANGED
@@ -58,7 +58,8 @@ with gr.Blocks(css="style.css") as demo:
58
  )
59
  hf_options = gr.CheckboxGroup(
60
  label="Hugging Face options",
61
- choices=["show_details", "datasets", "models", "spaces"]
 
62
  )
63
 
64
  with gr.Column():
@@ -68,6 +69,11 @@ with gr.Blocks(css="style.css") as demo:
68
  choices=["In proceedings"] + PaperCentral.CONFERENCES
69
  )
70
 
 
 
 
 
 
71
  # Define the Dataframe component to display paper data
72
  # List of columns in your DataFrame
73
  columns = paper_central_df.COLUMNS_START_PAPER_PAGE
@@ -91,7 +97,8 @@ with gr.Blocks(css="style.css") as demo:
91
  date: Union[str, datetime],
92
  cat_options_list: List[str],
93
  hf_options_list: List[str],
94
- conference_options_list: List[str]
 
95
  ) -> tuple:
96
  """
97
  Moves the selected date to the next day and updates the data.
@@ -120,7 +127,8 @@ with gr.Blocks(css="style.css") as demo:
120
  selected_date=new_date_str,
121
  cat_options=cat_options_list,
122
  hf_options=hf_options_list,
123
- conference_options=conference_options_list
 
124
  )
125
 
126
  # Return the new date and updated Dataframe
@@ -132,7 +140,8 @@ with gr.Blocks(css="style.css") as demo:
132
  date: Union[str, datetime],
133
  cat_options_list: List[str],
134
  hf_options_list: List[str],
135
- conference_options_list: List[str]
 
136
  ) -> tuple:
137
  """
138
  Moves the selected date to the previous day and updates the data.
@@ -161,7 +170,8 @@ with gr.Blocks(css="style.css") as demo:
161
  selected_date=new_date_str,
162
  cat_options=cat_options_list,
163
  hf_options=hf_options_list,
164
- conference_options=conference_options_list
 
165
  )
166
 
167
  # Return the new date and updated Dataframe
@@ -173,7 +183,8 @@ with gr.Blocks(css="style.css") as demo:
173
  date: Union[str, datetime],
174
  cat_options_list: List[str],
175
  hf_options_list: List[str],
176
- conference_options_list: List[str]
 
177
  ):
178
  """
179
  Updates the data displayed in the Dataframe based on the selected date and options.
@@ -191,7 +202,8 @@ with gr.Blocks(css="style.css") as demo:
191
  selected_date=date,
192
  cat_options=cat_options_list,
193
  hf_options=hf_options_list,
194
- conference_options=conference_options_list
 
195
  )
196
 
197
 
@@ -200,7 +212,8 @@ with gr.Blocks(css="style.css") as demo:
200
  date: Union[str, datetime],
201
  cat_options_list: List[str],
202
  hf_options_list: List[str],
203
- conference_options_list: List[str]
 
204
  ):
205
 
206
  cat_options_update = gr.update()
@@ -216,6 +229,7 @@ with gr.Blocks(css="style.css") as demo:
216
  [],
217
  hf_options_list,
218
  conference_options_list,
 
219
  )
220
  visible = False
221
 
@@ -231,7 +245,8 @@ with gr.Blocks(css="style.css") as demo:
231
  date: Union[str, datetime],
232
  cat_options_list: List[str],
233
  hf_options_list: List[str],
234
- conference_options_list: List[str]
 
235
  ):
236
  conference_options_update = gr.update()
237
  paper_central_component_update = gr.update()
@@ -246,6 +261,7 @@ with gr.Blocks(css="style.css") as demo:
246
  cat_options_list,
247
  hf_options_list,
248
  [],
 
249
  )
250
  visible = True
251
 
@@ -256,29 +272,36 @@ with gr.Blocks(css="style.css") as demo:
256
  return paper_central_component_update, conference_options_update, calendar_update, next_day_btn_update, prev_day_btn_update
257
 
258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
  # Set up the event listener for the 'Next Day' button
261
  next_day_btn.click(
262
  fn=go_to_next_day,
263
- inputs=[calendar, cat_options, hf_options, conference_options],
264
  outputs=[calendar, paper_central_component],
265
  )
266
 
267
  # Set up the event listener for the 'Previous Day' button
268
  prev_day_btn.click(
269
  fn=go_to_previous_day,
270
- inputs=[calendar, cat_options, hf_options, conference_options],
271
  outputs=[calendar, paper_central_component],
272
  )
273
 
274
- # Define the inputs for the filter function
275
- inputs = [
276
- calendar,
277
- cat_options,
278
- hf_options,
279
- conference_options,
280
- ]
281
-
282
  # Set up the event listener for the calendar date change
283
  calendar.change(
284
  fn=update_data,
@@ -293,6 +316,7 @@ with gr.Blocks(css="style.css") as demo:
293
  outputs=paper_central_component,
294
  )
295
 
 
296
  # Event chaining for conference options change
297
  conference_options.change(
298
  fn=on_conference_options_change,
 
58
  )
59
  hf_options = gr.CheckboxGroup(
60
  label="Hugging Face options",
61
+ choices=["🤗 paper-page", "datasets", "models", "spaces", "github"],
62
+ elem_id="hf_options"
63
  )
64
 
65
  with gr.Column():
 
69
  choices=["In proceedings"] + PaperCentral.CONFERENCES
70
  )
71
 
72
+ # Define a Textbox for author search
73
+ author_search = gr.Textbox(
74
+ label="Search Authors",
75
+ placeholder="Enter author name",
76
+ )
77
  # Define the Dataframe component to display paper data
78
  # List of columns in your DataFrame
79
  columns = paper_central_df.COLUMNS_START_PAPER_PAGE
 
97
  date: Union[str, datetime],
98
  cat_options_list: List[str],
99
  hf_options_list: List[str],
100
+ conference_options_list: List[str],
101
+ author_search_input: str,
102
  ) -> tuple:
103
  """
104
  Moves the selected date to the next day and updates the data.
 
127
  selected_date=new_date_str,
128
  cat_options=cat_options_list,
129
  hf_options=hf_options_list,
130
+ conference_options=conference_options_list,
131
+ author_search_input=author_search_input,
132
  )
133
 
134
  # Return the new date and updated Dataframe
 
140
  date: Union[str, datetime],
141
  cat_options_list: List[str],
142
  hf_options_list: List[str],
143
+ conference_options_list: List[str],
144
+ author_search_input: str,
145
  ) -> tuple:
146
  """
147
  Moves the selected date to the previous day and updates the data.
 
170
  selected_date=new_date_str,
171
  cat_options=cat_options_list,
172
  hf_options=hf_options_list,
173
+ conference_options=conference_options_list,
174
+ author_search_input=author_search_input,
175
  )
176
 
177
  # Return the new date and updated Dataframe
 
183
  date: Union[str, datetime],
184
  cat_options_list: List[str],
185
  hf_options_list: List[str],
186
+ conference_options_list: List[str],
187
+ author_search_input: str,
188
  ):
189
  """
190
  Updates the data displayed in the Dataframe based on the selected date and options.
 
202
  selected_date=date,
203
  cat_options=cat_options_list,
204
  hf_options=hf_options_list,
205
+ conference_options=conference_options_list,
206
+ author_search_input=author_search_input,
207
  )
208
 
209
 
 
212
  date: Union[str, datetime],
213
  cat_options_list: List[str],
214
  hf_options_list: List[str],
215
+ conference_options_list: List[str],
216
+ author_search_input: str,
217
  ):
218
 
219
  cat_options_update = gr.update()
 
229
  [],
230
  hf_options_list,
231
  conference_options_list,
232
+ author_search_input,
233
  )
234
  visible = False
235
 
 
245
  date: Union[str, datetime],
246
  cat_options_list: List[str],
247
  hf_options_list: List[str],
248
+ conference_options_list: List[str],
249
+ author_search_input: str,
250
  ):
251
  conference_options_update = gr.update()
252
  paper_central_component_update = gr.update()
 
261
  cat_options_list,
262
  hf_options_list,
263
  [],
264
+ author_search_input,
265
  )
266
  visible = True
267
 
 
272
  return paper_central_component_update, conference_options_update, calendar_update, next_day_btn_update, prev_day_btn_update
273
 
274
 
275
+ inputs = [
276
+ calendar,
277
+ cat_options,
278
+ hf_options,
279
+ conference_options,
280
+ author_search,
281
+ ]
282
+
283
+ # Set up the event listener for the author search
284
+ author_search.submit(
285
+ fn=update_data,
286
+ inputs=inputs,
287
+ outputs=paper_central_component,
288
+ )
289
+
290
 
291
  # Set up the event listener for the 'Next Day' button
292
  next_day_btn.click(
293
  fn=go_to_next_day,
294
+ inputs=inputs,
295
  outputs=[calendar, paper_central_component],
296
  )
297
 
298
  # Set up the event listener for the 'Previous Day' button
299
  prev_day_btn.click(
300
  fn=go_to_previous_day,
301
+ inputs=inputs,
302
  outputs=[calendar, paper_central_component],
303
  )
304
 
 
 
 
 
 
 
 
 
305
  # Set up the event listener for the calendar date change
306
  calendar.change(
307
  fn=update_data,
 
316
  outputs=paper_central_component,
317
  )
318
 
319
+
320
  # Event chaining for conference options change
321
  conference_options.change(
322
  fn=on_conference_options_change,
constants.py CHANGED
@@ -4,3 +4,5 @@ DATASET_CONFERENCE_PAPERS = "IAMJB/paper_conference_aggregate"
4
  DATASET_DAILY_PAPERS = "hysts-bot-data/daily-papers"
5
  DATASET_DAILY_PAPERS_STATS = "hysts-bot-data/daily-papers-stats"
6
  DATASET_COMMUNITY_SCIENCE = "huggingface/community-science-paper-v2"
 
 
 
4
  DATASET_DAILY_PAPERS = "hysts-bot-data/daily-papers"
5
  DATASET_DAILY_PAPERS_STATS = "hysts-bot-data/daily-papers-stats"
6
  DATASET_COMMUNITY_SCIENCE = "huggingface/community-science-paper-v2"
7
+ # DATASET_PAPER_CENTRAL = "huggingface/paper-central-data"
8
+ DATASET_PAPER_CENTRAL = "huggingface/paper-central-data-2"
df/PaperCentral.py CHANGED
@@ -5,6 +5,7 @@ from constants import (
5
  DATASET_CONFERENCE_PAPERS,
6
  DATASET_COMMUNITY_SCIENCE,
7
  NEURIPS_ICO,
 
8
  )
9
  import gradio as gr
10
  from utils import load_and_process
@@ -56,13 +57,15 @@ class PaperCentral:
56
  'num_models',
57
  'num_datasets',
58
  'num_spaces',
 
 
 
59
  'conference_name',
60
  'id',
61
  'type',
62
  'proceedings',
63
  'title',
64
- 'upvotes',
65
- 'num_comments',
66
  ]
67
 
68
  DATATYPES: Dict[str, str] = {
@@ -74,11 +77,13 @@ class PaperCentral:
74
  'num_models': 'markdown',
75
  'num_datasets': 'markdown',
76
  'num_spaces': 'markdown',
 
77
  'title': 'str',
78
  'proceedings': 'markdown',
79
  'conference_name': 'str',
80
  'id': 'str',
81
  'type': 'str',
 
82
  }
83
 
84
  # Mapping for renaming columns for display purposes
@@ -131,55 +136,13 @@ class PaperCentral:
131
  pd.DataFrame: The merged and processed DataFrame.
132
  """
133
  # Load datasets
134
- arxiv_scan_papers: pd.DataFrame = load_and_process(DATASET_ARXIV_SCAN_PAPERS)[
135
- ['arxiv_id', 'published_date', 'categories', 'title', 'primary_category',
136
- 'huggingface_urls']
 
137
  ]
138
- arxiv_scan_papers['published_date'] = pd.to_datetime(arxiv_scan_papers['published_date']) + pd.DateOffset(
139
- days=1)
140
-
141
- community_science_papers: pd.DataFrame = load_and_process(DATASET_COMMUNITY_SCIENCE)[
142
- ['arxiv_id', 'date', 'upvotes', 'num_comments', 'github', 'num_models', 'num_datasets', 'num_spaces',
143
- 'title']
144
- ]
145
-
146
- conference_papers: pd.DataFrame = load_and_process(DATASET_CONFERENCE_PAPERS)[
147
- ['id', 'proceedings', 'type', 'arxiv_id', 'title', 'conference_name']
148
- ]
149
-
150
- # Merge arxiv_scan_papers and community_science_papers on 'arxiv_id'
151
- merged_df: pd.DataFrame = pd.merge(arxiv_scan_papers, community_science_papers, on='arxiv_id', how='outer')
152
- merged_df['title'] = merged_df['title_x'].combine_first(merged_df['title_y'])
153
- merged_df = merged_df.drop(columns=['title_x', 'title_y'])
154
-
155
- final_merged_df: pd.DataFrame = pd.merge(
156
- merged_df,
157
- conference_papers,
158
- on='arxiv_id',
159
- how='outer'
160
- )
161
-
162
- # Combine the 'title' columns into one
163
- final_merged_df['title'] = final_merged_df['title_x'].combine_first(final_merged_df['title_y'])
164
-
165
- # Drop the redundant 'title_x' and 'title_y' columns
166
- final_merged_df = final_merged_df.drop(columns=['title_x', 'title_y'])
167
-
168
- # Use 'date' from community_science_papers if available; otherwise, use 'published_date'
169
- final_merged_df['date'] = final_merged_df['date'].combine_first(final_merged_df['published_date'])
170
- final_merged_df.drop(columns=['published_date'], inplace=True)
171
 
172
- # If 'arxiv_id' is in community_science_papers, set 'paper_page' to 'arxiv_id'
173
- final_merged_df.loc[
174
- final_merged_df['arxiv_id'].isin(community_science_papers['arxiv_id']), 'paper_page'
175
- ] = final_merged_df['arxiv_id']
176
-
177
- # Format the 'date' column
178
- final_merged_df = PaperCentral.format_df_date(final_merged_df, "date")
179
- final_merged_df['date'] = final_merged_df['date'].astype(str)
180
-
181
- print(final_merged_df.head())
182
- return final_merged_df
183
 
184
  @staticmethod
185
  def format_df_date(df: pd.DataFrame, date_column: str = "date") -> pd.DataFrame:
@@ -259,11 +222,11 @@ class PaperCentral:
259
  ### This should be processed last :)
260
  ####
261
  # Add markdown link to 'paper_page' if it exists
262
- if 'paper_page' in row and pd.notna(row['paper_page']):
263
  row['paper_page'] = f"🤗[paper_page](https://huggingface.co/papers/{row['paper_page']})"
264
 
265
  # Add image and link to 'arxiv_id' if it exists
266
- if 'arxiv_id' in row and pd.notna(row['arxiv_id']):
267
  image_url = "https://arxiv.org/static/browse/0.3.4/images/icons/favicon-16x16.png"
268
  style = "display:inline-block; vertical-align:middle;"
269
  row['arxiv_id'] = (
@@ -271,6 +234,15 @@ class PaperCentral:
271
  f"<a href='https://arxiv.org/abs/{row['arxiv_id']}'>arxiv_page</a>"
272
  )
273
 
 
 
 
 
 
 
 
 
 
274
  return row
275
 
276
  df = df.copy()
@@ -302,7 +274,8 @@ class PaperCentral:
302
  selected_date: Optional[str] = None,
303
  cat_options: Optional[List[str]] = None,
304
  hf_options: Optional[List[str]] = None,
305
- conference_options: Optional[List[str]] = None
 
306
  ) -> gr.update:
307
  """
308
  Filter the DataFrame based on selected date and options, and prepare it for display.
@@ -320,6 +293,32 @@ class PaperCentral:
320
  # Start with the initial columns to display
321
  columns_to_show: List[str] = PaperCentral.COLUMNS_START_PAPER_PAGE.copy()
322
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  if cat_options:
324
  options = [o.replace(".*", "") for o in cat_options]
325
  # Initialize filter series
@@ -339,7 +338,7 @@ class PaperCentral:
339
 
340
  # HF options
341
  if hf_options:
342
- if "show_details" in hf_options:
343
  # Filter rows where 'paper_page' is not empty or NaN
344
  filtered_df = filtered_df[
345
  (filtered_df['paper_page'] != "") & (filtered_df['paper_page'].notna())
@@ -371,6 +370,11 @@ class PaperCentral:
371
  columns_to_show.append('num_spaces')
372
  filtered_df = filtered_df[filtered_df['num_spaces'] != 0]
373
 
 
 
 
 
 
374
  # Apply conference filtering
375
  if conference_options:
376
 
 
5
  DATASET_CONFERENCE_PAPERS,
6
  DATASET_COMMUNITY_SCIENCE,
7
  NEURIPS_ICO,
8
+ DATASET_PAPER_CENTRAL,
9
  )
10
  import gradio as gr
11
  from utils import load_and_process
 
57
  'num_models',
58
  'num_datasets',
59
  'num_spaces',
60
+ 'upvotes',
61
+ 'num_comments',
62
+ 'github',
63
  'conference_name',
64
  'id',
65
  'type',
66
  'proceedings',
67
  'title',
68
+ 'authors',
 
69
  ]
70
 
71
  DATATYPES: Dict[str, str] = {
 
77
  'num_models': 'markdown',
78
  'num_datasets': 'markdown',
79
  'num_spaces': 'markdown',
80
+ 'github': 'markdown',
81
  'title': 'str',
82
  'proceedings': 'markdown',
83
  'conference_name': 'str',
84
  'id': 'str',
85
  'type': 'str',
86
+ 'authors': 'str',
87
  }
88
 
89
  # Mapping for renaming columns for display purposes
 
136
  pd.DataFrame: The merged and processed DataFrame.
137
  """
138
  # Load datasets
139
+ paper_central_df: pd.DataFrame = load_and_process(DATASET_PAPER_CENTRAL)[
140
+ ['arxiv_id', 'categories', 'primary_category', 'date', 'upvotes', 'num_comments', 'github', 'num_models',
141
+ 'num_datasets', 'num_spaces', 'id', 'proceedings', 'type',
142
+ 'conference_name', 'title', 'paper_page', 'authors']
143
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
+ return paper_central_df
 
 
 
 
 
 
 
 
 
 
146
 
147
  @staticmethod
148
  def format_df_date(df: pd.DataFrame, date_column: str = "date") -> pd.DataFrame:
 
222
  ### This should be processed last :)
223
  ####
224
  # Add markdown link to 'paper_page' if it exists
225
+ if 'paper_page' in row and pd.notna(row['paper_page']) and row['paper_page']:
226
  row['paper_page'] = f"🤗[paper_page](https://huggingface.co/papers/{row['paper_page']})"
227
 
228
  # Add image and link to 'arxiv_id' if it exists
229
+ if 'arxiv_id' in row and pd.notna(row['arxiv_id']) and row['arxiv_id']:
230
  image_url = "https://arxiv.org/static/browse/0.3.4/images/icons/favicon-16x16.png"
231
  style = "display:inline-block; vertical-align:middle;"
232
  row['arxiv_id'] = (
 
234
  f"<a href='https://arxiv.org/abs/{row['arxiv_id']}'>arxiv_page</a>"
235
  )
236
 
237
+ # Add image and link to 'arxiv_id' if it exists
238
+ if 'github' in row and pd.notna(row['github']) and row["github"]:
239
+ image_url = "https://github.githubassets.com/favicons/favicon.png"
240
+ style = "display:inline-block; vertical-align:middle;width:16px;"
241
+ row['github'] = (
242
+ f"<img src='{image_url}' style='{style}'/>"
243
+ f"<a href='{row['github']}'>github</a>"
244
+ )
245
+
246
  return row
247
 
248
  df = df.copy()
 
274
  selected_date: Optional[str] = None,
275
  cat_options: Optional[List[str]] = None,
276
  hf_options: Optional[List[str]] = None,
277
+ conference_options: Optional[List[str]] = None,
278
+ author_search_input: Optional[str] = None,
279
  ) -> gr.update:
280
  """
281
  Filter the DataFrame based on selected date and options, and prepare it for display.
 
293
  # Start with the initial columns to display
294
  columns_to_show: List[str] = PaperCentral.COLUMNS_START_PAPER_PAGE.copy()
295
 
296
+ if author_search_input:
297
+ if 'authors' not in columns_to_show:
298
+ columns_to_show.append('authors')
299
+
300
+ search_string = author_search_input.lower()
301
+
302
+ def author_matches(authors_list):
303
+ # Check if authors_list is None or empty
304
+ if authors_list is None or len(authors_list) == 0:
305
+ return False
306
+
307
+ # Check if authors_list is an iterable (list, tuple, Series, or ndarray)
308
+ if isinstance(authors_list, (list, tuple, pd.Series, np.ndarray)):
309
+ return any(
310
+ isinstance(author, str) and search_string in author.lower()
311
+ for author in authors_list
312
+ )
313
+ elif isinstance(authors_list, str):
314
+ # If authors_list is a single string
315
+ return search_string in authors_list.lower()
316
+ else:
317
+ # Handle unexpected data types
318
+ return False
319
+
320
+ filtered_df = filtered_df[filtered_df['authors'].apply(author_matches)]
321
+
322
  if cat_options:
323
  options = [o.replace(".*", "") for o in cat_options]
324
  # Initialize filter series
 
338
 
339
  # HF options
340
  if hf_options:
341
+ if "🤗 paper-page" in hf_options:
342
  # Filter rows where 'paper_page' is not empty or NaN
343
  filtered_df = filtered_df[
344
  (filtered_df['paper_page'] != "") & (filtered_df['paper_page'].notna())
 
370
  columns_to_show.append('num_spaces')
371
  filtered_df = filtered_df[filtered_df['num_spaces'] != 0]
372
 
373
+ if "github" in hf_options:
374
+ if 'github' not in columns_to_show:
375
+ columns_to_show.append('github')
376
+ filtered_df = filtered_df[(filtered_df['github'] != "") & (filtered_df['github'].notnull())]
377
+
378
  # Apply conference filtering
379
  if conference_options:
380
 
style.css CHANGED
@@ -21,3 +21,15 @@ body a:hover {
21
  }
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  }
22
 
23
 
24
+
25
+ #hf_options label[for='github']::before {
26
+ content: "";
27
+ background-image: url('https://github.githubassets.com/favicons/favicon.png');
28
+ background-size: contain;
29
+ display: inline-block;
30
+ width: 16px;
31
+ height: 16px;
32
+ vertical-align: middle;
33
+ margin-right: 5px;
34
+ }
35
+
utils.py CHANGED
@@ -1,11 +1,17 @@
1
- import re
2
  from datasets import load_dataset
 
3
 
4
 
5
  def arxiv_remove_version_suffix(arxiv_id):
6
- # Use regex to remove version suffix (e.g., v1, v2, etc.) if present
7
- cleaned_id = re.sub(r'v\d+$', '', arxiv_id)
8
- return cleaned_id
 
 
 
 
 
 
9
 
10
 
11
  # Load datasets
 
 
1
  from datasets import load_dataset
2
+ import re
3
 
4
 
5
  def arxiv_remove_version_suffix(arxiv_id):
6
+ if arxiv_id is None:
7
+ return None
8
+ # Ensure arxiv_id is a string before applying regex
9
+ elif isinstance(arxiv_id, str):
10
+ cleaned_id = re.sub(r'v\d+$', '', arxiv_id)
11
+ return cleaned_id
12
+ else:
13
+ # Handle unexpected types
14
+ return arxiv_id
15
 
16
 
17
  # Load datasets