rodrigomasini commited on
Commit
7f2fc59
1 Parent(s): c5558c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -205
app.py CHANGED
@@ -142,211 +142,95 @@ def load_query(request: gr.Request):
142
  query = request.query_params.get("query") or ""
143
  return query
144
 
 
 
 
 
 
 
 
145
 
146
- demo = gr.Blocks(css=custom_css)
147
- with demo:
148
- gr.HTML(TITLE)
149
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
150
-
151
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
152
- with gr.TabItem("Hallucinations Benchmark",
153
- elem_id="llm-benchmark-tab-table",
154
- id=0):
155
- with gr.Row():
156
- with gr.Column():
157
- with gr.Row():
158
- search_bar = gr.Textbox(placeholder=" 🔍 Model search (separate multiple queries with `;`)",
159
- show_label=False,
160
- elem_id="search-bar")
161
- with gr.Row():
162
- shown_columns = gr.CheckboxGroup(
163
- choices=[
164
- c.name
165
- for c in fields(AutoEvalColumn)
166
- if not c.hidden and not c.never_hidden and not c.dummy
167
- ],
168
- value=[
169
- c.name
170
- for c in fields(AutoEvalColumn)
171
- if c.displayed_by_default and not c.hidden and not c.never_hidden
172
- ],
173
- label="Select columns to show",
174
- elem_id="column-select",
175
- interactive=True)
176
-
177
- with gr.Column(min_width=320):
178
- filter_columns_type = gr.CheckboxGroup(
179
- label="Model types",
180
- choices=[t.to_str() for t in ModelType],
181
- value=[t.to_str() for t in ModelType],
182
- interactive=True,
183
- elem_id="filter-columns-type")
184
-
185
- filter_columns_precision = gr.CheckboxGroup(
186
- label="Precision",
187
- choices=[i.value.name for i in Precision],
188
- value=[i.value.name for i in Precision],
189
- interactive=True,
190
- elem_id="filter-columns-precision")
191
-
192
- filter_columns_size = gr.CheckboxGroup(
193
- label="Model sizes (in billions of parameters)",
194
- choices=list(NUMERIC_INTERVALS.keys()),
195
- value=list(NUMERIC_INTERVALS.keys()),
196
- interactive=True,
197
- elem_id="filter-columns-size")
198
-
199
- leaderboard_table = gr.components.Dataframe(
200
- value=leaderboard_df[
201
- [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value + [AutoEvalColumn.dummy.name]
202
- ] if leaderboard_df.empty is False else leaderboard_df,
203
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
204
- datatype=TYPES,
205
- elem_id="leaderboard-table",
206
- interactive=False,
207
- visible=True)
208
-
209
- # Dummy leaderboard for handling the case when the user uses backspace key
210
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
211
- value=original_df[COLS] if original_df.empty is False else original_df,
212
- headers=COLS,
213
- datatype=TYPES,
214
- visible=False)
215
-
216
- search_bar.submit(
217
- update_table,
218
- [
219
- hidden_leaderboard_table_for_search,
220
- shown_columns,
221
- filter_columns_type,
222
- filter_columns_precision,
223
- filter_columns_size,
224
- search_bar,
225
- ],
226
- leaderboard_table)
227
-
228
- # Check query parameter once at startup and update search bar
229
- demo.load(load_query, inputs=[], outputs=[search_bar])
230
-
231
- for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
232
- selector.change(
233
- update_table,
234
- [
235
- hidden_leaderboard_table_for_search,
236
- shown_columns,
237
- filter_columns_type,
238
- filter_columns_precision,
239
- filter_columns_size,
240
- search_bar,
241
- ],
242
- leaderboard_table,
243
- queue=True)
244
-
245
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
246
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
247
- print(f'dataset df columns: {list(dataset_df.columns)}')
248
- dataset_table = gr.components.Dataframe(
249
- value=dataset_df,
250
- headers=list(dataset_df.columns),
251
- datatype=['str', 'markdown', 'str', 'str', 'str'],
252
- elem_id="dataset-table",
253
- interactive=False,
254
- visible=True,
255
- column_widths=["15%", "20%"]
256
- )
257
- gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
258
- gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
259
-
260
- with gr.TabItem("Submit a model ", elem_id="llm-benchmark-tab-table", id=3):
261
- with gr.Column():
262
- with gr.Row():
263
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
264
-
265
- with gr.Column():
266
- with gr.Accordion(f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
267
- with gr.Row():
268
- finished_eval_table = gr.components.Dataframe(
269
- value=finished_eval_queue_df,
270
- headers=EVAL_COLS,
271
- datatype=EVAL_TYPES,
272
- row_count=5)
273
-
274
- with gr.Accordion(f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", open=False):
275
- with gr.Row():
276
- running_eval_table = gr.components.Dataframe(
277
- value=running_eval_queue_df,
278
- headers=EVAL_COLS,
279
- datatype=EVAL_TYPES,
280
- row_count=5)
281
-
282
- with gr.Accordion(f"⏳ Scheduled Evaluation Queue ({len(pending_eval_queue_df)})", open=False):
283
- with gr.Row():
284
- pending_eval_table = gr.components.Dataframe(
285
- value=pending_eval_queue_df,
286
- headers=EVAL_COLS,
287
- datatype=EVAL_TYPES,
288
- row_count=5)
289
-
290
- with gr.Row():
291
- gr.Markdown("# Submit your model here", elem_classes="markdown-text")
292
-
293
- with gr.Row():
294
- with gr.Column():
295
- model_name_textbox = gr.Textbox(label="Model name")
296
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
297
- private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
298
- model_type = gr.Dropdown(
299
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
300
- label="Model type",
301
- multiselect=False,
302
- value=None,
303
- interactive=True)
304
-
305
- with gr.Column():
306
- precision = gr.Dropdown(
307
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
308
- label="Precision",
309
- multiselect=False,
310
- value="float32",
311
- interactive=True)
312
-
313
- weight_type = gr.Dropdown(
314
- choices=[i.value.name for i in WeightType],
315
- label="Weights type",
316
- multiselect=False,
317
- value="Original",
318
- interactive=True)
319
-
320
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
321
-
322
- submit_button = gr.Button("Submit Eval")
323
- submission_result = gr.Markdown()
324
- submit_button.click(
325
- add_new_eval,
326
- [
327
- model_name_textbox,
328
- base_model_name_textbox,
329
- revision_name_textbox,
330
- precision,
331
- private,
332
- weight_type,
333
- model_type,
334
- ],
335
- submission_result)
336
-
337
- with gr.Row():
338
- with gr.Accordion("Citing this leaderboard", open=False):
339
- citation_button = gr.Textbox(
340
- value=CITATION_BUTTON_TEXT,
341
- label=CITATION_BUTTON_LABEL,
342
- lines=20,
343
- elem_id="citation-button",
344
- show_copy_button=True)
345
 
346
- scheduler = BackgroundScheduler()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
- scheduler.add_job(restart_space, "interval", seconds=6 * 60 * 60)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
 
 
 
350
 
351
  def launch_backend():
352
  import subprocess
@@ -354,8 +238,6 @@ def launch_backend():
354
  if DEVICE not in {'cpu'}:
355
  _ = subprocess.run(["python", "backend-cli.py"])
356
 
357
-
358
- # scheduler.add_job(launch_backend, "interval", seconds=120)
359
-
360
  scheduler.start()
361
- demo.queue(default_concurrency_limit=40).launch()
 
 
142
  query = request.query_params.get("query") or ""
143
  return query
144
 
145
+ leaderboard_df = filter_models(
146
+ df=leaderboard_df,
147
+ type_query=[t.to_str(" : ") for t in ModelType],
148
+ size_query=list(NUMERIC_INTERVALS.keys()),
149
+ precision_query=[i.value.name for i in Precision],
150
+ show_deleted=False,
151
+ )
152
 
153
+ import unicodedata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
+ def is_valid_unicode(char):
156
+ try:
157
+ unicodedata.name(char)
158
+ return True # Valid Unicode character
159
+ except ValueError:
160
+ return False # Invalid Unicode character
161
+
162
+ def remove_invalid_unicode(input_string):
163
+ if isinstance(input_string, str):
164
+ valid_chars = [char for char in input_string if is_valid_unicode(char)]
165
+ return ''.join(valid_chars)
166
+ else:
167
+ return input_string # Return non-string values as is
168
+
169
+ dummy1 = gr.Textbox(visible=False)
170
+
171
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
172
+ headers=COLS,
173
+ datatype=TYPES,
174
+ visible=False,
175
+ line_breaks=False,
176
+ interactive=False
177
+ )
178
 
179
+ def display(x, y):
180
+ # Assuming df is your DataFrame
181
+ for column in leaderboard_df.columns:
182
+ if leaderboard_df[column].dtype == 'object':
183
+ leaderboard_df[column] = leaderboard_df[column].apply(remove_invalid_unicode)
184
+
185
+ subset_df = leaderboard_df[COLS]
186
+ return subset_df
187
+
188
+ INTRODUCTION_TEXT = """
189
+ This is a copied space from LLM Trustworthy Leaderboard. Instead of displaying
190
+ the results as table this space was modified to simply provides a gradio API interface.
191
+ Using the following python script below, users can access the full leaderboard data easily.
192
+ Python on how to access the data:
193
+ ```python
194
+ # Import dependencies
195
+ from gradio_client import Client
196
+ # Initialize the Gradio client with the API URL
197
+ client = Client("https://rodrigomasini-data-only-llm-trustworthy-leaderboard.hf.space/")
198
+ try:
199
+ # Perform the API call
200
+ response = client.predict("","", api_name='/predict')
201
+ # Check if response it's directly accessible
202
+ if len(response) > 0:
203
+ print("Response received!")
204
+ headers = response.get('headers', [])
205
+ data = response.get('data', [])
206
+ print(headers)
207
+ # Remove commenst if you want to download the dataset and save in csv format
208
+ # Specify the path to your CSV file
209
+ #csv_file_path = 'llm-trustworthy-benchmark.csv'
210
+ # Open the CSV file for writing
211
+ #with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
212
+ # writer = csv.writer(file)
213
+ # Write the headers
214
+ # writer.writerow(headers)
215
+ # Write the data
216
+ # for row in data:
217
+ # writer.writerow(row)
218
+ #print(f"Results saved to {csv_file_path}")
219
+ # If the above line prints a string that looks like JSON, you can parse it with json.loads(response)
220
+ # Otherwise, you might need to adjust based on the actual structure of `response`
221
+ except Exception as e:
222
+ print(f"An error occurred: {e}")
223
+ ```
224
+ """
225
+
226
+ interface = gr.Interface(
227
+ fn=display,
228
+ inputs=[gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text"), dummy1],
229
+ outputs=[hidden_leaderboard_table_for_search]
230
+ )
231
 
232
+ scheduler = BackgroundScheduler()
233
+ scheduler.add_job(restart_space, "interval", seconds=1800)
234
 
235
  def launch_backend():
236
  import subprocess
 
238
  if DEVICE not in {'cpu'}:
239
  _ = subprocess.run(["python", "backend-cli.py"])
240
 
 
 
 
241
  scheduler.start()
242
+
243
+ interface.launch()