zhiminy commited on
Commit
19a995e
·
1 Parent(s): 1e0fb78

add instability score

Browse files
Files changed (1) hide show
  1. app.py +23 -1
app.py CHANGED
@@ -366,6 +366,7 @@ def get_leaderboard_data(feedback_entry=None):
366
  "Rank",
367
  "Model",
368
  "Elo Score",
 
369
  "Average Win Rate",
370
  "Bradley-Terry Coefficient",
371
  "Eigenvector Centrality Value",
@@ -402,12 +403,31 @@ def get_leaderboard_data(feedback_entry=None):
402
  pagerank_result = evalica.pagerank(
403
  feedback_df["left"], feedback_df["right"], feedback_df["winner"]
404
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
 
406
  # Combine all results into a single DataFrame
407
  leaderboard_data = pd.DataFrame(
408
  {
409
  "Model": elo_result.scores.index,
410
  "Elo Score": elo_result.scores.values,
 
411
  "Average Win Rate": avr_result.scores.values * 100,
412
  "Bradley-Terry Coefficient": bt_result.scores.values,
413
  "Eigenvector Centrality Value": eigen_result.scores.values,
@@ -420,6 +440,7 @@ def get_leaderboard_data(feedback_entry=None):
420
  leaderboard_data = leaderboard_data.round(
421
  {
422
  "Elo Score": 2,
 
423
  "Average Win Rate": 2,
424
  "Bradley-Terry Coefficient": 2,
425
  "Eigenvector Centrality Value": 2,
@@ -471,11 +492,12 @@ with gr.Blocks() as app:
471
  "Rank",
472
  "Model",
473
  "Elo Score",
474
- "Average Win Rate",
475
  ],
476
  search_columns=["Model"],
477
  filter_columns=[
478
  "Elo Score",
 
479
  "Average Win Rate",
480
  "Bradley-Terry Coefficient",
481
  "Eigenvector Centrality Value",
 
366
  "Rank",
367
  "Model",
368
  "Elo Score",
369
+ "Instability Score",
370
  "Average Win Rate",
371
  "Bradley-Terry Coefficient",
372
  "Eigenvector Centrality Value",
 
403
  pagerank_result = evalica.pagerank(
404
  feedback_df["left"], feedback_df["right"], feedback_df["winner"]
405
  )
406
+
407
+ # Calculate instability score as a pandas Series aligned with other metrics
408
+ is_result = pd.Series(0.0, index=elo_result.scores.index) # Initialize with zeros using same index
409
+
410
+ # Loop through models and update values
411
+ for model in is_result.index:
412
+ # Filter self-matches for this model
413
+ self_matches = feedback_df[
414
+ (feedback_df["left"] == model) &
415
+ (feedback_df["right"] == model)
416
+ ]
417
+ total = len(self_matches)
418
+
419
+ if total:
420
+ # Count non-draw outcomes (wins or losses)
421
+ non_draws = self_matches[self_matches["winner"] != evalica.Winner.Draw].shape[0]
422
+ # Store as percentage directly
423
+ is_result[model] = non_draws / total
424
 
425
  # Combine all results into a single DataFrame
426
  leaderboard_data = pd.DataFrame(
427
  {
428
  "Model": elo_result.scores.index,
429
  "Elo Score": elo_result.scores.values,
430
+ "Instability Score": is_result.values * 100,
431
  "Average Win Rate": avr_result.scores.values * 100,
432
  "Bradley-Terry Coefficient": bt_result.scores.values,
433
  "Eigenvector Centrality Value": eigen_result.scores.values,
 
440
  leaderboard_data = leaderboard_data.round(
441
  {
442
  "Elo Score": 2,
443
+ "Instability Score": 2,
444
  "Average Win Rate": 2,
445
  "Bradley-Terry Coefficient": 2,
446
  "Eigenvector Centrality Value": 2,
 
492
  "Rank",
493
  "Model",
494
  "Elo Score",
495
+ "Instability Score",
496
  ],
497
  search_columns=["Model"],
498
  filter_columns=[
499
  "Elo Score",
500
+ "Instability Score",
501
  "Average Win Rate",
502
  "Bradley-Terry Coefficient",
503
  "Eigenvector Centrality Value",