pratikbhavsar commited on
Commit
137096d
·
1 Parent(s): 39e3785

reverted to working

Browse files
Files changed (6) hide show
  1. app.py +20 -14
  2. requirements.txt +1 -1
  3. tabs/data_exploration.py +371 -371
  4. tabs/leaderboard.py +48 -551
  5. tabs/model_comparison.py +23 -117
  6. visualization.py +256 -0
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import warnings
2
 
3
  warnings.filterwarnings("ignore")
@@ -19,41 +20,46 @@ from tabs.data_exploration import create_exploration_tab, filter_and_display
19
 
20
  def create_app():
21
  df = load_data()
 
22
  MODELS = [x.strip() for x in df["Model"].unique().tolist()]
23
 
24
  with gr.Blocks(
25
  theme=gr.themes.Soft(font=[gr.themes.GoogleFont("sans-serif")])
26
  ) as app:
27
- with gr.Tabs() as tabs:
28
- with gr.Tab("Leaderboard", id=0) as tab1:
29
- lb_output, lb_plot1, lb_plot2 = create_leaderboard_tab(
30
- df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS
31
- )
32
 
33
- with gr.Tab("Model Comparison", id=1) as tab2:
34
- mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT)
35
 
36
- with gr.Tab("Data Exploration", id=2) as tab3:
37
- exp_outputs = create_exploration_tab(df)
38
 
39
- # Initial data loading
40
- tab1.select(
41
  fn=lambda: filter_leaderboard(
42
  df, "All", list(CATEGORIES.keys())[0], "Performance"
43
  ),
44
  outputs=[lb_output, lb_plot1, lb_plot2],
45
  )
46
 
47
- tab2.select(
48
  fn=lambda: compare_models(
49
  df, [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
50
  ),
51
  outputs=[mc_info, mc_plot],
52
  )
53
 
54
- tab3.select(
55
  fn=lambda: filter_and_display(
56
- MODELS[0], DATASETS[0], min(SCORES), max(SCORES), 0, 0, 0
 
 
 
 
 
 
57
  ),
58
  outputs=exp_outputs[:-1],
59
  )
 
1
+ # Add this at the top of your script
2
  import warnings
3
 
4
  warnings.filterwarnings("ignore")
 
20
 
21
  def create_app():
22
  df = load_data()
23
+
24
  MODELS = [x.strip() for x in df["Model"].unique().tolist()]
25
 
26
  with gr.Blocks(
27
  theme=gr.themes.Soft(font=[gr.themes.GoogleFont("sans-serif")])
28
  ) as app:
29
+ with gr.Tabs():
30
+ # Create tabs
31
+ lb_output, lb_plot1, lb_plot2 = create_leaderboard_tab(
32
+ df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS
33
+ )
34
 
35
+ mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT)
 
36
 
37
+ exp_outputs = create_exploration_tab(df)
 
38
 
39
+ # Initial loads
40
+ app.load(
41
  fn=lambda: filter_leaderboard(
42
  df, "All", list(CATEGORIES.keys())[0], "Performance"
43
  ),
44
  outputs=[lb_output, lb_plot1, lb_plot2],
45
  )
46
 
47
+ app.load(
48
  fn=lambda: compare_models(
49
  df, [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
50
  ),
51
  outputs=[mc_info, mc_plot],
52
  )
53
 
54
+ app.load(
55
  fn=lambda: filter_and_display(
56
+ MODELS[0],
57
+ DATASETS[0],
58
+ min(SCORES),
59
+ max(SCORES),
60
+ 0,
61
+ 0,
62
+ 0,
63
  ),
64
  outputs=exp_outputs[:-1],
65
  )
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- gradio==5.20.0
2
  pandas
3
  matplotlib
4
  plotly
 
1
+ gradio==5.18.0
2
  pandas
3
  matplotlib
4
  plotly
tabs/data_exploration.py CHANGED
@@ -395,292 +395,305 @@ def create_exploration_tab(df):
395
  """Create an enhanced data exploration tab with better UI and functionality."""
396
 
397
  # Main UI setup
398
- # with gr.Tab("Data Exploration"):
399
- # CSS styling (unchanged)
400
- gr.HTML(
401
- """
402
- <style>
403
- /* Custom styling for the exploration tab */
404
- :root[data-theme="light"] {
405
- --surface-color: #f8f9fa;
406
- --surface-color-alt: #ffffff;
407
- --text-color: #202124;
408
- --text-muted: #666666;
409
- --primary-text: #1a73e8;
410
- --primary-text-light: rgba(26, 115, 232, 0.3);
411
- --border-color: #e9ecef;
412
- --border-color-light: #f1f3f5;
413
- --shadow-color: rgba(0,0,0,0.05);
414
- --message-bg-user: #E5F6FD;
415
- --message-bg-assistant: #F7F7F8;
416
- --message-bg-system: #FFF3E0;
417
- --response-bg: #F0F7FF;
418
- --score-high: #1a73e8;
419
- --score-med: #f4b400;
420
- --score-low: #ea4335;
421
- }
422
-
423
- :root[data-theme="dark"] {
424
- --surface-color: #1e1e1e;
425
- --surface-color-alt: #2d2d2d;
426
- --text-color: #ffffff;
427
- --text-muted: #a0a0a0;
428
- --primary-text: #60a5fa;
429
- --primary-text-light: rgba(96, 165, 250, 0.3);
430
- --border-color: #404040;
431
- --border-color-light: #333333;
432
- --shadow-color: rgba(0,0,0,0.2);
433
- --message-bg-user: #2d3748;
434
- --message-bg-assistant: #1a1a1a;
435
- --message-bg-system: #2c2516;
436
- --response-bg: #1e2a3a;
437
- --score-high: #60a5fa;
438
- --score-med: #fbbf24;
439
- --score-low: #ef4444;
440
- }
441
-
442
- #exploration-header {
443
- margin-bottom: 1.5rem;
444
- padding-bottom: 1rem;
445
- border-bottom: 1px solid var(--border-color);
446
- }
447
-
448
- .filter-container {
449
- background-color: var(--surface-color);
450
- border-radius: 10px;
451
- padding: 1rem;
452
- margin-bottom: 1.5rem;
453
- border: 1px solid var(--border-color);
454
- box-shadow: 0 2px 6px var(--shadow-color);
455
- }
456
-
457
- .navigation-buttons button {
458
- min-width: 120px;
459
- font-weight: 500;
460
- }
461
-
462
- .content-panel {
463
- margin-top: 1.5rem;
464
- }
465
-
466
- @media (max-width: 768px) {
467
- .filter-row {
468
- flex-direction: column;
469
  }
470
- }
471
- </style>
472
- """
473
- )
474
-
475
- # Header
476
- with gr.Row(elem_id="exploration-header"):
477
- gr.HTML(HEADER_CONTENT)
478
-
479
- # Filters section
480
- with gr.Column(elem_classes="filter-container"):
481
- gr.Markdown("### 🔍 Filter Options")
482
-
483
- with gr.Row(equal_height=True, elem_classes="filter-row"):
484
- explore_model = gr.Dropdown(
485
- choices=MODELS,
486
- value=MODELS[0],
487
- label="Model",
488
- container=True,
489
- scale=1,
490
- info="Select AI model",
491
- )
492
- explore_dataset = gr.Dropdown(
493
- choices=DATASETS,
494
- value=DATASETS[0],
495
- label="Dataset",
496
- container=True,
497
- scale=1,
498
- info="Select evaluation dataset",
499
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
 
501
- with gr.Row(equal_height=True, elem_classes="filter-row"):
502
- min_score = gr.Slider(
503
- minimum=float(min(SCORES)),
504
- maximum=float(max(SCORES)),
505
- value=float(min(SCORES)),
506
- step=0.1,
507
- label="Minimum TSQ Score",
508
- container=True,
509
- scale=1,
510
- info="Filter responses with scores above this threshold",
511
- )
512
- max_score = gr.Slider(
513
- minimum=float(min(SCORES)),
514
- maximum=float(max(SCORES)),
515
- value=float(max(SCORES)),
516
- step=0.1,
517
- label="Maximum TSQ Score",
518
- container=True,
519
- scale=1,
520
- info="Filter responses with scores below this threshold",
521
- )
 
 
 
 
522
 
523
- # Get the data for initial ranges
524
- df_chat = get_chat_and_score_df(explore_model.value, explore_dataset.value)
525
-
526
- # Ensure columns exist and get ranges
527
- n_turns_max = int(df_chat["n_turns"].max())
528
- len_query_max = int(df_chat["len_query"].max())
529
- n_tools_max = int(df_chat["n_tools"].max())
530
-
531
- with gr.Row(equal_height=True, elem_classes="filter-row"):
532
- n_turns_filter = gr.Slider(
533
- minimum=0,
534
- maximum=n_turns_max,
535
- value=0,
536
- step=1,
537
- label="Minimum Turn Count",
538
- container=True,
539
- scale=1,
540
- info="Filter by minimum number of conversation turns",
541
- )
 
 
542
 
543
- len_query_filter = gr.Slider(
544
- minimum=0,
545
- maximum=len_query_max,
546
- value=0,
547
- step=10,
548
- label="Minimum Query Length",
549
- container=True,
550
- scale=1,
551
- info="Filter by minimum length of query in characters",
552
- )
 
 
 
 
 
 
 
 
 
553
 
554
- n_tools_filter = gr.Slider(
555
- minimum=0,
556
- maximum=n_tools_max,
557
- value=0,
558
- step=1,
559
- label="Minimum Tool Count",
560
- container=True,
561
- scale=1,
562
- info="Filter by minimum number of tools used",
563
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564
 
565
  with gr.Row():
566
- reset_btn = gr.Button("Reset Filters", size="sm", variant="secondary")
567
-
568
- # Navigation row
569
- with gr.Row(variant="panel"):
570
- with gr.Column(scale=1):
571
- prev_btn = gr.Button(
572
- "← Previous",
573
- size="lg",
574
- variant="secondary",
575
- elem_classes="navigation-buttons",
576
- )
577
 
578
- with gr.Column(scale=1, min_width=100):
579
- # Get initial count from default data
580
- df_initial = get_chat_and_score_df(MODELS[0], DATASETS[0])
581
- initial_count = len(df_initial)
582
-
583
- index_display = gr.HTML(
584
- value=f"""<div style="
585
- display: flex;
586
- align-items: center;
587
- justify-content: center;
588
- font-weight: 500;
589
- color: var(--primary-text);
590
- background-color: var(--surface-color-alt);
591
- padding: 0.5rem 1rem;
592
- border-radius: 20px;
593
- font-size: 0.9rem;
594
- width: fit-content;
595
- margin: 0 auto;">
596
- <span style="margin-right: 0.5rem;">📄</span>1/{initial_count}
597
- </div>""",
598
- elem_id="index-display",
599
- )
 
 
 
600
 
601
- with gr.Column(scale=1):
602
- next_btn = gr.Button(
603
- "Next →",
604
- size="lg",
605
- variant="secondary",
606
- elem_classes="navigation-buttons",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
607
  )
608
 
609
- # Content areas
610
- with gr.Row(equal_height=True):
611
- with gr.Column(scale=1):
612
- chat_display = gr.HTML()
613
- with gr.Column(scale=1):
614
- metrics_display = gr.HTML()
615
-
616
- with gr.Row():
617
- tool_info_display = gr.HTML()
618
-
619
- # State for tracking current index (simple integer state)
620
- current_index = gr.State(value=0)
621
-
622
- def reset_index():
623
- """Reset the current index to 0"""
624
- return 0
625
-
626
- # Add these explicit event handlers for model and dataset changes
627
- explore_model.change(
628
- reset_index,
629
- inputs=[],
630
- outputs=[current_index],
631
- )
632
-
633
- explore_dataset.change(
634
- reset_index,
635
- inputs=[],
636
- outputs=[current_index],
637
- )
638
-
639
- min_score.change(
640
- reset_index,
641
- inputs=[],
642
- outputs=[current_index],
643
- )
644
-
645
- max_score.change(
646
- reset_index,
647
- inputs=[],
648
- outputs=[current_index],
649
- )
650
-
651
- n_turns_filter.change(
652
- reset_index,
653
- inputs=[],
654
- outputs=[current_index],
655
- )
656
-
657
- len_query_filter.change(
658
- reset_index,
659
- inputs=[],
660
- outputs=[current_index],
661
- )
662
-
663
- n_tools_filter.change(
664
- reset_index,
665
- inputs=[],
666
- outputs=[current_index],
667
- )
668
-
669
- # Reset filters
670
- def reset_filters():
671
- return (
672
- MODELS[0],
673
- DATASETS[0],
674
- float(min(SCORES)),
675
- float(max(SCORES)),
676
- 0, # n_turns
677
- 0, # len_query
678
- 0, # n_tools
679
  )
680
 
681
- reset_btn.click(
682
- reset_filters,
683
- outputs=[
684
  explore_model,
685
  explore_dataset,
686
  min_score,
@@ -688,23 +701,31 @@ def create_exploration_tab(df):
688
  n_turns_filter,
689
  len_query_filter,
690
  n_tools_filter,
691
- ],
692
- )
693
-
694
- # Connect filter changes
695
- # Replace the existing filter connections with this:
696
- for control in [
697
- explore_model,
698
- explore_dataset,
699
- min_score,
700
- max_score,
701
- n_turns_filter,
702
- len_query_filter,
703
- n_tools_filter,
704
- ]:
705
- control.change(
706
- on_filter_change,
 
 
 
 
 
 
 
707
  inputs=[
 
708
  explore_model,
709
  explore_dataset,
710
  min_score,
@@ -718,93 +739,72 @@ def create_exploration_tab(df):
718
  metrics_display,
719
  tool_info_display,
720
  index_display,
 
721
  ],
722
  )
723
 
724
- # Connect navigation buttons with necessary filter parameters
725
- prev_btn.click(
726
- navigate_prev,
727
- inputs=[
728
- current_index,
729
- explore_model,
730
- explore_dataset,
731
- min_score,
732
- max_score,
733
- n_turns_filter,
734
- len_query_filter,
735
- n_tools_filter,
736
- ],
737
- outputs=[
738
- chat_display,
739
- metrics_display,
740
- tool_info_display,
741
- index_display,
742
- current_index,
743
- ],
744
- )
745
-
746
- next_btn.click(
747
- navigate_next,
748
- inputs=[
749
- current_index,
750
- explore_model,
751
- explore_dataset,
752
- min_score,
753
- max_score,
754
- n_turns_filter,
755
- len_query_filter,
756
- n_tools_filter,
757
- ],
758
- outputs=[
759
- chat_display,
760
- metrics_display,
761
- tool_info_display,
762
- index_display,
763
- current_index,
764
- ],
765
- )
766
-
767
- def update_slider_ranges(model, dataset):
768
- df_chat = get_chat_and_score_df(model, dataset)
769
-
770
- # Make sure columns are numeric first
771
- df_chat["n_turns"] = pd.to_numeric(df_chat["n_turns"], errors="coerce").fillna(
772
- 0
773
- )
774
- df_chat["len_query"] = pd.to_numeric(
775
- df_chat["len_query"], errors="coerce"
776
- ).fillna(0)
777
- df_chat["n_tools"] = pd.to_numeric(df_chat["n_tools"], errors="coerce").fillna(
778
- 0
779
  )
780
 
781
- # Calculate maximums with safety buffers
782
- n_turns_max = max(1, int(df_chat["n_turns"].max()))
783
- len_query_max = max(10, int(df_chat["len_query"].max()))
784
- n_tools_max = max(1, int(df_chat["n_tools"].max()))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
785
 
786
- # Return updated sliders using gr.update()
787
- return (
788
- gr.update(maximum=n_turns_max, value=0),
789
- gr.update(maximum=len_query_max, value=0),
790
- gr.update(maximum=n_tools_max, value=0),
 
 
 
 
 
791
  )
792
 
793
- # Connect model and dataset changes to slider range updates
794
- explore_model.change(
795
- update_slider_ranges,
796
- inputs=[explore_model, explore_dataset],
797
- outputs=[n_turns_filter, len_query_filter, n_tools_filter],
798
- )
799
- explore_dataset.change(
800
- update_slider_ranges,
801
- inputs=[explore_model, explore_dataset],
802
- outputs=[n_turns_filter, len_query_filter, n_tools_filter],
803
- )
804
-
805
- return [
806
- chat_display,
807
- metrics_display,
808
- tool_info_display,
809
- index_display,
810
- ]
 
395
  """Create an enhanced data exploration tab with better UI and functionality."""
396
 
397
  # Main UI setup
398
+ with gr.Tab("Data Exploration"):
399
+ # CSS styling (unchanged)
400
+ gr.HTML(
401
+ """
402
+ <style>
403
+ /* Custom styling for the exploration tab */
404
+ :root[data-theme="light"] {
405
+ --surface-color: #f8f9fa;
406
+ --surface-color-alt: #ffffff;
407
+ --text-color: #202124;
408
+ --text-muted: #666666;
409
+ --primary-text: #1a73e8;
410
+ --primary-text-light: rgba(26, 115, 232, 0.3);
411
+ --border-color: #e9ecef;
412
+ --border-color-light: #f1f3f5;
413
+ --shadow-color: rgba(0,0,0,0.05);
414
+ --message-bg-user: #E5F6FD;
415
+ --message-bg-assistant: #F7F7F8;
416
+ --message-bg-system: #FFF3E0;
417
+ --response-bg: #F0F7FF;
418
+ --score-high: #1a73e8;
419
+ --score-med: #f4b400;
420
+ --score-low: #ea4335;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  }
422
+
423
+ :root[data-theme="dark"] {
424
+ --surface-color: #1e1e1e;
425
+ --surface-color-alt: #2d2d2d;
426
+ --text-color: #ffffff;
427
+ --text-muted: #a0a0a0;
428
+ --primary-text: #60a5fa;
429
+ --primary-text-light: rgba(96, 165, 250, 0.3);
430
+ --border-color: #404040;
431
+ --border-color-light: #333333;
432
+ --shadow-color: rgba(0,0,0,0.2);
433
+ --message-bg-user: #2d3748;
434
+ --message-bg-assistant: #1a1a1a;
435
+ --message-bg-system: #2c2516;
436
+ --response-bg: #1e2a3a;
437
+ --score-high: #60a5fa;
438
+ --score-med: #fbbf24;
439
+ --score-low: #ef4444;
440
+ }
441
+
442
+ #exploration-header {
443
+ margin-bottom: 1.5rem;
444
+ padding-bottom: 1rem;
445
+ border-bottom: 1px solid var(--border-color);
446
+ }
447
+
448
+ .filter-container {
449
+ background-color: var(--surface-color);
450
+ border-radius: 10px;
451
+ padding: 1rem;
452
+ margin-bottom: 1.5rem;
453
+ border: 1px solid var(--border-color);
454
+ box-shadow: 0 2px 6px var(--shadow-color);
455
+ }
456
+
457
+ .navigation-buttons button {
458
+ min-width: 120px;
459
+ font-weight: 500;
460
+ }
461
+
462
+ .content-panel {
463
+ margin-top: 1.5rem;
464
+ }
465
+
466
+ @media (max-width: 768px) {
467
+ .filter-row {
468
+ flex-direction: column;
469
+ }
470
+ }
471
+ </style>
472
+ """
473
+ )
474
 
475
+ # Header
476
+ with gr.Row(elem_id="exploration-header"):
477
+ gr.HTML(HEADER_CONTENT)
478
+
479
+ # Filters section
480
+ with gr.Column(elem_classes="filter-container"):
481
+ gr.Markdown("### 🔍 Filter Options")
482
+
483
+ with gr.Row(equal_height=True, elem_classes="filter-row"):
484
+ explore_model = gr.Dropdown(
485
+ choices=MODELS,
486
+ value=MODELS[0],
487
+ label="Model",
488
+ container=True,
489
+ scale=1,
490
+ info="Select AI model",
491
+ )
492
+ explore_dataset = gr.Dropdown(
493
+ choices=DATASETS,
494
+ value=DATASETS[0],
495
+ label="Dataset",
496
+ container=True,
497
+ scale=1,
498
+ info="Select evaluation dataset",
499
+ )
500
 
501
+ with gr.Row(equal_height=True, elem_classes="filter-row"):
502
+ min_score = gr.Slider(
503
+ minimum=float(min(SCORES)),
504
+ maximum=float(max(SCORES)),
505
+ value=float(min(SCORES)),
506
+ step=0.1,
507
+ label="Minimum TSQ Score",
508
+ container=True,
509
+ scale=1,
510
+ info="Filter responses with scores above this threshold",
511
+ )
512
+ max_score = gr.Slider(
513
+ minimum=float(min(SCORES)),
514
+ maximum=float(max(SCORES)),
515
+ value=float(max(SCORES)),
516
+ step=0.1,
517
+ label="Maximum TSQ Score",
518
+ container=True,
519
+ scale=1,
520
+ info="Filter responses with scores below this threshold",
521
+ )
522
 
523
+ # Get the data for initial ranges
524
+ df_chat = get_chat_and_score_df(explore_model.value, explore_dataset.value)
525
+
526
+ # Ensure columns exist and get ranges
527
+ n_turns_max = int(df_chat["n_turns"].max())
528
+ len_query_max = int(df_chat["len_query"].max())
529
+ n_tools_max = int(df_chat["n_tools"].max())
530
+
531
+ with gr.Row(equal_height=True, elem_classes="filter-row"):
532
+ n_turns_filter = gr.Slider(
533
+ minimum=0,
534
+ maximum=n_turns_max,
535
+ value=0,
536
+ step=1,
537
+ label="Minimum Turn Count",
538
+ container=True,
539
+ scale=1,
540
+ info="Filter by minimum number of conversation turns",
541
+ )
542
 
543
+ len_query_filter = gr.Slider(
544
+ minimum=0,
545
+ maximum=len_query_max,
546
+ value=0,
547
+ step=10,
548
+ label="Minimum Query Length",
549
+ container=True,
550
+ scale=1,
551
+ info="Filter by minimum length of query in characters",
552
+ )
553
+
554
+ n_tools_filter = gr.Slider(
555
+ minimum=0,
556
+ maximum=n_tools_max,
557
+ value=0,
558
+ step=1,
559
+ label="Minimum Tool Count",
560
+ container=True,
561
+ scale=1,
562
+ info="Filter by minimum number of tools used",
563
+ )
564
+
565
+ with gr.Row():
566
+ reset_btn = gr.Button("Reset Filters", size="sm", variant="secondary")
567
+
568
+ # Navigation row
569
+ with gr.Row(variant="panel"):
570
+ with gr.Column(scale=1):
571
+ prev_btn = gr.Button(
572
+ "← Previous",
573
+ size="lg",
574
+ variant="secondary",
575
+ elem_classes="navigation-buttons",
576
+ )
577
+
578
+ with gr.Column(scale=1, min_width=100):
579
+ # Get initial count from default data
580
+ df_initial = get_chat_and_score_df(MODELS[0], DATASETS[0])
581
+ initial_count = len(df_initial)
582
+
583
+ index_display = gr.HTML(
584
+ value=f"""<div style="
585
+ display: flex;
586
+ align-items: center;
587
+ justify-content: center;
588
+ font-weight: 500;
589
+ color: var(--primary-text);
590
+ background-color: var(--surface-color-alt);
591
+ padding: 0.5rem 1rem;
592
+ border-radius: 20px;
593
+ font-size: 0.9rem;
594
+ width: fit-content;
595
+ margin: 0 auto;">
596
+ <span style="margin-right: 0.5rem;">📄</span>1/{initial_count}
597
+ </div>""",
598
+ elem_id="index-display",
599
+ )
600
+
601
+ with gr.Column(scale=1):
602
+ next_btn = gr.Button(
603
+ "Next →",
604
+ size="lg",
605
+ variant="secondary",
606
+ elem_classes="navigation-buttons",
607
+ )
608
+
609
+ # Content areas
610
+ with gr.Row(equal_height=True):
611
+ with gr.Column(scale=1):
612
+ chat_display = gr.HTML()
613
+ with gr.Column(scale=1):
614
+ metrics_display = gr.HTML()
615
 
616
  with gr.Row():
617
+ tool_info_display = gr.HTML()
 
 
 
 
 
 
 
 
 
 
618
 
619
+ # State for tracking current index (simple integer state)
620
+ current_index = gr.State(value=0)
621
+
622
+ def reset_index():
623
+ """Reset the current index to 0"""
624
+ return 0
625
+
626
+ # Add these explicit event handlers for model and dataset changes
627
+ explore_model.change(
628
+ reset_index,
629
+ inputs=[],
630
+ outputs=[current_index],
631
+ )
632
+
633
+ explore_dataset.change(
634
+ reset_index,
635
+ inputs=[],
636
+ outputs=[current_index],
637
+ )
638
+
639
+ min_score.change(
640
+ reset_index,
641
+ inputs=[],
642
+ outputs=[current_index],
643
+ )
644
 
645
+ max_score.change(
646
+ reset_index,
647
+ inputs=[],
648
+ outputs=[current_index],
649
+ )
650
+
651
+ n_turns_filter.change(
652
+ reset_index,
653
+ inputs=[],
654
+ outputs=[current_index],
655
+ )
656
+
657
+ len_query_filter.change(
658
+ reset_index,
659
+ inputs=[],
660
+ outputs=[current_index],
661
+ )
662
+
663
+ n_tools_filter.change(
664
+ reset_index,
665
+ inputs=[],
666
+ outputs=[current_index],
667
+ )
668
+
669
+ # Reset filters
670
+ def reset_filters():
671
+ return (
672
+ MODELS[0],
673
+ DATASETS[0],
674
+ float(min(SCORES)),
675
+ float(max(SCORES)),
676
+ 0, # n_turns
677
+ 0, # len_query
678
+ 0, # n_tools
679
  )
680
 
681
+ reset_btn.click(
682
+ reset_filters,
683
+ outputs=[
684
+ explore_model,
685
+ explore_dataset,
686
+ min_score,
687
+ max_score,
688
+ n_turns_filter,
689
+ len_query_filter,
690
+ n_tools_filter,
691
+ ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
692
  )
693
 
694
+ # Connect filter changes
695
+ # Replace the existing filter connections with this:
696
+ for control in [
697
  explore_model,
698
  explore_dataset,
699
  min_score,
 
701
  n_turns_filter,
702
  len_query_filter,
703
  n_tools_filter,
704
+ ]:
705
+ control.change(
706
+ on_filter_change,
707
+ inputs=[
708
+ explore_model,
709
+ explore_dataset,
710
+ min_score,
711
+ max_score,
712
+ n_turns_filter,
713
+ len_query_filter,
714
+ n_tools_filter,
715
+ ],
716
+ outputs=[
717
+ chat_display,
718
+ metrics_display,
719
+ tool_info_display,
720
+ index_display,
721
+ ],
722
+ )
723
+
724
+ # Connect navigation buttons with necessary filter parameters
725
+ prev_btn.click(
726
+ navigate_prev,
727
  inputs=[
728
+ current_index,
729
  explore_model,
730
  explore_dataset,
731
  min_score,
 
739
  metrics_display,
740
  tool_info_display,
741
  index_display,
742
+ current_index,
743
  ],
744
  )
745
 
746
+ next_btn.click(
747
+ navigate_next,
748
+ inputs=[
749
+ current_index,
750
+ explore_model,
751
+ explore_dataset,
752
+ min_score,
753
+ max_score,
754
+ n_turns_filter,
755
+ len_query_filter,
756
+ n_tools_filter,
757
+ ],
758
+ outputs=[
759
+ chat_display,
760
+ metrics_display,
761
+ tool_info_display,
762
+ index_display,
763
+ current_index,
764
+ ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
765
  )
766
 
767
+ def update_slider_ranges(model, dataset):
768
+ df_chat = get_chat_and_score_df(model, dataset)
769
+
770
+ # Make sure columns are numeric first
771
+ df_chat["n_turns"] = pd.to_numeric(
772
+ df_chat["n_turns"], errors="coerce"
773
+ ).fillna(0)
774
+ df_chat["len_query"] = pd.to_numeric(
775
+ df_chat["len_query"], errors="coerce"
776
+ ).fillna(0)
777
+ df_chat["n_tools"] = pd.to_numeric(
778
+ df_chat["n_tools"], errors="coerce"
779
+ ).fillna(0)
780
+
781
+ # Calculate maximums with safety buffers
782
+ n_turns_max = max(1, int(df_chat["n_turns"].max()))
783
+ len_query_max = max(10, int(df_chat["len_query"].max()))
784
+ n_tools_max = max(1, int(df_chat["n_tools"].max()))
785
+
786
+ # Return updated sliders using gr.update()
787
+ return (
788
+ gr.update(maximum=n_turns_max, value=0),
789
+ gr.update(maximum=len_query_max, value=0),
790
+ gr.update(maximum=n_tools_max, value=0),
791
+ )
792
 
793
+ # Connect model and dataset changes to slider range updates
794
+ explore_model.change(
795
+ update_slider_ranges,
796
+ inputs=[explore_model, explore_dataset],
797
+ outputs=[n_turns_filter, len_query_filter, n_tools_filter],
798
+ )
799
+ explore_dataset.change(
800
+ update_slider_ranges,
801
+ inputs=[explore_model, explore_dataset],
802
+ outputs=[n_turns_filter, len_query_filter, n_tools_filter],
803
  )
804
 
805
+ return [
806
+ chat_display,
807
+ metrics_display,
808
+ tool_info_display,
809
+ index_display,
810
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
tabs/leaderboard.py CHANGED
@@ -1,329 +1,16 @@
1
  import gradio as gr
2
 
3
  from data_loader import CATEGORIES, DESCRIPTION_HTML, CARDS
 
 
 
 
4
  from utils import (
5
  get_rank_badge,
6
  get_score_bar,
7
  get_type_badge,
8
  )
9
 
10
- from utils import get_chart_colors
11
- import matplotlib
12
- import matplotlib.pyplot as plt
13
- import numpy as np
14
- import plotly.graph_objects as go
15
- from plotly.subplots import make_subplots
16
- import plotly.express as px
17
- from matplotlib.colors import LinearSegmentedColormap
18
-
19
-
20
- def get_performance_chart(df, category_name="Overall"):
21
- plt.close("all")
22
- score_column = "Category Score"
23
- # Sort in ascending order (lowest scores at top, highest at bottom) to match the screenshot
24
- df_sorted = df.sort_values(score_column, ascending=True)
25
-
26
- # Create a Plotly figure
27
- fig = go.Figure()
28
-
29
- # Define colors for model types - these match the image exactly
30
- color_map = {
31
- "Private": "#4a9bf7", # Blue for closed source
32
- "Open source": "#b56ad7", # Purple for open source
33
- }
34
-
35
- # Add horizontal bars
36
- for i, row in df_sorted.iterrows():
37
- model_type = row["Model Type"]
38
-
39
- fig.add_trace(
40
- go.Bar(
41
- x=[row[score_column]],
42
- y=[row["Model"] + " "],
43
- orientation="h",
44
- marker=dict(
45
- color=color_map[model_type],
46
- line=dict(width=0),
47
- ),
48
- text=f"{row[score_column]:.3f}",
49
- textposition="outside",
50
- textfont=dict(
51
- size=16, color="white", family="Arial, sans-serif"
52
- ), # Improved visibility
53
- hoverinfo="text",
54
- hovertext=f"{row['Model']}: {row[score_column]:.3f}",
55
- showlegend=False,
56
- width=0.65, # Make bars thinner for cleaner appearance
57
- )
58
- )
59
-
60
- # Create a custom legend
61
- for model_type, color in color_map.items():
62
- display_name = "Closed source" if model_type == "Private" else model_type
63
- fig.add_trace(
64
- go.Bar(
65
- x=[None],
66
- y=[None],
67
- orientation="h",
68
- marker=dict(color=color),
69
- showlegend=True,
70
- name=display_name,
71
- )
72
- )
73
-
74
- # Theme colors - will be set by CSS
75
- plot_bg = "rgb(25, 28, 38)" # Default dark theme
76
- paper_bg = "rgb(25, 28, 38)"
77
- text_color = "white"
78
- grid_color = "rgba(150, 150, 150, 0.2)"
79
- legend_bg = "rgba(25, 28, 38, 0.7)"
80
-
81
- # Calculate a generous height based on the number of items
82
- # Use a minimum height and a larger per-item height factor
83
- min_height = 600
84
- height_per_item = 50 # Increased spacing between bars
85
- chart_height = max(min_height, len(df_sorted) * height_per_item)
86
-
87
- fig.update_layout(
88
- title=dict(
89
- text=f"Ranking - {category_name}",
90
- font=dict(size=28, color=text_color),
91
- x=0.5,
92
- y=0.98,
93
- xanchor="center",
94
- ),
95
- xaxis=dict(
96
- title=dict(
97
- text="Average Score (Tool Selection Quality)",
98
- font=dict(size=16, color=text_color),
99
- ),
100
- range=[0, 1.05],
101
- gridcolor=grid_color,
102
- gridwidth=1,
103
- tickfont=dict(size=16, color=text_color),
104
- zeroline=False,
105
- tickformat=".1f",
106
- showgrid=True,
107
- dtick=0.2, # Set tick spacing to match image
108
- ),
109
- yaxis=dict(
110
- tickfont=dict(size=16, color=text_color),
111
- automargin=True,
112
- ),
113
- margin=dict(l=30, r=50, t=100, b=80),
114
- height=chart_height,
115
- autosize=True, # Enable autosize for responsiveness
116
- bargap=0.15,
117
- bargroupgap=0.1,
118
- barmode="group",
119
- legend=dict(
120
- title=dict(text="Model Type", font=dict(size=18, color=text_color)),
121
- font=dict(size=16, color=text_color),
122
- x=0.4,
123
- y=-0.15,
124
- xanchor="center",
125
- yanchor="top",
126
- orientation="h",
127
- bgcolor=legend_bg,
128
- ),
129
- plot_bgcolor=plot_bg,
130
- paper_bgcolor=paper_bg,
131
- font=dict(color=text_color),
132
- )
133
-
134
- # Add grid lines that match the image
135
- for x in [0.2, 0.4, 0.6, 0.8]:
136
- fig.add_shape(
137
- type="line",
138
- x0=x,
139
- y0=0,
140
- x1=x,
141
- y1=1,
142
- yref="paper",
143
- line=dict(color=grid_color, width=1),
144
- )
145
-
146
- return fig
147
-
148
-
149
- def get_performance_cost_chart(df, category_name="Overall"):
150
- plt.close("all")
151
- score_column = "Category Score"
152
-
153
- # Create a Plotly figure
154
- fig = go.Figure()
155
-
156
- # Define colors for model types
157
- color_map = {
158
- "Private": "#4a9bf7", # Blue for closed source
159
- "Open source": "#b56ad7", # Purple for open source
160
- }
161
-
162
- # Dark theme colors
163
- plot_bg = "rgb(25, 28, 38)"
164
- paper_bg = "rgb(25, 28, 38)"
165
- text_color = "white"
166
- grid_color = "rgba(150, 150, 150, 0.2)"
167
- legend_bg = "rgba(25, 28, 38, 0.7)"
168
-
169
- # Add scatter points for each model
170
- for _, row in df.iterrows():
171
- model_type = row["Model Type"]
172
-
173
- # Add model point
174
- fig.add_trace(
175
- go.Scatter(
176
- x=[row["IO Cost"]],
177
- y=[row[score_column] * 100], # Convert to percentage scale
178
- mode="markers",
179
- marker=dict(
180
- color=color_map[model_type],
181
- size=15,
182
- line=dict(width=1, color="white"),
183
- opacity=0.9,
184
- ),
185
- name=row["Model"],
186
- text=f"{row['Model']}<br>${row['IO Cost']:.2f}<br>{row[score_column]:.3f}",
187
- hoverinfo="text",
188
- showlegend=False,
189
- )
190
- )
191
-
192
- # Add model label
193
- fig.add_trace(
194
- go.Scatter(
195
- x=[row["IO Cost"]],
196
- y=[row[score_column] * 100 + 0.8],
197
- mode="text",
198
- text=row["Model"], # + f" (${row['IO Cost']:.2f})",
199
- textposition="top center",
200
- textfont=dict(color=text_color, size=10),
201
- hoverinfo="none",
202
- showlegend=False,
203
- )
204
- )
205
-
206
- # Create a custom legend
207
- for model_type, color in color_map.items():
208
- display_name = "Closed source" if model_type == "Private" else model_type
209
- fig.add_trace(
210
- go.Scatter(
211
- x=[None],
212
- y=[None],
213
- mode="markers",
214
- marker=dict(color=color, size=10, line=dict(width=1, color="white")),
215
- name=display_name,
216
- )
217
- )
218
-
219
- # Add performance bands
220
- performance_bands = [
221
- {
222
- "range": [85, 100],
223
- "color": "rgba(52, 211, 153, 0.2)",
224
- "label": "Reliable Zone",
225
- },
226
- {"range": [75, 85], "color": "rgba(251, 191, 36, 0.2)", "label": "Good Zone"},
227
- {"range": [60, 75], "color": "rgba(239, 68, 68, 0.2)", "label": "Risky Zone"},
228
- ]
229
-
230
- for band in performance_bands:
231
- fig.add_trace(
232
- go.Scatter(
233
- x=[0.05, 100],
234
- y=[band["range"][0], band["range"][0]],
235
- mode="lines",
236
- line=dict(color="rgba(255, 255, 255, 0.3)", width=1, dash="dash"),
237
- showlegend=False,
238
- )
239
- )
240
-
241
- fig.add_shape(
242
- type="rect",
243
- x0=0.08,
244
- x1=1000,
245
- y0=band["range"][0],
246
- y1=band["range"][1],
247
- fillcolor=band["color"],
248
- line=dict(width=0),
249
- layer="below",
250
- )
251
-
252
- # Update layout
253
- fig.update_layout(
254
- title=dict(
255
- text=f"Performance vs. Cost - {category_name}",
256
- font=dict(size=28, color=text_color),
257
- x=0.5,
258
- y=0.98,
259
- xanchor="center",
260
- ),
261
- xaxis=dict(
262
- title=dict(
263
- text="I/O Cost per Million Tokens ($)",
264
- font=dict(size=14, color=text_color),
265
- ),
266
- type="log",
267
- range=[-1.2, 2.1], # log10 scale from 0.08 to 100
268
- gridcolor=grid_color,
269
- gridwidth=1,
270
- tickfont=dict(size=12, color=text_color),
271
- zeroline=False,
272
- showgrid=True,
273
- ),
274
- yaxis=dict(
275
- title=dict(
276
- text="Average Score (Tool Selection Quality)",
277
- font=dict(size=14, color=text_color),
278
- ),
279
- range=[60, 100],
280
- gridcolor=grid_color,
281
- gridwidth=1,
282
- tickfont=dict(size=12, color=text_color),
283
- zeroline=False,
284
- showgrid=True,
285
- ),
286
- margin=dict(l=20, r=20, t=80, b=80), # Increased bottom margin for legend
287
- autosize=True,
288
- height=900, # Increased height
289
- # width=1600,
290
- legend=dict(
291
- title=dict(text="Model Type", font=dict(size=14, color=text_color)),
292
- font=dict(size=12, color=text_color),
293
- x=0.5,
294
- y=-0.15,
295
- xanchor="center",
296
- yanchor="top",
297
- orientation="h",
298
- bgcolor=legend_bg,
299
- ),
300
- plot_bgcolor=plot_bg,
301
- paper_bgcolor=paper_bg,
302
- font=dict(color=text_color),
303
- hovermode="closest",
304
- )
305
-
306
- # Add annotations for performance bands
307
- for i, band in enumerate(performance_bands):
308
- fig.add_annotation(
309
- x=1.5,
310
- y=(band["range"][0] + band["range"][1]) / 2 + 3,
311
- text=band["label"],
312
- showarrow=False,
313
- font=dict(size=15, color=text_color),
314
- xanchor="left",
315
- yanchor="middle",
316
- xshift=5,
317
- )
318
-
319
- # Keep only dark theme - remove theme detection and switching
320
- fig.update_layout(
321
- autosize=True,
322
- )
323
-
324
- return fig
325
-
326
-
327
  def filter_leaderboard(df, model_type, category, sort_by):
328
  filtered_df = df.copy()
329
  if model_type != "All":
@@ -338,14 +25,9 @@ def filter_leaderboard(df, model_type, category, sort_by):
338
  filtered_df = filtered_df.sort_values(by="IO Cost", ascending=True)
339
 
340
  filtered_df["Rank"] = range(1, len(filtered_df) + 1)
341
-
342
- # Get charts
343
  perf_chart = get_performance_chart(filtered_df, category)
344
  cost_chart = get_performance_cost_chart(filtered_df, category)
345
 
346
- # Don't override the chart settings here - this was causing conflicts
347
- # The responsiveness is now handled in the chart creation functions
348
-
349
  # Generate styled table HTML
350
  table_html = f"""
351
  <style>
@@ -470,240 +152,55 @@ def filter_leaderboard(df, model_type, category, sort_by):
470
  </tr>
471
  """
472
 
473
- table_html += """
474
- </tbody>
475
- </table>
476
- </div>
477
- """
478
-
479
  return table_html, perf_chart, cost_chart
480
 
481
 
482
  def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
483
- chart_container_css = """
484
- <style>
485
- /* Chart container styling */
486
- .chart-container {
487
- display: flex;
488
- justify-content: center;
489
- align-items: center;
490
- width: 100%;
491
- margin: 20px 0;
492
- position: relative;
493
- /* Don't fix the height in CSS */
494
- }
495
-
496
- /* Plotly responsive container - use relative width */
497
- .js-plotly-plot, .plot-container, .plotly {
498
- width: 100% !important;
499
- max-width: 1200px !important;
500
- margin: 0 auto !important;
501
- }
502
-
503
- /* SVG container - make it fully responsive */
504
- .js-plotly-plot .svg-container {
505
- width: 100% !important;
506
- }
507
-
508
- /* Dark mode styles */
509
- .dark-theme .chart-title {
510
- color: white;
511
- text-align: center;
512
- font-size: 24px;
513
- margin-top: 40px;
514
- margin-bottom: 15px;
515
- }
516
-
517
- /* Ensure chart text is visible */
518
- .js-plotly-plot text {
519
- font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif !important;
520
- font-size: 14px !important;
521
- }
522
-
523
- /* Responsive adjustments */
524
- @media (max-width: 768px) {
525
- .js-plotly-plot text {
526
- font-size: 12px !important;
527
- }
528
- }
529
-
530
- /* Apply font styling to non-title text elements */
531
- .js-plotly-plot text:not(.gtitle) {
532
- font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif !important;
533
- font-size: 14px !important;
534
- }
535
-
536
- /* Specific styling for chart titles */
537
- .js-plotly-plot .gtitle {
538
- font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif !important;
539
- font-size: 28px !important;
540
- }
541
- </style>
542
- """
543
-
544
- # Start content directly
545
- gr.HTML(HEADER_CONTENT + CARDS)
546
- gr.HTML(DESCRIPTION_HTML)
547
 
548
- # Add our custom CSS
549
- gr.HTML(chart_container_css)
550
 
551
- # Filters row
552
- with gr.Row(equal_height=True):
553
- with gr.Column(scale=1):
554
- model_type = gr.Dropdown(
555
- choices=["All"] + df["Model Type"].unique().tolist(),
556
- value="All",
557
- label="Model Type",
558
- )
559
- with gr.Column(scale=1):
560
- category = gr.Dropdown(
561
- choices=list(CATEGORIES.keys()),
562
- value=list(CATEGORIES.keys())[0],
563
- label="Category",
564
- )
565
- with gr.Column(scale=1):
566
- sort_by = gr.Radio(
567
- choices=["Performance", "Cost"],
568
- value="Performance",
569
- label="Sort by",
570
  )
571
 
572
- # Content
573
- output = gr.HTML()
574
-
575
- # Performance chart - don't specify height in HTML
576
- with gr.Row():
577
- with gr.Column():
578
- gr.HTML('<div class="chart-container">')
579
- plot1 = gr.Plot(elem_id="plot1")
580
- gr.HTML("</div>")
581
-
582
- # Cost performance chart - don't specify height in HTML
583
- with gr.Row():
584
- with gr.Column():
585
- gr.HTML('<div class="chart-container">')
586
- plot2 = gr.Plot(elem_id="plot2")
587
- gr.HTML("</div>")
588
-
589
- gr.HTML(
590
- """<div class="note-box">
591
- <p style="margin: 0; font-size: 1em;">
592
- Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation.
593
- </p>
594
- </div>"""
595
- )
596
-
597
- gr.HTML(METHODOLOGY)
598
-
599
- # Enhanced resize script - improved to be more responsive
600
- resize_js = """
601
- <script>
602
- // Improved function to handle responsive Plotly charts
603
- function resizePlots() {
604
- // Find all plot containers
605
- const plotContainers = document.querySelectorAll('.js-plotly-plot');
606
- if (!plotContainers.length) {
607
- // If containers aren't ready yet, retry shortly
608
- setTimeout(resizePlots, 100);
609
- return;
610
- }
611
-
612
- // Get the available width for the container
613
- const containerWidth = document.querySelector('.chart-container').offsetWidth;
614
-
615
- plotContainers.forEach(container => {
616
- // Calculate appropriate dimensions based on container width
617
- let containerHeight;
618
-
619
- // Different height calculation based on chart type
620
- if (container.id.includes('plot1')) {
621
- // Performance chart - use sizing from reference code
622
- const barCount = container.querySelectorAll('.bars .point').length || 20; // Default if can't detect
623
- // Convert from matplotlib sizing approach: height = max(8, len(df_sorted) * 0.8) in inches * pixels per inch
624
- const heightInInches = Math.max(8, barCount * 0.8);
625
- containerHeight = heightInInches * 80; // Convert inches to pixels (approx)
626
- } else {
627
- // Cost chart - use fixed size from reference code (12x8 inches)
628
- containerHeight = 640; // 8 inches * 80 pixels per inch
629
- // Keep width proportional to container up to max width
630
- const maxWidth = 960; // 12 inches * 80 pixels per inch
631
- container.style.maxWidth = maxWidth + 'px';
632
- }
633
-
634
- // Apply dimensions
635
- container.style.width = '100%';
636
- container.style.height = containerHeight + 'px';
637
-
638
- // Find and resize the SVG elements
639
- const svgElements = container.querySelectorAll('svg');
640
- svgElements.forEach(svg => {
641
- svg.style.width = '100%';
642
- svg.style.height = containerHeight + 'px';
643
- });
644
-
645
- // Find the main SVG container and resize it
646
- const svgContainer = container.querySelector('.svg-container');
647
- if (svgContainer) {
648
- svgContainer.style.width = '100%';
649
- svgContainer.style.height = containerHeight + 'px';
650
- }
651
- });
652
-
653
- // Trigger window resize to make Plotly redraw
654
- window.dispatchEvent(new Event('resize'));
655
- }
656
-
657
- // Functions to run when content changes or window resizes
658
- function setupResizeHandlers() {
659
- // Initial resize
660
- resizePlots();
661
-
662
- // Handle window resize
663
- window.addEventListener('resize', function() {
664
- resizePlots();
665
- });
666
-
667
- // Set up a mutation observer to detect when plots are added/changed
668
- const observer = new MutationObserver(function(mutations) {
669
- mutations.forEach(function(mutation) {
670
- if (mutation.addedNodes.length ||
671
- mutation.type === 'attributes' &&
672
- mutation.target.classList.contains('js-plotly-plot')) {
673
- resizePlots();
674
- }
675
- });
676
- });
677
-
678
- // Observe the entire document for changes
679
- observer.observe(document.body, {
680
- childList: true,
681
- subtree: true,
682
- attributes: true,
683
- attributeFilter: ['style', 'class']
684
- });
685
- }
686
-
687
- // Run when DOM is fully loaded
688
- if (document.readyState === 'loading') {
689
- document.addEventListener('DOMContentLoaded', setupResizeHandlers);
690
- } else {
691
- setupResizeHandlers();
692
- }
693
-
694
- // Also resize periodically for a bit after initial load to ensure everything renders properly
695
- for (let i = 1; i <= 10; i++) {
696
- setTimeout(resizePlots, i * 500);
697
- }
698
- </script>
699
- """
700
- gr.HTML(resize_js)
701
-
702
- for input_comp in [model_type, category, sort_by]:
703
- input_comp.change(
704
- fn=lambda m, c, s: filter_leaderboard(df, m, c, s),
705
- inputs=[model_type, category, sort_by],
706
- outputs=[output, plot1, plot2],
707
- )
708
-
709
- return output, plot1, plot2
 
1
  import gradio as gr
2
 
3
  from data_loader import CATEGORIES, DESCRIPTION_HTML, CARDS
4
+ from visualization import (
5
+ get_performance_chart,
6
+ get_performance_cost_chart,
7
+ )
8
  from utils import (
9
  get_rank_badge,
10
  get_score_bar,
11
  get_type_badge,
12
  )
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def filter_leaderboard(df, model_type, category, sort_by):
15
  filtered_df = df.copy()
16
  if model_type != "All":
 
25
  filtered_df = filtered_df.sort_values(by="IO Cost", ascending=True)
26
 
27
  filtered_df["Rank"] = range(1, len(filtered_df) + 1)
 
 
28
  perf_chart = get_performance_chart(filtered_df, category)
29
  cost_chart = get_performance_cost_chart(filtered_df, category)
30
 
 
 
 
31
  # Generate styled table HTML
32
  table_html = f"""
33
  <style>
 
152
  </tr>
153
  """
154
 
 
 
 
 
 
 
155
  return table_html, perf_chart, cost_chart
156
 
157
 
158
  def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
159
+ with gr.Tab("Leaderboard"):
160
+ gr.HTML(HEADER_CONTENT + CARDS)
161
+ gr.HTML(DESCRIPTION_HTML)
162
+
163
+ # Filters row
164
+ with gr.Row(equal_height=True):
165
+ with gr.Column(scale=1):
166
+ model_type = gr.Dropdown(
167
+ choices=["All"] + df["Model Type"].unique().tolist(),
168
+ value="All",
169
+ label="Model Type",
170
+ )
171
+ with gr.Column(scale=1):
172
+ category = gr.Dropdown(
173
+ choices=list(CATEGORIES.keys()),
174
+ value=list(CATEGORIES.keys())[0],
175
+ label="Category",
176
+ )
177
+ with gr.Column(scale=1):
178
+ sort_by = gr.Radio(
179
+ choices=["Performance", "Cost"],
180
+ value="Performance",
181
+ label="Sort by",
182
+ )
183
+
184
+ # Content
185
+ output = gr.HTML()
186
+ plot1 = gr.Plot()
187
+ plot2 = gr.Plot()
188
+
189
+ gr.HTML(
190
+ """<div class="note-box">
191
+ <p style="margin: 0; font-size: 1em;">
192
+ Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation.
193
+ </p>
194
+ </div>"""
195
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
+ gr.HTML(METHODOLOGY)
 
198
 
199
+ for input_comp in [model_type, category, sort_by]:
200
+ input_comp.change(
201
+ fn=lambda m, c, s: filter_leaderboard(df, m, c, s),
202
+ inputs=[model_type, category, sort_by],
203
+ outputs=[output, plot1, plot2],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  )
205
 
206
+ return output, plot1, plot2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tabs/model_comparison.py CHANGED
@@ -1,96 +1,5 @@
1
  import gradio as gr
2
- from utils import get_chart_colors
3
- import matplotlib
4
- import matplotlib.pyplot as plt
5
- import numpy as np
6
- import plotly.graph_objects as go
7
- from plotly.subplots import make_subplots
8
- import plotly.express as px
9
- from matplotlib.colors import LinearSegmentedColormap
10
-
11
-
12
- def create_radar_plot(df, model_names):
13
- datasets = [col for col in df.columns[7:] if col != "IO Cost"]
14
- fig = go.Figure()
15
-
16
- # Dark theme colors - match other charts
17
- plot_bg = "rgb(25, 28, 38)"
18
- paper_bg = "rgb(25, 28, 38)"
19
- text_color = "white"
20
- grid_color = "rgba(150, 150, 150, 0.2)"
21
- legend_bg = "rgba(25, 28, 38, 0.7)"
22
-
23
- # Update colors for dark theme - more vibrant with better contrast
24
- colors = [
25
- "rgba(74, 155, 247, 0.3)",
26
- "rgba(181, 106, 215, 0.3)",
27
- ] # Match color_map from other charts
28
- line_colors = ["#4a9bf7", "#b56ad7"] # Match color_map from other charts
29
-
30
- for idx, model_name in enumerate(model_names):
31
- model_data = df[df["Model"] == model_name].iloc[0]
32
- values = [model_data[m] for m in datasets]
33
- values.append(values[0])
34
- datasets_plot = datasets + [datasets[0]]
35
-
36
- fig.add_trace(
37
- go.Scatterpolar(
38
- r=values,
39
- theta=datasets_plot,
40
- fill="toself",
41
- fillcolor=colors[idx % len(colors)],
42
- line=dict(color=line_colors[idx % len(line_colors)], width=2),
43
- name=model_name,
44
- text=[f"{val:.3f}" for val in values],
45
- textposition="middle right",
46
- mode="lines+markers+text",
47
- textfont=dict(color=text_color), # Set text color to match theme
48
- )
49
- )
50
-
51
- # Create a more balanced layout optimized for Gradio display
52
- fig.update_layout(
53
- polar=dict(
54
- radialaxis=dict(
55
- visible=True,
56
- range=[0, 1],
57
- showline=False,
58
- tickfont=dict(size=12, color=text_color),
59
- gridcolor=grid_color,
60
- ),
61
- angularaxis=dict(
62
- tickfont=dict(size=13, color=text_color),
63
- rotation=90,
64
- direction="clockwise",
65
- gridcolor=grid_color,
66
- ),
67
- bgcolor=plot_bg, # Set polar background color
68
- ),
69
- showlegend=True,
70
- legend=dict(
71
- orientation="h",
72
- yanchor="bottom",
73
- y=-0.15,
74
- xanchor="center",
75
- x=0.5,
76
- font=dict(size=14, color=text_color),
77
- bgcolor=legend_bg,
78
- ),
79
- title=dict(
80
- text="Model Comparison",
81
- x=0.5,
82
- y=0.98,
83
- font=dict(size=24, color=text_color),
84
- ),
85
- paper_bgcolor=paper_bg,
86
- plot_bgcolor=plot_bg,
87
- height=700,
88
- width=1200, # Make it perfectly square
89
- margin=dict(l=0, r=0, t=80, b=80), # Remove horizontal margins completely
90
- font=dict(color=text_color),
91
- )
92
-
93
- return fig
94
 
95
 
96
  def compare_models(df, model_names=None):
@@ -139,29 +48,26 @@ def compare_models(df, model_names=None):
139
 
140
 
141
  def create_model_comparison_tab(df, HEADER_CONTENT):
142
- # with gr.Tab("Model Comparison"):
143
- gr.HTML(HEADER_CONTENT)
144
- with gr.Column():
145
- # Filters row
146
- with gr.Row(equal_height=True):
147
- model_selector = gr.Dropdown(
148
- choices=df["Model"].unique().tolist(),
149
- value=df.sort_values("Model Avg", ascending=False).iloc[0]["Model"],
150
- multiselect=True,
151
- label="Select Models to Compare",
152
- )
153
-
154
- model_info = gr.HTML()
155
- with gr.Row():
156
- with gr.Column(scale=1, min_width=800):
157
- gr.HTML('<div class="full-width-plot-container" style="width:100%;">')
158
- radar_plot = gr.Plot(elem_id="plot", container=False)
159
- gr.HTML("</div>")
160
-
161
- model_selector.change(
162
- fn=lambda m: compare_models(df, m),
163
- inputs=[model_selector],
164
- outputs=[model_info, radar_plot],
165
- )
166
 
167
- return model_info, radar_plot
 
1
  import gradio as gr
2
+ from visualization import create_radar_plot
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
  def compare_models(df, model_names=None):
 
48
 
49
 
50
  def create_model_comparison_tab(df, HEADER_CONTENT):
51
+ with gr.Tab("Model Comparison"):
52
+ gr.HTML(HEADER_CONTENT)
53
+ with gr.Column():
54
+ # Filters row
55
+ with gr.Row(equal_height=True):
56
+ model_selector = gr.Dropdown(
57
+ choices=df["Model"].unique().tolist(),
58
+ value=df.sort_values("Model Avg", ascending=False).iloc[0]["Model"],
59
+ multiselect=True,
60
+ label="Select Models to Compare",
61
+ )
62
+
63
+ # Content
64
+ model_info = gr.HTML()
65
+ radar_plot = gr.Plot()
66
+
67
+ model_selector.change(
68
+ fn=lambda m: compare_models(df, m),
69
+ inputs=[model_selector],
70
+ outputs=[model_info, radar_plot],
71
+ )
 
 
 
72
 
73
+ return model_info, radar_plot
visualization.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils import get_chart_colors
2
+ import matplotlib
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import plotly.graph_objects as go
6
+
7
+
8
+ def setup_matplotlib():
9
+ matplotlib.use("Agg")
10
+ plt.close("all")
11
+
12
+
13
+ def get_performance_chart(df, category_name="Overall"):
14
+ plt.close("all")
15
+ colors = get_chart_colors()
16
+ score_column = "Category Score"
17
+ df_sorted = df.sort_values(score_column, ascending=True)
18
+
19
+ height = max(8, len(df_sorted) * 0.8)
20
+ fig, ax = plt.subplots(figsize=(16, height))
21
+ plt.rcParams.update({"font.size": 12})
22
+
23
+ fig.patch.set_facecolor(colors["background"])
24
+ ax.set_facecolor(colors["background"])
25
+
26
+ try:
27
+ bars = ax.barh(
28
+ np.arange(len(df_sorted)),
29
+ df_sorted[score_column],
30
+ height=0.4,
31
+ capstyle="round",
32
+ color=[colors[t] for t in df_sorted["Model Type"]],
33
+ )
34
+
35
+ ax.set_title(
36
+ f"Model Performance - {category_name}",
37
+ pad=20,
38
+ fontsize=20,
39
+ fontweight="bold",
40
+ color=colors["text"],
41
+ )
42
+ ax.set_xlabel(
43
+ "Average Score (Tool Selection Quality)",
44
+ fontsize=14,
45
+ fontweight="bold",
46
+ labelpad=10,
47
+ color=colors["text"],
48
+ )
49
+ ax.set_xlim(0.0, 1.0)
50
+
51
+ ax.set_yticks(np.arange(len(df_sorted)))
52
+ ax.set_yticklabels(
53
+ df_sorted["Model"], fontsize=12, fontweight="bold", color=colors["text"]
54
+ )
55
+
56
+ plt.subplots_adjust(left=0.35)
57
+
58
+ for i, v in enumerate(df_sorted[score_column]):
59
+ ax.text(
60
+ v + 0.01,
61
+ i,
62
+ f"{v:.3f}",
63
+ va="center",
64
+ fontsize=12,
65
+ fontweight="bold",
66
+ color=colors["text"],
67
+ )
68
+
69
+ ax.grid(True, axis="x", linestyle="--", alpha=0.2, color=colors["grid"])
70
+ ax.spines[["top", "right"]].set_visible(False)
71
+ ax.spines[["bottom", "left"]].set_color(colors["grid"])
72
+ ax.tick_params(colors=colors["text"])
73
+
74
+ legend_elements = [
75
+ plt.Rectangle((0, 0), 1, 1, facecolor=color, label=label)
76
+ for label, color in {
77
+ k: colors[k] for k in ["Private", "Open source"]
78
+ }.items()
79
+ ]
80
+ ax.legend(
81
+ handles=legend_elements,
82
+ title="Model Type",
83
+ loc="lower right",
84
+ fontsize=12,
85
+ title_fontsize=14,
86
+ facecolor=colors["background"],
87
+ labelcolor=colors["text"],
88
+ )
89
+
90
+ plt.tight_layout()
91
+ return fig
92
+ finally:
93
+ plt.close(fig)
94
+
95
+ def create_radar_plot(df, model_names):
96
+ datasets = [col for col in df.columns[7:] if col != "IO Cost"]
97
+ fig = go.Figure()
98
+
99
+ colors = ["rgba(99, 102, 241, 0.3)", "rgba(34, 197, 94, 0.3)"]
100
+ line_colors = ["#4F46E5", "#16A34A"]
101
+
102
+ for idx, model_name in enumerate(model_names):
103
+ model_data = df[df["Model"] == model_name].iloc[0]
104
+ values = [model_data[m] for m in datasets]
105
+ values.append(values[0])
106
+ datasets_plot = datasets + [datasets[0]]
107
+
108
+ fig.add_trace(
109
+ go.Scatterpolar(
110
+ r=values,
111
+ theta=datasets_plot,
112
+ fill="toself",
113
+ fillcolor=colors[idx % len(colors)],
114
+ line=dict(color=line_colors[idx % len(line_colors)], width=2),
115
+ name=model_name,
116
+ text=[f"{val:.3f}" for val in values],
117
+ textposition="middle right",
118
+ mode="lines+markers+text",
119
+ )
120
+ )
121
+
122
+ fig.update_layout(
123
+ polar=dict(
124
+ radialaxis=dict(
125
+ visible=True, range=[0, 1], showline=False, tickfont=dict(size=12)
126
+ ),
127
+ angularaxis=dict(
128
+ tickfont=dict(size=13, family="Arial"),
129
+ rotation=90,
130
+ direction="clockwise",
131
+ ),
132
+ ),
133
+ showlegend=True,
134
+ legend=dict(
135
+ orientation="h",
136
+ yanchor="bottom",
137
+ y=-0.2,
138
+ xanchor="center",
139
+ x=0.5,
140
+ font=dict(size=14),
141
+ ),
142
+ title=dict(
143
+ text="Model Comparison",
144
+ x=0.5,
145
+ y=0.95,
146
+ font=dict(size=24, family="Arial", color="#1F2937"),
147
+ ),
148
+ paper_bgcolor="white",
149
+ plot_bgcolor="white",
150
+ height=700,
151
+ width=900,
152
+ margin=dict(t=100, b=100, l=80, r=80),
153
+ )
154
+
155
+ return fig
156
+
157
+
158
+ def get_performance_cost_chart(df, category_name="Overall"):
159
+ colors = get_chart_colors()
160
+ fig, ax = plt.subplots(figsize=(12, 8), dpi=300)
161
+
162
+ fig.patch.set_facecolor(colors["background"])
163
+ ax.set_facecolor(colors["background"])
164
+ ax.grid(True, linestyle="--", alpha=0.15, which="both", color=colors["grid"])
165
+
166
+ score_column = "Category Score"
167
+
168
+ for _, row in df.iterrows():
169
+ color = colors[row["Model Type"]]
170
+ size = 100 if row[score_column] > 0.85 else 80
171
+ edge_color = (
172
+ colors["Private"]
173
+ if row["Model Type"] == "Private"
174
+ else colors["Open source"]
175
+ )
176
+
177
+ ax.scatter(
178
+ row["IO Cost"],
179
+ row[score_column] * 100,
180
+ c=color,
181
+ s=size,
182
+ alpha=0.9,
183
+ edgecolor=edge_color,
184
+ linewidth=1,
185
+ zorder=5,
186
+ )
187
+
188
+ bbox_props = dict(
189
+ boxstyle="round,pad=0.3", fc=colors["background"], ec="none", alpha=0.8
190
+ )
191
+
192
+ ax.annotate(
193
+ f"{row['Model']}\n(${row['IO Cost']:.2f})",
194
+ (row["IO Cost"], row[score_column] * 100),
195
+ xytext=(5, 5),
196
+ textcoords="offset points",
197
+ fontsize=8,
198
+ fontweight="bold",
199
+ color=colors["text"],
200
+ bbox=bbox_props,
201
+ zorder=6,
202
+ )
203
+
204
+ ax.set_xscale("log")
205
+ ax.set_xlim(0.08, 1000)
206
+ ax.set_ylim(60, 100)
207
+
208
+ ax.set_xlabel(
209
+ "I/O Cost per Million Tokens ($)",
210
+ fontsize=10,
211
+ fontweight="bold",
212
+ labelpad=10,
213
+ color=colors["text"],
214
+ )
215
+ ax.set_ylabel(
216
+ "Model Performance Score",
217
+ fontsize=10,
218
+ fontweight="bold",
219
+ labelpad=10,
220
+ color=colors["text"],
221
+ )
222
+
223
+ legend_elements = [
224
+ plt.scatter([], [], c=colors[label], label=label, s=80)
225
+ for label in ["Private", "Open source"]
226
+ ]
227
+ ax.legend(
228
+ handles=legend_elements,
229
+ loc="upper right",
230
+ frameon=True,
231
+ facecolor=colors["background"],
232
+ edgecolor="none",
233
+ fontsize=9,
234
+ labelcolor=colors["text"],
235
+ )
236
+
237
+ ax.set_title(
238
+ f"Performance vs. Cost - {category_name}",
239
+ fontsize=14,
240
+ pad=15,
241
+ fontweight="bold",
242
+ color=colors["text"],
243
+ )
244
+
245
+ for y1, y2, color in zip([85, 75, 60], [100, 85, 75], colors["performance_bands"]):
246
+ ax.axhspan(y1, y2, alpha=0.2, color=color, zorder=1)
247
+
248
+ ax.tick_params(axis="both", which="major", labelsize=9, colors=colors["text"])
249
+ ax.tick_params(axis="both", which="minor", labelsize=8, colors=colors["text"])
250
+ ax.xaxis.set_minor_locator(plt.LogLocator(base=10.0, subs=np.arange(2, 10) * 0.1))
251
+
252
+ for spine in ax.spines.values():
253
+ spine.set_color(colors["grid"])
254
+
255
+ plt.tight_layout()
256
+ return fig