Huanzhi Mao commited on
Commit
57013a0
1 Parent(s): 469201a

update leaderboard data to include new models

Browse files
Files changed (1) hide show
  1. app.py +181 -133
app.py CHANGED
@@ -630,158 +630,190 @@ COLUMNS = [
630
  DATA = [
631
  (
632
  1,
633
- 85.24,
634
  "GPT-4-1106-Preview",
635
  "OpenAI",
636
  "Proprietary",
637
- 81.64,
638
- 89.50,
639
- 92.00,
640
- 92.00,
 
641
  70.00,
642
- 62.00,
643
- 72.00,
644
  50.00,
645
  88.75,
646
  ),
647
  (
648
  2,
649
- 85.12,
650
  "GPT-4-0125-Preview",
651
  "OpenAI",
652
  "Proprietary",
653
- 82.18,
654
- 90.00,
655
- 90.00,
656
- 91.00,
657
- 67.06,
658
- 70.00,
659
- 76.00,
660
- 55.00,
661
  87.50,
662
  ),
663
  (
664
  3,
665
- 83.67,
666
  "Gorilla-OpenFunctions-v2",
667
  "Gorilla LLM",
668
  "Apache 2.0",
669
- 88.73,
670
- 89.50,
671
- 79.50,
672
  78.00,
673
- 80.00,
674
- 74.00,
675
- 76.00,
676
- 60.00,
677
  71.67,
678
  ),
679
  (
680
  4,
681
- 82.23,
682
- "GPT-3.5-Turbo-0125",
683
- "OpenAI",
684
  "Proprietary",
685
- 81.27,
686
- 88.00,
687
- 87.50,
688
- 88.00,
 
689
  80.00,
690
- 74.00,
691
- 70.00,
692
- 47.50,
693
- 68.33,
694
  ),
695
  (
696
  5,
697
- 79.70,
698
  "Mistral-Medium-2312",
699
  "Mistral AI",
700
  "Proprietary",
701
  80.18,
702
  84.50,
703
- 71.00,
704
- 68.00,
705
- 78.24,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
706
  72.00,
707
- 62.00,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
708
  47.50,
 
 
 
 
 
 
 
 
 
709
  90.00,
 
 
 
 
 
 
 
710
  ),
711
  (
712
- 6,
713
- 76.02,
714
  "Claude-2.1",
715
  "Anthropic",
716
  "Proprietary",
717
  85.64,
718
  83.00,
719
- 72.00,
720
- 56.50,
721
- 63.53,
722
  48.00,
723
- 60.00,
724
- 45.00,
725
  78.33,
726
  ),
727
  (
728
- 7,
729
- 60.06,
730
  "Mistral-tiny-2312",
731
  "Mistral AI",
732
  "Proprietary",
733
- 59.27,
734
- 59.50,
735
- 53.50,
736
- 41.50,
737
- 63.53,
738
- 64.00,
739
- 42.00,
740
- 40.00,
741
  77.08,
742
  ),
743
  (
744
- 8,
745
- 59.70,
746
  "Claude-instant-1.2",
747
  "Anthropic",
748
  "Proprietary",
749
  68.73,
750
  59.00,
751
- 53.00,
752
- 39.50,
753
- 56.47,
 
754
  52.00,
755
  50.00,
756
- 37.50,
757
  61.67,
758
  ),
759
  (
760
- 9,
761
- 56.39,
762
- "Mistral-large-2402",
763
- "Mistral AI",
764
- "Proprietary",
765
- 71.82,
766
- 90.50,
767
- 4.00,
768
- 0.00,
769
- 67.06,
770
- 66.00,
771
- 0.00,
772
- 5.00,
773
- 84.58,
774
- ),
775
- (
776
- 10,
777
- 55.72,
778
  "Mistral-small-2312",
779
  "Mistral AI",
780
  "Proprietary",
781
  46.55,
782
  68.00,
783
- 48.50,
784
- 58.00,
785
  32.35,
786
  30.00,
787
  40.00,
@@ -789,79 +821,95 @@ DATA = [
789
  89.58,
790
  ),
791
  (
792
- 11,
793
- 55.72,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
794
  "Nexusflow-Raven-v2",
795
  "Nexusflow",
796
  "Apache 2.0",
797
  76.55,
798
  83.50,
799
  39.50,
800
- 34.00,
801
- 58.24,
802
- 78.00,
803
- 68.00,
804
- 45.00,
805
- 0.00,
806
- ),
807
- (
808
- 12,
809
- 55.68,
810
- "Gemini-1.0-Pro",
811
- "Google",
812
- "Proprietary",
813
- 79.71,
814
- 89.00,
815
- 4.00,
816
- 0.00,
817
- 47.66,
818
  62.00,
 
819
  0.00,
820
- 0.00,
821
- 78.30,
822
  ),
823
  (
824
- 13,
825
- 55.33,
826
  "FireFunction-v1",
827
  "Fireworks",
828
  "Apache 2.0",
829
  73.19,
830
  87.00,
831
- 4.00,
832
  0.00,
833
- 61.76,
834
- 64.00,
 
835
  0.00,
836
  5.00,
837
  81.25,
838
  ),
839
  (
840
- 14,
841
- 54.16,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
842
  "GPT-4-0613",
843
  "OpenAI",
844
  "Proprietary",
845
  74.55,
846
  86.00,
847
- 4.00,
848
  0.00,
849
- 44.12,
850
- 50.00,
851
  0.00,
 
 
852
  0.00,
 
853
  87.08,
854
  ),
855
  (
856
- 15,
857
- 45.18,
858
  "Deepseek-v1.5",
859
  "Deepseek",
860
  "Deepseek License",
861
  48.36,
862
  61.00,
863
- 35.00,
864
- 43.50,
865
  24.70,
866
  2.00,
867
  0.00,
@@ -869,8 +917,8 @@ DATA = [
869
  66.25,
870
  ),
871
  (
872
- 16,
873
- 44.34,
874
  "Gemma",
875
  "Google",
876
  "gemma-terms-of-use",
@@ -879,39 +927,39 @@ DATA = [
879
  41.00,
880
  32.00,
881
  44.71,
882
- 46.00,
883
  44.00,
884
- 25.00,
885
  0.42,
886
  ),
887
  (
888
- 17,
889
- 33.61,
890
  "Gorilla-OpenFunctions-v0",
891
  "Gorilla LLM",
892
  "Apache 2.0",
893
  60.00,
894
  56.00,
895
- 1.00,
896
- 2.50,
897
- 39.41,
898
- 62.00,
899
  0.00,
900
  0.00,
901
  4.58,
902
  ),
903
  (
904
- 18,
905
- 24.76,
906
  "Glaive-v1",
907
  "Glaive",
908
  "cc-by-sa-4.0",
909
  34.55,
910
  26.00,
911
- 2.00,
912
  0.00,
913
  21.18,
914
- 34.00,
915
  0.00,
916
  2.50,
917
  46.25,
 
630
  DATA = [
631
  (
632
  1,
633
+ 84.28,
634
  "GPT-4-1106-Preview",
635
  "OpenAI",
636
  "Proprietary",
637
+ 80.73,
638
+ 88.50,
639
+ 90.50,
640
+ 84.50,
641
+ 74.12,
642
  70.00,
643
+ 68.00,
 
644
  50.00,
645
  88.75,
646
  ),
647
  (
648
  2,
649
+ 84.16,
650
  "GPT-4-0125-Preview",
651
  "OpenAI",
652
  "Proprietary",
653
+ 81.45,
654
+ 89.00,
655
+ 88.00,
656
+ 83.50,
657
+ 72.94,
658
+ 78.00,
659
+ 68.00,
660
+ 50.00,
661
  87.50,
662
  ),
663
  (
664
  3,
665
+ 84.16,
666
  "Gorilla-OpenFunctions-v2",
667
  "Gorilla LLM",
668
  "Apache 2.0",
669
+ 87.82,
670
+ 88.50,
671
+ 82.50,
672
  78.00,
673
+ 85.88,
674
+ 82.00,
675
+ 68.00,
676
+ 55.00,
677
  71.67,
678
  ),
679
  (
680
  4,
681
+ 83.67,
682
+ "Claude-3-Opus-20240229",
683
+ "Anthropic",
684
  "Proprietary",
685
+ 85.27,
686
+ 83.00,
687
+ 79.00,
688
+ 72.00,
689
+ 89.41,
690
  80.00,
691
+ 68.00,
692
+ 57.50,
693
+ 84.58,
 
694
  ),
695
  (
696
  5,
697
+ 81.75,
698
  "Mistral-Medium-2312",
699
  "Mistral AI",
700
  "Proprietary",
701
  80.18,
702
  84.50,
703
+ 76.50,
704
+ 73.50,
705
+ 84.71,
706
+ 86.00,
707
+ 76.00,
708
+ 62.50,
709
+ 90.00,
710
+ ),
711
+ (
712
+ 6,
713
+ 80.30,
714
+ "Claude-3-Sonnet-20240229",
715
+ "Anthropic",
716
+ "Proprietary",
717
+ 85.64,
718
+ 87.50,
719
+ 83.50,
720
+ 83.00,
721
+ 90.59,
722
+ 82.00,
723
  72.00,
724
+ 60.00,
725
+ 41.25,
726
+ ),
727
+ (
728
+ 7,
729
+ 80.30,
730
+ "GPT-3.5-Turbo-0125",
731
+ "OpenAI",
732
+ "Proprietary",
733
+ 80.18,
734
+ 84.50,
735
+ 82.50,
736
+ 79.00,
737
+ 84.71,
738
+ 80.00,
739
+ 68.00,
740
  47.50,
741
+ 45.33,
742
+ ),
743
+ (
744
+ 8,
745
+ 79.07,
746
+ "Functionary-Medium-v2.2",
747
+ "MeetKai",
748
+ "N/A",
749
+ 79.17,
750
  90.00,
751
+ 85.00,
752
+ 78.00,
753
+ 65.88,
754
+ 62.00,
755
+ 70.00,
756
+ 50.00,
757
+ 79.17,
758
  ),
759
  (
760
+ 9,
761
+ 77.41,
762
  "Claude-2.1",
763
  "Anthropic",
764
  "Proprietary",
765
  85.64,
766
  83.00,
767
+ 77.00,
768
+ 60.50,
769
+ 68.23,
770
  48.00,
771
+ 52.00,
772
+ 47.00,
773
  78.33,
774
  ),
775
  (
776
+ 10,
777
+ 61.75,
778
  "Mistral-tiny-2312",
779
  "Mistral AI",
780
  "Proprietary",
781
+ 59.64,
782
+ 62.50,
783
+ 56.00,
784
+ 43.00,
785
+ 71.17,
786
+ 84.00,
787
+ 74.00,
788
+ 36.00,
789
  77.08,
790
  ),
791
  (
792
+ 11,
793
+ 61.02,
794
  "Claude-instant-1.2",
795
  "Anthropic",
796
  "Proprietary",
797
  68.73,
798
  59.00,
799
+ 56.00,
800
+ 44.00,
801
+ 60.00,
802
+ 51.00,
803
  52.00,
804
  50.00,
 
805
  61.67,
806
  ),
807
  (
808
+ 12,
809
+ 56.87,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
810
  "Mistral-small-2312",
811
  "Mistral AI",
812
  "Proprietary",
813
  46.55,
814
  68.00,
815
+ 50.00,
816
+ 63.00,
817
  32.35,
818
  30.00,
819
  40.00,
 
821
  89.58,
822
  ),
823
  (
824
+ 13,
825
+ 56.81,
826
+ "Mistral-large-2402",
827
+ "Mistral AI",
828
+ "Proprietary",
829
+ 71.82,
830
+ 90.50,
831
+ 0.00,
832
+ 0.00,
833
+ 72.94,
834
+ 76.00,
835
+ 0.00,
836
+ 5.00,
837
+ 84.58,
838
+ ),
839
+ (
840
+ 14,
841
+ 55.90,
842
  "Nexusflow-Raven-v2",
843
  "Nexusflow",
844
  "Apache 2.0",
845
  76.55,
846
  83.50,
847
  39.50,
848
+ 32.50,
849
+ 61.18,
850
+ 84.00,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
851
  62.00,
852
+ 47.00,
853
  0.00,
 
 
854
  ),
855
  (
856
+ 15,
857
+ 55.87,
858
  "FireFunction-v1",
859
  "Fireworks",
860
  "Apache 2.0",
861
  73.19,
862
  87.00,
 
863
  0.00,
864
+ 0.00,
865
+ 68.23,
866
+ 76.00,
867
  0.00,
868
  5.00,
869
  81.25,
870
  ),
871
  (
872
+ 16,
873
+ 55.68,
874
+ "Gemini-1.0-Pro",
875
+ "Google",
876
+ "Proprietary",
877
+ 79.71,
878
+ 89.00,
879
+ 0.00,
880
+ 0.00,
881
+ 51.19,
882
+ 66.00,
883
+ 0.00,
884
+ 0.00,
885
+ 78.30,
886
+ ),
887
+ (
888
+ 17,
889
+ 54.52,
890
  "GPT-4-0613",
891
  "OpenAI",
892
  "Proprietary",
893
  74.55,
894
  86.00,
 
895
  0.00,
 
 
896
  0.00,
897
+ 50.00,
898
+ 56.00,
899
  0.00,
900
+ 2.00,
901
  87.08,
902
  ),
903
  (
904
+ 18,
905
+ 45.96,
906
  "Deepseek-v1.5",
907
  "Deepseek",
908
  "Deepseek License",
909
  48.36,
910
  61.00,
911
+ 37.00,
912
+ 47.50,
913
  24.70,
914
  2.00,
915
  0.00,
 
917
  66.25,
918
  ),
919
  (
920
+ 19,
921
+ 44.40,
922
  "Gemma",
923
  "Google",
924
  "gemma-terms-of-use",
 
927
  41.00,
928
  32.00,
929
  44.71,
930
+ 48.00,
931
  44.00,
932
+ 25.50,
933
  0.42,
934
  ),
935
  (
936
+ 20,
937
+ 33.37,
938
  "Gorilla-OpenFunctions-v0",
939
  "Gorilla LLM",
940
  "Apache 2.0",
941
  60.00,
942
  56.00,
943
+ 0.00,
944
+ 3.50,
945
+ 38.24,
946
+ 65.00,
947
  0.00,
948
  0.00,
949
  4.58,
950
  ),
951
  (
952
+ 21,
953
+ 24.58,
954
  "Glaive-v1",
955
  "Glaive",
956
  "cc-by-sa-4.0",
957
  34.55,
958
  26.00,
959
+ 0.00,
960
  0.00,
961
  21.18,
962
+ 36.00,
963
  0.00,
964
  2.50,
965
  46.25,