Huanzhi Mao commited on
Commit
67249b1
1 Parent(s): 57013a0

Support auto-populated leaderboard from csv file.

Browse files
Files changed (2) hide show
  1. app.py +79 -342
  2. data.csv +22 -0
app.py CHANGED
@@ -4,6 +4,7 @@ import webbrowser
4
  import os
5
  import re
6
  import pandas as pd
 
7
  # from anthropic import Anthropic
8
  from openai import OpenAI
9
  from mistralai.client import MistralClient
@@ -627,344 +628,31 @@ COLUMNS = [
627
  "Parallel Multiple Exec",
628
  "Relevance Detection",
629
  ]
630
- DATA = [
631
- (
632
- 1,
633
- 84.28,
634
- "GPT-4-1106-Preview",
635
- "OpenAI",
636
- "Proprietary",
637
- 80.73,
638
- 88.50,
639
- 90.50,
640
- 84.50,
641
- 74.12,
642
- 70.00,
643
- 68.00,
644
- 50.00,
645
- 88.75,
646
- ),
647
- (
648
- 2,
649
- 84.16,
650
- "GPT-4-0125-Preview",
651
- "OpenAI",
652
- "Proprietary",
653
- 81.45,
654
- 89.00,
655
- 88.00,
656
- 83.50,
657
- 72.94,
658
- 78.00,
659
- 68.00,
660
- 50.00,
661
- 87.50,
662
- ),
663
- (
664
- 3,
665
- 84.16,
666
- "Gorilla-OpenFunctions-v2",
667
- "Gorilla LLM",
668
- "Apache 2.0",
669
- 87.82,
670
- 88.50,
671
- 82.50,
672
- 78.00,
673
- 85.88,
674
- 82.00,
675
- 68.00,
676
- 55.00,
677
- 71.67,
678
- ),
679
- (
680
- 4,
681
- 83.67,
682
- "Claude-3-Opus-20240229",
683
- "Anthropic",
684
- "Proprietary",
685
- 85.27,
686
- 83.00,
687
- 79.00,
688
- 72.00,
689
- 89.41,
690
- 80.00,
691
- 68.00,
692
- 57.50,
693
- 84.58,
694
- ),
695
- (
696
- 5,
697
- 81.75,
698
- "Mistral-Medium-2312",
699
- "Mistral AI",
700
- "Proprietary",
701
- 80.18,
702
- 84.50,
703
- 76.50,
704
- 73.50,
705
- 84.71,
706
- 86.00,
707
- 76.00,
708
- 62.50,
709
- 90.00,
710
- ),
711
- (
712
- 6,
713
- 80.30,
714
- "Claude-3-Sonnet-20240229",
715
- "Anthropic",
716
- "Proprietary",
717
- 85.64,
718
- 87.50,
719
- 83.50,
720
- 83.00,
721
- 90.59,
722
- 82.00,
723
- 72.00,
724
- 60.00,
725
- 41.25,
726
- ),
727
- (
728
- 7,
729
- 80.30,
730
- "GPT-3.5-Turbo-0125",
731
- "OpenAI",
732
- "Proprietary",
733
- 80.18,
734
- 84.50,
735
- 82.50,
736
- 79.00,
737
- 84.71,
738
- 80.00,
739
- 68.00,
740
- 47.50,
741
- 45.33,
742
- ),
743
- (
744
- 8,
745
- 79.07,
746
- "Functionary-Medium-v2.2",
747
- "MeetKai",
748
- "N/A",
749
- 79.17,
750
- 90.00,
751
- 85.00,
752
- 78.00,
753
- 65.88,
754
- 62.00,
755
- 70.00,
756
- 50.00,
757
- 79.17,
758
- ),
759
- (
760
- 9,
761
- 77.41,
762
- "Claude-2.1",
763
- "Anthropic",
764
- "Proprietary",
765
- 85.64,
766
- 83.00,
767
- 77.00,
768
- 60.50,
769
- 68.23,
770
- 48.00,
771
- 52.00,
772
- 47.00,
773
- 78.33,
774
- ),
775
- (
776
- 10,
777
- 61.75,
778
- "Mistral-tiny-2312",
779
- "Mistral AI",
780
- "Proprietary",
781
- 59.64,
782
- 62.50,
783
- 56.00,
784
- 43.00,
785
- 71.17,
786
- 84.00,
787
- 74.00,
788
- 36.00,
789
- 77.08,
790
- ),
791
- (
792
- 11,
793
- 61.02,
794
- "Claude-instant-1.2",
795
- "Anthropic",
796
- "Proprietary",
797
- 68.73,
798
- 59.00,
799
- 56.00,
800
- 44.00,
801
- 60.00,
802
- 51.00,
803
- 52.00,
804
- 50.00,
805
- 61.67,
806
- ),
807
- (
808
- 12,
809
- 56.87,
810
- "Mistral-small-2312",
811
- "Mistral AI",
812
- "Proprietary",
813
- 46.55,
814
- 68.00,
815
- 50.00,
816
- 63.00,
817
- 32.35,
818
- 30.00,
819
- 40.00,
820
- 37.50,
821
- 89.58,
822
- ),
823
- (
824
- 13,
825
- 56.81,
826
- "Mistral-large-2402",
827
- "Mistral AI",
828
- "Proprietary",
829
- 71.82,
830
- 90.50,
831
- 0.00,
832
- 0.00,
833
- 72.94,
834
- 76.00,
835
- 0.00,
836
- 5.00,
837
- 84.58,
838
- ),
839
- (
840
- 14,
841
- 55.90,
842
- "Nexusflow-Raven-v2",
843
- "Nexusflow",
844
- "Apache 2.0",
845
- 76.55,
846
- 83.50,
847
- 39.50,
848
- 32.50,
849
- 61.18,
850
- 84.00,
851
- 62.00,
852
- 47.00,
853
- 0.00,
854
- ),
855
- (
856
- 15,
857
- 55.87,
858
- "FireFunction-v1",
859
- "Fireworks",
860
- "Apache 2.0",
861
- 73.19,
862
- 87.00,
863
- 0.00,
864
- 0.00,
865
- 68.23,
866
- 76.00,
867
- 0.00,
868
- 5.00,
869
- 81.25,
870
- ),
871
- (
872
- 16,
873
- 55.68,
874
- "Gemini-1.0-Pro",
875
- "Google",
876
- "Proprietary",
877
- 79.71,
878
- 89.00,
879
- 0.00,
880
- 0.00,
881
- 51.19,
882
- 66.00,
883
- 0.00,
884
- 0.00,
885
- 78.30,
886
- ),
887
- (
888
- 17,
889
- 54.52,
890
- "GPT-4-0613",
891
- "OpenAI",
892
- "Proprietary",
893
- 74.55,
894
- 86.00,
895
- 0.00,
896
- 0.00,
897
- 50.00,
898
- 56.00,
899
- 0.00,
900
- 2.00,
901
- 87.08,
902
- ),
903
- (
904
- 18,
905
- 45.96,
906
- "Deepseek-v1.5",
907
- "Deepseek",
908
- "Deepseek License",
909
- 48.36,
910
- 61.00,
911
- 37.00,
912
- 47.50,
913
- 24.70,
914
- 2.00,
915
- 0.00,
916
- 7.50,
917
- 66.25,
918
- ),
919
- (
920
- 19,
921
- 44.40,
922
- "Gemma",
923
- "Google",
924
- "gemma-terms-of-use",
925
- 61.45,
926
- 60.00,
927
- 41.00,
928
- 32.00,
929
- 44.71,
930
- 48.00,
931
- 44.00,
932
- 25.50,
933
- 0.42,
934
- ),
935
- (
936
- 20,
937
- 33.37,
938
- "Gorilla-OpenFunctions-v0",
939
- "Gorilla LLM",
940
- "Apache 2.0",
941
- 60.00,
942
- 56.00,
943
- 0.00,
944
- 3.50,
945
- 38.24,
946
- 65.00,
947
- 0.00,
948
- 0.00,
949
- 4.58,
950
- ),
951
- (
952
- 21,
953
- 24.58,
954
- "Glaive-v1",
955
- "Glaive",
956
- "cc-by-sa-4.0",
957
- 34.55,
958
- 26.00,
959
- 0.00,
960
- 0.00,
961
- 21.18,
962
- 36.00,
963
- 0.00,
964
- 2.50,
965
- 46.25,
966
- ),
967
- ]
968
 
969
  MODELS = [
970
  "gorilla-openfunctions-v2",
@@ -977,7 +665,6 @@ MODELS = [
977
 
978
  def send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput, vote):
979
  # Login and get access token
980
- print("Sending feedback")
981
  login_url = 'https://us-west-2.aws.realm.mongodb.com/api/client/v2.0/app/data-onwzq/auth/providers/local-userpass/login'
982
  headers = {'Content-Type': 'application/json'}
983
  login_data = {
@@ -1020,6 +707,52 @@ def send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput,
1020
  else:
1021
  print('Error:', response.text)
1022
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1023
  def send_feedback_negative(prompt, function, model, temperature, codeOutput, jsonOutput):
1024
  send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput, "negative")
1025
  return "Thank you for your feedback. We will use this to improve our service."
@@ -1221,12 +954,10 @@ def distribute_task(prompt, function, model, temperature):
1221
  def get_leaderboard():
1222
  # Convert the leaderboard data to a pandas DataFrame for easier handling and display
1223
  leaderboard_df = pd.DataFrame(DATA, columns=COLUMNS)
 
1224
  return leaderboard_df
1225
 
1226
 
1227
- # Initialize the leaderboard data so it's loaded when the page is opened
1228
- initial_leaderboard_data = get_leaderboard()
1229
-
1230
  prompt = gr.Textbox(label="Prompt", placeholder="Type your prompt here...", lines=4)
1231
  funcDescription = gr.Textbox(
1232
  label="Function Description", placeholder="Describe the function...", lines=20
@@ -1324,4 +1055,10 @@ with gr.Blocks() as demo:
1324
  outputs=[feedbackMsg],
1325
  )
1326
 
 
 
 
 
 
 
1327
  demo.launch()
 
4
  import os
5
  import re
6
  import pandas as pd
7
+ import csv
8
  # from anthropic import Anthropic
9
  from openai import OpenAI
10
  from mistralai.client import MistralClient
 
628
  "Parallel Multiple Exec",
629
  "Relevance Detection",
630
  ]
631
+
632
+ def parse_csv(text):
633
+ lines = text.split('\n')
634
+ result = []
635
+ for i in range(len(lines)):
636
+ row = lines[i].split(',')[:15]
637
+ row = [parse_value(value) for value in row]
638
+ row.insert(0, i)
639
+ overall_acc = row.pop(4)
640
+ row.insert(1, overall_acc)
641
+ row.pop(5)
642
+ row.pop(5)
643
+ result.append(row)
644
+ return result
645
+
646
+ def parse_value(value):
647
+ if value.endswith('%'):
648
+ return float(value[:-1])
649
+ return value
650
+
651
+
652
+ with open('./data.csv', 'r') as file:
653
+ csv_text = file.read()
654
+ DATA = parse_csv(csv_text)
655
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
656
 
657
  MODELS = [
658
  "gorilla-openfunctions-v2",
 
665
 
666
  def send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput, vote):
667
  # Login and get access token
 
668
  login_url = 'https://us-west-2.aws.realm.mongodb.com/api/client/v2.0/app/data-onwzq/auth/providers/local-userpass/login'
669
  headers = {'Content-Type': 'application/json'}
670
  login_data = {
 
707
  else:
708
  print('Error:', response.text)
709
 
710
+ def get_voting_result():
711
+ login_url = 'https://us-west-2.aws.realm.mongodb.com/api/client/v2.0/app/data-onwzq/auth/providers/local-userpass/login'
712
+ headers = {'Content-Type': 'application/json'}
713
+ login_data = {
714
+ 'username': 'website',
715
+ 'password': mongoDBPassword
716
+ }
717
+ response = requests.post(login_url, headers=headers, json=login_data)
718
+ access_token = response.json()['access_token']
719
+
720
+ # Scanning the database
721
+ url = 'https://us-west-2.aws.data.mongodb-api.com/app/data-onwzq/endpoint/data/v1/action/find'
722
+ headers = {
723
+ 'Content-Type': 'application/json',
724
+ 'Access-Control-Request-Headers': '*',
725
+ 'Authorization': f'Bearer {access_token}'
726
+ }
727
+ body = {
728
+ 'collection': "vote",
729
+ 'database': "gorilla-feedback",
730
+ 'dataSource': "gorilla",
731
+ }
732
+ response = requests.post(url, headers=headers, json=body)
733
+ if response.ok:
734
+ data = response.json()
735
+ votes = data['documents']
736
+ votes = [vote for vote in votes if vote['result'] in ['positive', 'negative']]
737
+ # extract only the model, positive count, negative count
738
+ model_votes = {}
739
+ for vote in votes:
740
+ model = vote['model']
741
+ if model not in model_votes:
742
+ model_votes[model] = {'positive': 0, 'negative': 0}
743
+ model_votes[model][vote['result']] += 1
744
+ for model in model_votes:
745
+ model_votes[model]['accuracy'] = model_votes[model]['positive'] / (model_votes[model]['positive'] + model_votes[model]['negative'])
746
+
747
+ result = []
748
+ for model in model_votes:
749
+ result.append([model, model_votes[model]['accuracy'], model_votes[model]['positive'], model_votes[model]['negative']])
750
+ result = sorted(result, key=lambda x: x[1], reverse=True)
751
+ return pd.DataFrame(result, columns=['Model', 'Accuracy', 'Positive', 'Negative'])
752
+ else:
753
+ print('Error:', response.text)
754
+ return []
755
+
756
  def send_feedback_negative(prompt, function, model, temperature, codeOutput, jsonOutput):
757
  send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput, "negative")
758
  return "Thank you for your feedback. We will use this to improve our service."
 
954
  def get_leaderboard():
955
  # Convert the leaderboard data to a pandas DataFrame for easier handling and display
956
  leaderboard_df = pd.DataFrame(DATA, columns=COLUMNS)
957
+ leaderboard_df = leaderboard_df.sort_values(by="Rank")
958
  return leaderboard_df
959
 
960
 
 
 
 
961
  prompt = gr.Textbox(label="Prompt", placeholder="Type your prompt here...", lines=4)
962
  funcDescription = gr.Textbox(
963
  label="Function Description", placeholder="Describe the function...", lines=20
 
1055
  outputs=[feedbackMsg],
1056
  )
1057
 
1058
+ with gr.TabItem("Voting Leaderboard"):
1059
+ gr.Markdown("## This is a live leaderboard where you can see user's voting result on the agent's response.")
1060
+ leaderboard_data = gr.Dataframe(
1061
+ value=get_voting_result(), wrap=True
1062
+ )
1063
+
1064
  demo.launch()
data.csv ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GPT-4-1106-Preview (FC),OpenAI,Proprietary,84.28%,86.06%,65.53%,80.73%,88.50%,90.50%,84.50%,74.12%,70.00%,68.00%,50.00%,88.75%
2
+ GPT-4-0125-Preview (FC),OpenAI,Proprietary,84.16%,85.61%,67.24%,81.45%,89.00%,88.50%,83.50%,72.94%,78.00%,68.00%,50.00%,87.50%
3
+ Gorilla-OpenFunctions-v2 (FC),Gorilla LLM,Apache 2.0,84.16%,84.33%,72.72%,87.82%,89.00%,82.50%,78.00%,85.88%,82.00%,68.00%,55.00%,71.67%
4
+ Claude-3-Opus-20240229 (Prompt),Anthropic,Proprietary,83.67%,79.82%,73.73%,85.27%,83.00%,79.00%,72.00%,89.41%,80.00%,68.00%,57.50%,84.58%
5
+ Mistral-Medium-2312 (Prompt),Mistral AI,Proprietary,81.75%,78.67%,66.93%,80.18%,84.50%,76.50%,73.50%,84.71%,76.00%,62.00%,45.00%,90.00%
6
+ Claude-3-Sonnet-20240229 (Prompt),Anthropic,Proprietary,80.30%,84.91%,76.15%,85.64%,87.50%,83.50%,83.00%,90.59%,82.00%,72.00%,60.00%,41.25%
7
+ GPT-3.5-Turbo-0125 (FC),OpenAI,Proprietary,80.30%,81.55%,69.43%,80.18%,84.50%,82.50%,79.00%,84.71%,80.00%,68.00%,45.00%,68.33%
8
+ Functionary-Small (FC),MeetKai,N/A,79.07%,82.31%,64.40%,75.75%,89.50%,82.50%,81.50%,64.12%,78.00%,68.00%,47.50%,78.33%
9
+ Functionary-Medium-v2.2 (FC),MeetKai,N/A,79.03%,82.25%,61.97%,76.00%,90.00%,85.00%,77.99%,65.88%,62.00%,70.00%,50.00%,79.17%
10
+ Claude-2.1 (Prompt),Anthropic,Proprietary,77.41%,76.53%,53.93%,85.64%,83.00%,77.00%,60.50%,68.23%,48.00%,52.00%,47.50%,78.33%
11
+ Mistral-tiny-2312 (Prompt),Mistral AI,Proprietary,61.75%,55.28%,53.42%,59.64%,62.50%,56.00%,43.00%,71.17%,74.00%,36.00%,32.50%,77.08%
12
+ Claude-instant-1.2 (Prompt),Anthropic,Proprietary,61.02%,57.06%,49.88%,68.73%,59.00%,56.50%,44.00%,60.00%,52.00%,50.00%,37.50%,61.67%
13
+ Mistral-small-2312 (Prompt),Mistral AI,Proprietary,56.87%,57.01%,36.18%,46.55%,68.00%,50.50%,63.00%,34.71%,32.00%,38.00%,40.00%,89.58%
14
+ Mistral-large-2402 (FC),Mistral AI,Proprietary,56.81%,40.58%,38.49%,71.82%,90.50%,0.00%,0.00%,72.94%,76.00%,0.00%,5.00%,84.58%
15
+ Nexusflow-Raven-v2 (FC),Nexusflow,Apache 2.0,55.90%,58.01%,63.67%,76.55%,83.50%,39.50%,32.50%,61.18%,84.00%,62.00%,47.50%,0.00%
16
+ FireFunction-v1 (FC),Fireworks,Apache 2.0,55.87%,40.05%,37.31%,73.19%,87.00%,0.00%,0.00%,68.23%,76.00%,0.00%,5.00%,81.25%
17
+ Gemini-1.0-Pro (FC),Google,Proprietary,55.68%,42.18%,29.30%,79.71%,89.00%,0.00%,0.00%,51.19%,66.00%,0.00%,0.00%,78.30%
18
+ GPT-4-0613 (FC),OpenAI,Proprietary,54.52%,40.14%,27.12%,74.55%,86.00%,0.00%,0.00%,50.00%,56.00%,0.00%,2.50%,87.08%
19
+ Deepseek-v1.5 (Prompt),Deepseek,Deepseek License,45.96%,48.59%,8.55%,48.36%,61.00%,37.50%,47.50%,24.70%,2.00%,0.00%,7.50%,66.25%
20
+ Gemma,Google,gemma-terms-of-use,44.40%,48.61%,40.43%,61.45%,60.00%,41.00%,32.00%,44.71%,48.00%,44.00%,25.00%,0.42%
21
+ Gorilla-OpenFunctions-v0 (FC),Gorilla LLM,Apache 2.0,33.37%,29.88%,24.06%,60.00%,56.00%,0.00%,3.50%,38.24%,58.00%,0.00%,0.00%,4.58%
22
+ Glaive-v1 (FC),Glaive,cc-by-sa-4.0,24.58%,15.14%,14.92%,34.55%,26.00%,0.00%,0.00%,21.18%,36.00%,0.00%,2.50%,46.25%