Huanzhi Mao commited on
Commit
c94dd2f
1 Parent(s): 23ba85c

update description

Browse files
Files changed (1) hide show
  1. app.py +166 -79
app.py CHANGED
@@ -5,6 +5,7 @@ import os
5
  import re
6
  import pandas as pd
7
  import csv
 
8
  # from anthropic import Anthropic
9
  from openai import OpenAI
10
  from mistralai.client import MistralClient
@@ -632,12 +633,26 @@ COLUMNS = [
632
  "Latency Standard Deviation (s)",
633
  ]
634
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
  def parse_csv(text):
636
- lines = text.split('\n')
637
  lines = lines[1:]
638
  result = []
639
  for i in range(len(lines)):
640
- row = lines[i].split(',')
641
  row = [parse_value(value) for value in row]
642
  row.pop(3)
643
  row.pop(5)
@@ -647,12 +662,13 @@ def parse_csv(text):
647
  row.pop(6)
648
  row.pop(10)
649
  row.pop(10)
650
-
651
  result.append(row)
652
  return result
653
 
 
654
  def parse_value(value):
655
- if value.endswith('%'):
656
  return float(value[:-1])
657
  try:
658
  return float(value)
@@ -660,54 +676,57 @@ def parse_value(value):
660
  return value
661
 
662
 
663
- with open('./data.csv', 'r') as file:
664
  csv_text = file.read()
665
  DATA = parse_csv(csv_text)
 
 
 
 
 
 
 
666
 
667
  MODELS = [
668
  "gorilla-openfunctions-v2",
669
  "gpt-4-1106-preview-fc",
670
  "gpt-4-0125-preview-fc",
671
  "gpt-3.5-turbo-0125-fc",
672
- "mistral-large-fc"
673
  ]
674
 
675
 
676
  def send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput, vote):
677
  # Login and get access token
678
- login_url = 'https://us-west-2.aws.realm.mongodb.com/api/client/v2.0/app/data-onwzq/auth/providers/local-userpass/login'
679
- headers = {'Content-Type': 'application/json'}
680
- login_data = {
681
- 'username': 'website',
682
- 'password': mongoDBPassword
683
- }
684
  response = requests.post(login_url, headers=headers, json=login_data)
685
- access_token = response.json()['access_token']
686
  # Prepare data for sending feedback
687
- url = 'https://us-west-2.aws.data.mongodb-api.com/app/data-onwzq/endpoint/data/v1/action/insertOne'
688
  headers = {
689
- 'Content-Type': 'application/json',
690
- 'Access-Control-Request-Headers': '*',
691
- 'Authorization': f'Bearer {access_token}'
692
  }
693
 
694
-
695
  if not prompt or not function:
696
  return
697
 
698
  body = {
699
- 'collection': "vote",
700
- 'database': "gorilla-feedback",
701
- 'dataSource': "gorilla",
702
- 'document': {
703
- 'prompt': prompt,
704
- 'funcDef': function,
705
- 'temperature': temperature,
706
- 'model': model,
707
- 'codeOutput': codeOutput,
708
- 'jsonOutput': jsonOutput,
709
- 'result': vote
710
- }
711
  }
712
 
713
  # Send feedback
@@ -715,60 +734,79 @@ def send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput,
715
  if response.ok:
716
  print("Document inserted:", response.json())
717
  else:
718
- print('Error:', response.text)
 
719
 
720
  def get_voting_result():
721
- login_url = 'https://us-west-2.aws.realm.mongodb.com/api/client/v2.0/app/data-onwzq/auth/providers/local-userpass/login'
722
- headers = {'Content-Type': 'application/json'}
723
- login_data = {
724
- 'username': 'website',
725
- 'password': mongoDBPassword
726
- }
727
  response = requests.post(login_url, headers=headers, json=login_data)
728
- access_token = response.json()['access_token']
729
-
730
  # Scanning the database
731
- url = 'https://us-west-2.aws.data.mongodb-api.com/app/data-onwzq/endpoint/data/v1/action/find'
732
  headers = {
733
- 'Content-Type': 'application/json',
734
- 'Access-Control-Request-Headers': '*',
735
- 'Authorization': f'Bearer {access_token}'
736
  }
737
  body = {
738
- 'collection': "vote",
739
- 'database': "gorilla-feedback",
740
- 'dataSource': "gorilla",
741
  }
742
  response = requests.post(url, headers=headers, json=body)
743
  if response.ok:
744
  data = response.json()
745
- votes = data['documents']
746
- votes = [vote for vote in votes if vote['result'] in ['positive', 'negative']]
747
  # extract only the model, positive count, negative count
748
  model_votes = {}
749
  for vote in votes:
750
- model = vote['model']
751
  if model not in model_votes:
752
- model_votes[model] = {'positive': 0, 'negative': 0}
753
- model_votes[model][vote['result']] += 1
754
  for model in model_votes:
755
- model_votes[model]['accuracy'] = model_votes[model]['positive'] / (model_votes[model]['positive'] + model_votes[model]['negative'])
756
-
 
 
757
  result = []
758
  for model in model_votes:
759
- result.append([model, model_votes[model]['accuracy'], model_votes[model]['positive'], model_votes[model]['negative']])
 
 
 
 
 
 
 
760
  result = sorted(result, key=lambda x: x[1], reverse=True)
761
- return pd.DataFrame(result, columns=['Model', 'Accuracy', 'Positive', 'Negative'])
 
 
762
  else:
763
- print('Error:', response.text)
764
  return []
765
-
766
- def send_feedback_negative(prompt, function, model, temperature, codeOutput, jsonOutput):
767
- send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput, "negative")
 
 
 
 
 
768
  return "Thank you for your feedback. We will use this to improve our service."
769
 
770
- def send_feedback_positive(prompt, function, model, temperature, codeOutput, jsonOutput):
771
- send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput, "positive")
 
 
 
 
 
772
  return "Thank you for your feedback. We will use this to improve our service."
773
 
774
 
@@ -905,7 +943,7 @@ def get_openai_response(prompt, function, model, temperature):
905
 
906
 
907
  def get_mistral_response(prompt, function, model, temperature):
908
- client = MistralClient(api_key= mistralKey)
909
  oai_tool = []
910
  function = json.loads(function)
911
  item = function # use item in the later code
@@ -913,7 +951,9 @@ def get_mistral_response(prompt, function, model, temperature):
913
  item["name"] = re.sub(
914
  r"\.", "_", item["name"]
915
  ) # OAI does not support "." in the function name so we replace it with "_". ^[a-zA-Z0-9_-]{1,64}$ is the regex for the name.
916
- item["parameters"]["type"] = "object" # If typing is missing, we assume it is an object since OAI requires a type.
 
 
917
  if "properties" not in item["parameters"]:
918
  item["parameters"]["properties"] = item["parameters"].copy()
919
  item["parameters"]["type"] = "object"
@@ -928,12 +968,12 @@ def get_mistral_response(prompt, function, model, temperature):
928
  )
929
  oai_tool.append({"type": "function", "function": item})
930
  message = [
931
- ChatMessage(role="user", content=prompt),
932
- ]
933
  chat_response = client.chat(
934
  model="mistral-large-latest",
935
  messages=message,
936
- tools = oai_tool,
937
  temperature=temperature,
938
  )
939
  try:
@@ -949,8 +989,8 @@ def get_mistral_response(prompt, function, model, temperature):
949
  except:
950
  result = chat_response.choices[0].message.content
951
  return result, "The model failed to return a JSON output."
952
-
953
-
954
  def distribute_task(prompt, function, model, temperature):
955
  if "gpt" in model:
956
  return get_openai_response(prompt, function, model, temperature)
@@ -968,6 +1008,13 @@ def get_leaderboard():
968
  return leaderboard_df
969
 
970
 
 
 
 
 
 
 
 
971
  prompt = gr.Textbox(label="Prompt", placeholder="Type your prompt here...", lines=4)
972
  funcDescription = gr.Textbox(
973
  label="Function Description", placeholder="Describe the function...", lines=20
@@ -977,14 +1024,40 @@ model = gr.Dropdown(label="Model", choices=MODELS)
977
 
978
  with gr.Blocks() as demo:
979
  with gr.Tabs():
980
- with gr.TabItem("Leaderboard"):
981
- gr.Markdown("**This live leaderboard evaluates the LLM's ability to call functions (aka tools) accurately. This leaderboard consists of real-world data and will be updated periodically. For more information on the evaluation dataset and methodology, please refer to our [blog](https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html) and [code](https://github.com/ShishirPatil/gorilla).**")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
982
  gr.Markdown(
983
- "**Note: AST means evaluation through Abstract Syntax Tree and Exec means evaluation through execution.**"
984
  )
985
- leaderboard_data = gr.Dataframe(
986
- value=get_leaderboard(), wrap=True
 
 
 
 
 
 
 
 
 
987
  )
 
988
 
989
  with gr.TabItem("Try It Out"):
990
  with gr.Row():
@@ -1050,18 +1123,32 @@ with gr.Blocks() as demo:
1050
  fn=None,
1051
  inputs=[prompt, model, temperature, codeOutput, jsonOutput],
1052
  outputs=[],
1053
- js='(prompt, model, temperature, codeOutput, jsonOutput) => window.open(`https://github.com/ShishirPatil/gorilla/issues/new?assignees=&labels=hosted-openfunctions-v2&projects=&template=hosted-openfunctions-v2.md&title=[bug] OpenFunctions-v2: &body=**Issue Description**%0A%0APrompt: ${prompt}%0A%0AModel: ${model}%0A%0ATemperature: ${temperature}%0A%0AOutput (or Error if request failed): ${codeOutput} %0A%0A ${jsonOutput}%0A%0A**Additional Information**\n`, "_blank")'
1054
  )
1055
-
1056
  thumbs_up.click(
1057
  fn=send_feedback_positive,
1058
- inputs=[prompt, funcDescription, model, temperature, codeOutput, jsonOutput],
 
 
 
 
 
 
 
1059
  outputs=[feedbackMsg],
1060
  )
1061
-
1062
  thumbs_down.click(
1063
  fn=send_feedback_negative,
1064
- inputs=[prompt, funcDescription, model, temperature, codeOutput, jsonOutput],
 
 
 
 
 
 
 
1065
  outputs=[feedbackMsg],
1066
  )
1067
 
@@ -1070,5 +1157,5 @@ with gr.Blocks() as demo:
1070
  # leaderboard_data = gr.Dataframe(
1071
  # value=get_voting_result(), wrap=True
1072
  # )
1073
-
1074
  demo.launch()
 
5
  import re
6
  import pandas as pd
7
  import csv
8
+
9
  # from anthropic import Anthropic
10
  from openai import OpenAI
11
  from mistralai.client import MistralClient
 
633
  "Latency Standard Deviation (s)",
634
  ]
635
 
636
+ COLUMNS_SUMMARY = [
637
+ "Rank",
638
+ "Overall Acc",
639
+ "Model",
640
+ "Organization",
641
+ "License",
642
+ "AST Summary",
643
+ "Exec Summary",
644
+ "Relevance Detection",
645
+ "Cost ($ Per 1k Function Calls)",
646
+ "Latency Mean (s)",
647
+ ]
648
+
649
+
650
  def parse_csv(text):
651
+ lines = text.split("\n")
652
  lines = lines[1:]
653
  result = []
654
  for i in range(len(lines)):
655
+ row = lines[i].split(",")
656
  row = [parse_value(value) for value in row]
657
  row.pop(3)
658
  row.pop(5)
 
662
  row.pop(6)
663
  row.pop(10)
664
  row.pop(10)
665
+
666
  result.append(row)
667
  return result
668
 
669
+
670
  def parse_value(value):
671
+ if value.endswith("%"):
672
  return float(value[:-1])
673
  try:
674
  return float(value)
 
676
  return value
677
 
678
 
679
+ with open("./data.csv", "r") as file:
680
  csv_text = file.read()
681
  DATA = parse_csv(csv_text)
682
+ DATA_SUMMARY = [
683
+ row[:5]
684
+ + [round((row[5] + row[6] + row[7] + row[8]) / 4, 2)]
685
+ + [round((row[9] + row[10] + row[11] + row[12]) / 4, 2)]
686
+ + row[13:16]
687
+ for row in DATA
688
+ ]
689
 
690
  MODELS = [
691
  "gorilla-openfunctions-v2",
692
  "gpt-4-1106-preview-fc",
693
  "gpt-4-0125-preview-fc",
694
  "gpt-3.5-turbo-0125-fc",
695
+ "mistral-large-fc",
696
  ]
697
 
698
 
699
  def send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput, vote):
700
  # Login and get access token
701
+ login_url = "https://us-west-2.aws.realm.mongodb.com/api/client/v2.0/app/data-onwzq/auth/providers/local-userpass/login"
702
+ headers = {"Content-Type": "application/json"}
703
+ login_data = {"username": "website", "password": mongoDBPassword}
 
 
 
704
  response = requests.post(login_url, headers=headers, json=login_data)
705
+ access_token = response.json()["access_token"]
706
  # Prepare data for sending feedback
707
+ url = "https://us-west-2.aws.data.mongodb-api.com/app/data-onwzq/endpoint/data/v1/action/insertOne"
708
  headers = {
709
+ "Content-Type": "application/json",
710
+ "Access-Control-Request-Headers": "*",
711
+ "Authorization": f"Bearer {access_token}",
712
  }
713
 
 
714
  if not prompt or not function:
715
  return
716
 
717
  body = {
718
+ "collection": "vote",
719
+ "database": "gorilla-feedback",
720
+ "dataSource": "gorilla",
721
+ "document": {
722
+ "prompt": prompt,
723
+ "funcDef": function,
724
+ "temperature": temperature,
725
+ "model": model,
726
+ "codeOutput": codeOutput,
727
+ "jsonOutput": jsonOutput,
728
+ "result": vote,
729
+ },
730
  }
731
 
732
  # Send feedback
 
734
  if response.ok:
735
  print("Document inserted:", response.json())
736
  else:
737
+ print("Error:", response.text)
738
+
739
 
740
  def get_voting_result():
741
+ login_url = "https://us-west-2.aws.realm.mongodb.com/api/client/v2.0/app/data-onwzq/auth/providers/local-userpass/login"
742
+ headers = {"Content-Type": "application/json"}
743
+ login_data = {"username": "website", "password": mongoDBPassword}
 
 
 
744
  response = requests.post(login_url, headers=headers, json=login_data)
745
+ access_token = response.json()["access_token"]
746
+
747
  # Scanning the database
748
+ url = "https://us-west-2.aws.data.mongodb-api.com/app/data-onwzq/endpoint/data/v1/action/find"
749
  headers = {
750
+ "Content-Type": "application/json",
751
+ "Access-Control-Request-Headers": "*",
752
+ "Authorization": f"Bearer {access_token}",
753
  }
754
  body = {
755
+ "collection": "vote",
756
+ "database": "gorilla-feedback",
757
+ "dataSource": "gorilla",
758
  }
759
  response = requests.post(url, headers=headers, json=body)
760
  if response.ok:
761
  data = response.json()
762
+ votes = data["documents"]
763
+ votes = [vote for vote in votes if vote["result"] in ["positive", "negative"]]
764
  # extract only the model, positive count, negative count
765
  model_votes = {}
766
  for vote in votes:
767
+ model = vote["model"]
768
  if model not in model_votes:
769
+ model_votes[model] = {"positive": 0, "negative": 0}
770
+ model_votes[model][vote["result"]] += 1
771
  for model in model_votes:
772
+ model_votes[model]["accuracy"] = model_votes[model]["positive"] / (
773
+ model_votes[model]["positive"] + model_votes[model]["negative"]
774
+ )
775
+
776
  result = []
777
  for model in model_votes:
778
+ result.append(
779
+ [
780
+ model,
781
+ model_votes[model]["accuracy"],
782
+ model_votes[model]["positive"],
783
+ model_votes[model]["negative"],
784
+ ]
785
+ )
786
  result = sorted(result, key=lambda x: x[1], reverse=True)
787
+ return pd.DataFrame(
788
+ result, columns=["Model", "Accuracy", "Positive", "Negative"]
789
+ )
790
  else:
791
+ print("Error:", response.text)
792
  return []
793
+
794
+
795
+ def send_feedback_negative(
796
+ prompt, function, model, temperature, codeOutput, jsonOutput
797
+ ):
798
+ send_feedback(
799
+ prompt, function, model, temperature, codeOutput, jsonOutput, "negative"
800
+ )
801
  return "Thank you for your feedback. We will use this to improve our service."
802
 
803
+
804
+ def send_feedback_positive(
805
+ prompt, function, model, temperature, codeOutput, jsonOutput
806
+ ):
807
+ send_feedback(
808
+ prompt, function, model, temperature, codeOutput, jsonOutput, "positive"
809
+ )
810
  return "Thank you for your feedback. We will use this to improve our service."
811
 
812
 
 
943
 
944
 
945
  def get_mistral_response(prompt, function, model, temperature):
946
+ client = MistralClient(api_key=mistralKey)
947
  oai_tool = []
948
  function = json.loads(function)
949
  item = function # use item in the later code
 
951
  item["name"] = re.sub(
952
  r"\.", "_", item["name"]
953
  ) # OAI does not support "." in the function name so we replace it with "_". ^[a-zA-Z0-9_-]{1,64}$ is the regex for the name.
954
+ item["parameters"][
955
+ "type"
956
+ ] = "object" # If typing is missing, we assume it is an object since OAI requires a type.
957
  if "properties" not in item["parameters"]:
958
  item["parameters"]["properties"] = item["parameters"].copy()
959
  item["parameters"]["type"] = "object"
 
968
  )
969
  oai_tool.append({"type": "function", "function": item})
970
  message = [
971
+ ChatMessage(role="user", content=prompt),
972
+ ]
973
  chat_response = client.chat(
974
  model="mistral-large-latest",
975
  messages=message,
976
+ tools=oai_tool,
977
  temperature=temperature,
978
  )
979
  try:
 
989
  except:
990
  result = chat_response.choices[0].message.content
991
  return result, "The model failed to return a JSON output."
992
+
993
+
994
  def distribute_task(prompt, function, model, temperature):
995
  if "gpt" in model:
996
  return get_openai_response(prompt, function, model, temperature)
 
1008
  return leaderboard_df
1009
 
1010
 
1011
+ def get_summary():
1012
+ # Convert the leaderboard data to a pandas DataFrame for easier handling and display
1013
+ leaderboard_df = pd.DataFrame(DATA_SUMMARY, columns=COLUMNS_SUMMARY)
1014
+ leaderboard_df = leaderboard_df.sort_values(by="Rank")
1015
+ return leaderboard_df
1016
+
1017
+
1018
  prompt = gr.Textbox(label="Prompt", placeholder="Type your prompt here...", lines=4)
1019
  funcDescription = gr.Textbox(
1020
  label="Function Description", placeholder="Describe the function...", lines=20
 
1024
 
1025
  with gr.Blocks() as demo:
1026
  with gr.Tabs():
1027
+ with gr.TabItem("Summary Leaderboard"):
1028
+ gr.Markdown(
1029
+ "**This live leaderboard evaluates the LLM's ability to call functions (aka tools) accurately. This leaderboard consists of real-world data and will be updated periodically. For more information on the evaluation dataset and methodology, please refer to our [blog](https://gorilla.cs.berkeley.edu/blogs/10_checker_manual.html) and [code](https://github.com/ShishirPatil/gorilla).**"
1030
+ )
1031
+ gr.Markdown(
1032
+ """**AST means evaluation through Abstract Syntax Tree and Exec means evaluation through execution.**
1033
+
1034
+ **FC = native support for function/tool calling.**
1035
+
1036
+ **Cost is calculated as an estimate of the cost per 1000 function calls, in USD. Latency is measured in seconds.**
1037
+
1038
+ **AST Summary is the unweighted average of the four test categories under AST Evaluation. Exec Summary is the unweighted average of the four test categories under Exec Evaluation.**
1039
+
1040
+ **Click on column header to sort. If you would like to add your model or contribute test-cases, please contact us via [discord](https://discord.gg/SwTyuTAxX3).**
1041
+ """
1042
+ )
1043
+ leaderboard_data = gr.Dataframe(value=get_summary(), wrap=True)
1044
+ with gr.TabItem("Full Leaderboard"):
1045
  gr.Markdown(
1046
+ "**This live leaderboard evaluates the LLM's ability to call functions (aka tools) accurately. This leaderboard consists of real-world data and will be updated periodically. For more information on the evaluation dataset and methodology, please refer to our [blog](https://gorilla.cs.berkeley.edu/blogs/10_checker_manual.html) and [code](https://github.com/ShishirPatil/gorilla).**"
1047
  )
1048
+ gr.Markdown(
1049
+ """**AST means evaluation through Abstract Syntax Tree and Exec means evaluation through execution.**
1050
+
1051
+ **FC = native support for function/tool calling.**
1052
+
1053
+ **Cost is calculated as an estimate of the cost per 1000 function calls, in USD. Latency is measured in seconds.**
1054
+
1055
+ **AST Summary is the unweighted average of the four test categories under AST Evaluation. Exec Summary is the unweighted average of the four test categories under Exec Evaluation.**
1056
+
1057
+ **Click on column header to sort. If you would like to add your model or contribute test-cases, please contact us via [discord](https://discord.gg/SwTyuTAxX3).**
1058
+ """
1059
  )
1060
+ leaderboard_data = gr.Dataframe(value=get_leaderboard(), wrap=True)
1061
 
1062
  with gr.TabItem("Try It Out"):
1063
  with gr.Row():
 
1123
  fn=None,
1124
  inputs=[prompt, model, temperature, codeOutput, jsonOutput],
1125
  outputs=[],
1126
+ js='(prompt, model, temperature, codeOutput, jsonOutput) => window.open(`https://github.com/ShishirPatil/gorilla/issues/new?assignees=&labels=hosted-openfunctions-v2&projects=&template=hosted-openfunctions-v2.md&title=[bug] OpenFunctions-v2: &body=**Issue Description**%0A%0APrompt: ${prompt}%0A%0AModel: ${model}%0A%0ATemperature: ${temperature}%0A%0AOutput (or Error if request failed): ${codeOutput} %0A%0A ${jsonOutput}%0A%0A**Additional Information**\n`, "_blank")',
1127
  )
1128
+
1129
  thumbs_up.click(
1130
  fn=send_feedback_positive,
1131
+ inputs=[
1132
+ prompt,
1133
+ funcDescription,
1134
+ model,
1135
+ temperature,
1136
+ codeOutput,
1137
+ jsonOutput,
1138
+ ],
1139
  outputs=[feedbackMsg],
1140
  )
1141
+
1142
  thumbs_down.click(
1143
  fn=send_feedback_negative,
1144
+ inputs=[
1145
+ prompt,
1146
+ funcDescription,
1147
+ model,
1148
+ temperature,
1149
+ codeOutput,
1150
+ jsonOutput,
1151
+ ],
1152
  outputs=[feedbackMsg],
1153
  )
1154
 
 
1157
  # leaderboard_data = gr.Dataframe(
1158
  # value=get_voting_result(), wrap=True
1159
  # )
1160
+
1161
  demo.launch()