Spaces:

gorilla-llm
/

berkeley-function-calling-leaderboard

Running

App Files Files Community

Huanzhi Mao commited on Mar 19

Commit

67249b1

•

1 Parent(s): 57013a0

Support auto-populated leaderboard from csv file.

Browse files

Files changed (2) hide show

app.py +79 -342
data.csv +22 -0

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import webbrowser
 import os
 import re
 import pandas as pd
 # from anthropic import Anthropic
 from openai import OpenAI
 from mistralai.client import MistralClient
@@ -627,344 +628,31 @@ COLUMNS = [
     "Parallel Multiple Exec",
     "Relevance Detection",
 ]
-DATA = [
-    (
-        1,
-        84.28,
-        "GPT-4-1106-Preview",
-        "OpenAI",
-        "Proprietary",
-        80.73,
-        88.50,
-        90.50,
-        84.50,
-        74.12,
-        70.00,
-        68.00,
-        50.00,
-        88.75,
-    ),
-        (
-        2,
-        84.16,
-        "GPT-4-0125-Preview",
-        "OpenAI",
-        "Proprietary",
-        81.45,
-        89.00,
-        88.00,
-        83.50,
-        72.94,
-        78.00,
-        68.00,
-        50.00,
-        87.50,
-    ),
-    (
-        3,
-        84.16,
-        "Gorilla-OpenFunctions-v2",
-        "Gorilla LLM",
-        "Apache 2.0",
-        87.82,
-        88.50,
-        82.50,
-        78.00,
-        85.88,
-        82.00,
-        68.00,
-        55.00,
-        71.67,
-    ),
-    (
-        4,
-        83.67,
-        "Claude-3-Opus-20240229",
-        "Anthropic",
-        "Proprietary",
-        85.27,
-        83.00,
-        79.00,
-        72.00,
-        89.41,
-        80.00,
-        68.00,
-        57.50,
-        84.58,
-    ),
-    (
-        5,
-        81.75,
-        "Mistral-Medium-2312",
-        "Mistral AI",
-        "Proprietary",
-        80.18,
-        84.50,
-        76.50,
-        73.50,
-        84.71,
-        86.00,
-        76.00,
-        62.50,
-        90.00,
-    ),
-    (
-        6,
-        80.30,
-        "Claude-3-Sonnet-20240229",
-        "Anthropic",
-        "Proprietary",
-        85.64,
-        87.50,
-        83.50,
-        83.00,
-        90.59,
-        82.00,
-        72.00,
-        60.00,
-        41.25,
-    ),
-    (
-        7,
-        80.30,
-        "GPT-3.5-Turbo-0125",
-        "OpenAI",
-        "Proprietary",
-        80.18,
-        84.50,
-        82.50,
-        79.00,
-        84.71,
-        80.00,
-        68.00,
-        47.50,
-        45.33,
-    ),
-    (
-        8,
-        79.07,
-        "Functionary-Medium-v2.2",
-        "MeetKai",
-        "N/A",
-        79.17,
-        90.00,
-        85.00,
-        78.00,
-        65.88,
-        62.00,
-        70.00,
-        50.00,
-        79.17,
-    ),
-    (
-        9,
-        77.41,
-        "Claude-2.1",
-        "Anthropic",
-        "Proprietary",
-        85.64,
-        83.00,
-        77.00,
-        60.50,
-        68.23,
-        48.00,
-        52.00,
-        47.00,
-        78.33,
-    ),
-    (
-        10,
-        61.75,
-        "Mistral-tiny-2312",
-        "Mistral AI",
-        "Proprietary",
-        59.64,
-        62.50,
-        56.00,
-        43.00,
-        71.17,
-        84.00,
-        74.00,
-        36.00,
-        77.08,
-    ),
-    (
-        11,
-        61.02,
-        "Claude-instant-1.2",
-        "Anthropic",
-        "Proprietary",
-        68.73,
-        59.00,
-        56.00,
-        44.00,
-        60.00,
-        51.00,
-        52.00,
-        50.00,
-        61.67,
-    ),
-    (
-        12,
-        56.87,
-        "Mistral-small-2312",
-        "Mistral AI",
-        "Proprietary",
-        46.55,
-        68.00,
-        50.00,
-        63.00,
-        32.35,
-        30.00,
-        40.00,
-        37.50,
-        89.58,
-    ),
-    (
-        13,
-        56.81,
-        "Mistral-large-2402",
-        "Mistral AI",
-        "Proprietary",
-        71.82,
-        90.50,
-        0.00,
-        0.00,
-        72.94,
-        76.00,
-        0.00,
-        5.00,
-        84.58,
-    ),
-    (
-        14,
-        55.90,
-        "Nexusflow-Raven-v2",
-        "Nexusflow",
-        "Apache 2.0",
-        76.55,
-        83.50,
-        39.50,
-        32.50,
-        61.18,
-        84.00,
-        62.00,
-        47.00,
-        0.00,
-    ),
-    (
-        15,
-        55.87,
-        "FireFunction-v1",
-        "Fireworks",
-        "Apache 2.0",
-        73.19,
-        87.00,
-        0.00,
-        0.00,
-        68.23,
-        76.00,
-        0.00,
-        5.00,
-        81.25,
-    ),
-    (
-        16,
-        55.68,
-        "Gemini-1.0-Pro",
-        "Google",
-        "Proprietary",
-        79.71,
-        89.00,
-        0.00,
-        0.00,
-        51.19,
-        66.00,
-        0.00,
-        0.00,
-        78.30,
-    ),
-    (
-        17,
-        54.52,
-        "GPT-4-0613",
-        "OpenAI",
-        "Proprietary",
-        74.55,
-        86.00,
-        0.00,
-        0.00,
-        50.00,
-        56.00,
-        0.00,
-        2.00,
-        87.08,
-    ),
-    (
-        18,
-        45.96,
-        "Deepseek-v1.5",
-        "Deepseek",
-        "Deepseek License",
-        48.36,
-        61.00,
-        37.00,
-        47.50,
-        24.70,
-        2.00,
-        0.00,
-        7.50,
-        66.25,
-    ),
-    (
-        19,
-        44.40,
-        "Gemma",
-        "Google",
-        "gemma-terms-of-use",
-        61.45,
-        60.00,
-        41.00,
-        32.00,
-        44.71,
-        48.00,
-        44.00,
-        25.50,
-        0.42,
-    ),
-    (
-        20,
-        33.37,
-        "Gorilla-OpenFunctions-v0",
-        "Gorilla LLM",
-        "Apache 2.0",
-        60.00,
-        56.00,
-        0.00,
-        3.50,
-        38.24,
-        65.00,
-        0.00,
-        0.00,
-        4.58,
-    ),
-    (
-        21,
-        24.58,
-        "Glaive-v1",
-        "Glaive",
-        "cc-by-sa-4.0",
-        34.55,
-        26.00,
-        0.00,
-        0.00,
-        21.18,
-        36.00,
-        0.00,
-        2.50,
-        46.25,
-    ),
-]
 MODELS = [
     "gorilla-openfunctions-v2",
@@ -977,7 +665,6 @@ MODELS = [
 def send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput, vote):
     # Login and get access token
-    print("Sending feedback")
     login_url = 'https://us-west-2.aws.realm.mongodb.com/api/client/v2.0/app/data-onwzq/auth/providers/local-userpass/login'
     headers = {'Content-Type': 'application/json'}
     login_data = {
@@ -1020,6 +707,52 @@ def send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput,
     else:
         print('Error:', response.text)
 def send_feedback_negative(prompt, function, model, temperature, codeOutput, jsonOutput):
     send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput, "negative")
     return "Thank you for your feedback. We will use this to improve our service."
@@ -1221,12 +954,10 @@ def distribute_task(prompt, function, model, temperature):
 def get_leaderboard():
     # Convert the leaderboard data to a pandas DataFrame for easier handling and display
     leaderboard_df = pd.DataFrame(DATA, columns=COLUMNS)
     return leaderboard_df
-# Initialize the leaderboard data so it's loaded when the page is opened
-initial_leaderboard_data = get_leaderboard()
 prompt = gr.Textbox(label="Prompt", placeholder="Type your prompt here...", lines=4)
 funcDescription = gr.Textbox(
     label="Function Description", placeholder="Describe the function...", lines=20
@@ -1324,4 +1055,10 @@ with gr.Blocks() as demo:
                 outputs=[feedbackMsg],
             )
 demo.launch()

 import os
 import re
 import pandas as pd
+import csv
 # from anthropic import Anthropic
 from openai import OpenAI
 from mistralai.client import MistralClient
     "Parallel Multiple Exec",
     "Relevance Detection",
 ]
+def parse_csv(text):
+    lines = text.split('\n')
+    result = []
+    for i in range(len(lines)):
+        row = lines[i].split(',')[:15]
+        row = [parse_value(value) for value in row]
+        row.insert(0, i)
+        overall_acc = row.pop(4)
+        row.insert(1, overall_acc)
+        row.pop(5)
+        row.pop(5)
+        result.append(row)
+    return result
+def parse_value(value):
+    if value.endswith('%'):
+        return float(value[:-1])
+    return value
+with open('./data.csv', 'r') as file:
+    csv_text = file.read()
+    DATA = parse_csv(csv_text)
 MODELS = [
     "gorilla-openfunctions-v2",
 def send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput, vote):
     # Login and get access token
     login_url = 'https://us-west-2.aws.realm.mongodb.com/api/client/v2.0/app/data-onwzq/auth/providers/local-userpass/login'
     headers = {'Content-Type': 'application/json'}
     login_data = {
     else:
         print('Error:', response.text)
+def get_voting_result():
+    login_url = 'https://us-west-2.aws.realm.mongodb.com/api/client/v2.0/app/data-onwzq/auth/providers/local-userpass/login'
+    headers = {'Content-Type': 'application/json'}
+    login_data = {
+        'username': 'website',
+        'password': mongoDBPassword
+    }
+    response = requests.post(login_url, headers=headers, json=login_data)
+    access_token = response.json()['access_token']
+    # Scanning the database
+    url = 'https://us-west-2.aws.data.mongodb-api.com/app/data-onwzq/endpoint/data/v1/action/find'
+    headers = {
+        'Content-Type': 'application/json',
+        'Access-Control-Request-Headers': '*',
+        'Authorization': f'Bearer {access_token}'
+    }
+    body = {
+        'collection': "vote",
+        'database': "gorilla-feedback",
+        'dataSource': "gorilla",
+    }
+    response = requests.post(url, headers=headers, json=body)
+    if response.ok:
+        data = response.json()
+        votes = data['documents']
+        votes = [vote for vote in votes if vote['result'] in ['positive', 'negative']]
+        # extract only the model, positive count, negative count
+        model_votes = {}
+        for vote in votes:
+            model = vote['model']
+            if model not in model_votes:
+                model_votes[model] = {'positive': 0, 'negative': 0}
+            model_votes[model][vote['result']] += 1
+        for model in model_votes:
+            model_votes[model]['accuracy'] = model_votes[model]['positive'] / (model_votes[model]['positive'] + model_votes[model]['negative'])
+        result = []
+        for model in model_votes:
+            result.append([model, model_votes[model]['accuracy'], model_votes[model]['positive'], model_votes[model]['negative']])
+        result = sorted(result, key=lambda x: x[1], reverse=True)
+        return pd.DataFrame(result, columns=['Model', 'Accuracy', 'Positive', 'Negative'])
+    else:
+        print('Error:', response.text)
+        return []
 def send_feedback_negative(prompt, function, model, temperature, codeOutput, jsonOutput):
     send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput, "negative")
     return "Thank you for your feedback. We will use this to improve our service."
 def get_leaderboard():
     # Convert the leaderboard data to a pandas DataFrame for easier handling and display
     leaderboard_df = pd.DataFrame(DATA, columns=COLUMNS)
+    leaderboard_df = leaderboard_df.sort_values(by="Rank")
     return leaderboard_df
 prompt = gr.Textbox(label="Prompt", placeholder="Type your prompt here...", lines=4)
 funcDescription = gr.Textbox(
     label="Function Description", placeholder="Describe the function...", lines=20
                 outputs=[feedbackMsg],
             )
+        with gr.TabItem("Voting Leaderboard"):
+            gr.Markdown("## This is a live leaderboard where you can see user's voting result on the agent's response.")
+            leaderboard_data = gr.Dataframe(
+                value=get_voting_result(), wrap=True
+            )
 demo.launch()

data.csv ADDED Viewed

	@@ -0,0 +1,22 @@

+GPT-4-1106-Preview (FC),OpenAI,Proprietary,84.28%,86.06%,65.53%,80.73%,88.50%,90.50%,84.50%,74.12%,70.00%,68.00%,50.00%,88.75%
+GPT-4-0125-Preview (FC),OpenAI,Proprietary,84.16%,85.61%,67.24%,81.45%,89.00%,88.50%,83.50%,72.94%,78.00%,68.00%,50.00%,87.50%
+Gorilla-OpenFunctions-v2 (FC),Gorilla LLM,Apache 2.0,84.16%,84.33%,72.72%,87.82%,89.00%,82.50%,78.00%,85.88%,82.00%,68.00%,55.00%,71.67%
+Claude-3-Opus-20240229 (Prompt),Anthropic,Proprietary,83.67%,79.82%,73.73%,85.27%,83.00%,79.00%,72.00%,89.41%,80.00%,68.00%,57.50%,84.58%
+Mistral-Medium-2312 (Prompt),Mistral AI,Proprietary,81.75%,78.67%,66.93%,80.18%,84.50%,76.50%,73.50%,84.71%,76.00%,62.00%,45.00%,90.00%
+Claude-3-Sonnet-20240229 (Prompt),Anthropic,Proprietary,80.30%,84.91%,76.15%,85.64%,87.50%,83.50%,83.00%,90.59%,82.00%,72.00%,60.00%,41.25%
+GPT-3.5-Turbo-0125 (FC),OpenAI,Proprietary,80.30%,81.55%,69.43%,80.18%,84.50%,82.50%,79.00%,84.71%,80.00%,68.00%,45.00%,68.33%
+Functionary-Small (FC),MeetKai,N/A,79.07%,82.31%,64.40%,75.75%,89.50%,82.50%,81.50%,64.12%,78.00%,68.00%,47.50%,78.33%
+Functionary-Medium-v2.2 (FC),MeetKai,N/A,79.03%,82.25%,61.97%,76.00%,90.00%,85.00%,77.99%,65.88%,62.00%,70.00%,50.00%,79.17%
+Claude-2.1 (Prompt),Anthropic,Proprietary,77.41%,76.53%,53.93%,85.64%,83.00%,77.00%,60.50%,68.23%,48.00%,52.00%,47.50%,78.33%
+Mistral-tiny-2312 (Prompt),Mistral AI,Proprietary,61.75%,55.28%,53.42%,59.64%,62.50%,56.00%,43.00%,71.17%,74.00%,36.00%,32.50%,77.08%
+Claude-instant-1.2 (Prompt),Anthropic,Proprietary,61.02%,57.06%,49.88%,68.73%,59.00%,56.50%,44.00%,60.00%,52.00%,50.00%,37.50%,61.67%
+Mistral-small-2312 (Prompt),Mistral AI,Proprietary,56.87%,57.01%,36.18%,46.55%,68.00%,50.50%,63.00%,34.71%,32.00%,38.00%,40.00%,89.58%
+Mistral-large-2402 (FC),Mistral AI,Proprietary,56.81%,40.58%,38.49%,71.82%,90.50%,0.00%,0.00%,72.94%,76.00%,0.00%,5.00%,84.58%
+Nexusflow-Raven-v2 (FC),Nexusflow,Apache 2.0,55.90%,58.01%,63.67%,76.55%,83.50%,39.50%,32.50%,61.18%,84.00%,62.00%,47.50%,0.00%
+FireFunction-v1 (FC),Fireworks,Apache 2.0,55.87%,40.05%,37.31%,73.19%,87.00%,0.00%,0.00%,68.23%,76.00%,0.00%,5.00%,81.25%
+Gemini-1.0-Pro (FC),Google,Proprietary,55.68%,42.18%,29.30%,79.71%,89.00%,0.00%,0.00%,51.19%,66.00%,0.00%,0.00%,78.30%
+GPT-4-0613 (FC),OpenAI,Proprietary,54.52%,40.14%,27.12%,74.55%,86.00%,0.00%,0.00%,50.00%,56.00%,0.00%,2.50%,87.08%
+Deepseek-v1.5 (Prompt),Deepseek,Deepseek License,45.96%,48.59%,8.55%,48.36%,61.00%,37.50%,47.50%,24.70%,2.00%,0.00%,7.50%,66.25%
+Gemma,Google,gemma-terms-of-use,44.40%,48.61%,40.43%,61.45%,60.00%,41.00%,32.00%,44.71%,48.00%,44.00%,25.00%,0.42%
+Gorilla-OpenFunctions-v0 (FC),Gorilla LLM,Apache 2.0,33.37%,29.88%,24.06%,60.00%,56.00%,0.00%,3.50%,38.24%,58.00%,0.00%,0.00%,4.58%
+Glaive-v1 (FC),Glaive,cc-by-sa-4.0,24.58%,15.14%,14.92%,34.55%,26.00%,0.00%,0.00%,21.18%,36.00%,0.00%,2.50%,46.25%