Spaces:

qiantong-xu
/

toolbench-leaderboard

Running

File size: 10,348 Bytes

eb33210
 
 
 
 
 
c3d52e7
ebdc5a0
3176152
 
 
cc310eb
 
 
3176152
 
 
 
d38ed8c
 
 
 
 
 
 
 
 
3176152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ebdc5a0
 
3176152
 
 
 
1307e8d
 
 
 
 
 
 
 
 
 
3176152
 
ebdc5a0
 
 
1307e8d
ebdc5a0
e938cdf
fdc4461
ebdc5a0
 
 
3176152
1307e8d
3176152
e938cdf
fdc4461
ebdc5a0
3176152
ebdc5a0
3176152
c3d52e7
eb33210
 
3176152
9dfb4f3
 
 
 
 
 
 
 
 
 
 
 
eb33210
 
 
 
a206e6c
eb33210
a206e6c
 
9dfb4f3
 
eb33210
 
9dfb4f3
 
 
 
 
 
 
 
ebdc5a0
 
 
1307e8d
ebdc5a0
 
eb33210
 
fdc4461
eb33210
23e7ae5
eb33210
 
3176152
eb33210
 
3176152
eb33210


__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']

import gradio as gr
import pandas as pd

COLUMN_NAMES = ["Model", "Tuned on ToolBench", "Avg.", "Open Weather", "The Cat API", "Home Search", "Trip Booking", "Google Sheets", "VirtualHome", "WebShop Long", "WebShop Short", "Tabletop"]
UNTUNED_MODEL_RESULTS = '''[gpt4](https://platform.openai.com/docs/models/gpt-4)                    & 93.0 & 96.0 & 97.0 & 96.7 & 62.9 & 23.0 / 23.5 & 0.0 & 0.0 & 81.0 \\
[text-davinci-003](https://platform.openai.com/docs/models/gpt-3)      & 99.0 & 98.0 & 97.0 & 89.2 & 62.9 & 31.0 / 25.1 & 0.0 & 0.0 & 66.7 \\
[gpt-3.5-turbo](https://platform.openai.com/docs/models/gpt-3-5)           & 90.0 & 92.0 & 80.0 & 85.8 & 51.4 & 20.0 / 18.9 & 0.0        & 1.8        & 33.3 \\
[text-curie-001](https://platform.openai.com/docs/models/gpt-3)          & 8.0  & 58.0 & 6.0  & 6.7  & 1.4  & 12.0 / 4.1  & 0.0        & 0.0        & 1.0  \\
[Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b) & 90.0 & 84.39 & 83.0 & 71.67 & 58.57 & 35.0 / 24.74 & 1.53 & 30.45 & 45.4 \\
[Llama-2-13b](https://huggingface.co/meta-llama/Llama-2-13b) & 85.0 & 77.0 & 68.0 & 53.33 & 30.0 & 33.0 / 21.67 & 0.6 & 31.67 & 23.81 \\
[Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) & 76.0 & 83.0 & 58.0 & 33.33 & 22.86 & 25.0 / 21.49 & 0.0 & 6.92 & 14.39
[llama-65b](https://huggingface.co/huggyllama/llama-65b)     & 90.0 & 80.0 & 84.0 & 65.8 & 32.9 & 32.0 / 20.3 & 0.0 & 41.2 & 30.5 \\
[llama-30b](https://huggingface.co/huggyllama/llama-30b)              & 78.0 & 84.0 & 66.0 & 45.0 & 37.1 & 27.0 / 21.7 & 0.0 & 30.6 & 34.3 \\
[llama-13b](https://huggingface.co/huggyllama/llama-13b)                & 70.0 & 74.0 & 45.0 & 35.8 & 5.7  & 28.0 / 18.9 & 0.0 & 27.6 & 17.1 \\
[llama-13b-alpaca](https://huggingface.co/chavinlo/gpt4-x-alpaca)    & 62.0 & 43.0 & 44.0 & 40.8 & 11.4 & 1.0 / 1.6   & 0.0 & 2.7  & 9.5  \\
[CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf) & 86.0 & 92.0 & 74.0 & 63.33 & 38.08 & 35.0 / 21.97 & 0.0 & 0.0 & 11.16 \\
[CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf) & 90.0 & 94.0 & 78.0 & 61.67 & 41.27 & 32.0 / 21.95 & 0.0 & 0.0 & 16.98 \\
[CodeLlama-7b-Python-hf](https://huggingface.co/codellama/CodeLlama-7b-Python-hf) & 83.0 & 88.0 & 83.0 & 68.33 & 49.13 & 31.0 / 21.33 & 0.0 & 1.58 & 22.86 \\
[CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) & 96.0 & 86.83 & 87.0 & 70.83 & 51.26 & 35.0 / 22.28 & 0.0 & 0.0 & 40.95 \\
[CodeLlama-13b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf) & 98.0 & 89.58 & 85.0 & 72.5 & 48.97 & 31.0 / 22.56 & 0.0 & 9.7 & 57.62 \\
[CodeLlama-13b-Python-hf](https://huggingface.co/codellama/CodeLlama-13b-Python-hf) & 93.0 & 92.35 & 86.0 & 62.5 & 50.79 & 37.0 / 21.94 & 0.0 & 2.4 & 41.53 \\
[CodeLlama-34b-hf](https://huggingface.co/codellama/CodeLlama-34b-hf) & 96.39 & 85.0 & 88.0 & 88.33 & 64.29 & 34.0 / 24.65 & 0.0 & 5.53 & 51.32 \\
[CodeLlama-34b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf) & 94.28 & 86.0 & 90.0 & 89.17 & 61.11 & 31.0 / 24.34 & 0.0 & 25.99 & 47.46 \\
[CodeLlama-34b-Python-hf](https://huggingface.co/codellama/CodeLlama-34b-Python-hf) & 91.11 & 88.42 & 91.0 & 85.83 & 55.87 & 28.0 / 21.24 & 0.0 & 6.47 & 33.33 \\
[starcoder](https://huggingface.co/bigcode/starcoder)                & 91.0 & 84.0 & 82.0 & 51.7 & 48.0 & 23.0 / 19.4 & 2.6 & 0.0  & 21.9 \\
[starcoderbase](https://huggingface.co/bigcode/starcoderbase)           & 90.0 & 86.0 & 79.0 & 63.3 & 42.9 & 24.0 / 16.3 & 5.8 & 23.1 & 17.1 \\
[codegen-16B-nl](https://huggingface.co/Salesforce/codegen-16B-nl)           & 51.0 & 75.0 & 37.0 & 21.7 & 7.1  & 43.0 / 18.0 & 0.0 & 0.0  & 16.2 \\
[codegen-16B-multi](https://huggingface.co/Salesforce/codegen-16B-multi)        & 56.0 & 75.0 & 47.0 & 7.5  & 21.4 & 31.0 / 14.1 & 0.0 & 0.5  & 8.6  \\
[codegen-16B-mono](https://huggingface.co/Salesforce/codegen-16B-mono)        & 63.7 & 72.0 & 52.0 & 28.3 & 31.5 & 28.0 / 15.7 & 1.5 & 6.6  & 15.2 \\
[bloomz](https://huggingface.co/bigscience/bloomz)            & 58.0 & 85.0 & 36.0 & 22.5 & 14.3 & 9.0 / 4.9   & 0.0 & 1.0  & 1.0  \\
[opt-iml-30b](https://huggingface.co/facebook/opt-iml-30b)              & 44.0 & 48.0 & 5.0  & 3.3  & 2.9  & 13.0 / 8.3  & 0.0 & 0.0  & 1.0  \\
[opt-30b](https://huggingface.co/facebook/opt-30b)                  & 46.0 & 35.0 & 2.0  & 3.3  & 8.6  & 24.0 / 11.7 & 0.0 & 0.0  & 1.0  \\
[opt-iml-1.3b](https://huggingface.co/facebook/opt-iml-1.3b)             & 20.0 & 28.0 & 0.0  & 0.0  & 4.3  & 13.0 / 3.1  & 0.0 & 0.0  & 1.0  \\
[opt-1.3b](https://huggingface.co/facebook/opt-1.3b)                 & 18.0 & 30.0 & 0.0  & 0.0  & 1.4  & 31.0 / 9.7  & 0.0 & 0.0  & 1.0  \\
[neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b)                & 55.0 & 69.0 & 27.0 & 10.8 & 18.6 & 28.0 / 15.3 & 0.0 & 8.8  & 6.7  \\
[GPT-NeoXT-Chat-Base-20B](https://huggingface.co/togethercomputer/GPT-NeoXT-Chat-Base-20B)  & 43.0 & 73.0 & 28.0 & 10.8 & 4.3  & 26.0 / 13.1 & 0.0 & 0.7  & 7.6  \\
[pythia-12b](https://huggingface.co/EleutherAI/pythia-12b)               & 53.0 & 65.0 & 12.0 & 0.8  & 11.4 & 17.0 / 12.1 & 0.0 & 0.0  & 1.9  \\
[dolly-v2-12b]()            & 0.0  & 1.0  & 10.0 & 5.0  & 7.1  & 11.0 / 8.9  & 0.0 & 0.0  & 7.6  \\
[pythia-6.9b](https://huggingface.co/EleutherAI/pythia-6.9b)   & 41.0 & 72.0 & 8.0  & 7.5  & 4.3  & 29.0 / 14.0 & 0.0 & 0.0  & 8.6  \\
[pythia-2.8b](https://huggingface.co/EleutherAI/pythia-2.8b)   & 49.0 & 54.0 & 7.0  & 3.3  & 12.9 & 24.0 / 14.8 & 0.0 & 0.0  & 7.6  \\
[pythia-1.4b](https://huggingface.co/EleutherAI/pythia-1.4b)   & 37.0 & 48.0 & 4.0  & 5.0  & 10.0 & 22.0 / 10.7 & 0.0 & 5.2  & 7.6  \\
[stablelm-base-alpha-7b](https://huggingface.co/stabilityai/stablelm-base-alpha-7b)   & 22.0 & 47.0 & 0.0  & 0.0  & 4.3  & 28.0 / 10.3 & 0.0 & 0.0  & 2.9  \\
[stablelm-tuned-alpha-7b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b)  & 23.0 & 38.0 & 0.0  & 0.0  & 1.4  & 26.0 / 7.3  & 0.0 & 0.0  & 3.8  \\
[stablelm-base-alpha-3b](https://huggingface.co/stabilityai/stablelm-base-alpha-3b)   & 6.0  & 28.0 & 0.0  & 0.0  & 1.4  & 29.0 / 5.3  & 0.0 & 0.0  & 1.0  \\
[stablelm-tuned-alpha-3b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b)  & 14.0 & 31.0 & 0.0  & 0.8  & 0.0  & 8.0 / 5.6   & 0.0 & 0.0  & 1.0  \\'''
TUNED_MODEL_RESULTS='''[llama-30b-toolbench](https://huggingface.co/sambanovasystems/LLaMA-30b-toolbench)          & 100.0          & 94.0           & 87.0           & 85.8           & 2.9            & 16.0/ 24.3& 0.0            & 0.0            & 7.5        \\
[starcoder-toolbench](https://huggingface.co/sambanovasystems/starcoder-toolbench)          & 99.0       & 97.0           & 83.0           & 80.8           & 21.2        & 31.0/ 18.4& 0.0            & 0.0            & 13.9        \\
[codegen-16B-mono-toolbench](https://huggingface.co/sambanovasystems/codegen-16B-mono-toolbench)   & 97.7          & 99.0           & 82.0           & 77.5           & 19.8     & 29.0/ 17.2& 0.0            & 3.5            & 16.2                   \\'''


def parse_line(line):
    model_results = line.replace(" ", "").strip("\\").split("&")
    for i in range(1, len(model_results)):
        if i == 6:
            res = model_results[6].split('/')[-1].strip()
        else:
            res = model_results[i]
        model_results[i] = float(res)
    return model_results
    
def get_baseline_df():
    df_data = []
    
    lines = UNTUNED_MODEL_RESULTS.split("\n")
    for line in lines:
        model_results = parse_line(line)
        assert len(model_results) == 10
        avg = sum(model_results[1:-3] + model_results[-2:]) / 8
        model_results.insert(1, avg)
        model_results.insert(1, "False")
        df_data.append(model_results)
    lines = TUNED_MODEL_RESULTS.split("\n")
    for line in lines:
        model_results = parse_line(line)
        assert len(model_results) == 10
        avg = sum(model_results[1:-3] + model_results[-2:]) / 8
        model_results.insert(1, avg)
        model_results.insert(1, "True")
        df_data.append(model_results)
        
    print(len(df_data))
    df = pd.DataFrame(df_data, columns=COLUMN_NAMES).round(1)
    return df


CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@misc{xu2023tool,
      title={On the Tool Manipulation Capability of Open-source Large Language Models}, 
      author={Qiantong Xu and Fenglu Hong and Bo Li and Changran Hu and Zhengyu Chen and Jian Zhang},
      year={2023},
      eprint={2305.16504},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
}"""


block = gr.Blocks()

with block:
    gr.Markdown(
        """# Toolbench Leaderboard
    
    Welcome to the leaderboard of the ToolBench! 🏆 
    This is a community where participants create language models and action generation algorithms to generate API function calls based goals described in natural lanugage!
    Please refer to [our paper](https://arxiv.org/abs/2305.16504) for more details and join our [Discord](https://discord.com/invite/JehFG5HXKb) for further discussion.
    The [evaluation suite](https://github.com/sambanova/toolbench/) is now alive on Github.
    """
    )
    with gr.Row():
        with gr.Accordion("Citation", open=False):
            citation_button = gr.Textbox(
                value=CITATION_BUTTON_TEXT,
                label=CITATION_BUTTON_LABEL,
                elem_id="citation-button",
            ).style(show_copy_button=True)

    
    gr.Markdown(
        """In the table below, we summarize the 3-shot performance of all the models.
        We use success rate as the primary evaluation metric for most tasks, except that we report rewards on WebShop, and the Longest Common Subsequence (LCS) on VirtualHome, following the original metrics proposed by the respective authors. 
    """
    )
    with gr.Row():
        data = gr.components.Dataframe(
            type="pandas", datatype=["markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
        )
    with gr.Row():
        data_run = gr.Button("Refresh")
        data_run.click(
            get_baseline_df, outputs=data
        )

    block.load(get_baseline_df, outputs=data)

block.launch()