qiantong-xu
commited on
Commit
•
ebdc5a0
1
Parent(s):
0f7c127
Update app.py
Browse files
app.py
CHANGED
@@ -4,8 +4,8 @@ __all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissi
|
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
6 |
|
7 |
-
COLUMN_NAMES = ["model", "Open Weather", "The Cat API", "Home Search", "Trip Booking", "Google Sheets", "VirtualHome", "WebShop Long", "WebShop Short", "Tabletop"]
|
8 |
-
|
9 |
[text-davinci-003](https://platform.openai.com/docs/models/gpt-3) & 99.0 & 98.0 & 97.0 & 89.2 & 62.9 & 31.0 / 25.1 & 0.0 & 0.0 & 66.7 \\
|
10 |
[gpt-3.5-turbo](https://platform.openai.com/docs/models/gpt-3-5) & 90.0 & 92.0 & 80.0 & 85.8 & 51.4 & 20.0 / 18.9 & 0.0 & 1.8 & 33.3 \\
|
11 |
[text-curie-001](https://platform.openai.com/docs/models/gpt-3) & 8.0 & 58.0 & 6.0 & 6.7 & 1.4 & 12.0 / 4.1 & 0.0 & 0.0 & 1.0 \\
|
@@ -33,19 +33,28 @@ BENCHMARK_RESULTS = '''[gpt4](https://platform.openai.com/docs/models/gpt-4)
|
|
33 |
[stablelm-base-alpha-7b](https://huggingface.co/stabilityai/stablelm-base-alpha-7b) & 22.0 & 47.0 & 0.0 & 0.0 & 4.3 & 28.0 / 10.3 & 0.0 & 0.0 & 2.9 \\
|
34 |
[stablelm-tuned-alpha-7b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b) & 23.0 & 38.0 & 0.0 & 0.0 & 1.4 & 26.0 / 7.3 & 0.0 & 0.0 & 3.8 \\
|
35 |
[stablelm-base-alpha-3b](https://huggingface.co/stabilityai/stablelm-base-alpha-3b) & 6.0 & 28.0 & 0.0 & 0.0 & 1.4 & 29.0 / 5.3 & 0.0 & 0.0 & 1.0 \\
|
36 |
-
[stablelm-tuned-alpha-3b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b) & 14.0 & 31.0 & 0.0 & 0.8 & 0.0 & 8.0 / 5.6 & 0.0 & 0.0 & 1.0 \\
|
37 |
-
[llama-30b-toolbench](https://huggingface.co/sambanovasystems/LLaMA-30b-toolbench) & 100.0 & 94.0 & 87.0 & 85.8 & 2.9 & 16.0/ 24.3& 0.0 & 0.0 & 7.5 \\
|
38 |
[starcoder-toolbench](https://huggingface.co/sambanovasystems/starcoder-toolbench) & 99.0 & 97.0 & 83.0 & 80.8 & 21.2 & 31.0/ 18.4& 0.0 & 0.0 & 13.9 \\
|
39 |
[codegen-16B-mono-toolbench](https://huggingface.co/sambanovasystems/codegen-16B-mono-toolbench) & 97.7 & 99.0 & 82.0 & 77.5 & 19.8 & 29.0/ 17.2& 0.0 & 3.5 & 16.2 \\'''
|
40 |
|
41 |
|
42 |
def get_baseline_df():
|
43 |
-
lines = BENCHMARK_RESULTS.split("\n")
|
44 |
df_data = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
for line in lines:
|
46 |
model_results = line.replace(" ", "").strip("\\").split("&")
|
47 |
assert len(model_results) == 10
|
|
|
48 |
df_data.append(model_results)
|
|
|
49 |
print(len(df_data))
|
50 |
df = pd.DataFrame(df_data, columns=COLUMN_NAMES)
|
51 |
return df
|
@@ -75,7 +84,6 @@ with block:
|
|
75 |
The [evaluation suite](https://github.com/sambanova/toolbench/) is now alive on Github.
|
76 |
"""
|
77 |
)
|
78 |
-
|
79 |
with gr.Row():
|
80 |
with gr.Accordion("Citation", open=False):
|
81 |
citation_button = gr.Textbox(
|
@@ -84,9 +92,15 @@ with block:
|
|
84 |
elem_id="citation-button",
|
85 |
).style(show_copy_button=True)
|
86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
with gr.Row():
|
88 |
data = gr.components.Dataframe(
|
89 |
-
type="pandas", datatype=["markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
|
90 |
)
|
91 |
with gr.Row():
|
92 |
data_run = gr.Button("Refresh")
|
|
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
6 |
|
7 |
+
COLUMN_NAMES = ["model", "Tuned on ToolBench" "Open Weather", "The Cat API", "Home Search", "Trip Booking", "Google Sheets", "VirtualHome", "WebShop Long", "WebShop Short", "Tabletop"]
|
8 |
+
UNTUNED_MODEL_RESULTS = '''[gpt4](https://platform.openai.com/docs/models/gpt-4) & 93.0 & 96.0 & 97.0 & 96.7 & 62.9 & 23.0 / 23.5 & 0.0 & 0.0 & 81.0 \\
|
9 |
[text-davinci-003](https://platform.openai.com/docs/models/gpt-3) & 99.0 & 98.0 & 97.0 & 89.2 & 62.9 & 31.0 / 25.1 & 0.0 & 0.0 & 66.7 \\
|
10 |
[gpt-3.5-turbo](https://platform.openai.com/docs/models/gpt-3-5) & 90.0 & 92.0 & 80.0 & 85.8 & 51.4 & 20.0 / 18.9 & 0.0 & 1.8 & 33.3 \\
|
11 |
[text-curie-001](https://platform.openai.com/docs/models/gpt-3) & 8.0 & 58.0 & 6.0 & 6.7 & 1.4 & 12.0 / 4.1 & 0.0 & 0.0 & 1.0 \\
|
|
|
33 |
[stablelm-base-alpha-7b](https://huggingface.co/stabilityai/stablelm-base-alpha-7b) & 22.0 & 47.0 & 0.0 & 0.0 & 4.3 & 28.0 / 10.3 & 0.0 & 0.0 & 2.9 \\
|
34 |
[stablelm-tuned-alpha-7b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b) & 23.0 & 38.0 & 0.0 & 0.0 & 1.4 & 26.0 / 7.3 & 0.0 & 0.0 & 3.8 \\
|
35 |
[stablelm-base-alpha-3b](https://huggingface.co/stabilityai/stablelm-base-alpha-3b) & 6.0 & 28.0 & 0.0 & 0.0 & 1.4 & 29.0 / 5.3 & 0.0 & 0.0 & 1.0 \\
|
36 |
+
[stablelm-tuned-alpha-3b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b) & 14.0 & 31.0 & 0.0 & 0.8 & 0.0 & 8.0 / 5.6 & 0.0 & 0.0 & 1.0 \\'''
|
37 |
+
TUNED_MODEL_RESULTS='''[llama-30b-toolbench](https://huggingface.co/sambanovasystems/LLaMA-30b-toolbench) & 100.0 & 94.0 & 87.0 & 85.8 & 2.9 & 16.0/ 24.3& 0.0 & 0.0 & 7.5 \\
|
38 |
[starcoder-toolbench](https://huggingface.co/sambanovasystems/starcoder-toolbench) & 99.0 & 97.0 & 83.0 & 80.8 & 21.2 & 31.0/ 18.4& 0.0 & 0.0 & 13.9 \\
|
39 |
[codegen-16B-mono-toolbench](https://huggingface.co/sambanovasystems/codegen-16B-mono-toolbench) & 97.7 & 99.0 & 82.0 & 77.5 & 19.8 & 29.0/ 17.2& 0.0 & 3.5 & 16.2 \\'''
|
40 |
|
41 |
|
42 |
def get_baseline_df():
|
|
|
43 |
df_data = []
|
44 |
+
|
45 |
+
lines = UNTUNED_MODEL_RESULTS.split("\n")
|
46 |
+
for line in lines:
|
47 |
+
model_results = line.replace(" ", "").strip("\\").split("&")
|
48 |
+
assert len(model_results) == 10
|
49 |
+
model_results.insert(1, "False")
|
50 |
+
df_data.append(model_results)
|
51 |
+
lines = TUNED_MODEL_RESULTS.split("\n")
|
52 |
for line in lines:
|
53 |
model_results = line.replace(" ", "").strip("\\").split("&")
|
54 |
assert len(model_results) == 10
|
55 |
+
model_results.insert(1, "True")
|
56 |
df_data.append(model_results)
|
57 |
+
|
58 |
print(len(df_data))
|
59 |
df = pd.DataFrame(df_data, columns=COLUMN_NAMES)
|
60 |
return df
|
|
|
84 |
The [evaluation suite](https://github.com/sambanova/toolbench/) is now alive on Github.
|
85 |
"""
|
86 |
)
|
|
|
87 |
with gr.Row():
|
88 |
with gr.Accordion("Citation", open=False):
|
89 |
citation_button = gr.Textbox(
|
|
|
92 |
elem_id="citation-button",
|
93 |
).style(show_copy_button=True)
|
94 |
|
95 |
+
|
96 |
+
gr.Markdown(
|
97 |
+
"""In the table below, we summarize the 3-shot performance of all the models.
|
98 |
+
We use success rate as the primary evaluation metric for most tasks, except for the WebShop where we report rewards, as well as for VirtualHome where we use executability and Longest Common Subsequence (LCS), following the original metrics proposed by the respective authors.
|
99 |
+
"""
|
100 |
+
)
|
101 |
with gr.Row():
|
102 |
data = gr.components.Dataframe(
|
103 |
+
type="pandas", datatype=["markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
|
104 |
)
|
105 |
with gr.Row():
|
106 |
data_run = gr.Button("Refresh")
|