qiantong-xu commited on
Commit
ebdc5a0
1 Parent(s): 0f7c127

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -7
app.py CHANGED
@@ -4,8 +4,8 @@ __all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissi
4
  import gradio as gr
5
  import pandas as pd
6
 
7
- COLUMN_NAMES = ["model", "Open Weather", "The Cat API", "Home Search", "Trip Booking", "Google Sheets", "VirtualHome", "WebShop Long", "WebShop Short", "Tabletop"]
8
- BENCHMARK_RESULTS = '''[gpt4](https://platform.openai.com/docs/models/gpt-4) & 93.0 & 96.0 & 97.0 & 96.7 & 62.9 & 23.0 / 23.5 & 0.0 & 0.0 & 81.0 \\
9
  [text-davinci-003](https://platform.openai.com/docs/models/gpt-3) & 99.0 & 98.0 & 97.0 & 89.2 & 62.9 & 31.0 / 25.1 & 0.0 & 0.0 & 66.7 \\
10
  [gpt-3.5-turbo](https://platform.openai.com/docs/models/gpt-3-5) & 90.0 & 92.0 & 80.0 & 85.8 & 51.4 & 20.0 / 18.9 & 0.0 & 1.8 & 33.3 \\
11
  [text-curie-001](https://platform.openai.com/docs/models/gpt-3) & 8.0 & 58.0 & 6.0 & 6.7 & 1.4 & 12.0 / 4.1 & 0.0 & 0.0 & 1.0 \\
@@ -33,19 +33,28 @@ BENCHMARK_RESULTS = '''[gpt4](https://platform.openai.com/docs/models/gpt-4)
33
  [stablelm-base-alpha-7b](https://huggingface.co/stabilityai/stablelm-base-alpha-7b) & 22.0 & 47.0 & 0.0 & 0.0 & 4.3 & 28.0 / 10.3 & 0.0 & 0.0 & 2.9 \\
34
  [stablelm-tuned-alpha-7b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b) & 23.0 & 38.0 & 0.0 & 0.0 & 1.4 & 26.0 / 7.3 & 0.0 & 0.0 & 3.8 \\
35
  [stablelm-base-alpha-3b](https://huggingface.co/stabilityai/stablelm-base-alpha-3b) & 6.0 & 28.0 & 0.0 & 0.0 & 1.4 & 29.0 / 5.3 & 0.0 & 0.0 & 1.0 \\
36
- [stablelm-tuned-alpha-3b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b) & 14.0 & 31.0 & 0.0 & 0.8 & 0.0 & 8.0 / 5.6 & 0.0 & 0.0 & 1.0 \\
37
- [llama-30b-toolbench](https://huggingface.co/sambanovasystems/LLaMA-30b-toolbench) & 100.0 & 94.0 & 87.0 & 85.8 & 2.9 & 16.0/ 24.3& 0.0 & 0.0 & 7.5 \\
38
  [starcoder-toolbench](https://huggingface.co/sambanovasystems/starcoder-toolbench) & 99.0 & 97.0 & 83.0 & 80.8 & 21.2 & 31.0/ 18.4& 0.0 & 0.0 & 13.9 \\
39
  [codegen-16B-mono-toolbench](https://huggingface.co/sambanovasystems/codegen-16B-mono-toolbench) & 97.7 & 99.0 & 82.0 & 77.5 & 19.8 & 29.0/ 17.2& 0.0 & 3.5 & 16.2 \\'''
40
 
41
 
42
  def get_baseline_df():
43
- lines = BENCHMARK_RESULTS.split("\n")
44
  df_data = []
 
 
 
 
 
 
 
 
45
  for line in lines:
46
  model_results = line.replace(" ", "").strip("\\").split("&")
47
  assert len(model_results) == 10
 
48
  df_data.append(model_results)
 
49
  print(len(df_data))
50
  df = pd.DataFrame(df_data, columns=COLUMN_NAMES)
51
  return df
@@ -75,7 +84,6 @@ with block:
75
  The [evaluation suite](https://github.com/sambanova/toolbench/) is now alive on Github.
76
  """
77
  )
78
-
79
  with gr.Row():
80
  with gr.Accordion("Citation", open=False):
81
  citation_button = gr.Textbox(
@@ -84,9 +92,15 @@ with block:
84
  elem_id="citation-button",
85
  ).style(show_copy_button=True)
86
 
 
 
 
 
 
 
87
  with gr.Row():
88
  data = gr.components.Dataframe(
89
- type="pandas", datatype=["markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
90
  )
91
  with gr.Row():
92
  data_run = gr.Button("Refresh")
 
4
  import gradio as gr
5
  import pandas as pd
6
 
7
+ COLUMN_NAMES = ["model", "Tuned on ToolBench" "Open Weather", "The Cat API", "Home Search", "Trip Booking", "Google Sheets", "VirtualHome", "WebShop Long", "WebShop Short", "Tabletop"]
8
+ UNTUNED_MODEL_RESULTS = '''[gpt4](https://platform.openai.com/docs/models/gpt-4) & 93.0 & 96.0 & 97.0 & 96.7 & 62.9 & 23.0 / 23.5 & 0.0 & 0.0 & 81.0 \\
9
  [text-davinci-003](https://platform.openai.com/docs/models/gpt-3) & 99.0 & 98.0 & 97.0 & 89.2 & 62.9 & 31.0 / 25.1 & 0.0 & 0.0 & 66.7 \\
10
  [gpt-3.5-turbo](https://platform.openai.com/docs/models/gpt-3-5) & 90.0 & 92.0 & 80.0 & 85.8 & 51.4 & 20.0 / 18.9 & 0.0 & 1.8 & 33.3 \\
11
  [text-curie-001](https://platform.openai.com/docs/models/gpt-3) & 8.0 & 58.0 & 6.0 & 6.7 & 1.4 & 12.0 / 4.1 & 0.0 & 0.0 & 1.0 \\
 
33
  [stablelm-base-alpha-7b](https://huggingface.co/stabilityai/stablelm-base-alpha-7b) & 22.0 & 47.0 & 0.0 & 0.0 & 4.3 & 28.0 / 10.3 & 0.0 & 0.0 & 2.9 \\
34
  [stablelm-tuned-alpha-7b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b) & 23.0 & 38.0 & 0.0 & 0.0 & 1.4 & 26.0 / 7.3 & 0.0 & 0.0 & 3.8 \\
35
  [stablelm-base-alpha-3b](https://huggingface.co/stabilityai/stablelm-base-alpha-3b) & 6.0 & 28.0 & 0.0 & 0.0 & 1.4 & 29.0 / 5.3 & 0.0 & 0.0 & 1.0 \\
36
+ [stablelm-tuned-alpha-3b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b) & 14.0 & 31.0 & 0.0 & 0.8 & 0.0 & 8.0 / 5.6 & 0.0 & 0.0 & 1.0 \\'''
37
+ TUNED_MODEL_RESULTS='''[llama-30b-toolbench](https://huggingface.co/sambanovasystems/LLaMA-30b-toolbench) & 100.0 & 94.0 & 87.0 & 85.8 & 2.9 & 16.0/ 24.3& 0.0 & 0.0 & 7.5 \\
38
  [starcoder-toolbench](https://huggingface.co/sambanovasystems/starcoder-toolbench) & 99.0 & 97.0 & 83.0 & 80.8 & 21.2 & 31.0/ 18.4& 0.0 & 0.0 & 13.9 \\
39
  [codegen-16B-mono-toolbench](https://huggingface.co/sambanovasystems/codegen-16B-mono-toolbench) & 97.7 & 99.0 & 82.0 & 77.5 & 19.8 & 29.0/ 17.2& 0.0 & 3.5 & 16.2 \\'''
40
 
41
 
42
  def get_baseline_df():
 
43
  df_data = []
44
+
45
+ lines = UNTUNED_MODEL_RESULTS.split("\n")
46
+ for line in lines:
47
+ model_results = line.replace(" ", "").strip("\\").split("&")
48
+ assert len(model_results) == 10
49
+ model_results.insert(1, "False")
50
+ df_data.append(model_results)
51
+ lines = TUNED_MODEL_RESULTS.split("\n")
52
  for line in lines:
53
  model_results = line.replace(" ", "").strip("\\").split("&")
54
  assert len(model_results) == 10
55
+ model_results.insert(1, "True")
56
  df_data.append(model_results)
57
+
58
  print(len(df_data))
59
  df = pd.DataFrame(df_data, columns=COLUMN_NAMES)
60
  return df
 
84
  The [evaluation suite](https://github.com/sambanova/toolbench/) is now alive on Github.
85
  """
86
  )
 
87
  with gr.Row():
88
  with gr.Accordion("Citation", open=False):
89
  citation_button = gr.Textbox(
 
92
  elem_id="citation-button",
93
  ).style(show_copy_button=True)
94
 
95
+
96
+ gr.Markdown(
97
+ """In the table below, we summarize the 3-shot performance of all the models.
98
+ We use success rate as the primary evaluation metric for most tasks, except for the WebShop where we report rewards, as well as for VirtualHome where we use executability and Longest Common Subsequence (LCS), following the original metrics proposed by the respective authors.
99
+ """
100
+ )
101
  with gr.Row():
102
  data = gr.components.Dataframe(
103
+ type="pandas", datatype=["markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
104
  )
105
  with gr.Row():
106
  data_run = gr.Button("Refresh")