jianguozhang commited on
Commit
4b1d5ee
1 Parent(s): f301eb8

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +127 -1
README.md CHANGED
@@ -31,4 +31,130 @@ inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
31
 
32
  outputs = model.generate(inputs, max_new_tokens=512)
33
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
34
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  outputs = model.generate(inputs, max_new_tokens=512)
33
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
34
+ ```
35
+
36
+
37
+
38
+ # Benchmarks
39
+
40
+ ## [BOLAA](https://github.com/salesforce/BOLAA)
41
+
42
+ ### Webshop
43
+
44
+
45
+ <div class="datagrid" style="width:700px;">
46
+ <table>
47
+ <!-- <thead><tr><th></th><th colspan="6"></th></tr></thead> -->
48
+ <thead><tr><th>LLM Name</th><th>ZS</th><th>ZST</th><th>ReaAct</th><th>PlanAct</th><th>PlanReAct</th><th>BOLAA</th></tr></thead>
49
+ <tbody>
50
+ <tr><td>Llama-2-70B-chat </td><td>0.0089 </td><td>0.0102</td><td>0.4273</td><td>0.2809</td><td>0.3966</td><td>0.4986</td></tr>
51
+ <tr><td>Vicuna-33B </td><td>0.1527 </td><td>0.2122</td><td>0.1971</td><td>0.3766</td><td>0.4032</td><td>0.5618</td></tr>
52
+ <tr><td>Mixtral-8x7B-Instruct-v0.1 </td><td>0.4634 </td><td>0.4592</td><td><u>0.5638</u></td><td>0.4738</td><td>0.3339</td><td>0.5342</td></tr>
53
+ <tr><td>GPT-3.5-Turbo </td><td>0.4851 </td><td><u>0.5058</u></td><td>0.5047</td><td>0.4930</td><td><u>0.5436</u></td><td><u>0.6354</u></td></tr>
54
+ <tr><td>GPT-3.5-Turbo-Instruct </td><td>0.3785 </td><td>0.4195</td><td>0.4377</td><td>0.3604</td><td>0.4851</td><td>0.5811</td></tr>
55
+ <tr><td>GPT-4-0613</td><td><u>0.5002</u></td><td>0.4783 </td><td>0.4616</td><td><strong>0.7950</strong></td><td>0.4635</td><td>0.6129</td></tr>
56
+ <tr><td>xLAM-v0.1-r</td><td><strong>0.5201</strong></td><td><strong>0.5268</strong></td><td><strong>0.6486</strong></td><td><u>0.6573</u></td><td><strong>0.6611</strong></td><td><strong>0.6556</strong></td></tr>
57
+ </tbody>
58
+ </table>
59
+
60
+ ### HotpotQA
61
+
62
+ <div class="datagrid" style="width:700px;">
63
+ <table>
64
+ <!-- <thead><tr><th></th><th colspan="6"></th></tr></thead> -->
65
+ <thead><tr><th>LLM Name</th><th>ZS</th><th>ZST</th><th>ReaAct</th><th>PlanAct</th><th>PlanReAct</th></thead>
66
+ <tbody>
67
+ <tr><td>Mixtral-8x7B-Instruct-v0.1 </td><td>0.3912 </td><td>0.3971</td><td>0.3714</td><td>0.3195</td><td>0.3039</td></tr>
68
+ <tr><td>GPT-3.5-Turbo </td><td>0.4196 </td><td>0.3937</td><td>0.3868</td><td>0.4182</td><td>0.3960</td></tr>
69
+ <tr><td>GPT-4-0613</td><td><strong>0.5801</strong></td><td><strong>0.5709 </strong></td><td><strong>0.6129</strong></td><td><strong>0.5778</strong></td><td><strong>0.5716</strong></td></tr>
70
+ <tr><td>xLAM-v0.1-r</td><td><u>0.5492</u></td><td><u>0.4776</u></td><td><u>0.5020</u></td><td><u>0.5583</u></td><td><u>0.5030</u></td></tr>
71
+ </tbody>
72
+ </table>
73
+
74
+ ## [AgentLite](https://github.com/SalesforceAIResearch/AgentLite/tree/main)
75
+
76
+ **Please note:** All prompts provided by AgentLite are considered "unseen prompts" for xLAM-v0.1-r, meaning the model has not been trained with data related to these prompts.
77
+
78
+ #### Webshop
79
+
80
+ <div class="datagrid" style="width:780px;">
81
+ <table>
82
+ <!-- <thead><tr><th></th><th colspan="2">Easy</th><th colspan="2">Medium</th><th colspan="2">Hard</th></tr></thead> -->
83
+ <thead><tr><th>LLM Name</th><th>Act</th><th>ReAct</th><th>BOLAA</th></tr></thead>
84
+ <tbody>
85
+ <tr><td>GPT-3.5-Turbo-16k </td><td>0.6158 </td><td>0.6005</td><td>0.6652</td></tr>
86
+ <tr><td>GPT-4-0613</td><td><strong>0.6989 </strong></td><td><strong>0.6732</strong></td><td><strong>0.7154</strong></td></tr>
87
+ <tr><td>xLAM-v0.1-r</td><td><u>0.6563</u></td><td><u>0.6640</u></td><td><u>0.6854</u></td></tr>
88
+ </tbody>
89
+ </table>
90
+
91
+ #### HotpotQA
92
+
93
+ <div class="datagrid" style="width:700px;">
94
+ <table>
95
+ <thead><tr><th></th><th colspan="2">Easy</th><th colspan="2">Medium</th><th colspan="2">Hard</th></tr></thead>
96
+ <thead><tr><th>LLM Name</th><th>F1 Score</th><th>Accuracy</th><th>F1 Score</th><th>Accuracy</th><th>F1 Score</th><th>Accuracy</th></tr></thead>
97
+ <tbody>
98
+ <tr><td>GPT-3.5-Turbo-16k-0613 </td><td>0.410 </td><td>0.350</td><td>0.330</td><td>0.25</td><td>0.283</td><td>0.20</td></tr>
99
+ <tr><td>GPT-4-0613</td><td><strong>0.611</strong></td><td><strong>0.47</strong> </td><td><strong>0.610</strong></td><td><strong>0.480</strong></td><td><strong>0.527</strong></td><td><strong>0.38</strong></td></tr>
100
+ <tr><td>xLAM-v0.1-r</td><td><u>0.532</u></td><td><u>0.45</u></td><td><u>0.547</u></td><td><u>0.46</u></td><td><u>0.455</u></td><td><u>0.36</u></td></tr>
101
+ </tbody>
102
+ </table>
103
+
104
+
105
+
106
+ ## ToolBench
107
+
108
+ <div class="datagrid" style="width:780px;">
109
+ <table>
110
+ <!-- <thead><tr><th></th><th colspan="2">Easy</th><th colspan="2">Medium</th><th colspan="2">Hard</th></tr></thead> -->
111
+ <thead><tr><th>LLM Name</th><th>Unseen Insts & Same Set</th><th>Unseen Tools & Seen Cat</th><th>Unseen Tools & Unseen Cat</th></tr></thead>
112
+ <tbody>
113
+ <tr><td>TooLlama V2 </td><td>0.4385 </td><td>0.4300</td><td>0.4350</td></tr>
114
+ <tr><td>GPT-3.5-Turbo-0125 </td><td>0.5000 </td><td>0.5150</td><td>0.4900</td></tr>
115
+ <tr><td>GPT-4-0125-preview</td><td><strong>0.5462</strong></td><td><u>0.5450</u></td><td><u>0.5050</u></td></tr>
116
+ <tr><td>xLAM-v0.1-r</td><td><u>0.5077</u></td><td><strong>0.5650</strong></td><td><strong>0.5200</strong></td></tr>
117
+ </tbody>
118
+ </table>
119
+
120
+ ## [MINT-BENCH](https://github.com/xingyaoww/mint-bench)
121
+
122
+
123
+ <div class="datagrid" style="width:780px;">
124
+ <table>
125
+ <!-- <thead><tr><th></th><th colspan="2">Easy</th><th colspan="2">Medium</th><th colspan="2">Hard</th></tr></thead> -->
126
+ <thead><tr><th>LLM Name</th><th>1-step</th><th>2-step</th><th>3-step</th><th>4-step</th><th>5-step</th></tr></thead>
127
+ <tbody>
128
+ <tr><td>GPT-4-0613</td><td>-</td><td>-</td><td>-</td><td>-</td><td>69.45</td></tr>
129
+ <tr><td>Claude-Instant-1</td><td>12.12</td><td>32.25</td><td>39.25</td><td>44.37</td><td>45.90</td></tr>
130
+ <tr><td>xLAM-v0.1-r</td><td>4.10</td><td>28.50</td><td>36.01</td><td>42.66</td><td>43.96</td></tr>
131
+ <tr><td>Claude-2 </td><td>26.45 </td><td>35.49</td><td>36.01</td><td>39.76</td><td>39.93</td></tr>
132
+ <tr><td>Lemur-70b-Chat-v1 </td><td>3.75 </td><td>26.96</td><td>35.67</td><td>37.54</td><td>37.03</td></tr>
133
+ <tr><td>GPT-3.5-Turbo-0613 </td><td>2.73</td><td>16.89</td><td>24.06</td><td>31.74</td><td>36.18</td></tr>
134
+ <tr><td>AgentLM-70b </td><td>6.48</td><td>17.75</td><td>24.91</td><td>28.16</td><td>28.67</td></tr>
135
+ <tr><td>CodeLlama-34b </td><td>0.17</td><td>16.21</td><td>23.04</td><td>25.94</td><td>28.16</td></tr>
136
+ <tr><td>Llama-2-70b-chat </td><td>4.27</td><td>14.33</td><td>15.70</td><td>16.55</td><td>17.92</td></tr>
137
+ </tbody>
138
+ </table>
139
+
140
+
141
+ ## [Tool-Query](https://github.com/hkust-nlp/AgentBoard)
142
+
143
+ <div class="datagrid" style="width:780px;">
144
+ <table>
145
+ <!-- <thead><tr><th></th><th colspan="2">Easy</th><th colspan="2">Medium</th><th colspan="2">Hard</th></tr></thead> -->
146
+ <thead><tr><th>LLM Name</th><th>Success Rate</th><th>Progress Rate</th></tr></thead>
147
+ <tbody>
148
+ <tr><td>xLAM-v0.1-r</td><td>0.433</td><td>0.677</td></tr>
149
+ <tr><td>DeepSeek-67B </td><td>0.400 </td><td>0.714</td></tr>
150
+ <tr><td>GPT-3.5-Turbo-0613 </td><td>0.367 </td><td>0.627</td></tr>
151
+ <tr><td>GPT-3.5-Turbo-16k </td><td>0.317</td><td>0.591</td></tr>
152
+ <tr><td>Lemur-70B </td><td>0.283</td><td>0.720</td></tr>
153
+ <tr><td>CodeLlama-13B </td><td>0.250</td><td>0.525</td></tr>
154
+ <tr><td>CodeLlama-34B </td><td>0.133</td><td>0.600</td></tr>
155
+ <tr><td>Mistral-7B </td><td>0.033</td><td>0.510</td></tr>
156
+ <tr><td>Vicuna-13B-16K </td><td>0.033</td><td>0.343</td></tr>
157
+ <tr><td>Llama-2-70B </td><td>0.000</td><td>0.483</td></tr>
158
+ </tbody>
159
+ </table>
160
+