jianguozhang
commited on
Commit
•
4b1d5ee
1
Parent(s):
f301eb8
Update README.md
Browse files
README.md
CHANGED
@@ -31,4 +31,130 @@ inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
|
|
31 |
|
32 |
outputs = model.generate(inputs, max_new_tokens=512)
|
33 |
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
34 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
outputs = model.generate(inputs, max_new_tokens=512)
|
33 |
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
34 |
+
```
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
# Benchmarks
|
39 |
+
|
40 |
+
## [BOLAA](https://github.com/salesforce/BOLAA)
|
41 |
+
|
42 |
+
### Webshop
|
43 |
+
|
44 |
+
|
45 |
+
<div class="datagrid" style="width:700px;">
|
46 |
+
<table>
|
47 |
+
<!-- <thead><tr><th></th><th colspan="6"></th></tr></thead> -->
|
48 |
+
<thead><tr><th>LLM Name</th><th>ZS</th><th>ZST</th><th>ReaAct</th><th>PlanAct</th><th>PlanReAct</th><th>BOLAA</th></tr></thead>
|
49 |
+
<tbody>
|
50 |
+
<tr><td>Llama-2-70B-chat </td><td>0.0089 </td><td>0.0102</td><td>0.4273</td><td>0.2809</td><td>0.3966</td><td>0.4986</td></tr>
|
51 |
+
<tr><td>Vicuna-33B </td><td>0.1527 </td><td>0.2122</td><td>0.1971</td><td>0.3766</td><td>0.4032</td><td>0.5618</td></tr>
|
52 |
+
<tr><td>Mixtral-8x7B-Instruct-v0.1 </td><td>0.4634 </td><td>0.4592</td><td><u>0.5638</u></td><td>0.4738</td><td>0.3339</td><td>0.5342</td></tr>
|
53 |
+
<tr><td>GPT-3.5-Turbo </td><td>0.4851 </td><td><u>0.5058</u></td><td>0.5047</td><td>0.4930</td><td><u>0.5436</u></td><td><u>0.6354</u></td></tr>
|
54 |
+
<tr><td>GPT-3.5-Turbo-Instruct </td><td>0.3785 </td><td>0.4195</td><td>0.4377</td><td>0.3604</td><td>0.4851</td><td>0.5811</td></tr>
|
55 |
+
<tr><td>GPT-4-0613</td><td><u>0.5002</u></td><td>0.4783 </td><td>0.4616</td><td><strong>0.7950</strong></td><td>0.4635</td><td>0.6129</td></tr>
|
56 |
+
<tr><td>xLAM-v0.1-r</td><td><strong>0.5201</strong></td><td><strong>0.5268</strong></td><td><strong>0.6486</strong></td><td><u>0.6573</u></td><td><strong>0.6611</strong></td><td><strong>0.6556</strong></td></tr>
|
57 |
+
</tbody>
|
58 |
+
</table>
|
59 |
+
|
60 |
+
### HotpotQA
|
61 |
+
|
62 |
+
<div class="datagrid" style="width:700px;">
|
63 |
+
<table>
|
64 |
+
<!-- <thead><tr><th></th><th colspan="6"></th></tr></thead> -->
|
65 |
+
<thead><tr><th>LLM Name</th><th>ZS</th><th>ZST</th><th>ReaAct</th><th>PlanAct</th><th>PlanReAct</th></thead>
|
66 |
+
<tbody>
|
67 |
+
<tr><td>Mixtral-8x7B-Instruct-v0.1 </td><td>0.3912 </td><td>0.3971</td><td>0.3714</td><td>0.3195</td><td>0.3039</td></tr>
|
68 |
+
<tr><td>GPT-3.5-Turbo </td><td>0.4196 </td><td>0.3937</td><td>0.3868</td><td>0.4182</td><td>0.3960</td></tr>
|
69 |
+
<tr><td>GPT-4-0613</td><td><strong>0.5801</strong></td><td><strong>0.5709 </strong></td><td><strong>0.6129</strong></td><td><strong>0.5778</strong></td><td><strong>0.5716</strong></td></tr>
|
70 |
+
<tr><td>xLAM-v0.1-r</td><td><u>0.5492</u></td><td><u>0.4776</u></td><td><u>0.5020</u></td><td><u>0.5583</u></td><td><u>0.5030</u></td></tr>
|
71 |
+
</tbody>
|
72 |
+
</table>
|
73 |
+
|
74 |
+
## [AgentLite](https://github.com/SalesforceAIResearch/AgentLite/tree/main)
|
75 |
+
|
76 |
+
**Please note:** All prompts provided by AgentLite are considered "unseen prompts" for xLAM-v0.1-r, meaning the model has not been trained with data related to these prompts.
|
77 |
+
|
78 |
+
#### Webshop
|
79 |
+
|
80 |
+
<div class="datagrid" style="width:780px;">
|
81 |
+
<table>
|
82 |
+
<!-- <thead><tr><th></th><th colspan="2">Easy</th><th colspan="2">Medium</th><th colspan="2">Hard</th></tr></thead> -->
|
83 |
+
<thead><tr><th>LLM Name</th><th>Act</th><th>ReAct</th><th>BOLAA</th></tr></thead>
|
84 |
+
<tbody>
|
85 |
+
<tr><td>GPT-3.5-Turbo-16k </td><td>0.6158 </td><td>0.6005</td><td>0.6652</td></tr>
|
86 |
+
<tr><td>GPT-4-0613</td><td><strong>0.6989 </strong></td><td><strong>0.6732</strong></td><td><strong>0.7154</strong></td></tr>
|
87 |
+
<tr><td>xLAM-v0.1-r</td><td><u>0.6563</u></td><td><u>0.6640</u></td><td><u>0.6854</u></td></tr>
|
88 |
+
</tbody>
|
89 |
+
</table>
|
90 |
+
|
91 |
+
#### HotpotQA
|
92 |
+
|
93 |
+
<div class="datagrid" style="width:700px;">
|
94 |
+
<table>
|
95 |
+
<thead><tr><th></th><th colspan="2">Easy</th><th colspan="2">Medium</th><th colspan="2">Hard</th></tr></thead>
|
96 |
+
<thead><tr><th>LLM Name</th><th>F1 Score</th><th>Accuracy</th><th>F1 Score</th><th>Accuracy</th><th>F1 Score</th><th>Accuracy</th></tr></thead>
|
97 |
+
<tbody>
|
98 |
+
<tr><td>GPT-3.5-Turbo-16k-0613 </td><td>0.410 </td><td>0.350</td><td>0.330</td><td>0.25</td><td>0.283</td><td>0.20</td></tr>
|
99 |
+
<tr><td>GPT-4-0613</td><td><strong>0.611</strong></td><td><strong>0.47</strong> </td><td><strong>0.610</strong></td><td><strong>0.480</strong></td><td><strong>0.527</strong></td><td><strong>0.38</strong></td></tr>
|
100 |
+
<tr><td>xLAM-v0.1-r</td><td><u>0.532</u></td><td><u>0.45</u></td><td><u>0.547</u></td><td><u>0.46</u></td><td><u>0.455</u></td><td><u>0.36</u></td></tr>
|
101 |
+
</tbody>
|
102 |
+
</table>
|
103 |
+
|
104 |
+
|
105 |
+
|
106 |
+
## ToolBench
|
107 |
+
|
108 |
+
<div class="datagrid" style="width:780px;">
|
109 |
+
<table>
|
110 |
+
<!-- <thead><tr><th></th><th colspan="2">Easy</th><th colspan="2">Medium</th><th colspan="2">Hard</th></tr></thead> -->
|
111 |
+
<thead><tr><th>LLM Name</th><th>Unseen Insts & Same Set</th><th>Unseen Tools & Seen Cat</th><th>Unseen Tools & Unseen Cat</th></tr></thead>
|
112 |
+
<tbody>
|
113 |
+
<tr><td>TooLlama V2 </td><td>0.4385 </td><td>0.4300</td><td>0.4350</td></tr>
|
114 |
+
<tr><td>GPT-3.5-Turbo-0125 </td><td>0.5000 </td><td>0.5150</td><td>0.4900</td></tr>
|
115 |
+
<tr><td>GPT-4-0125-preview</td><td><strong>0.5462</strong></td><td><u>0.5450</u></td><td><u>0.5050</u></td></tr>
|
116 |
+
<tr><td>xLAM-v0.1-r</td><td><u>0.5077</u></td><td><strong>0.5650</strong></td><td><strong>0.5200</strong></td></tr>
|
117 |
+
</tbody>
|
118 |
+
</table>
|
119 |
+
|
120 |
+
## [MINT-BENCH](https://github.com/xingyaoww/mint-bench)
|
121 |
+
|
122 |
+
|
123 |
+
<div class="datagrid" style="width:780px;">
|
124 |
+
<table>
|
125 |
+
<!-- <thead><tr><th></th><th colspan="2">Easy</th><th colspan="2">Medium</th><th colspan="2">Hard</th></tr></thead> -->
|
126 |
+
<thead><tr><th>LLM Name</th><th>1-step</th><th>2-step</th><th>3-step</th><th>4-step</th><th>5-step</th></tr></thead>
|
127 |
+
<tbody>
|
128 |
+
<tr><td>GPT-4-0613</td><td>-</td><td>-</td><td>-</td><td>-</td><td>69.45</td></tr>
|
129 |
+
<tr><td>Claude-Instant-1</td><td>12.12</td><td>32.25</td><td>39.25</td><td>44.37</td><td>45.90</td></tr>
|
130 |
+
<tr><td>xLAM-v0.1-r</td><td>4.10</td><td>28.50</td><td>36.01</td><td>42.66</td><td>43.96</td></tr>
|
131 |
+
<tr><td>Claude-2 </td><td>26.45 </td><td>35.49</td><td>36.01</td><td>39.76</td><td>39.93</td></tr>
|
132 |
+
<tr><td>Lemur-70b-Chat-v1 </td><td>3.75 </td><td>26.96</td><td>35.67</td><td>37.54</td><td>37.03</td></tr>
|
133 |
+
<tr><td>GPT-3.5-Turbo-0613 </td><td>2.73</td><td>16.89</td><td>24.06</td><td>31.74</td><td>36.18</td></tr>
|
134 |
+
<tr><td>AgentLM-70b </td><td>6.48</td><td>17.75</td><td>24.91</td><td>28.16</td><td>28.67</td></tr>
|
135 |
+
<tr><td>CodeLlama-34b </td><td>0.17</td><td>16.21</td><td>23.04</td><td>25.94</td><td>28.16</td></tr>
|
136 |
+
<tr><td>Llama-2-70b-chat </td><td>4.27</td><td>14.33</td><td>15.70</td><td>16.55</td><td>17.92</td></tr>
|
137 |
+
</tbody>
|
138 |
+
</table>
|
139 |
+
|
140 |
+
|
141 |
+
## [Tool-Query](https://github.com/hkust-nlp/AgentBoard)
|
142 |
+
|
143 |
+
<div class="datagrid" style="width:780px;">
|
144 |
+
<table>
|
145 |
+
<!-- <thead><tr><th></th><th colspan="2">Easy</th><th colspan="2">Medium</th><th colspan="2">Hard</th></tr></thead> -->
|
146 |
+
<thead><tr><th>LLM Name</th><th>Success Rate</th><th>Progress Rate</th></tr></thead>
|
147 |
+
<tbody>
|
148 |
+
<tr><td>xLAM-v0.1-r</td><td>0.433</td><td>0.677</td></tr>
|
149 |
+
<tr><td>DeepSeek-67B </td><td>0.400 </td><td>0.714</td></tr>
|
150 |
+
<tr><td>GPT-3.5-Turbo-0613 </td><td>0.367 </td><td>0.627</td></tr>
|
151 |
+
<tr><td>GPT-3.5-Turbo-16k </td><td>0.317</td><td>0.591</td></tr>
|
152 |
+
<tr><td>Lemur-70B </td><td>0.283</td><td>0.720</td></tr>
|
153 |
+
<tr><td>CodeLlama-13B </td><td>0.250</td><td>0.525</td></tr>
|
154 |
+
<tr><td>CodeLlama-34B </td><td>0.133</td><td>0.600</td></tr>
|
155 |
+
<tr><td>Mistral-7B </td><td>0.033</td><td>0.510</td></tr>
|
156 |
+
<tr><td>Vicuna-13B-16K </td><td>0.033</td><td>0.343</td></tr>
|
157 |
+
<tr><td>Llama-2-70B </td><td>0.000</td><td>0.483</td></tr>
|
158 |
+
</tbody>
|
159 |
+
</table>
|
160 |
+
|