Spaces:
Running
Running
vlff李飞飞
commited on
Commit
•
a4bff98
1
Parent(s):
522b6de
优化
Browse files- .gitattributes +1 -0
- .gitignore +0 -6
- README.md +0 -94
- README_CN.md +1 -93
- benchmark/README.md +0 -248
- benchmark/code_interpreter.py +0 -250
- benchmark/config.py +0 -66
- benchmark/inference_and_execute.py +0 -280
- benchmark/metrics/__init__.py +0 -0
- benchmark/metrics/code_execution.py +0 -257
- benchmark/metrics/gsm8k.py +0 -54
- benchmark/metrics/visualization.py +0 -179
- benchmark/models/__init__.py +0 -4
- benchmark/models/base.py +0 -17
- benchmark/models/dashscope.py +0 -40
- benchmark/models/llm.py +0 -26
- benchmark/models/qwen.py +0 -36
- benchmark/parser/__init__.py +0 -2
- benchmark/parser/internlm_parser.py +0 -11
- benchmark/parser/react_parser.py +0 -46
- benchmark/prompt/__init__.py +0 -4
- benchmark/prompt/internlm_react.py +0 -103
- benchmark/prompt/llama_react.py +0 -20
- benchmark/prompt/qwen_react.py +0 -80
- benchmark/prompt/react.py +0 -87
- benchmark/requirements.txt +0 -13
- benchmark/utils/__init__.py +0 -0
- benchmark/utils/code_utils.py +0 -31
- benchmark/utils/data_utils.py +0 -28
- setup.py +0 -16
.gitattributes
CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
*.ttf filter=lfs diff=lfs merge=lfs -text
|
|
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
*.ttf filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.db filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
@@ -9,12 +9,6 @@ __pycache__
|
|
9 |
qwen_agent/llm/gpt.py
|
10 |
qwen_agent/llm/tools.py
|
11 |
#workspace/*
|
12 |
-
|
13 |
-
benchmark/log/*
|
14 |
-
benchmark/output_data/*
|
15 |
-
benchmark/upload_file/*
|
16 |
-
benchmark/upload_file_clean/*
|
17 |
-
benchmark/eval_data/
|
18 |
Qwen-Agent
|
19 |
|
20 |
docqa/*
|
|
|
9 |
qwen_agent/llm/gpt.py
|
10 |
qwen_agent/llm/tools.py
|
11 |
#workspace/*
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
Qwen-Agent
|
13 |
|
14 |
docqa/*
|
README.md
CHANGED
@@ -156,97 +156,3 @@ You can watch the following showcase videos to learn about the basic operations
|
|
156 |
- Long-form writing based on visited webpages and PDFs [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_write_article_based_on_webpages_and_pdfs.mp4)
|
157 |
- Drawing a plot using code interpreter based on the given information [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_chat_with_docs_and_code_interpreter.mp4)
|
158 |
- Uploading files, multi-turn conversation, and data analysis using code interpreter [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_code_interpreter_multi_turn_chat.mp4)
|
159 |
-
|
160 |
-
# Evaluation Benchmark
|
161 |
-
|
162 |
-
We have also open-sourced a benchmark for evaluating the performance of a model in writing Python code and using Code Interpreter for mathematical problem solving, data analysis, and other general tasks. The benchmark can be found in the [benchmark](benchmark/README.md) directory. The current evaluation results are as follows:
|
163 |
-
|
164 |
-
<table>
|
165 |
-
<tr>
|
166 |
-
<th colspan="5" align="center">In-house Code Interpreter Benchmark (Version 20231206)</th>
|
167 |
-
</tr>
|
168 |
-
<tr>
|
169 |
-
<th rowspan="2" align="center">Model</th>
|
170 |
-
<th colspan="3" align="center">Accuracy of Code Execution Results (%)</th>
|
171 |
-
<th colspan="1" align="center">Executable Rate of Code (%)</th>
|
172 |
-
</tr>
|
173 |
-
<tr>
|
174 |
-
<th align="center">Math↑</th><th align="center">Visualization-Hard↑</th><th align="center">Visualization-Easy↑</th><th align="center">General↑</th>
|
175 |
-
</tr>
|
176 |
-
<tr>
|
177 |
-
<td>GPT-4</td>
|
178 |
-
<td align="center">82.8</td>
|
179 |
-
<td align="center">66.7</td>
|
180 |
-
<td align="center">60.8</td>
|
181 |
-
<td align="center">82.8</td>
|
182 |
-
</tr>
|
183 |
-
<tr>
|
184 |
-
<td>GPT-3.5</td>
|
185 |
-
<td align="center">47.3</td>
|
186 |
-
<td align="center">33.3</td>
|
187 |
-
<td align="center">55.7</td>
|
188 |
-
<td align="center">74.1</td>
|
189 |
-
</tr>
|
190 |
-
<tr>
|
191 |
-
<td>LLaMA2-13B-Chat</td>
|
192 |
-
<td align="center">8.3</td>
|
193 |
-
<td align="center">1.2</td>
|
194 |
-
<td align="center">15.2</td>
|
195 |
-
<td align="center">48.3</td>
|
196 |
-
</tr>
|
197 |
-
<tr>
|
198 |
-
<td>CodeLLaMA-13B-Instruct</td>
|
199 |
-
<td align="center">28.2</td>
|
200 |
-
<td align="center">15.5</td>
|
201 |
-
<td align="center">21.5</td>
|
202 |
-
<td align="center">74.1</td>
|
203 |
-
</tr>
|
204 |
-
<tr>
|
205 |
-
<td>InternLM-20B-Chat</td>
|
206 |
-
<td align="center">34.6</td>
|
207 |
-
<td align="center">10.7</td>
|
208 |
-
<td align="center">24.1</td>
|
209 |
-
<td align="center">65.5</td>
|
210 |
-
</tr>
|
211 |
-
<tr>
|
212 |
-
<td>ChatGLM3-6B</td>
|
213 |
-
<td align="center">54.2</td>
|
214 |
-
<td align="center">4.8</td>
|
215 |
-
<td align="center">15.2</td>
|
216 |
-
<td align="center">62.1</td>
|
217 |
-
</tr>
|
218 |
-
<tr>
|
219 |
-
<td>Qwen-1.8B-Chat</td>
|
220 |
-
<td align="center">25.6</td>
|
221 |
-
<td align="center">21.4</td>
|
222 |
-
<td align="center">22.8</td>
|
223 |
-
<td align="center">65.5</td>
|
224 |
-
</tr>
|
225 |
-
<tr>
|
226 |
-
<td>Qwen-7B-Chat</td>
|
227 |
-
<td align="center">41.9</td>
|
228 |
-
<td align="center">23.8</td>
|
229 |
-
<td align="center">38.0</td>
|
230 |
-
<td align="center">67.2</td>
|
231 |
-
</tr>
|
232 |
-
<tr>
|
233 |
-
<td>Qwen-14B-Chat</td>
|
234 |
-
<td align="center">58.4</td>
|
235 |
-
<td align="center">31.0</td>
|
236 |
-
<td align="center">45.6</td>
|
237 |
-
<td align="center">65.5</td>
|
238 |
-
</tr>
|
239 |
-
<tr>
|
240 |
-
<td>Qwen-72B-Chat</td>
|
241 |
-
<td align="center">72.7</td>
|
242 |
-
<td align="center">41.7</td>
|
243 |
-
<td align="center">43.0</td>
|
244 |
-
<td align="center">82.8</td>
|
245 |
-
</tr>
|
246 |
-
</table>
|
247 |
-
|
248 |
-
# Disclaimer
|
249 |
-
|
250 |
-
This project is not intended to be an official product, rather it serves as a proof-of-concept project that highlights the capabilities of the Qwen series models.
|
251 |
-
|
252 |
-
> Important: The code interpreter is not sandboxed, and it executes code in your own environment. Please do not ask Qwen to perform dangerous tasks, and do not directly use the code interpreter for production purposes.
|
|
|
156 |
- Long-form writing based on visited webpages and PDFs [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_write_article_based_on_webpages_and_pdfs.mp4)
|
157 |
- Drawing a plot using code interpreter based on the given information [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_chat_with_docs_and_code_interpreter.mp4)
|
158 |
- Uploading files, multi-turn conversation, and data analysis using code interpreter [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_code_interpreter_multi_turn_chat.mp4)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README_CN.md
CHANGED
@@ -157,96 +157,4 @@ python run_server.py --model_server http://{MODEL_SERVER_IP}:7905/v1 --workstati
|
|
157 |
- 提取浏览内容使用代码解释器画图 [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_chat_with_docs_and_code_interpreter.mp4)
|
158 |
- 上传文件、多轮对话利用代码解释器分析数据 [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_code_interpreter_multi_turn_chat.mp4)
|
159 |
|
160 |
-
|
161 |
-
|
162 |
-
我们也开源了一个评测基准,用于评估一个模型写Python代码并使用Code Interpreter进行数学解题、数据分析、及其他通用任务时的表现。评测基准见 [benchmark](benchmark/README.md) 目录,当前的评测结果如下:
|
163 |
-
|
164 |
-
<table>
|
165 |
-
<tr>
|
166 |
-
<th colspan="5" align="center">In-house Code Interpreter Benchmark (Version 20231206)</th>
|
167 |
-
</tr>
|
168 |
-
<tr>
|
169 |
-
<th rowspan="2" align="center">Model</th>
|
170 |
-
<th colspan="3" align="center">代码执行结果正确性 (%)</th>
|
171 |
-
<th colspan="1" align="center">生成代码的可执行率 (%)</th>
|
172 |
-
</tr>
|
173 |
-
<tr>
|
174 |
-
<th align="center">Math↑</th><th align="center">Visualization-Hard↑</th><th align="center">Visualization-Easy↑</th><th align="center">General↑</th>
|
175 |
-
</tr>
|
176 |
-
<tr>
|
177 |
-
<td>GPT-4</td>
|
178 |
-
<td align="center">82.8</td>
|
179 |
-
<td align="center">66.7</td>
|
180 |
-
<td align="center">60.8</td>
|
181 |
-
<td align="center">82.8</td>
|
182 |
-
</tr>
|
183 |
-
<tr>
|
184 |
-
<td>GPT-3.5</td>
|
185 |
-
<td align="center">47.3</td>
|
186 |
-
<td align="center">33.3</td>
|
187 |
-
<td align="center">55.7</td>
|
188 |
-
<td align="center">74.1</td>
|
189 |
-
</tr>
|
190 |
-
<tr>
|
191 |
-
<td>LLaMA2-13B-Chat</td>
|
192 |
-
<td align="center">8.3</td>
|
193 |
-
<td align="center">1.2</td>
|
194 |
-
<td align="center">15.2</td>
|
195 |
-
<td align="center">48.3</td>
|
196 |
-
</tr>
|
197 |
-
<tr>
|
198 |
-
<td>CodeLLaMA-13B-Instruct</td>
|
199 |
-
<td align="center">28.2</td>
|
200 |
-
<td align="center">15.5</td>
|
201 |
-
<td align="center">21.5</td>
|
202 |
-
<td align="center">74.1</td>
|
203 |
-
</tr>
|
204 |
-
<tr>
|
205 |
-
<td>InternLM-20B-Chat</td>
|
206 |
-
<td align="center">34.6</td>
|
207 |
-
<td align="center">10.7</td>
|
208 |
-
<td align="center">24.1</td>
|
209 |
-
<td align="center">65.5</td>
|
210 |
-
</tr>
|
211 |
-
<tr>
|
212 |
-
<td>ChatGLM3-6B</td>
|
213 |
-
<td align="center">54.2</td>
|
214 |
-
<td align="center">4.8</td>
|
215 |
-
<td align="center">15.2</td>
|
216 |
-
<td align="center">62.1</td>
|
217 |
-
</tr>
|
218 |
-
<tr>
|
219 |
-
<td>Qwen-1.8B-Chat</td>
|
220 |
-
<td align="center">25.6</td>
|
221 |
-
<td align="center">21.4</td>
|
222 |
-
<td align="center">22.8</td>
|
223 |
-
<td align="center">65.5</td>
|
224 |
-
</tr>
|
225 |
-
<tr>
|
226 |
-
<td>Qwen-7B-Chat</td>
|
227 |
-
<td align="center">41.9</td>
|
228 |
-
<td align="center">23.8</td>
|
229 |
-
<td align="center">38.0</td>
|
230 |
-
<td align="center">67.2</td>
|
231 |
-
</tr>
|
232 |
-
<tr>
|
233 |
-
<td>Qwen-14B-Chat</td>
|
234 |
-
<td align="center">58.4</td>
|
235 |
-
<td align="center">31.0</td>
|
236 |
-
<td align="center">45.6</td>
|
237 |
-
<td align="center">65.5</td>
|
238 |
-
</tr>
|
239 |
-
<tr>
|
240 |
-
<td>Qwen-72B-Chat</td>
|
241 |
-
<td align="center">72.7</td>
|
242 |
-
<td align="center">41.7</td>
|
243 |
-
<td align="center">43.0</td>
|
244 |
-
<td align="center">82.8</td>
|
245 |
-
</tr>
|
246 |
-
</table>
|
247 |
-
|
248 |
-
# 免责声明
|
249 |
-
|
250 |
-
本项目并非正式产品,而是一个概念验证项目,用于演示Qwen系列模型的能力。
|
251 |
-
|
252 |
-
> 重要提示:代码解释器未进行沙盒隔离,会在部署环境中执行代码。请避免向Qwen发出危险指令,切勿将该代码解释器直接用于生产目的。
|
|
|
157 |
- 提取浏览内容使用代码解释器画图 [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_chat_with_docs_and_code_interpreter.mp4)
|
158 |
- 上传文件、多轮对话利用代码解释器分析数据 [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_code_interpreter_multi_turn_chat.mp4)
|
159 |
|
160 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark/README.md
DELETED
@@ -1,248 +0,0 @@
|
|
1 |
-
# Code Interpreter Benchmark
|
2 |
-
|
3 |
-
## Introduction
|
4 |
-
To assess LLM's ability to use the Python Code Interpreter for tasks such as mathematical problem solving, data visualization, and other general-purpose tasks such as file handling and web scraping, we have created and open-sourced a benchmark specifically designed for evaluating these capabilities.
|
5 |
-
|
6 |
-
### Metrics
|
7 |
-
The metrics are divided into two parts: code executability and code correctness.
|
8 |
-
- Code executability: evaluating the ability of the LLM-generated code to be executed.
|
9 |
-
- Code correctness: evaluating whether the LLM-generated code runs correctly.
|
10 |
-
|
11 |
-
### Domain
|
12 |
-
When evaluating the accuracy of the code execution results for code correctness, we further divide it into two specific domains: `Math`, `Visualization`.
|
13 |
-
In terms of code executability, we calculate executable rate of the generated code for `General problem-solving`.
|
14 |
-
|
15 |
-
## Results
|
16 |
-
- Qwen-7B-Chat refers to the version updated after September 25, 2023.
|
17 |
-
- The code correctness judger model for `Visualization` has changed from `Qwen-vl-chat` to `gpt-4-vision-preview` in the version 20231206.
|
18 |
-
|
19 |
-
<table>
|
20 |
-
<tr>
|
21 |
-
<th colspan="5" align="center">In-house Code Interpreter Benchmark (Version 20231206)</th>
|
22 |
-
</tr>
|
23 |
-
<tr>
|
24 |
-
<th rowspan="2" align="center">Model</th>
|
25 |
-
<th colspan="3" align="center">Accuracy of Code Execution Results (%)</th>
|
26 |
-
<th colspan="1" align="center">Executable Rate of Code (%)</th>
|
27 |
-
</tr>
|
28 |
-
<tr>
|
29 |
-
<th align="center">Math↑</th><th align="center">Visualization-Hard↑</th><th align="center">Visualization-Easy↑</th><th align="center">General↑</th>
|
30 |
-
</tr>
|
31 |
-
<tr>
|
32 |
-
<td>GPT-4</td>
|
33 |
-
<td align="center">82.8</td>
|
34 |
-
<td align="center">66.7</td>
|
35 |
-
<td align="center">60.8</td>
|
36 |
-
<td align="center">82.8</td>
|
37 |
-
</tr>
|
38 |
-
<tr>
|
39 |
-
<td>GPT-3.5</td>
|
40 |
-
<td align="center">47.3</td>
|
41 |
-
<td align="center">33.3</td>
|
42 |
-
<td align="center">55.7</td>
|
43 |
-
<td align="center">74.1</td>
|
44 |
-
</tr>
|
45 |
-
<tr>
|
46 |
-
<td>LLaMA2-13B-Chat</td>
|
47 |
-
<td align="center">8.3</td>
|
48 |
-
<td align="center">1.2</td>
|
49 |
-
<td align="center">15.2</td>
|
50 |
-
<td align="center">48.3</td>
|
51 |
-
</tr>
|
52 |
-
<tr>
|
53 |
-
<td>CodeLLaMA-13B-Instruct</td>
|
54 |
-
<td align="center">28.2</td>
|
55 |
-
<td align="center">15.5</td>
|
56 |
-
<td align="center">21.5</td>
|
57 |
-
<td align="center">74.1</td>
|
58 |
-
</tr>
|
59 |
-
<tr>
|
60 |
-
<td>InternLM-20B-Chat</td>
|
61 |
-
<td align="center">34.6</td>
|
62 |
-
<td align="center">10.7</td>
|
63 |
-
<td align="center">24.1</td>
|
64 |
-
<td align="center">65.5</td>
|
65 |
-
</tr>
|
66 |
-
<tr>
|
67 |
-
<td>ChatGLM3-6B</td>
|
68 |
-
<td align="center">54.2</td>
|
69 |
-
<td align="center">4.8</td>
|
70 |
-
<td align="center">15.2</td>
|
71 |
-
<td align="center">62.1</td>
|
72 |
-
</tr>
|
73 |
-
<tr>
|
74 |
-
<td>Qwen-1.8B-Chat</td>
|
75 |
-
<td align="center">25.6</td>
|
76 |
-
<td align="center">21.4</td>
|
77 |
-
<td align="center">22.8</td>
|
78 |
-
<td align="center">65.5</td>
|
79 |
-
</tr>
|
80 |
-
<tr>
|
81 |
-
<td>Qwen-7B-Chat</td>
|
82 |
-
<td align="center">41.9</td>
|
83 |
-
<td align="center">23.8</td>
|
84 |
-
<td align="center">38.0</td>
|
85 |
-
<td align="center">67.2</td>
|
86 |
-
</tr>
|
87 |
-
<tr>
|
88 |
-
<td>Qwen-14B-Chat</td>
|
89 |
-
<td align="center">58.4</td>
|
90 |
-
<td align="center">31.0</td>
|
91 |
-
<td align="center">45.6</td>
|
92 |
-
<td align="center">65.5</td>
|
93 |
-
</tr>
|
94 |
-
<tr>
|
95 |
-
<td>Qwen-72B-Chat</td>
|
96 |
-
<td align="center">72.7</td>
|
97 |
-
<td align="center">41.7</td>
|
98 |
-
<td align="center">43.0</td>
|
99 |
-
<td align="center">82.8</td>
|
100 |
-
</tr>
|
101 |
-
</table>
|
102 |
-
|
103 |
-
Furthermore, we also provide the results of `Qwen-vl-plus` as the code correctness judger model for `Visualization` task to serve as a reference.
|
104 |
-
|
105 |
-
<table>
|
106 |
-
<tr>
|
107 |
-
<th colspan="3" align="center">Code Correctness Judger Model = Qwen-vl-plus</th>
|
108 |
-
</tr>
|
109 |
-
<tr>
|
110 |
-
<th rowspan="2" align="center">Model</th>
|
111 |
-
<th colspan="2" align="center">Accuracy of Code Execution Results (%)</th>
|
112 |
-
</tr>
|
113 |
-
<tr>
|
114 |
-
<th align="center">Visualization-Hard↑</th>
|
115 |
-
<th align="center">Visualization-Easy↑</th>
|
116 |
-
</tr>
|
117 |
-
<tr>
|
118 |
-
<td>LLaMA2-13B-Chat</td>
|
119 |
-
<td align="center">2.4</td>
|
120 |
-
<td align="center">17.7</td>
|
121 |
-
</tr>
|
122 |
-
<tr>
|
123 |
-
<td>CodeLLaMA-13B-Instruct</td>
|
124 |
-
<td align="center">17.9</td>
|
125 |
-
<td align="center">34.2</td>
|
126 |
-
</tr>
|
127 |
-
<tr>
|
128 |
-
<td>InternLM-20B-Chat</td>
|
129 |
-
<td align="center">9.5</td>
|
130 |
-
<td align="center">31.7</td>
|
131 |
-
</tr>
|
132 |
-
<tr>
|
133 |
-
<td>ChatGLM3-6B</td>
|
134 |
-
<td align="center">10.7</td>
|
135 |
-
<td align="center">29.1</td>
|
136 |
-
</tr>
|
137 |
-
<tr>
|
138 |
-
<td>Qwen-1.8B-Chat</td>
|
139 |
-
<td align="center">32.1</td>
|
140 |
-
<td align="center">32.9</td>
|
141 |
-
</tr>
|
142 |
-
<tr>
|
143 |
-
<td>Qwen-7B-Chat</td>
|
144 |
-
<td align="center">26.2</td>
|
145 |
-
<td align="center">39.2</td>
|
146 |
-
</tr>
|
147 |
-
<tr>
|
148 |
-
<td>Qwen-14B-Chat</td>
|
149 |
-
<td align="center">36.9</td>
|
150 |
-
<td align="center">41.8</td>
|
151 |
-
</tr>
|
152 |
-
<tr>
|
153 |
-
<td>Qwen-72B-Chat</td>
|
154 |
-
<td align="center">38.1</td>
|
155 |
-
<td align="center">38.0</td>
|
156 |
-
</tr>
|
157 |
-
</table>
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
## Usage
|
162 |
-
|
163 |
-
### Installation
|
164 |
-
|
165 |
-
```shell
|
166 |
-
git clone https://github.com/QwenLM/Qwen-Agent.git
|
167 |
-
cd benchmark
|
168 |
-
pip install -r requirements.txt
|
169 |
-
```
|
170 |
-
|
171 |
-
### Dataset Download
|
172 |
-
```shell
|
173 |
-
cd benchmark
|
174 |
-
wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/benchmark_code_interpreter_data.zip
|
175 |
-
unzip benchmark_code_interpreter_data.zip
|
176 |
-
mkdir eval_data
|
177 |
-
mv eval_code_interpreter_v1.jsonl eval_data/
|
178 |
-
```
|
179 |
-
|
180 |
-
### Evaluation
|
181 |
-
To reproduce the comprehensive results of benchmark, you can run the following script:
|
182 |
-
|
183 |
-
```Shell
|
184 |
-
python inference_and_execute.py --model {model_name}
|
185 |
-
```
|
186 |
-
|
187 |
-
{model_name}:
|
188 |
-
- qwen-1.8b-chat
|
189 |
-
- qwen-7b-chat
|
190 |
-
- qwen-14b-chat
|
191 |
-
- qwen-72b-chat
|
192 |
-
- llama-2-7b-chat
|
193 |
-
- llama-2-13b-chat
|
194 |
-
- codellama-7b-instruct
|
195 |
-
- codellama-13b-instruct
|
196 |
-
- internlm-7b-chat-1.1
|
197 |
-
- internlm-20b-chat
|
198 |
-
|
199 |
-
The benchmark will run the test cases and generate the performance results. The results will be saved in the `output_data` directory.
|
200 |
-
|
201 |
-
**Notes**:
|
202 |
-
Please install `simhei.ttf` font for proper display in matplotlib when evaluating visualization task. You can do this by preparing `simhei.ttf` (which can be found on any Windows PC) and then running the following code snippet:
|
203 |
-
```python
|
204 |
-
import os
|
205 |
-
import matplotlib
|
206 |
-
target_font_path = os.path.join(
|
207 |
-
os.path.abspath(
|
208 |
-
os.path.join(matplotlib.matplotlib_fname(), os.path.pardir)),
|
209 |
-
'fonts', 'ttf', 'simhei.ttf')
|
210 |
-
os.system(f'cp simhei.ttf {target_font_path}')
|
211 |
-
font_list_cache = os.path.join(matplotlib.get_cachedir(), 'fontlist-*.json')
|
212 |
-
os.system(f'rm -f {font_list_cache}')
|
213 |
-
```
|
214 |
-
|
215 |
-
#### Code Executable Rate
|
216 |
-
```Shell
|
217 |
-
python inference_and_execute.py --task {task_name} --model {model_name}
|
218 |
-
```
|
219 |
-
|
220 |
-
{task_name}:
|
221 |
-
- `general`: General problem-solving task
|
222 |
-
|
223 |
-
|
224 |
-
#### Code Correctness Rate
|
225 |
-
```Shell
|
226 |
-
python inference_and_execute.py --task {task_name} --model {model_name}
|
227 |
-
```
|
228 |
-
|
229 |
-
{task_name}:
|
230 |
-
- `visualization`: Visualization task
|
231 |
-
- `gsm8k`: Math task
|
232 |
-
|
233 |
-
|
234 |
-
## Configuration
|
235 |
-
The inference_and_exec.py file contains the following configurable options:
|
236 |
-
|
237 |
-
- `--model`: The model to test which can be one of `qwen-72b-chat`, `qwen-14b-chat`, `qwen-7b-chat`, `qwen-1.8b-chat`, `qwen-7b-chat`, `llama-2-7b-chat`, `llama-2-13b-chat`, `codellama-7b-instruct`, `codellama-13b-instruct`, `internlm-7b-chat-1.1`, `internlm-20b-chat`.
|
238 |
-
- `--task`: The test task which can be one of `all`, `visualization`, `general`, `gsm8k`.
|
239 |
-
- `--output-path`: The path for saving evaluation result.
|
240 |
-
- `--input-path`: The path for placing evaluation data.
|
241 |
-
- `--output-fname`: The file name for evaluation result.
|
242 |
-
- `--input-fname`: The file name for evaluation data.
|
243 |
-
- `--force`: Force generation and will overwrite the cached results.
|
244 |
-
- `--eval-only`: Only calculate evaluation metrics without re-inference.
|
245 |
-
- `--eval-code-exec-only`: Only evaluate code executable rate
|
246 |
-
- `--gen-exec-only`: Only generate and execuate code without calculating evaluation metrics.
|
247 |
-
- `--gen-only`: Only generate without execuating code and calculating evaluation metrics.
|
248 |
-
- `--vis-judger`: The model to judge the result correctness for `Visualization` task which can be one of `gpt-4-vision-preview`, `qwen-vl-chat`, `qwen-vl-plus`. It is set to `gpt-4-vision-preview` by default in the version 20231206, and `Qwen-vl-chat` has been deprecated.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark/code_interpreter.py
DELETED
@@ -1,250 +0,0 @@
|
|
1 |
-
import base64
|
2 |
-
import io
|
3 |
-
import json
|
4 |
-
import logging
|
5 |
-
import os
|
6 |
-
import queue
|
7 |
-
import re
|
8 |
-
import subprocess
|
9 |
-
import sys
|
10 |
-
import time
|
11 |
-
import traceback
|
12 |
-
import uuid
|
13 |
-
|
14 |
-
import matplotlib
|
15 |
-
import PIL.Image
|
16 |
-
from jupyter_client import BlockingKernelClient
|
17 |
-
from utils.code_utils import extract_code
|
18 |
-
|
19 |
-
WORK_DIR = os.getenv('CODE_INTERPRETER_WORK_DIR', '/tmp/workspace')
|
20 |
-
|
21 |
-
LAUNCH_KERNEL_PY = """
|
22 |
-
from ipykernel import kernelapp as app
|
23 |
-
app.launch_new_instance()
|
24 |
-
"""
|
25 |
-
|
26 |
-
_KERNEL_CLIENTS = {}
|
27 |
-
|
28 |
-
|
29 |
-
# Run this fix before jupyter starts if matplotlib cannot render CJK fonts.
|
30 |
-
# And we need to additionally run the following lines in the jupyter notebook.
|
31 |
-
# ```python
|
32 |
-
# import matplotlib.pyplot as plt
|
33 |
-
# plt.rcParams['font.sans-serif'] = ['SimHei']
|
34 |
-
# plt.rcParams['axes.unicode_minus'] = False
|
35 |
-
# ````
|
36 |
-
def fix_matplotlib_cjk_font_issue():
|
37 |
-
local_ttf = os.path.join(
|
38 |
-
os.path.abspath(
|
39 |
-
os.path.join(matplotlib.matplotlib_fname(), os.path.pardir)),
|
40 |
-
'fonts', 'ttf', 'simhei.ttf')
|
41 |
-
if not os.path.exists(local_ttf):
|
42 |
-
logging.warning(
|
43 |
-
f'Missing font file `{local_ttf}` for matplotlib. It may cause some error when using matplotlib.'
|
44 |
-
)
|
45 |
-
|
46 |
-
|
47 |
-
def start_kernel(pid):
|
48 |
-
fix_matplotlib_cjk_font_issue()
|
49 |
-
|
50 |
-
connection_file = os.path.join(WORK_DIR,
|
51 |
-
f'kernel_connection_file_{pid}.json')
|
52 |
-
launch_kernel_script = os.path.join(WORK_DIR, f'launch_kernel_{pid}.py')
|
53 |
-
for f in [connection_file, launch_kernel_script]:
|
54 |
-
if os.path.exists(f):
|
55 |
-
logging.warning(f'{f} already exists')
|
56 |
-
os.remove(f)
|
57 |
-
|
58 |
-
os.makedirs(WORK_DIR, exist_ok=True)
|
59 |
-
|
60 |
-
with open(launch_kernel_script, 'w') as fout:
|
61 |
-
fout.write(LAUNCH_KERNEL_PY)
|
62 |
-
|
63 |
-
kernel_process = subprocess.Popen([
|
64 |
-
sys.executable,
|
65 |
-
launch_kernel_script,
|
66 |
-
'--IPKernelApp.connection_file',
|
67 |
-
connection_file,
|
68 |
-
'--matplotlib=inline',
|
69 |
-
'--quiet',
|
70 |
-
],
|
71 |
-
cwd=WORK_DIR)
|
72 |
-
logging.info(f"INFO: kernel process's PID = {kernel_process.pid}")
|
73 |
-
|
74 |
-
# Wait for kernel connection file to be written
|
75 |
-
while True:
|
76 |
-
if not os.path.isfile(connection_file):
|
77 |
-
time.sleep(0.1)
|
78 |
-
else:
|
79 |
-
# Keep looping if JSON parsing fails, file may be partially written
|
80 |
-
try:
|
81 |
-
with open(connection_file, 'r') as fp:
|
82 |
-
json.load(fp)
|
83 |
-
break
|
84 |
-
except json.JSONDecodeError:
|
85 |
-
pass
|
86 |
-
|
87 |
-
# Client
|
88 |
-
kc = BlockingKernelClient(connection_file=connection_file)
|
89 |
-
kc.load_connection_file()
|
90 |
-
kc.start_channels()
|
91 |
-
kc.wait_for_ready()
|
92 |
-
return kc
|
93 |
-
|
94 |
-
|
95 |
-
def escape_ansi(line):
|
96 |
-
ansi_escape = re.compile(r'(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]')
|
97 |
-
return ansi_escape.sub('', line)
|
98 |
-
|
99 |
-
|
100 |
-
def publish_image_to_local(image_base64: str):
|
101 |
-
image_file = str(uuid.uuid4()) + '.png'
|
102 |
-
local_image_file = os.path.join(WORK_DIR, image_file)
|
103 |
-
|
104 |
-
png_bytes = base64.b64decode(image_base64)
|
105 |
-
assert isinstance(png_bytes, bytes)
|
106 |
-
bytes_io = io.BytesIO(png_bytes)
|
107 |
-
PIL.Image.open(bytes_io).save(local_image_file, 'png')
|
108 |
-
|
109 |
-
return local_image_file
|
110 |
-
|
111 |
-
|
112 |
-
START_CODE = """
|
113 |
-
import signal
|
114 |
-
def _m6_code_interpreter_timeout_handler(signum, frame):
|
115 |
-
raise TimeoutError("M6_CODE_INTERPRETER_TIMEOUT")
|
116 |
-
signal.signal(signal.SIGALRM, _m6_code_interpreter_timeout_handler)
|
117 |
-
|
118 |
-
def input(*args, **kwargs):
|
119 |
-
raise NotImplementedError('Python input() function is disabled.')
|
120 |
-
|
121 |
-
import os
|
122 |
-
if 'upload_file' not in os.getcwd():
|
123 |
-
os.chdir("./upload_file/")
|
124 |
-
|
125 |
-
import math
|
126 |
-
import re
|
127 |
-
import json
|
128 |
-
|
129 |
-
import seaborn as sns
|
130 |
-
sns.set_theme()
|
131 |
-
|
132 |
-
import matplotlib
|
133 |
-
import matplotlib.pyplot as plt
|
134 |
-
plt.rcParams['font.sans-serif'] = ['SimHei']
|
135 |
-
plt.rcParams['axes.unicode_minus'] = False
|
136 |
-
|
137 |
-
import numpy as np
|
138 |
-
import pandas as pd
|
139 |
-
|
140 |
-
from sympy import Eq, symbols, solve
|
141 |
-
"""
|
142 |
-
|
143 |
-
|
144 |
-
def code_interpreter(action_input_list: list, timeout=30, clear=False):
|
145 |
-
code = ''
|
146 |
-
for action_input in action_input_list:
|
147 |
-
code += (extract_code(action_input) + '\n')
|
148 |
-
fixed_code = []
|
149 |
-
for line in code.split('\n'):
|
150 |
-
fixed_code.append(line)
|
151 |
-
if line.startswith('sns.set_theme('):
|
152 |
-
fixed_code.append('plt.rcParams["font.sans-serif"] = ["SimHei"]')
|
153 |
-
fixed_code.append('plt.rcParams["axes.unicode_minus"] = False')
|
154 |
-
fixed_code = '\n'.join(fixed_code)
|
155 |
-
if 'def solution()' in fixed_code:
|
156 |
-
fixed_code += '\nsolution()'
|
157 |
-
|
158 |
-
return _code_interpreter(fixed_code, timeout, clear)
|
159 |
-
|
160 |
-
|
161 |
-
def _code_interpreter(code: str, timeout, clear=False):
|
162 |
-
if not code.strip():
|
163 |
-
return ''
|
164 |
-
if timeout:
|
165 |
-
code = f'signal.alarm({timeout})\n{code}'
|
166 |
-
if clear:
|
167 |
-
code = "get_ipython().run_line_magic('reset', '-f')\n" + START_CODE + code
|
168 |
-
|
169 |
-
pid = os.getpid()
|
170 |
-
if pid not in _KERNEL_CLIENTS:
|
171 |
-
_KERNEL_CLIENTS[pid] = start_kernel(pid)
|
172 |
-
_code_interpreter(START_CODE, timeout=None)
|
173 |
-
kc = _KERNEL_CLIENTS[pid]
|
174 |
-
kc.wait_for_ready()
|
175 |
-
kc.execute(code)
|
176 |
-
result = ''
|
177 |
-
image_idx = 0
|
178 |
-
while True:
|
179 |
-
text = ''
|
180 |
-
image = ''
|
181 |
-
finished = False
|
182 |
-
msg_type = 'error'
|
183 |
-
try:
|
184 |
-
msg = kc.get_iopub_msg()
|
185 |
-
msg_type = msg['msg_type']
|
186 |
-
if msg_type == 'status':
|
187 |
-
if msg['content'].get('execution_state') == 'idle':
|
188 |
-
finished = True
|
189 |
-
elif msg_type == 'execute_result':
|
190 |
-
text = msg['content']['data'].get('text/plain', '')
|
191 |
-
if 'image/png' in msg['content']['data']:
|
192 |
-
image_b64 = msg['content']['data']['image/png']
|
193 |
-
image_url = publish_image_to_local(image_b64)
|
194 |
-
image_idx += 1
|
195 |
-
image = '![fig-%03d](%s)' % (image_idx, image_url)
|
196 |
-
elif msg_type == 'display_data':
|
197 |
-
if 'image/png' in msg['content']['data']:
|
198 |
-
image_b64 = msg['content']['data']['image/png']
|
199 |
-
image_url = publish_image_to_local(image_b64)
|
200 |
-
image_idx += 1
|
201 |
-
image = '![fig-%03d](%s)' % (image_idx, image_url)
|
202 |
-
else:
|
203 |
-
text = msg['content']['data'].get('text/plain', '')
|
204 |
-
elif msg_type == 'stream':
|
205 |
-
msg_type = msg['content']['name'] # stdout, stderr
|
206 |
-
text = msg['content']['text']
|
207 |
-
elif msg_type == 'error':
|
208 |
-
text = escape_ansi('\n'.join(msg['content']['traceback']))
|
209 |
-
if 'M6_CODE_INTERPRETER_TIMEOUT' in text:
|
210 |
-
text = f'Timeout. No response after {timeout} seconds.'
|
211 |
-
except queue.Empty:
|
212 |
-
text = f'Timeout. No response after {timeout} seconds.'
|
213 |
-
finished = True
|
214 |
-
except Exception:
|
215 |
-
text = 'The code interpreter encountered an unexpected error.'
|
216 |
-
logging.warning(''.join(
|
217 |
-
traceback.format_exception(*sys.exc_info())))
|
218 |
-
finished = True
|
219 |
-
if text:
|
220 |
-
result += f'\n\n{msg_type}:\n\n```\n{text}\n```'
|
221 |
-
if image:
|
222 |
-
result += f'\n\n{image}'
|
223 |
-
if finished:
|
224 |
-
break
|
225 |
-
result = result.lstrip('\n')
|
226 |
-
if timeout:
|
227 |
-
_code_interpreter('signal.alarm(0)', timeout=None)
|
228 |
-
return result
|
229 |
-
|
230 |
-
|
231 |
-
def get_multiline_input(hint):
|
232 |
-
print(hint)
|
233 |
-
print('// Press ENTER to make a new line. Press CTRL-D to end input.')
|
234 |
-
lines = []
|
235 |
-
while True:
|
236 |
-
try:
|
237 |
-
line = input()
|
238 |
-
except EOFError: # CTRL-D
|
239 |
-
break
|
240 |
-
lines.append(line)
|
241 |
-
print('// Input received.')
|
242 |
-
if lines:
|
243 |
-
return '\n'.join(lines)
|
244 |
-
else:
|
245 |
-
return ''
|
246 |
-
|
247 |
-
|
248 |
-
if __name__ == '__main__':
|
249 |
-
while True:
|
250 |
-
print(code_interpreter([get_multiline_input('Enter python code:')]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark/config.py
DELETED
@@ -1,66 +0,0 @@
|
|
1 |
-
from parser import InternLMReActParser, ReActParser
|
2 |
-
|
3 |
-
from models import LLM, QwenVL, Qwen, QwenDashscopeVLModel
|
4 |
-
from prompt import InternLMReAct, LlamaReAct, QwenReAct
|
5 |
-
|
6 |
-
react_prompt_map = {
|
7 |
-
'qwen': QwenReAct,
|
8 |
-
'llama': LlamaReAct,
|
9 |
-
'internlm': InternLMReAct,
|
10 |
-
}
|
11 |
-
|
12 |
-
react_parser_map = {
|
13 |
-
'qwen': ReActParser,
|
14 |
-
'llama': ReActParser,
|
15 |
-
'internlm': InternLMReActParser,
|
16 |
-
}
|
17 |
-
|
18 |
-
model_map = {'qwen': Qwen, 'llama': LLM, 'internlm': LLM, 'qwen-vl-chat': QwenVL}
|
19 |
-
|
20 |
-
model_type_map = {
|
21 |
-
'qwen-72b-chat': 'qwen',
|
22 |
-
'qwen-14b-chat': 'qwen',
|
23 |
-
'qwen-1.8b-chat': 'qwen',
|
24 |
-
'qwen-7b-chat': 'qwen',
|
25 |
-
'llama-2-7b-chat': 'llama',
|
26 |
-
'llama-2-13b-chat': 'llama',
|
27 |
-
'codellama-7b-instruct': 'llama',
|
28 |
-
'codellama-13b-instruct': 'llama',
|
29 |
-
'internlm-7b-chat-1.1': 'internlm',
|
30 |
-
'internlm-20b-chat': 'internlm',
|
31 |
-
'qwen-vl-chat': 'qwen-vl-chat',
|
32 |
-
}
|
33 |
-
|
34 |
-
model_path_map = {
|
35 |
-
'qwen-72b-chat': 'Qwen/Qwen-72B-Chat',
|
36 |
-
'qwen-14b-chat': 'Qwen/Qwen-14B-Chat',
|
37 |
-
'qwen-7b-chat': 'Qwen/Qwen-7B-Chat',
|
38 |
-
'qwen-1.8b-chat': 'Qwen/Qwen-1_8B-Chat',
|
39 |
-
'llama-2-7b-chat': 'meta-llama/Llama-2-7b-chat-hf',
|
40 |
-
'llama-2-13b-chat': 'meta-llama/Llama-2-13b-chat-hf',
|
41 |
-
'codellama-7b-instruct': 'codellama/CodeLlama-7b-Instruct-hf',
|
42 |
-
'codellama-13b-instruct': 'codellama/CodeLlama-13b-Instruct-hf',
|
43 |
-
'internlm-7b-chat-1.1': 'internlm/internlm-chat-7b-v1_1',
|
44 |
-
'internlm-20b-chat': 'internlm/internlm-chat-20b',
|
45 |
-
'qwen-vl-chat': 'Qwen/Qwen-VL-Chat',
|
46 |
-
}
|
47 |
-
|
48 |
-
|
49 |
-
def get_react_prompt(model_name, query, lang, upload_fname_list):
|
50 |
-
react_prompt_cls = react_prompt_map.get(model_type_map[model_name],
|
51 |
-
QwenReAct)
|
52 |
-
return react_prompt_cls(query, lang, upload_fname_list)
|
53 |
-
|
54 |
-
|
55 |
-
def get_react_parser(model_name):
|
56 |
-
react_parser_cls = react_parser_map.get(model_type_map[model_name],
|
57 |
-
ReActParser)
|
58 |
-
return react_parser_cls()
|
59 |
-
|
60 |
-
|
61 |
-
def get_model(model_name):
|
62 |
-
if model_name in ["qwen-vl-plus"]:
|
63 |
-
return QwenDashscopeVLModel(model=model_name)
|
64 |
-
model_path = model_path_map.get(model_name, None)
|
65 |
-
model_cls = model_map.get(model_type_map[model_name], LLM)
|
66 |
-
return model_cls(model_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark/inference_and_execute.py
DELETED
@@ -1,280 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
import json
|
3 |
-
import logging
|
4 |
-
import os
|
5 |
-
from parser import ReActParser
|
6 |
-
|
7 |
-
import prettytable
|
8 |
-
import tqdm
|
9 |
-
from code_interpreter import code_interpreter
|
10 |
-
from config import (get_model, get_react_parser, get_react_prompt,
|
11 |
-
model_path_map)
|
12 |
-
from datasets import load_dataset
|
13 |
-
from metrics.code_execution import eval_code_execution_rate
|
14 |
-
from metrics.gsm8k import eval_gsm8k_acc, is_correct
|
15 |
-
from metrics.visualization import eval_visualization_acc
|
16 |
-
from utils.code_utils import replace_upload_fname
|
17 |
-
from utils.data_utils import load_jsonl
|
18 |
-
|
19 |
-
logging.basicConfig(
|
20 |
-
format='%(asctime)s - %(levelname)s - %(message)s',
|
21 |
-
datefmt='%Y-%m-%d %H:%M:%S',
|
22 |
-
level=logging.INFO,
|
23 |
-
)
|
24 |
-
|
25 |
-
WORK_DIR = os.getenv('CODE_INTERPRETER_WORK_DIR', '/tmp/workspace')
|
26 |
-
os.makedirs(WORK_DIR, exist_ok=True)
|
27 |
-
os.system(f'cp -r upload_file_clean {WORK_DIR}/upload_file')
|
28 |
-
os.system('cp -r upload_file_clean ./upload_file')
|
29 |
-
|
30 |
-
global_eval_result = {
|
31 |
-
'code_executability': {
|
32 |
-
'math': None,
|
33 |
-
'visualization': None,
|
34 |
-
'general': None,
|
35 |
-
},
|
36 |
-
'code_correctness': {
|
37 |
-
'math': None,
|
38 |
-
'visualization-hard': None,
|
39 |
-
'visualization-easy': None,
|
40 |
-
}
|
41 |
-
}
|
42 |
-
|
43 |
-
|
44 |
-
def llm_with_plugin(args, query, item=None, exec_limit=3):
|
45 |
-
exec_count = 0
|
46 |
-
|
47 |
-
# Build ReAct prompt
|
48 |
-
upload_fname_list = item[
|
49 |
-
'input_file_path'] if item and 'input_file_path' in item else []
|
50 |
-
lang = item['lang'] if item and 'lang' in item else 'en'
|
51 |
-
react_prompt_obj = get_react_prompt(args.model, query, lang,
|
52 |
-
upload_fname_list)
|
53 |
-
planning_prompt = react_prompt_obj.build_prompt()
|
54 |
-
|
55 |
-
# Execute the code when providing the first action in the query
|
56 |
-
if '<|im_start|>' in query:
|
57 |
-
_, prepend_code, __ = ReActParser().parse_latest_plugin_call(query)
|
58 |
-
prepend_code = replace_upload_fname(prepend_code, upload_fname_list)
|
59 |
-
call_plugin(_, [prepend_code], clear=(exec_count == 0))
|
60 |
-
exec_count += 1
|
61 |
-
exec_limit += 1
|
62 |
-
|
63 |
-
# Inference and execute
|
64 |
-
text = ''
|
65 |
-
while exec_count < exec_limit:
|
66 |
-
stop_words_list = react_prompt_obj.get_stop_words_list()
|
67 |
-
output = text_completion(args.llm,
|
68 |
-
planning_prompt + text,
|
69 |
-
stop_words=stop_words_list)
|
70 |
-
|
71 |
-
if args.gen_only:
|
72 |
-
text += output
|
73 |
-
break
|
74 |
-
|
75 |
-
react_parser = get_react_parser(args.model)
|
76 |
-
action, action_input, output = react_parser.parse_latest_plugin_call(
|
77 |
-
output)
|
78 |
-
if action:
|
79 |
-
action_input = replace_upload_fname(action_input,
|
80 |
-
upload_fname_list)
|
81 |
-
observation = call_plugin(action, [action_input],
|
82 |
-
clear=(exec_count == 0))
|
83 |
-
output += react_prompt_obj.build_observation(observation)
|
84 |
-
text += output
|
85 |
-
exec_count += 1
|
86 |
-
if 'error:' in observation or 'Traceback' in observation:
|
87 |
-
break
|
88 |
-
else:
|
89 |
-
text += output
|
90 |
-
break
|
91 |
-
return text
|
92 |
-
|
93 |
-
|
94 |
-
def text_completion(llm, input_text, stop_words=[]):
|
95 |
-
logging.info('Generating'.center(60, '='))
|
96 |
-
logging.info('Input'.center(60, '-'))
|
97 |
-
logging.info(input_text)
|
98 |
-
|
99 |
-
output = llm.generate(input_text, stop_words)
|
100 |
-
|
101 |
-
logging.info('Output'.center(60, '-'))
|
102 |
-
logging.info(output)
|
103 |
-
return output
|
104 |
-
|
105 |
-
|
106 |
-
def call_plugin(plugin_name, plugin_args_list, clear=False):
|
107 |
-
# Relax constraints on plugin name.
|
108 |
-
logging.info('Call code interpreter'.center(60, '='))
|
109 |
-
obs = code_interpreter(plugin_args_list, clear=clear)
|
110 |
-
logging.info(obs)
|
111 |
-
return obs
|
112 |
-
|
113 |
-
|
114 |
-
def process_code_interpreter(item, writer):
|
115 |
-
query = item['query']
|
116 |
-
exec_limit = 3 if 'visualization' in item['tags'] else 1
|
117 |
-
response = llm_with_plugin(args=args,
|
118 |
-
query=query,
|
119 |
-
item=item,
|
120 |
-
exec_limit=exec_limit)
|
121 |
-
item['gen'] = response
|
122 |
-
|
123 |
-
writer.write(json.dumps(item, ensure_ascii=False) + '\n')
|
124 |
-
writer.flush()
|
125 |
-
|
126 |
-
|
127 |
-
def process_gsm8k(doc, writer):
|
128 |
-
context = doc['question']
|
129 |
-
completion = llm_with_plugin(args=args, query=context)
|
130 |
-
acc = is_correct(completion, doc['answer'])
|
131 |
-
doc['completion'] = completion
|
132 |
-
doc['acc'] = acc
|
133 |
-
|
134 |
-
writer.write(json.dumps(doc, ensure_ascii=False) + '\n')
|
135 |
-
writer.flush()
|
136 |
-
|
137 |
-
|
138 |
-
def sequential_processing(args, data_list, process_func, writer):
|
139 |
-
for item in tqdm.tqdm(data_list):
|
140 |
-
process_func(item, writer)
|
141 |
-
|
142 |
-
|
143 |
-
process_func_map = {
|
144 |
-
'gsm8k': process_gsm8k,
|
145 |
-
'visualization': process_code_interpreter
|
146 |
-
}
|
147 |
-
|
148 |
-
|
149 |
-
def gather_eval_result(model_name):
|
150 |
-
for metric in global_eval_result:
|
151 |
-
logging.info(metric)
|
152 |
-
table = prettytable.PrettyTable()
|
153 |
-
table.field_names = ['model'] + list(global_eval_result[metric].keys())
|
154 |
-
row_data = [model_name]
|
155 |
-
for item in global_eval_result[metric].values():
|
156 |
-
item = str(item) if not item else str(round(item, 2))
|
157 |
-
row_data.append(item)
|
158 |
-
table.add_row(row_data)
|
159 |
-
logging.info('\n' + str(table))
|
160 |
-
|
161 |
-
|
162 |
-
def eval_metrics(args, test_set, full_output_fname):
|
163 |
-
# metrics
|
164 |
-
assert os.path.exists(
|
165 |
-
full_output_fname), f'Not Found File {full_output_fname}.'
|
166 |
-
inference_res = load_jsonl(full_output_fname)
|
167 |
-
assert len(inference_res) == len(
|
168 |
-
test_set
|
169 |
-
), f'There are still {len(test_set)-len(inference_res)} cases left.'
|
170 |
-
|
171 |
-
abs_output_fname = os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
172 |
-
full_output_fname)
|
173 |
-
if args.task == 'gsm8k':
|
174 |
-
math_code_correctness = eval_gsm8k_acc(abs_output_fname)
|
175 |
-
global_eval_result['code_correctness'].update(math_code_correctness)
|
176 |
-
else:
|
177 |
-
code_executability = eval_code_execution_rate(abs_output_fname,
|
178 |
-
args.task, args.model)
|
179 |
-
global_eval_result['code_executability'].update(code_executability)
|
180 |
-
if args.task in ['all_ci', 'visualization'
|
181 |
-
] and not args.eval_code_exec_only:
|
182 |
-
visualization_code_correctness = eval_visualization_acc(
|
183 |
-
abs_output_fname, args.model, args.vis_judger)
|
184 |
-
global_eval_result['code_correctness'].update(
|
185 |
-
visualization_code_correctness)
|
186 |
-
|
187 |
-
|
188 |
-
def main(args):
|
189 |
-
current_dir = os.getcwd()
|
190 |
-
os.makedirs(args.output_path, exist_ok=True)
|
191 |
-
full_output_fname = os.path.join(
|
192 |
-
args.output_path,
|
193 |
-
(args.output_fname or f'{args.task}_{args.model}_res.jsonl'))
|
194 |
-
|
195 |
-
if not os.path.exists(full_output_fname):
|
196 |
-
with open(full_output_fname, 'w'):
|
197 |
-
logging.info(f'Create file {full_output_fname} done.')
|
198 |
-
|
199 |
-
# build data
|
200 |
-
if args.task == 'gsm8k':
|
201 |
-
dataset = load_dataset('gsm8k', 'main')
|
202 |
-
test_set = dataset['test']
|
203 |
-
else:
|
204 |
-
eval_data_path = os.path.join(args.input_path, args.input_fname)
|
205 |
-
test_set = [
|
206 |
-
item for item in load_jsonl(eval_data_path)
|
207 |
-
if args.task in item['tags']
|
208 |
-
]
|
209 |
-
logging.info(f'Test set: {len(test_set)}')
|
210 |
-
|
211 |
-
if args.eval_only:
|
212 |
-
eval_metrics(args, test_set, full_output_fname)
|
213 |
-
else:
|
214 |
-
key = 'question' if args.task == 'gsm8k' else 'query'
|
215 |
-
cache_question = [item[key] for item in load_jsonl(full_output_fname)
|
216 |
-
] if not args.force else []
|
217 |
-
data_list = [
|
218 |
-
item for item in test_set if item[key] not in cache_question
|
219 |
-
]
|
220 |
-
logging.info(f'Left cases: {len(data_list)}')
|
221 |
-
|
222 |
-
# inference
|
223 |
-
writer_mode = 'w' if args.force else 'a'
|
224 |
-
f_output = open(full_output_fname, writer_mode, encoding='utf-8')
|
225 |
-
process_func = process_func_map.get(args.task,
|
226 |
-
process_code_interpreter)
|
227 |
-
sequential_processing(args, data_list, process_func, f_output)
|
228 |
-
f_output.close()
|
229 |
-
|
230 |
-
# evaluate
|
231 |
-
if not args.gen_exec_only:
|
232 |
-
eval_metrics(args, test_set, full_output_fname)
|
233 |
-
|
234 |
-
os.chdir(current_dir)
|
235 |
-
|
236 |
-
|
237 |
-
def parse_args():
|
238 |
-
parser = argparse.ArgumentParser()
|
239 |
-
parser.add_argument('--model',
|
240 |
-
type=str,
|
241 |
-
default='qwen-14b-chat',
|
242 |
-
choices=list(model_path_map.keys()))
|
243 |
-
parser.add_argument(
|
244 |
-
'--task',
|
245 |
-
type=str,
|
246 |
-
default='all',
|
247 |
-
choices=['all', 'gsm8k', 'visualization', 'general'])
|
248 |
-
parser.add_argument('--output-path', type=str, default='output_data')
|
249 |
-
parser.add_argument('--input-path', type=str, default='eval_data')
|
250 |
-
parser.add_argument('-o', '--output-fname', type=str, default='')
|
251 |
-
parser.add_argument('-i',
|
252 |
-
'--input-fname',
|
253 |
-
type=str,
|
254 |
-
default='eval_code_interpreter_v1.jsonl')
|
255 |
-
parser.add_argument('-f', '--force', action='store_true', default=False)
|
256 |
-
parser.add_argument('--eval-only', action='store_true', default=False)
|
257 |
-
parser.add_argument('--eval-code-exec-only',
|
258 |
-
action='store_true',
|
259 |
-
default=False)
|
260 |
-
parser.add_argument('--gen-exec-only', action='store_true', default=False)
|
261 |
-
parser.add_argument('--gen-only', action='store_true', default=False)
|
262 |
-
parser.add_argument('--vis-judger', type=str, default="'gpt-4-vision-preview'",
|
263 |
-
choices=['gpt-4-vision-preview', 'qwen-vl-chat', 'qwen-vl-plus'])
|
264 |
-
args = parser.parse_args()
|
265 |
-
return args
|
266 |
-
|
267 |
-
|
268 |
-
if __name__ == '__main__':
|
269 |
-
args = parse_args()
|
270 |
-
if not args.eval_only:
|
271 |
-
args.llm = get_model(args.model)
|
272 |
-
logging.info(f'Init {args.model} done.')
|
273 |
-
|
274 |
-
if args.task == 'all':
|
275 |
-
for key in ['gsm8k', 'visualization', 'general']:
|
276 |
-
args.task = key
|
277 |
-
main(args)
|
278 |
-
else:
|
279 |
-
main(args)
|
280 |
-
gather_eval_result(args.model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark/metrics/__init__.py
DELETED
File without changes
|
benchmark/metrics/code_execution.py
DELETED
@@ -1,257 +0,0 @@
|
|
1 |
-
import logging
|
2 |
-
import os
|
3 |
-
|
4 |
-
import func_timeout
|
5 |
-
from config import get_react_parser
|
6 |
-
from func_timeout import func_set_timeout
|
7 |
-
from utils.code_utils import extract_code, replace_upload_fname
|
8 |
-
from utils.data_utils import load_jsonl, save_jsonl
|
9 |
-
|
10 |
-
pre_load = """
|
11 |
-
import os
|
12 |
-
if 'upload_file' not in os.getcwd():
|
13 |
-
os.chdir("./upload_file/")
|
14 |
-
|
15 |
-
import seaborn as sns
|
16 |
-
|
17 |
-
import matplotlib
|
18 |
-
# matplotlib.use('Agg')
|
19 |
-
import matplotlib.pyplot as plt
|
20 |
-
plt.ion()
|
21 |
-
|
22 |
-
import numpy as np
|
23 |
-
import pandas as pd
|
24 |
-
from sympy import Eq, symbols, solve
|
25 |
-
import re
|
26 |
-
import json
|
27 |
-
import math
|
28 |
-
"""
|
29 |
-
|
30 |
-
tags_config = {
|
31 |
-
'visualization': {
|
32 |
-
'timelimit': True,
|
33 |
-
'extract_first_code': True,
|
34 |
-
},
|
35 |
-
'math': {
|
36 |
-
'timelimit': True,
|
37 |
-
'extract_first_code': False,
|
38 |
-
},
|
39 |
-
'general': {
|
40 |
-
'timelimit': False,
|
41 |
-
'extract_first_code': True,
|
42 |
-
}
|
43 |
-
}
|
44 |
-
|
45 |
-
code_executability = {'math': None, 'visualization': None, 'general': None}
|
46 |
-
|
47 |
-
|
48 |
-
@func_set_timeout(10)
|
49 |
-
def exec_limit_time(text):
|
50 |
-
exec(text, locals())
|
51 |
-
|
52 |
-
|
53 |
-
def exec_code(text, timelimit=False):
|
54 |
-
if timelimit:
|
55 |
-
exec_limit_time(text)
|
56 |
-
else:
|
57 |
-
exec(text, locals())
|
58 |
-
|
59 |
-
|
60 |
-
def postprocess_code(gen_code, line):
|
61 |
-
if '<|im_start|>' in line['query']:
|
62 |
-
first_action_code = get_action_input_code(line['query'])
|
63 |
-
gen_code = first_action_code + gen_code
|
64 |
-
|
65 |
-
upload_fname_list = line[
|
66 |
-
'input_file_path'] if line and 'input_file_path' in line else []
|
67 |
-
gen_code = replace_upload_fname(gen_code, upload_fname_list)
|
68 |
-
|
69 |
-
if 'def solution()' in gen_code:
|
70 |
-
gen_code += '\nsolution()\n'
|
71 |
-
|
72 |
-
if 'plt.show()' in gen_code:
|
73 |
-
gen_code += "\nplt.pause(1)\nplt.close('all')\n"
|
74 |
-
|
75 |
-
if 'sns.' in gen_code and 'plot' in gen_code:
|
76 |
-
gen_code += "\nplt.close('all')\n"
|
77 |
-
|
78 |
-
gen_code = pre_load + gen_code
|
79 |
-
return gen_code
|
80 |
-
|
81 |
-
|
82 |
-
def get_action_input_code(text,
|
83 |
-
model_name='qwen-14b-chat',
|
84 |
-
extract_first_code=False):
|
85 |
-
action_input_list = []
|
86 |
-
tmp = text
|
87 |
-
react_parser = get_react_parser(model_name)
|
88 |
-
while True:
|
89 |
-
action_input = react_parser.get_first_action_input(tmp)
|
90 |
-
if not action_input:
|
91 |
-
break
|
92 |
-
action_input_list.append(action_input)
|
93 |
-
tmp = tmp.split(action_input)[1]
|
94 |
-
if not tmp or extract_first_code:
|
95 |
-
break
|
96 |
-
|
97 |
-
code = ''
|
98 |
-
for action_input in action_input_list:
|
99 |
-
code = code + '# concat\n' + extract_code(action_input) + '\n'
|
100 |
-
return code
|
101 |
-
|
102 |
-
|
103 |
-
def eval_code_execution_rate(output_fname,
|
104 |
-
tag='all_ci',
|
105 |
-
model_name='qwen-14b-chat',
|
106 |
-
timelimit=False,
|
107 |
-
extract_first_code=False):
|
108 |
-
data_list = load_jsonl(output_fname)
|
109 |
-
pip_package = []
|
110 |
-
|
111 |
-
for line_id, line in enumerate(data_list):
|
112 |
-
line['idx'] = line_id
|
113 |
-
tags_list = line['tags'].split(',')
|
114 |
-
if tag not in tags_list:
|
115 |
-
continue
|
116 |
-
|
117 |
-
# update args
|
118 |
-
for cur_tag in tags_list:
|
119 |
-
if cur_tag != 'all_ci':
|
120 |
-
timelimit = tags_config[cur_tag]['timelimit']
|
121 |
-
extract_first_code = tags_config[cur_tag]['extract_first_code']
|
122 |
-
|
123 |
-
line['executable_code'] = False
|
124 |
-
line['missing_code'] = False
|
125 |
-
line['code_error_info'] = ''
|
126 |
-
|
127 |
-
# get Action Input code from response
|
128 |
-
gen_code = get_action_input_code(line['gen'],
|
129 |
-
model_name=model_name,
|
130 |
-
extract_first_code=extract_first_code)
|
131 |
-
|
132 |
-
if not gen_code:
|
133 |
-
line['missing_code'] = True
|
134 |
-
line['code'] = ''
|
135 |
-
line['code_error_info'] = 'missing code'
|
136 |
-
continue
|
137 |
-
|
138 |
-
line['code'] = gen_code
|
139 |
-
gen_code = postprocess_code(gen_code, line)
|
140 |
-
|
141 |
-
while True:
|
142 |
-
try:
|
143 |
-
exec_code(gen_code, timelimit=timelimit)
|
144 |
-
line['executable_code'] = True
|
145 |
-
break
|
146 |
-
except func_timeout.exceptions.FunctionTimedOut as ex:
|
147 |
-
line['code_error_info'] = str(ex)
|
148 |
-
break
|
149 |
-
except (ImportError, ModuleNotFoundError) as ex:
|
150 |
-
try:
|
151 |
-
packege = str(ex).split("'")[1].strip()
|
152 |
-
except Exception:
|
153 |
-
packege = ''
|
154 |
-
if packege and packege not in pip_package: # install package
|
155 |
-
pip_package.append(packege)
|
156 |
-
os.system('pip install ' + packege)
|
157 |
-
logging.info(f'Automatic installation: {packege}')
|
158 |
-
else:
|
159 |
-
line['code_error_info'] = str(ex)
|
160 |
-
break
|
161 |
-
except Exception as ex:
|
162 |
-
line['code_error_info'] = str(ex)
|
163 |
-
break
|
164 |
-
|
165 |
-
# double check
|
166 |
-
observation = get_react_parser(model_name).get_first_observation(
|
167 |
-
line['gen'])
|
168 |
-
if line['executable_code'] and ('error:' in observation):
|
169 |
-
logging.warning(
|
170 |
-
'The code executes correctly, but it has an error in IPython!')
|
171 |
-
logging.warning(f'Code:\n{gen_code}')
|
172 |
-
logging.warning(f'IPython error info:\n{observation}')
|
173 |
-
logging.info('=' * 60)
|
174 |
-
elif not line['executable_code'] and not ('error:' in observation):
|
175 |
-
logging.warning(
|
176 |
-
'The code has an execution error, but it runs correctly in IPython!'
|
177 |
-
)
|
178 |
-
logging.warning(f'Code:\n{gen_code}')
|
179 |
-
logging.warning(f"Exec error info:\n{line['code_error_info']}")
|
180 |
-
logging.warning(f'IPython observation:\n{observation}')
|
181 |
-
logging.info('=' * 60)
|
182 |
-
|
183 |
-
# save error data
|
184 |
-
error_data_list = [
|
185 |
-
item for item in data_list
|
186 |
-
if not item['executable_code'] or item['missing_code']
|
187 |
-
]
|
188 |
-
error_data_output_fname = os.path.splitext(
|
189 |
-
output_fname)[0] + '_exec_error.jsonl'
|
190 |
-
save_jsonl(error_data_list, error_data_output_fname)
|
191 |
-
|
192 |
-
log_result(data_list)
|
193 |
-
|
194 |
-
return code_executability
|
195 |
-
|
196 |
-
|
197 |
-
def log_result(data_list, verbose=True):
|
198 |
-
if verbose:
|
199 |
-
logging.info('*' * 60)
|
200 |
-
logging.info('{:^60}'.format('Detail'))
|
201 |
-
logging.info('*' * 60)
|
202 |
-
for line_id, line in enumerate(data_list):
|
203 |
-
logging.info(f'Question {line_id}'.center(60, '='))
|
204 |
-
logging.info(line['query'])
|
205 |
-
|
206 |
-
logging.info(f'Generated {line_id}'.center(60, '-'))
|
207 |
-
logging.info('\n' + line['gen'])
|
208 |
-
|
209 |
-
logging.info(f'Code {line_id}'.center(60, '-'))
|
210 |
-
logging.info('\n' + line['code'])
|
211 |
-
|
212 |
-
logging.info(f'Exec Result {line_id}'.center(60, '-'))
|
213 |
-
prefix_info = 'Exec Success' if line[
|
214 |
-
'executable_code'] else 'Exec Error: '
|
215 |
-
exec_info = prefix_info + line['code_error_info']
|
216 |
-
logging.info(exec_info)
|
217 |
-
|
218 |
-
logging.info('=' * 60)
|
219 |
-
logging.info('{:^60}'.format('Code Execuation Rate'))
|
220 |
-
logging.info('=' * 60)
|
221 |
-
involved_tags = []
|
222 |
-
for line in data_list:
|
223 |
-
involved_tags += line['tags'].split(',')
|
224 |
-
involved_tags = list(set(involved_tags))
|
225 |
-
|
226 |
-
for key in involved_tags:
|
227 |
-
logging.info(f'task: {key}'.center(60, '='))
|
228 |
-
key_item_list = [item for item in data_list if key in item['tags']]
|
229 |
-
all_count = len(key_item_list)
|
230 |
-
missing_code_count = len(
|
231 |
-
[item for item in key_item_list if item['missing_code']])
|
232 |
-
executable_code_count = len(
|
233 |
-
[item for item in key_item_list if item['executable_code']])
|
234 |
-
|
235 |
-
logging.info(f'All Test: {all_count}')
|
236 |
-
logging.info(f'Missing Code: {missing_code_count}')
|
237 |
-
logging.info(f'Predict Exec Success: {executable_code_count}')
|
238 |
-
logging.info('Codes available && Execution Rate: {:.2f}'.format(
|
239 |
-
executable_code_count / (all_count - missing_code_count) * 100))
|
240 |
-
logging.info('Execution Rate: {:.2f}'.format(executable_code_count /
|
241 |
-
all_count * 100))
|
242 |
-
logging.info('Non-executable rate: {:.2f}'.format(
|
243 |
-
(all_count - missing_code_count - executable_code_count) /
|
244 |
-
all_count * 100))
|
245 |
-
logging.info('Missing code rate: {:.2f}'.format(missing_code_count /
|
246 |
-
all_count * 100))
|
247 |
-
|
248 |
-
if key != 'all_ci':
|
249 |
-
code_executability[key] = executable_code_count / all_count * 100
|
250 |
-
|
251 |
-
if verbose:
|
252 |
-
logging.info('Error List: ')
|
253 |
-
error_list = [(item['idx'], item['code_error_info'])
|
254 |
-
for item in key_item_list if item['code_error_info']]
|
255 |
-
error_list.sort(key=lambda x: x[1])
|
256 |
-
for x in error_list:
|
257 |
-
logging.info(x)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark/metrics/gsm8k.py
DELETED
@@ -1,54 +0,0 @@
|
|
1 |
-
import logging
|
2 |
-
import os
|
3 |
-
import re
|
4 |
-
|
5 |
-
import numpy as np
|
6 |
-
from utils.data_utils import load_jsonl, save_jsonl
|
7 |
-
|
8 |
-
INVALID_ANS = '[invalid]'
|
9 |
-
|
10 |
-
|
11 |
-
def extract_answer(completion):
|
12 |
-
|
13 |
-
def _get_last_digit(s):
|
14 |
-
_PAT_LAST_DIGIT = re.compile(
|
15 |
-
r'(?<=(\s|[\$%#{]))([+-])?(?=(\S))(0|([1-9](\d*|\d{0,2}(,\d{3})*)))?(\.\d*[1-9])?(?=(\s|[.,}]|$))'
|
16 |
-
)
|
17 |
-
match = list(_PAT_LAST_DIGIT.finditer(s))
|
18 |
-
if match:
|
19 |
-
last_digit = match[-1].group().replace(',', '').replace('+', '')
|
20 |
-
else:
|
21 |
-
last_digit = None
|
22 |
-
logging.warning(f'No digits found in {s!r}')
|
23 |
-
return last_digit
|
24 |
-
|
25 |
-
job_gen = completion.strip('.').replace('\n', '\\n')
|
26 |
-
last_digit = _get_last_digit(job_gen)
|
27 |
-
if last_digit:
|
28 |
-
return eval(last_digit)
|
29 |
-
else:
|
30 |
-
return INVALID_ANS
|
31 |
-
|
32 |
-
|
33 |
-
def is_correct(completion, answer):
|
34 |
-
gold = extract_answer(answer)
|
35 |
-
assert gold != INVALID_ANS, 'No ground truth answer found in the document.'
|
36 |
-
return extract_answer(completion) == gold
|
37 |
-
|
38 |
-
|
39 |
-
def eval_gsm8k_acc(output_fname):
|
40 |
-
data_list = load_jsonl(output_fname)
|
41 |
-
acc_res = [item['acc'] for item in data_list]
|
42 |
-
logging.info('=' * 60)
|
43 |
-
logging.info('{:^60}'.format('Math Acc.'))
|
44 |
-
logging.info('=' * 60)
|
45 |
-
logging.info('Total num={:.2f}'.format(len(acc_res)))
|
46 |
-
logging.info('Right num={:.2f}'.format(np.sum(acc_res)))
|
47 |
-
logging.info('Zero-shot Acc={:.2f}'.format(np.mean(acc_res) * 100))
|
48 |
-
|
49 |
-
error_data_list = [item for item in data_list if not item['acc']]
|
50 |
-
error_data_output_fname = os.path.splitext(
|
51 |
-
output_fname)[0] + '_gsm8k_error.jsonl'
|
52 |
-
save_jsonl(error_data_list, error_data_output_fname)
|
53 |
-
|
54 |
-
return {'math': np.mean(acc_res) * 100}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark/metrics/visualization.py
DELETED
@@ -1,179 +0,0 @@
|
|
1 |
-
import logging
|
2 |
-
import os
|
3 |
-
import re
|
4 |
-
import base64
|
5 |
-
import torch
|
6 |
-
from config import get_model, get_react_parser
|
7 |
-
from utils.data_utils import load_jsonl, save_jsonl
|
8 |
-
|
9 |
-
torch.manual_seed(1234)
|
10 |
-
|
11 |
-
EVAL_VISUAL_PROMPT_ZH = """请判断图片是否与下面的[问题]一致,如果一致则回复“right”,不一致则回复“wrong”。
|
12 |
-
[问题]:{query}
|
13 |
-
"""
|
14 |
-
|
15 |
-
EVAL_VISUAL_PROMPT_EN = """Please judge whether the image is consistent with the [Question] below, if it is consistent then reply "right", if not then reply "wrong".
|
16 |
-
[Question]: {query}
|
17 |
-
"""
|
18 |
-
|
19 |
-
visualization_code_correctness = {
|
20 |
-
'visualization-hard': None,
|
21 |
-
'visualization-easy': None,
|
22 |
-
}
|
23 |
-
|
24 |
-
|
25 |
-
def encode_image(image_path):
|
26 |
-
with open(image_path, "rb") as image_file:
|
27 |
-
a = base64.b64encode(image_file.read()).decode('utf-8')
|
28 |
-
return a
|
29 |
-
|
30 |
-
|
31 |
-
def judger_model_inference(judger_model_name, judger_model, imgs=[], prompt=''):
|
32 |
-
output = ""
|
33 |
-
if judger_model_name == 'gpt-4-vision-preview':
|
34 |
-
logging.warning("This is an example of `gpt-4-vision-preview`. "
|
35 |
-
"Please set the API key and use according to your actual situation.")
|
36 |
-
from openai import OpenAI
|
37 |
-
client = OpenAI()
|
38 |
-
content_list = []
|
39 |
-
content_list.append({"type": "text", "text": prompt})
|
40 |
-
input_images = []
|
41 |
-
for img in imgs:
|
42 |
-
if 'http' not in img:
|
43 |
-
base64_image = encode_image(img)
|
44 |
-
img = f"data:image/jpeg;base64,{base64_image}"
|
45 |
-
input_images.append({"type": "image_url", 'image_url': img})
|
46 |
-
content_list.extend(input_images)
|
47 |
-
response = client.chat.completions.create(
|
48 |
-
model="gpt-4-vision-preview",
|
49 |
-
messages=[
|
50 |
-
{
|
51 |
-
"role": "user",
|
52 |
-
"content": content_list,
|
53 |
-
}
|
54 |
-
],
|
55 |
-
max_tokens=300,
|
56 |
-
)
|
57 |
-
output = response.choices[0]
|
58 |
-
elif judger_model_name in ['qwen-vl-plus', 'qwen-vl-chat']:
|
59 |
-
inputs = []
|
60 |
-
for img in imgs:
|
61 |
-
if 'http' not in img and judger_model_name == 'qwen-vl-plus':
|
62 |
-
img = "file://" + img
|
63 |
-
inputs.append({'image': img})
|
64 |
-
inputs.append({'text': prompt})
|
65 |
-
|
66 |
-
logging.info('Eval'.center(60, '-'))
|
67 |
-
logging.info(inputs)
|
68 |
-
output = judger_model.generate(inputs)
|
69 |
-
logging.info(output)
|
70 |
-
logging.info('=' * 60)
|
71 |
-
return output
|
72 |
-
|
73 |
-
|
74 |
-
def extract_images(text):
|
75 |
-
regex = re.compile(r'!\[fig-(.+)\]\((.+)\)')
|
76 |
-
results = re.findall(regex, text)
|
77 |
-
images = []
|
78 |
-
for res in results:
|
79 |
-
assert len(res) == 2
|
80 |
-
if os.path.exists(res[1]):
|
81 |
-
images.append(res[1])
|
82 |
-
return images
|
83 |
-
|
84 |
-
|
85 |
-
def check_images_observation(text, images, model_name):
|
86 |
-
start_flag = get_react_parser(model_name).observation
|
87 |
-
for image in images:
|
88 |
-
logging.info('Image'.center(60, '-'))
|
89 |
-
logging.info(image)
|
90 |
-
|
91 |
-
end_idx = text.find(image)
|
92 |
-
tmp_text = text[:end_idx + len(image)]
|
93 |
-
start_idx = tmp_text.rfind(start_flag)
|
94 |
-
check_text = tmp_text[start_idx + len(start_flag):]
|
95 |
-
|
96 |
-
logging.info('Observation'.center(60, '-'))
|
97 |
-
logging.info(check_text)
|
98 |
-
|
99 |
-
# As long as there exists correctly executed observation, we consider `True`
|
100 |
-
if 'error:' not in check_text and 'Traceback' not in check_text:
|
101 |
-
return True
|
102 |
-
return False
|
103 |
-
|
104 |
-
|
105 |
-
eval_visual_prompt = {'zh': EVAL_VISUAL_PROMPT_ZH, 'en': EVAL_VISUAL_PROMPT_EN}
|
106 |
-
|
107 |
-
|
108 |
-
def eval_visualization_acc(output_fname, model_name, judger_model_name='gpt-4-vision-preview'):
|
109 |
-
if judger_model_name == 'gpt-4-vision-preview':
|
110 |
-
judger_model = None
|
111 |
-
elif judger_model_name in ['qwen-vl-chat', 'qwen-vl-plus']:
|
112 |
-
if judger_model_name == 'qwen-vl-chat':
|
113 |
-
logging.warning('In this benchmark of version 20231206, `Qwen-vl-chat` is no longer used as the '
|
114 |
-
'evaluation model for `Visualization` task.. If you insist on using it, '
|
115 |
-
'the evaluation results might differ from the official results.')
|
116 |
-
judger_model = get_model(judger_model_name)
|
117 |
-
else:
|
118 |
-
raise Exception("Not supported judger model.")
|
119 |
-
|
120 |
-
one_action, one_action_right = 0, 0
|
121 |
-
zero_action, zero_action_right = 0, 0
|
122 |
-
|
123 |
-
data_list = load_jsonl(output_fname)
|
124 |
-
for item in data_list:
|
125 |
-
if 'visualization' not in item['tags']:
|
126 |
-
continue
|
127 |
-
|
128 |
-
item['vis_acc'] = False
|
129 |
-
if '<|im_end|>' in item['query']:
|
130 |
-
one_action += 1
|
131 |
-
prompt = item['query'].split('<|im_end|>')[0]
|
132 |
-
else:
|
133 |
-
zero_action += 1
|
134 |
-
prompt = item['query']
|
135 |
-
|
136 |
-
images = extract_images(item['gen'])
|
137 |
-
|
138 |
-
if images and check_images_observation(item['gen'], images,
|
139 |
-
model_name):
|
140 |
-
input_prompt = eval_visual_prompt[item.get('lang', 'en')]
|
141 |
-
format_prompt = input_prompt.format(query=prompt)
|
142 |
-
output = judger_model_inference(judger_model_name, judger_model, images, format_prompt)
|
143 |
-
if 'right' in output.lower():
|
144 |
-
item['vis_acc'] = True
|
145 |
-
if '<|im_end|>' in item['query']:
|
146 |
-
one_action_right += 1
|
147 |
-
else:
|
148 |
-
zero_action_right += 1
|
149 |
-
|
150 |
-
logging.info('*' * 60)
|
151 |
-
logging.info('{:^60}'.format('Visualization Acc.'))
|
152 |
-
logging.info('*' * 60)
|
153 |
-
logging.info(
|
154 |
-
'Visualization-Hard count={}, Visualization-Hard right count={}, Visualization-Hard acc={:.2f}'
|
155 |
-
.format(zero_action, zero_action_right,
|
156 |
-
zero_action_right / zero_action * 100))
|
157 |
-
logging.info(
|
158 |
-
'Visualization-Easy count={}, Visualization-Easy right count={}, Visualization-Easy acc={:.2f}'
|
159 |
-
.format(one_action, one_action_right,
|
160 |
-
one_action_right / one_action * 100))
|
161 |
-
logging.info('all count={}, all right={}, all acc={:.2f}'.format(
|
162 |
-
zero_action + one_action, zero_action_right + one_action_right,
|
163 |
-
(zero_action_right + one_action_right) / (zero_action + one_action) *
|
164 |
-
100))
|
165 |
-
|
166 |
-
visualization_code_correctness[
|
167 |
-
'visualization-hard'] = zero_action_right / zero_action * 100
|
168 |
-
visualization_code_correctness[
|
169 |
-
'visualization-easy'] = one_action_right / one_action * 100
|
170 |
-
|
171 |
-
error_data_list = [
|
172 |
-
item for item in data_list
|
173 |
-
if 'visualization' in item['tags'] and not item['vis_acc']
|
174 |
-
]
|
175 |
-
error_data_output_fname = os.path.splitext(
|
176 |
-
output_fname)[0] + '_vis_error.jsonl'
|
177 |
-
save_jsonl(error_data_list, error_data_output_fname)
|
178 |
-
|
179 |
-
return visualization_code_correctness
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark/models/__init__.py
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
from models.base import HFModel # noqa
|
2 |
-
from models.llm import LLM # noqa
|
3 |
-
from models.qwen import Qwen, QwenVL # noqa
|
4 |
-
from models.dashscope import QwenDashscopeVLModel
|
|
|
|
|
|
|
|
|
|
benchmark/models/base.py
DELETED
@@ -1,17 +0,0 @@
|
|
1 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
2 |
-
from transformers.generation import GenerationConfig
|
3 |
-
|
4 |
-
|
5 |
-
class HFModel(object):
|
6 |
-
|
7 |
-
def __init__(self, model_path):
|
8 |
-
self.tokenizer = AutoTokenizer.from_pretrained(model_path,
|
9 |
-
trust_remote_code=True)
|
10 |
-
self.model = AutoModelForCausalLM.from_pretrained(
|
11 |
-
model_path,
|
12 |
-
trust_remote_code=True,
|
13 |
-
device_map='auto',
|
14 |
-
low_cpu_mem_usage=True).eval()
|
15 |
-
self.model.generation_config = GenerationConfig.from_pretrained(
|
16 |
-
model_path, trust_remote_code=True)
|
17 |
-
self.model.generation_config.do_sample = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark/models/dashscope.py
DELETED
@@ -1,40 +0,0 @@
|
|
1 |
-
import logging
|
2 |
-
from http import HTTPStatus
|
3 |
-
import time
|
4 |
-
import dashscope
|
5 |
-
|
6 |
-
|
7 |
-
class QwenDashscopeVLModel(object):
|
8 |
-
def __init__(self, model, api_key):
|
9 |
-
self.model = model
|
10 |
-
dashscope.api_key = api_key.strip() or os.getenv('DASHSCOPE_API_KEY', default='')
|
11 |
-
assert dashscope.api_key, 'DASHSCOPE_API_KEY is required.'
|
12 |
-
|
13 |
-
def generate(self, prompt, stop_words=[]):
|
14 |
-
if isinstance(prompt, str):
|
15 |
-
prompt = [{'text': prompt}]
|
16 |
-
|
17 |
-
MAX_TRY = 3
|
18 |
-
count = 0
|
19 |
-
while count < MAX_TRY:
|
20 |
-
response = dashscope.MultiModalConversation.call(
|
21 |
-
self.model,
|
22 |
-
messages=[{'role': 'user', 'content': prompt}],
|
23 |
-
top_p=0.01,
|
24 |
-
top_k=1,
|
25 |
-
)
|
26 |
-
if response.status_code == HTTPStatus.OK:
|
27 |
-
output = response.output.choices[0].message.content[0]['text']
|
28 |
-
for stop_str in stop_words:
|
29 |
-
idx = output.find(stop_str)
|
30 |
-
if idx != -1:
|
31 |
-
output = output[: idx + len(stop_str)]
|
32 |
-
return output
|
33 |
-
else:
|
34 |
-
err = 'Error code: %s, error message: %s' % (
|
35 |
-
response.code,
|
36 |
-
response.message,
|
37 |
-
)
|
38 |
-
logging.error(err)
|
39 |
-
count += 1
|
40 |
-
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark/models/llm.py
DELETED
@@ -1,26 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
from models.base import HFModel
|
3 |
-
|
4 |
-
|
5 |
-
class LLM(HFModel):
|
6 |
-
|
7 |
-
def __init__(self, model_path):
|
8 |
-
super().__init__(model_path)
|
9 |
-
|
10 |
-
def generate(self, input_text, stop_words=[], max_new_tokens=512):
|
11 |
-
if isinstance(input_text, str):
|
12 |
-
input_text = [input_text]
|
13 |
-
|
14 |
-
input_ids = self.tokenizer(input_text)['input_ids']
|
15 |
-
input_ids = torch.tensor(input_ids, device=self.model.device)
|
16 |
-
gen_kwargs = {'max_new_tokens': max_new_tokens, 'do_sample': False}
|
17 |
-
outputs = self.model.generate(input_ids, **gen_kwargs)
|
18 |
-
s = outputs[0][input_ids.shape[1]:]
|
19 |
-
output = self.tokenizer.decode(s, skip_special_tokens=True)
|
20 |
-
|
21 |
-
for stop_str in stop_words:
|
22 |
-
idx = output.find(stop_str)
|
23 |
-
if idx != -1:
|
24 |
-
output = output[:idx + len(stop_str)]
|
25 |
-
|
26 |
-
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark/models/qwen.py
DELETED
@@ -1,36 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
from models.base import HFModel
|
3 |
-
|
4 |
-
|
5 |
-
class Qwen(HFModel):
|
6 |
-
|
7 |
-
def __init__(self, model_path):
|
8 |
-
super().__init__(model_path)
|
9 |
-
|
10 |
-
def generate(self, input_text, stop_words=[]):
|
11 |
-
im_end = '<|im_end|>'
|
12 |
-
if im_end not in stop_words:
|
13 |
-
stop_words = stop_words + [im_end]
|
14 |
-
stop_words_ids = [self.tokenizer.encode(w) for w in stop_words]
|
15 |
-
|
16 |
-
input_ids = torch.tensor([self.tokenizer.encode(input_text)
|
17 |
-
]).to(self.model.device)
|
18 |
-
output = self.model.generate(input_ids, stop_words_ids=stop_words_ids)
|
19 |
-
output = output.tolist()[0]
|
20 |
-
output = self.tokenizer.decode(output, errors='ignore')
|
21 |
-
assert output.startswith(input_text)
|
22 |
-
output = output[len(input_text):].replace('<|endoftext|>',
|
23 |
-
'').replace(im_end, '')
|
24 |
-
|
25 |
-
return output
|
26 |
-
|
27 |
-
|
28 |
-
class QwenVL(HFModel):
|
29 |
-
def __init__(self, model_path):
|
30 |
-
super().__init__(model_path)
|
31 |
-
|
32 |
-
def generate(self, inputs: list):
|
33 |
-
query = self.tokenizer.from_list_format(inputs)
|
34 |
-
response, _ = self.model.chat(self.tokenizer, query=query, history=None)
|
35 |
-
|
36 |
-
return response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark/parser/__init__.py
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
from parser.internlm_parser import InternLMReActParser # noqa
|
2 |
-
from parser.react_parser import ReActParser # noqa
|
|
|
|
|
|
benchmark/parser/internlm_parser.py
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
from parser.react_parser import ReActParser
|
2 |
-
|
3 |
-
|
4 |
-
class InternLMReActParser(ReActParser):
|
5 |
-
|
6 |
-
def __init__(self):
|
7 |
-
self.action = '\nAction:'
|
8 |
-
self.action_input = '\nActionInput:'
|
9 |
-
self.action_input_stop = '<eoa>'
|
10 |
-
self.observation = '<|System|>:Response:'
|
11 |
-
self.observation_stop = '<TOKENS_UNUSED_2>\n<|Bot|>:'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark/parser/react_parser.py
DELETED
@@ -1,46 +0,0 @@
|
|
1 |
-
class ReActParser(object):
|
2 |
-
|
3 |
-
def __init__(self):
|
4 |
-
self.action = '\nAction:'
|
5 |
-
self.action_input = '\nAction Input:'
|
6 |
-
self.action_input_stop = '\nObservation:'
|
7 |
-
self.observation = '\nObservation:'
|
8 |
-
self.observation_stop = '\nThought:'
|
9 |
-
|
10 |
-
def parse_latest_plugin_call(self, text):
|
11 |
-
action = self.action
|
12 |
-
action_input = self.action_input
|
13 |
-
observation = self.action_input_stop
|
14 |
-
plugin_name, plugin_args = '', ''
|
15 |
-
i = text.rfind(action)
|
16 |
-
j = text.rfind(action_input)
|
17 |
-
k = text.rfind(observation)
|
18 |
-
if 0 <= i < j: # If the text has `Action` and `Action input`,
|
19 |
-
if k < j: # but does not contain `Observation`,
|
20 |
-
# then it is likely that `Observation` is ommited by the LLM,
|
21 |
-
# because the output text may have discarded the stop word.
|
22 |
-
text = text.rstrip() + observation # Add it back.
|
23 |
-
k = text.rfind(observation)
|
24 |
-
plugin_name = text[i + len(action):j].strip()
|
25 |
-
plugin_args = text[j + len(action_input):k].strip()
|
26 |
-
text = text[:k]
|
27 |
-
return plugin_name, plugin_args, text
|
28 |
-
|
29 |
-
def _extract_first_target(self, text, start_flag, end_flag):
|
30 |
-
target = ''
|
31 |
-
i = text.find(start_flag)
|
32 |
-
if i != -1:
|
33 |
-
j = text.find(end_flag, i)
|
34 |
-
if j != -1:
|
35 |
-
target = text[i + len(start_flag):j].strip()
|
36 |
-
else:
|
37 |
-
target = text[i + len(start_flag):].strip()
|
38 |
-
return target
|
39 |
-
|
40 |
-
def get_first_observation(self, text):
|
41 |
-
return self._extract_first_target(text, self.observation,
|
42 |
-
self.observation_stop)
|
43 |
-
|
44 |
-
def get_first_action_input(self, text):
|
45 |
-
return self._extract_first_target(text, self.action_input,
|
46 |
-
self.action_input_stop)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark/prompt/__init__.py
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
from prompt.internlm_react import InternLMReAct # noqa
|
2 |
-
from prompt.llama_react import LlamaReAct # noqa
|
3 |
-
from prompt.qwen_react import QwenReAct # noqa
|
4 |
-
from prompt.react import ReAct # noqa
|
|
|
|
|
|
|
|
|
|
benchmark/prompt/internlm_react.py
DELETED
@@ -1,103 +0,0 @@
|
|
1 |
-
from prompt.react import ReAct
|
2 |
-
|
3 |
-
INTERNLM_TOOL_DESCRIPTION = """用来执行Python代码。代码必须是一个函数,
|
4 |
-
函数名必须得是 'solution',代码对应你的思考过程。代码实例格式如下:
|
5 |
-
```python
|
6 |
-
# import 依赖包
|
7 |
-
import xxx
|
8 |
-
def solution():
|
9 |
-
# 初始化一些变量
|
10 |
-
variable_names_with_real_meaning = xxx
|
11 |
-
# 步骤一
|
12 |
-
mid_variable = func(variable_names_with_real_meaning)
|
13 |
-
# 步骤 x
|
14 |
-
mid_variable = func(mid_variable)
|
15 |
-
# 最后结果
|
16 |
-
final_answer = func(mid_variable)
|
17 |
-
return final_answer
|
18 |
-
```"""
|
19 |
-
|
20 |
-
INTERNLM_TOOL = {'PythonInterpreter': INTERNLM_TOOL_DESCRIPTION}
|
21 |
-
|
22 |
-
INTERNLM_REACT_PROMPT_ZH = """<|System|>:你是一个可以调用外部工具的助手,可以使用的工具包括:
|
23 |
-
{tools_text}
|
24 |
-
如果使用工具请遵循以下格式回复:
|
25 |
-
```
|
26 |
-
Thought:思考你当前步骤需要解决什么问题,是否需要使用工具
|
27 |
-
Action:工具名称,你的工具必须从 [{tools_name_text}] 选择
|
28 |
-
ActionInput:工具输入参数
|
29 |
-
```
|
30 |
-
工具返回按照以下格式回复:
|
31 |
-
```
|
32 |
-
Response:调用工具后的结果
|
33 |
-
```
|
34 |
-
如果你已经知道了答案,或者你不需要工具,请遵循以下格式回复
|
35 |
-
```
|
36 |
-
Thought:给出最终答案的思考过程
|
37 |
-
FinalAnswer:最终答案
|
38 |
-
```
|
39 |
-
开始!<TOKENS_UNUSED_2>
|
40 |
-
<|User|>:{query}<eoh>
|
41 |
-
<|Bot|>:"""
|
42 |
-
|
43 |
-
INTERNLM_REACT_PROMPT_EN = """<|System|>:You are a assistant who can utilize external tools.
|
44 |
-
{tools_text}
|
45 |
-
To use a tool, please use the following format:
|
46 |
-
```
|
47 |
-
Thought: Think what you need to solve, do you need to use tools?
|
48 |
-
Action: the tool name, should be one of [{tools_name_text}]
|
49 |
-
ActionInput: the input to the action
|
50 |
-
```
|
51 |
-
The response after utilizing tools should using the following format:
|
52 |
-
```
|
53 |
-
Response: the results after call the tool.
|
54 |
-
``
|
55 |
-
If you already know the answer, or you do not need to use tools,
|
56 |
-
please using the following format to reply:
|
57 |
-
```
|
58 |
-
Thought: the thought process to get the final answer
|
59 |
-
FinalAnswer: final answer
|
60 |
-
```
|
61 |
-
Begin!<TOKENS_UNUSED_2>
|
62 |
-
<|User|>:{query}<eoh>
|
63 |
-
<|Bot|>:"""
|
64 |
-
|
65 |
-
|
66 |
-
class InternLMReAct(ReAct):
|
67 |
-
|
68 |
-
def __init__(self, query, lang='en', upload_file_paths=[]):
|
69 |
-
super().__init__(query, lang, upload_file_paths)
|
70 |
-
self.react_template = INTERNLM_REACT_PROMPT_ZH if self.lang == 'zh' else INTERNLM_REACT_PROMPT_EN
|
71 |
-
|
72 |
-
def build_prompt(self):
|
73 |
-
planning_prompt = super().build_prompt()
|
74 |
-
if '<|im_end|>' in self.query and planning_prompt.endswith(
|
75 |
-
'<eoh>\n<|Bot|>:'):
|
76 |
-
planning_prompt = planning_prompt[:-len('<eoh>\n<|Bot|>:')]
|
77 |
-
|
78 |
-
if '<|im_end|>' in self.query:
|
79 |
-
planning_prompt = planning_prompt.replace(
|
80 |
-
'<|im_end|>\n<|im_start|>assistant\n',
|
81 |
-
'<eoh>\n<|Bot|>:').replace(
|
82 |
-
'Observation:', '<eoa>\n<|System|>:Response:').replace(
|
83 |
-
'\nAction Input',
|
84 |
-
'\nActionInput').replace('code_interpreter',
|
85 |
-
'PythonInterpreter')
|
86 |
-
assert planning_prompt.endswith('Thought:')
|
87 |
-
planning_prompt = planning_prompt[:-len(
|
88 |
-
'Thought:')] + '<TOKENS_UNUSED_2>\n<|Bot|>:'
|
89 |
-
|
90 |
-
self.prompt = planning_prompt
|
91 |
-
return planning_prompt
|
92 |
-
|
93 |
-
def _build_tools_text(self):
|
94 |
-
return INTERNLM_TOOL
|
95 |
-
|
96 |
-
def _build_tools_name_text(self):
|
97 |
-
return list(INTERNLM_TOOL.keys())
|
98 |
-
|
99 |
-
def build_observation(self, observation):
|
100 |
-
return f'<eoa>\n<|System|>:Response:{observation}\n<TOKENS_UNUSED_2>\n<|Bot|>:'
|
101 |
-
|
102 |
-
def get_stop_words_list(self):
|
103 |
-
return ['<eoa>']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark/prompt/llama_react.py
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
from prompt.react import ReAct
|
2 |
-
|
3 |
-
|
4 |
-
class LlamaReAct(ReAct):
|
5 |
-
|
6 |
-
def __init__(self, query, lang='en', upload_file_paths=[]):
|
7 |
-
super().__init__(query, lang, upload_file_paths)
|
8 |
-
|
9 |
-
def build_prompt(self):
|
10 |
-
planning_prompt = super().build_prompt()
|
11 |
-
planning_prompt = '[INST] ' + planning_prompt + ' [/INST]'
|
12 |
-
|
13 |
-
if '<|im_end|>' in self.query:
|
14 |
-
planning_prompt = planning_prompt.replace(
|
15 |
-
'<|im_end|>\n<|im_start|>assistant', ' [/INST] ')
|
16 |
-
assert planning_prompt.endswith(' [/INST]')
|
17 |
-
planning_prompt = planning_prompt[:-len(' [/INST]')]
|
18 |
-
|
19 |
-
self.prompt = planning_prompt
|
20 |
-
return planning_prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark/prompt/qwen_react.py
DELETED
@@ -1,80 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
|
4 |
-
from prompt.react import ReAct
|
5 |
-
|
6 |
-
QWEN_TOOLS_LIST = [
|
7 |
-
{
|
8 |
-
'name_for_human': '代码解释器',
|
9 |
-
'name_for_model': 'code_interpreter',
|
10 |
-
'description_for_model': '代码解释器,可用于执行Python代码。',
|
11 |
-
'parameters': [{
|
12 |
-
'name': 'code',
|
13 |
-
'type': 'string',
|
14 |
-
'description': '待执行的代码'
|
15 |
-
}],
|
16 |
-
'args_format': 'code'
|
17 |
-
},
|
18 |
-
]
|
19 |
-
|
20 |
-
TOOL_DESC = """{name_for_model}: Call this tool to interact with the {name_for_human} API. What is the {name_for_human} API useful for? {description_for_model} Parameters: {parameters}"""
|
21 |
-
|
22 |
-
|
23 |
-
class QwenReAct(ReAct):
|
24 |
-
|
25 |
-
def __init__(self, query, lang='en', upload_file_paths=[]):
|
26 |
-
super().__init__(query, lang, upload_file_paths)
|
27 |
-
|
28 |
-
self.upload_file_paths = [
|
29 |
-
f'{os.path.basename(fname)}' for fname in upload_file_paths
|
30 |
-
]
|
31 |
-
self.list_of_plugin_info = QWEN_TOOLS_LIST
|
32 |
-
self.fname_template = {
|
33 |
-
'zh': '[上传文件{fname_str}]',
|
34 |
-
'en': '[Upload file {fname_str}]',
|
35 |
-
'en_multi': '[Upload file {fname_str}]'
|
36 |
-
}
|
37 |
-
|
38 |
-
def build_prompt(self):
|
39 |
-
im_start = '<|im_start|>'
|
40 |
-
im_end = '<|im_end|>'
|
41 |
-
prompt = f'{im_start}system\nYou are a helpful assistant.{im_end}'
|
42 |
-
|
43 |
-
query = super().build_prompt()
|
44 |
-
|
45 |
-
query = query.lstrip('\n').rstrip()
|
46 |
-
prompt += f'\n{im_start}user\n{query}{im_end}'
|
47 |
-
if f'{im_start}assistant' not in query:
|
48 |
-
prompt += f'\n{im_start}assistant\n{im_end}'
|
49 |
-
assert prompt.endswith(f'\n{im_start}assistant\n{im_end}')
|
50 |
-
|
51 |
-
prompt = prompt[:-len(f'{im_end}')]
|
52 |
-
self.prompt = prompt
|
53 |
-
return prompt
|
54 |
-
|
55 |
-
def _build_tools_text(self):
|
56 |
-
# tool info
|
57 |
-
tools_text = []
|
58 |
-
for plugin_info in self.list_of_plugin_info:
|
59 |
-
tool = TOOL_DESC.format(
|
60 |
-
name_for_model=plugin_info['name_for_model'],
|
61 |
-
name_for_human=plugin_info['name_for_human'],
|
62 |
-
description_for_model=plugin_info['description_for_model'],
|
63 |
-
parameters=json.dumps(plugin_info['parameters'],
|
64 |
-
ensure_ascii=False),
|
65 |
-
)
|
66 |
-
if plugin_info.get('args_format', 'json') == 'json':
|
67 |
-
tool += ' Format the arguments as a JSON object.'
|
68 |
-
elif plugin_info['args_format'] == 'code':
|
69 |
-
tool += ' Enclose the code within triple backticks (`) at the beginning and end of the code.'
|
70 |
-
else:
|
71 |
-
raise NotImplementedError
|
72 |
-
tools_text.append(tool)
|
73 |
-
tools_text = '\n\n'.join(tools_text)
|
74 |
-
return tools_text
|
75 |
-
|
76 |
-
def _build_tools_name_text(self):
|
77 |
-
return ', '.join([
|
78 |
-
plugin_info['name_for_model']
|
79 |
-
for plugin_info in self.list_of_plugin_info
|
80 |
-
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark/prompt/react.py
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
-
tools_text = """code_interpreter: Call this tool to interact with the Code Interpreter API.
|
4 |
-
What is the Code Interpreter API useful for?
|
5 |
-
Code Interpreter is used to execute Python code to deal with the following tasks:
|
6 |
-
1. Solving mathematical problems, both quantitative and qualitative
|
7 |
-
2. Doing data analysis and visualization
|
8 |
-
3. Converting files between formats
|
9 |
-
Parameters:
|
10 |
-
```py
|
11 |
-
code
|
12 |
-
```
|
13 |
-
Enclose the code within triple backticks (```) at the beginning and end of the code.
|
14 |
-
"""
|
15 |
-
|
16 |
-
REACT_PROMPT = """Answer the following questions as best you can. You have access to the following tools:
|
17 |
-
|
18 |
-
{tools_text}
|
19 |
-
|
20 |
-
Use the following format:
|
21 |
-
|
22 |
-
Question: the input question you must answer
|
23 |
-
Thought: you should always think about what to do
|
24 |
-
Action: the action to take, should be one of [{tools_name_text}]
|
25 |
-
Action Input: the input to the action
|
26 |
-
Observation: the result of the action
|
27 |
-
... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
|
28 |
-
Thought: I now know the final answer
|
29 |
-
Final Answer: the final answer to the original input question
|
30 |
-
|
31 |
-
Begin!
|
32 |
-
|
33 |
-
Question: {query}"""
|
34 |
-
|
35 |
-
fname_template = {
|
36 |
-
'zh': '文件{fname_str},',
|
37 |
-
'en_multi': 'Files {fname_str}. ',
|
38 |
-
'en': 'File {fname_str}. ',
|
39 |
-
}
|
40 |
-
|
41 |
-
|
42 |
-
class ReAct(object):
|
43 |
-
|
44 |
-
def __init__(self, query, lang='en', upload_file_paths=[]):
|
45 |
-
self.query = query
|
46 |
-
self.lang = lang
|
47 |
-
self.upload_file_paths = [
|
48 |
-
f'`{os.path.basename(fname)}`' for fname in upload_file_paths
|
49 |
-
]
|
50 |
-
|
51 |
-
self.fname_template = fname_template
|
52 |
-
self.react_template = REACT_PROMPT
|
53 |
-
self.prompt = ''
|
54 |
-
|
55 |
-
def build_prompt(self):
|
56 |
-
query = self._format_upload_fname() + self.query
|
57 |
-
tools_text = self._build_tools_text()
|
58 |
-
tools_name_text = self._build_tools_name_text()
|
59 |
-
planning_prompt = self.react_template.format(
|
60 |
-
query=query,
|
61 |
-
tools_text=tools_text,
|
62 |
-
tools_name_text=tools_name_text)
|
63 |
-
|
64 |
-
self.prompt = planning_prompt
|
65 |
-
return planning_prompt
|
66 |
-
|
67 |
-
def _format_upload_fname(self):
|
68 |
-
prefix = ''
|
69 |
-
if self.upload_file_paths:
|
70 |
-
fname_str = ', '.join(self.upload_file_paths)
|
71 |
-
lang_key = 'en_multi' if self.lang == 'en' and len(
|
72 |
-
self.upload_file_paths) > 1 else self.lang
|
73 |
-
fname_template = self.fname_template[lang_key]
|
74 |
-
prefix = fname_template.format(fname_str=fname_str)
|
75 |
-
return prefix
|
76 |
-
|
77 |
-
def _build_tools_text(self):
|
78 |
-
return tools_text
|
79 |
-
|
80 |
-
def _build_tools_name_text(self):
|
81 |
-
return 'code_interpreter'
|
82 |
-
|
83 |
-
def build_observation(self, observation):
|
84 |
-
return f'\nObservation: {observation}\nThought:'
|
85 |
-
|
86 |
-
def get_stop_words_list(self):
|
87 |
-
return ['Observation:', 'Observation:\n']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark/requirements.txt
DELETED
@@ -1,13 +0,0 @@
|
|
1 |
-
accelerate>=0.20.3
|
2 |
-
func_timeout
|
3 |
-
json5
|
4 |
-
matplotlib
|
5 |
-
numpy
|
6 |
-
pandas
|
7 |
-
PrettyTable
|
8 |
-
scipy
|
9 |
-
seaborn
|
10 |
-
sympy
|
11 |
-
transformers==4.33.1
|
12 |
-
transformers_stream_generator
|
13 |
-
openai
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark/utils/__init__.py
DELETED
File without changes
|
benchmark/utils/code_utils.py
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import re
|
3 |
-
|
4 |
-
import json5
|
5 |
-
|
6 |
-
|
7 |
-
def replace_upload_fname(text, upload_fname_list):
|
8 |
-
for full_input_fname in upload_fname_list:
|
9 |
-
if full_input_fname not in text and os.path.basename(
|
10 |
-
full_input_fname) in text:
|
11 |
-
text = text.replace(os.path.basename(full_input_fname),
|
12 |
-
full_input_fname)
|
13 |
-
return text
|
14 |
-
|
15 |
-
|
16 |
-
def extract_code(text):
|
17 |
-
# Match triple backtick blocks first
|
18 |
-
triple_match = re.search(r'```[^\n]*\n(.+?)```', text, re.DOTALL)
|
19 |
-
# Match single backtick blocks second
|
20 |
-
single_match = re.search(r'`([^`]*)`', text, re.DOTALL)
|
21 |
-
if triple_match:
|
22 |
-
text = triple_match.group(1)
|
23 |
-
elif single_match:
|
24 |
-
text = single_match.group(1)
|
25 |
-
else:
|
26 |
-
try:
|
27 |
-
text = json5.loads(text)['code']
|
28 |
-
except Exception:
|
29 |
-
pass
|
30 |
-
# If no code blocks found, return original text
|
31 |
-
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark/utils/data_utils.py
DELETED
@@ -1,28 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import logging
|
3 |
-
|
4 |
-
from tqdm import tqdm
|
5 |
-
|
6 |
-
|
7 |
-
def load_jsonl(path):
|
8 |
-
data = []
|
9 |
-
with open(path, 'r', encoding='utf8') as f:
|
10 |
-
for idx, line in enumerate(f, start=1):
|
11 |
-
try:
|
12 |
-
data.append(json.loads(line))
|
13 |
-
except Exception as e:
|
14 |
-
logging.info(line)
|
15 |
-
logging.warning(f'Error at line {idx}: {e}')
|
16 |
-
continue
|
17 |
-
return data
|
18 |
-
|
19 |
-
|
20 |
-
def save_jsonl(data, path, progress=False, enabled=True):
|
21 |
-
if not enabled:
|
22 |
-
return
|
23 |
-
with open(path, 'w', encoding='utf-8') as f:
|
24 |
-
if progress:
|
25 |
-
data = tqdm(data)
|
26 |
-
for item in data:
|
27 |
-
line = json.dumps(item, ensure_ascii=False)
|
28 |
-
print(line, file=f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
setup.py
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
from setuptools import find_packages, setup
|
2 |
-
|
3 |
-
|
4 |
-
def read_requirements():
|
5 |
-
with open('requirements.txt') as req:
|
6 |
-
content = req.read()
|
7 |
-
requirements = content.split('\n')
|
8 |
-
return requirements
|
9 |
-
|
10 |
-
|
11 |
-
setup(
|
12 |
-
name='qwen_agent',
|
13 |
-
version='0.0.1',
|
14 |
-
packages=find_packages(),
|
15 |
-
install_requires=read_requirements(),
|
16 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|