vlff李飞飞 commited on
Commit
a4bff98
1 Parent(s): 522b6de
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  *.ttf filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  *.ttf filter=lfs diff=lfs merge=lfs -text
37
+ *.db filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -9,12 +9,6 @@ __pycache__
9
  qwen_agent/llm/gpt.py
10
  qwen_agent/llm/tools.py
11
  #workspace/*
12
-
13
- benchmark/log/*
14
- benchmark/output_data/*
15
- benchmark/upload_file/*
16
- benchmark/upload_file_clean/*
17
- benchmark/eval_data/
18
  Qwen-Agent
19
 
20
  docqa/*
 
9
  qwen_agent/llm/gpt.py
10
  qwen_agent/llm/tools.py
11
  #workspace/*
 
 
 
 
 
 
12
  Qwen-Agent
13
 
14
  docqa/*
README.md CHANGED
@@ -156,97 +156,3 @@ You can watch the following showcase videos to learn about the basic operations
156
  - Long-form writing based on visited webpages and PDFs [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_write_article_based_on_webpages_and_pdfs.mp4)
157
  - Drawing a plot using code interpreter based on the given information [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_chat_with_docs_and_code_interpreter.mp4)
158
  - Uploading files, multi-turn conversation, and data analysis using code interpreter [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_code_interpreter_multi_turn_chat.mp4)
159
-
160
- # Evaluation Benchmark
161
-
162
- We have also open-sourced a benchmark for evaluating the performance of a model in writing Python code and using Code Interpreter for mathematical problem solving, data analysis, and other general tasks. The benchmark can be found in the [benchmark](benchmark/README.md) directory. The current evaluation results are as follows:
163
-
164
- <table>
165
- <tr>
166
- <th colspan="5" align="center">In-house Code Interpreter Benchmark (Version 20231206)</th>
167
- </tr>
168
- <tr>
169
- <th rowspan="2" align="center">Model</th>
170
- <th colspan="3" align="center">Accuracy of Code Execution Results (%)</th>
171
- <th colspan="1" align="center">Executable Rate of Code (%)</th>
172
- </tr>
173
- <tr>
174
- <th align="center">Math↑</th><th align="center">Visualization-Hard↑</th><th align="center">Visualization-Easy↑</th><th align="center">General↑</th>
175
- </tr>
176
- <tr>
177
- <td>GPT-4</td>
178
- <td align="center">82.8</td>
179
- <td align="center">66.7</td>
180
- <td align="center">60.8</td>
181
- <td align="center">82.8</td>
182
- </tr>
183
- <tr>
184
- <td>GPT-3.5</td>
185
- <td align="center">47.3</td>
186
- <td align="center">33.3</td>
187
- <td align="center">55.7</td>
188
- <td align="center">74.1</td>
189
- </tr>
190
- <tr>
191
- <td>LLaMA2-13B-Chat</td>
192
- <td align="center">8.3</td>
193
- <td align="center">1.2</td>
194
- <td align="center">15.2</td>
195
- <td align="center">48.3</td>
196
- </tr>
197
- <tr>
198
- <td>CodeLLaMA-13B-Instruct</td>
199
- <td align="center">28.2</td>
200
- <td align="center">15.5</td>
201
- <td align="center">21.5</td>
202
- <td align="center">74.1</td>
203
- </tr>
204
- <tr>
205
- <td>InternLM-20B-Chat</td>
206
- <td align="center">34.6</td>
207
- <td align="center">10.7</td>
208
- <td align="center">24.1</td>
209
- <td align="center">65.5</td>
210
- </tr>
211
- <tr>
212
- <td>ChatGLM3-6B</td>
213
- <td align="center">54.2</td>
214
- <td align="center">4.8</td>
215
- <td align="center">15.2</td>
216
- <td align="center">62.1</td>
217
- </tr>
218
- <tr>
219
- <td>Qwen-1.8B-Chat</td>
220
- <td align="center">25.6</td>
221
- <td align="center">21.4</td>
222
- <td align="center">22.8</td>
223
- <td align="center">65.5</td>
224
- </tr>
225
- <tr>
226
- <td>Qwen-7B-Chat</td>
227
- <td align="center">41.9</td>
228
- <td align="center">23.8</td>
229
- <td align="center">38.0</td>
230
- <td align="center">67.2</td>
231
- </tr>
232
- <tr>
233
- <td>Qwen-14B-Chat</td>
234
- <td align="center">58.4</td>
235
- <td align="center">31.0</td>
236
- <td align="center">45.6</td>
237
- <td align="center">65.5</td>
238
- </tr>
239
- <tr>
240
- <td>Qwen-72B-Chat</td>
241
- <td align="center">72.7</td>
242
- <td align="center">41.7</td>
243
- <td align="center">43.0</td>
244
- <td align="center">82.8</td>
245
- </tr>
246
- </table>
247
-
248
- # Disclaimer
249
-
250
- This project is not intended to be an official product, rather it serves as a proof-of-concept project that highlights the capabilities of the Qwen series models.
251
-
252
- > Important: The code interpreter is not sandboxed, and it executes code in your own environment. Please do not ask Qwen to perform dangerous tasks, and do not directly use the code interpreter for production purposes.
 
156
  - Long-form writing based on visited webpages and PDFs [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_write_article_based_on_webpages_and_pdfs.mp4)
157
  - Drawing a plot using code interpreter based on the given information [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_chat_with_docs_and_code_interpreter.mp4)
158
  - Uploading files, multi-turn conversation, and data analysis using code interpreter [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_code_interpreter_multi_turn_chat.mp4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README_CN.md CHANGED
@@ -157,96 +157,4 @@ python run_server.py --model_server http://{MODEL_SERVER_IP}:7905/v1 --workstati
157
  - 提取浏览内容使用代码解释器画图 [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_chat_with_docs_and_code_interpreter.mp4)
158
  - 上传文件、多轮对话利用代码解释器分析数据 [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_code_interpreter_multi_turn_chat.mp4)
159
 
160
- # 评测基准
161
-
162
- 我们也开源了一个评测基准,用于评估一个模型写Python代码并使用Code Interpreter进行数学解题、数据分析、及其他通用任务时的表现。评测基准见 [benchmark](benchmark/README.md) 目录,当前的评测结果如下:
163
-
164
- <table>
165
- <tr>
166
- <th colspan="5" align="center">In-house Code Interpreter Benchmark (Version 20231206)</th>
167
- </tr>
168
- <tr>
169
- <th rowspan="2" align="center">Model</th>
170
- <th colspan="3" align="center">代码执行结果正确性 (%)</th>
171
- <th colspan="1" align="center">生成代码的可执行率 (%)</th>
172
- </tr>
173
- <tr>
174
- <th align="center">Math↑</th><th align="center">Visualization-Hard↑</th><th align="center">Visualization-Easy↑</th><th align="center">General↑</th>
175
- </tr>
176
- <tr>
177
- <td>GPT-4</td>
178
- <td align="center">82.8</td>
179
- <td align="center">66.7</td>
180
- <td align="center">60.8</td>
181
- <td align="center">82.8</td>
182
- </tr>
183
- <tr>
184
- <td>GPT-3.5</td>
185
- <td align="center">47.3</td>
186
- <td align="center">33.3</td>
187
- <td align="center">55.7</td>
188
- <td align="center">74.1</td>
189
- </tr>
190
- <tr>
191
- <td>LLaMA2-13B-Chat</td>
192
- <td align="center">8.3</td>
193
- <td align="center">1.2</td>
194
- <td align="center">15.2</td>
195
- <td align="center">48.3</td>
196
- </tr>
197
- <tr>
198
- <td>CodeLLaMA-13B-Instruct</td>
199
- <td align="center">28.2</td>
200
- <td align="center">15.5</td>
201
- <td align="center">21.5</td>
202
- <td align="center">74.1</td>
203
- </tr>
204
- <tr>
205
- <td>InternLM-20B-Chat</td>
206
- <td align="center">34.6</td>
207
- <td align="center">10.7</td>
208
- <td align="center">24.1</td>
209
- <td align="center">65.5</td>
210
- </tr>
211
- <tr>
212
- <td>ChatGLM3-6B</td>
213
- <td align="center">54.2</td>
214
- <td align="center">4.8</td>
215
- <td align="center">15.2</td>
216
- <td align="center">62.1</td>
217
- </tr>
218
- <tr>
219
- <td>Qwen-1.8B-Chat</td>
220
- <td align="center">25.6</td>
221
- <td align="center">21.4</td>
222
- <td align="center">22.8</td>
223
- <td align="center">65.5</td>
224
- </tr>
225
- <tr>
226
- <td>Qwen-7B-Chat</td>
227
- <td align="center">41.9</td>
228
- <td align="center">23.8</td>
229
- <td align="center">38.0</td>
230
- <td align="center">67.2</td>
231
- </tr>
232
- <tr>
233
- <td>Qwen-14B-Chat</td>
234
- <td align="center">58.4</td>
235
- <td align="center">31.0</td>
236
- <td align="center">45.6</td>
237
- <td align="center">65.5</td>
238
- </tr>
239
- <tr>
240
- <td>Qwen-72B-Chat</td>
241
- <td align="center">72.7</td>
242
- <td align="center">41.7</td>
243
- <td align="center">43.0</td>
244
- <td align="center">82.8</td>
245
- </tr>
246
- </table>
247
-
248
- # 免责声明
249
-
250
- 本项目并非正式产品,而是一个概念验证项目,用于演示Qwen系列模型的能力。
251
-
252
- > 重要提示:代码解释器未进行沙盒隔离,会在部署环境中执行代码。请避免向Qwen发出危险指令,切勿将该代码解释器直接用于生产目的。
 
157
  - 提取浏览内容使用代码解释器画图 [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_chat_with_docs_and_code_interpreter.mp4)
158
  - 上传文件、多轮对话利用代码解释器分析数据 [video](https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/showcase_code_interpreter_multi_turn_chat.mp4)
159
 
160
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/README.md DELETED
@@ -1,248 +0,0 @@
1
- # Code Interpreter Benchmark
2
-
3
- ## Introduction
4
- To assess LLM's ability to use the Python Code Interpreter for tasks such as mathematical problem solving, data visualization, and other general-purpose tasks such as file handling and web scraping, we have created and open-sourced a benchmark specifically designed for evaluating these capabilities.
5
-
6
- ### Metrics
7
- The metrics are divided into two parts: code executability and code correctness.
8
- - Code executability: evaluating the ability of the LLM-generated code to be executed.
9
- - Code correctness: evaluating whether the LLM-generated code runs correctly.
10
-
11
- ### Domain
12
- When evaluating the accuracy of the code execution results for code correctness, we further divide it into two specific domains: `Math`, `Visualization`.
13
- In terms of code executability, we calculate executable rate of the generated code for `General problem-solving`.
14
-
15
- ## Results
16
- - Qwen-7B-Chat refers to the version updated after September 25, 2023.
17
- - The code correctness judger model for `Visualization` has changed from `Qwen-vl-chat` to `gpt-4-vision-preview` in the version 20231206.
18
-
19
- <table>
20
- <tr>
21
- <th colspan="5" align="center">In-house Code Interpreter Benchmark (Version 20231206)</th>
22
- </tr>
23
- <tr>
24
- <th rowspan="2" align="center">Model</th>
25
- <th colspan="3" align="center">Accuracy of Code Execution Results (%)</th>
26
- <th colspan="1" align="center">Executable Rate of Code (%)</th>
27
- </tr>
28
- <tr>
29
- <th align="center">Math↑</th><th align="center">Visualization-Hard↑</th><th align="center">Visualization-Easy↑</th><th align="center">General↑</th>
30
- </tr>
31
- <tr>
32
- <td>GPT-4</td>
33
- <td align="center">82.8</td>
34
- <td align="center">66.7</td>
35
- <td align="center">60.8</td>
36
- <td align="center">82.8</td>
37
- </tr>
38
- <tr>
39
- <td>GPT-3.5</td>
40
- <td align="center">47.3</td>
41
- <td align="center">33.3</td>
42
- <td align="center">55.7</td>
43
- <td align="center">74.1</td>
44
- </tr>
45
- <tr>
46
- <td>LLaMA2-13B-Chat</td>
47
- <td align="center">8.3</td>
48
- <td align="center">1.2</td>
49
- <td align="center">15.2</td>
50
- <td align="center">48.3</td>
51
- </tr>
52
- <tr>
53
- <td>CodeLLaMA-13B-Instruct</td>
54
- <td align="center">28.2</td>
55
- <td align="center">15.5</td>
56
- <td align="center">21.5</td>
57
- <td align="center">74.1</td>
58
- </tr>
59
- <tr>
60
- <td>InternLM-20B-Chat</td>
61
- <td align="center">34.6</td>
62
- <td align="center">10.7</td>
63
- <td align="center">24.1</td>
64
- <td align="center">65.5</td>
65
- </tr>
66
- <tr>
67
- <td>ChatGLM3-6B</td>
68
- <td align="center">54.2</td>
69
- <td align="center">4.8</td>
70
- <td align="center">15.2</td>
71
- <td align="center">62.1</td>
72
- </tr>
73
- <tr>
74
- <td>Qwen-1.8B-Chat</td>
75
- <td align="center">25.6</td>
76
- <td align="center">21.4</td>
77
- <td align="center">22.8</td>
78
- <td align="center">65.5</td>
79
- </tr>
80
- <tr>
81
- <td>Qwen-7B-Chat</td>
82
- <td align="center">41.9</td>
83
- <td align="center">23.8</td>
84
- <td align="center">38.0</td>
85
- <td align="center">67.2</td>
86
- </tr>
87
- <tr>
88
- <td>Qwen-14B-Chat</td>
89
- <td align="center">58.4</td>
90
- <td align="center">31.0</td>
91
- <td align="center">45.6</td>
92
- <td align="center">65.5</td>
93
- </tr>
94
- <tr>
95
- <td>Qwen-72B-Chat</td>
96
- <td align="center">72.7</td>
97
- <td align="center">41.7</td>
98
- <td align="center">43.0</td>
99
- <td align="center">82.8</td>
100
- </tr>
101
- </table>
102
-
103
- Furthermore, we also provide the results of `Qwen-vl-plus` as the code correctness judger model for `Visualization` task to serve as a reference.
104
-
105
- <table>
106
- <tr>
107
- <th colspan="3" align="center">Code Correctness Judger Model = Qwen-vl-plus</th>
108
- </tr>
109
- <tr>
110
- <th rowspan="2" align="center">Model</th>
111
- <th colspan="2" align="center">Accuracy of Code Execution Results (%)</th>
112
- </tr>
113
- <tr>
114
- <th align="center">Visualization-Hard↑</th>
115
- <th align="center">Visualization-Easy↑</th>
116
- </tr>
117
- <tr>
118
- <td>LLaMA2-13B-Chat</td>
119
- <td align="center">2.4</td>
120
- <td align="center">17.7</td>
121
- </tr>
122
- <tr>
123
- <td>CodeLLaMA-13B-Instruct</td>
124
- <td align="center">17.9</td>
125
- <td align="center">34.2</td>
126
- </tr>
127
- <tr>
128
- <td>InternLM-20B-Chat</td>
129
- <td align="center">9.5</td>
130
- <td align="center">31.7</td>
131
- </tr>
132
- <tr>
133
- <td>ChatGLM3-6B</td>
134
- <td align="center">10.7</td>
135
- <td align="center">29.1</td>
136
- </tr>
137
- <tr>
138
- <td>Qwen-1.8B-Chat</td>
139
- <td align="center">32.1</td>
140
- <td align="center">32.9</td>
141
- </tr>
142
- <tr>
143
- <td>Qwen-7B-Chat</td>
144
- <td align="center">26.2</td>
145
- <td align="center">39.2</td>
146
- </tr>
147
- <tr>
148
- <td>Qwen-14B-Chat</td>
149
- <td align="center">36.9</td>
150
- <td align="center">41.8</td>
151
- </tr>
152
- <tr>
153
- <td>Qwen-72B-Chat</td>
154
- <td align="center">38.1</td>
155
- <td align="center">38.0</td>
156
- </tr>
157
- </table>
158
-
159
-
160
-
161
- ## Usage
162
-
163
- ### Installation
164
-
165
- ```shell
166
- git clone https://github.com/QwenLM/Qwen-Agent.git
167
- cd benchmark
168
- pip install -r requirements.txt
169
- ```
170
-
171
- ### Dataset Download
172
- ```shell
173
- cd benchmark
174
- wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/assets/qwen_agent/benchmark_code_interpreter_data.zip
175
- unzip benchmark_code_interpreter_data.zip
176
- mkdir eval_data
177
- mv eval_code_interpreter_v1.jsonl eval_data/
178
- ```
179
-
180
- ### Evaluation
181
- To reproduce the comprehensive results of benchmark, you can run the following script:
182
-
183
- ```Shell
184
- python inference_and_execute.py --model {model_name}
185
- ```
186
-
187
- {model_name}:
188
- - qwen-1.8b-chat
189
- - qwen-7b-chat
190
- - qwen-14b-chat
191
- - qwen-72b-chat
192
- - llama-2-7b-chat
193
- - llama-2-13b-chat
194
- - codellama-7b-instruct
195
- - codellama-13b-instruct
196
- - internlm-7b-chat-1.1
197
- - internlm-20b-chat
198
-
199
- The benchmark will run the test cases and generate the performance results. The results will be saved in the `output_data` directory.
200
-
201
- **Notes**:
202
- Please install `simhei.ttf` font for proper display in matplotlib when evaluating visualization task. You can do this by preparing `simhei.ttf` (which can be found on any Windows PC) and then running the following code snippet:
203
- ```python
204
- import os
205
- import matplotlib
206
- target_font_path = os.path.join(
207
- os.path.abspath(
208
- os.path.join(matplotlib.matplotlib_fname(), os.path.pardir)),
209
- 'fonts', 'ttf', 'simhei.ttf')
210
- os.system(f'cp simhei.ttf {target_font_path}')
211
- font_list_cache = os.path.join(matplotlib.get_cachedir(), 'fontlist-*.json')
212
- os.system(f'rm -f {font_list_cache}')
213
- ```
214
-
215
- #### Code Executable Rate
216
- ```Shell
217
- python inference_and_execute.py --task {task_name} --model {model_name}
218
- ```
219
-
220
- {task_name}:
221
- - `general`: General problem-solving task
222
-
223
-
224
- #### Code Correctness Rate
225
- ```Shell
226
- python inference_and_execute.py --task {task_name} --model {model_name}
227
- ```
228
-
229
- {task_name}:
230
- - `visualization`: Visualization task
231
- - `gsm8k`: Math task
232
-
233
-
234
- ## Configuration
235
- The inference_and_exec.py file contains the following configurable options:
236
-
237
- - `--model`: The model to test which can be one of `qwen-72b-chat`, `qwen-14b-chat`, `qwen-7b-chat`, `qwen-1.8b-chat`, `qwen-7b-chat`, `llama-2-7b-chat`, `llama-2-13b-chat`, `codellama-7b-instruct`, `codellama-13b-instruct`, `internlm-7b-chat-1.1`, `internlm-20b-chat`.
238
- - `--task`: The test task which can be one of `all`, `visualization`, `general`, `gsm8k`.
239
- - `--output-path`: The path for saving evaluation result.
240
- - `--input-path`: The path for placing evaluation data.
241
- - `--output-fname`: The file name for evaluation result.
242
- - `--input-fname`: The file name for evaluation data.
243
- - `--force`: Force generation and will overwrite the cached results.
244
- - `--eval-only`: Only calculate evaluation metrics without re-inference.
245
- - `--eval-code-exec-only`: Only evaluate code executable rate
246
- - `--gen-exec-only`: Only generate and execuate code without calculating evaluation metrics.
247
- - `--gen-only`: Only generate without execuating code and calculating evaluation metrics.
248
- - `--vis-judger`: The model to judge the result correctness for `Visualization` task which can be one of `gpt-4-vision-preview`, `qwen-vl-chat`, `qwen-vl-plus`. It is set to `gpt-4-vision-preview` by default in the version 20231206, and `Qwen-vl-chat` has been deprecated.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/code_interpreter.py DELETED
@@ -1,250 +0,0 @@
1
- import base64
2
- import io
3
- import json
4
- import logging
5
- import os
6
- import queue
7
- import re
8
- import subprocess
9
- import sys
10
- import time
11
- import traceback
12
- import uuid
13
-
14
- import matplotlib
15
- import PIL.Image
16
- from jupyter_client import BlockingKernelClient
17
- from utils.code_utils import extract_code
18
-
19
- WORK_DIR = os.getenv('CODE_INTERPRETER_WORK_DIR', '/tmp/workspace')
20
-
21
- LAUNCH_KERNEL_PY = """
22
- from ipykernel import kernelapp as app
23
- app.launch_new_instance()
24
- """
25
-
26
- _KERNEL_CLIENTS = {}
27
-
28
-
29
- # Run this fix before jupyter starts if matplotlib cannot render CJK fonts.
30
- # And we need to additionally run the following lines in the jupyter notebook.
31
- # ```python
32
- # import matplotlib.pyplot as plt
33
- # plt.rcParams['font.sans-serif'] = ['SimHei']
34
- # plt.rcParams['axes.unicode_minus'] = False
35
- # ````
36
- def fix_matplotlib_cjk_font_issue():
37
- local_ttf = os.path.join(
38
- os.path.abspath(
39
- os.path.join(matplotlib.matplotlib_fname(), os.path.pardir)),
40
- 'fonts', 'ttf', 'simhei.ttf')
41
- if not os.path.exists(local_ttf):
42
- logging.warning(
43
- f'Missing font file `{local_ttf}` for matplotlib. It may cause some error when using matplotlib.'
44
- )
45
-
46
-
47
- def start_kernel(pid):
48
- fix_matplotlib_cjk_font_issue()
49
-
50
- connection_file = os.path.join(WORK_DIR,
51
- f'kernel_connection_file_{pid}.json')
52
- launch_kernel_script = os.path.join(WORK_DIR, f'launch_kernel_{pid}.py')
53
- for f in [connection_file, launch_kernel_script]:
54
- if os.path.exists(f):
55
- logging.warning(f'{f} already exists')
56
- os.remove(f)
57
-
58
- os.makedirs(WORK_DIR, exist_ok=True)
59
-
60
- with open(launch_kernel_script, 'w') as fout:
61
- fout.write(LAUNCH_KERNEL_PY)
62
-
63
- kernel_process = subprocess.Popen([
64
- sys.executable,
65
- launch_kernel_script,
66
- '--IPKernelApp.connection_file',
67
- connection_file,
68
- '--matplotlib=inline',
69
- '--quiet',
70
- ],
71
- cwd=WORK_DIR)
72
- logging.info(f"INFO: kernel process's PID = {kernel_process.pid}")
73
-
74
- # Wait for kernel connection file to be written
75
- while True:
76
- if not os.path.isfile(connection_file):
77
- time.sleep(0.1)
78
- else:
79
- # Keep looping if JSON parsing fails, file may be partially written
80
- try:
81
- with open(connection_file, 'r') as fp:
82
- json.load(fp)
83
- break
84
- except json.JSONDecodeError:
85
- pass
86
-
87
- # Client
88
- kc = BlockingKernelClient(connection_file=connection_file)
89
- kc.load_connection_file()
90
- kc.start_channels()
91
- kc.wait_for_ready()
92
- return kc
93
-
94
-
95
- def escape_ansi(line):
96
- ansi_escape = re.compile(r'(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]')
97
- return ansi_escape.sub('', line)
98
-
99
-
100
- def publish_image_to_local(image_base64: str):
101
- image_file = str(uuid.uuid4()) + '.png'
102
- local_image_file = os.path.join(WORK_DIR, image_file)
103
-
104
- png_bytes = base64.b64decode(image_base64)
105
- assert isinstance(png_bytes, bytes)
106
- bytes_io = io.BytesIO(png_bytes)
107
- PIL.Image.open(bytes_io).save(local_image_file, 'png')
108
-
109
- return local_image_file
110
-
111
-
112
- START_CODE = """
113
- import signal
114
- def _m6_code_interpreter_timeout_handler(signum, frame):
115
- raise TimeoutError("M6_CODE_INTERPRETER_TIMEOUT")
116
- signal.signal(signal.SIGALRM, _m6_code_interpreter_timeout_handler)
117
-
118
- def input(*args, **kwargs):
119
- raise NotImplementedError('Python input() function is disabled.')
120
-
121
- import os
122
- if 'upload_file' not in os.getcwd():
123
- os.chdir("./upload_file/")
124
-
125
- import math
126
- import re
127
- import json
128
-
129
- import seaborn as sns
130
- sns.set_theme()
131
-
132
- import matplotlib
133
- import matplotlib.pyplot as plt
134
- plt.rcParams['font.sans-serif'] = ['SimHei']
135
- plt.rcParams['axes.unicode_minus'] = False
136
-
137
- import numpy as np
138
- import pandas as pd
139
-
140
- from sympy import Eq, symbols, solve
141
- """
142
-
143
-
144
- def code_interpreter(action_input_list: list, timeout=30, clear=False):
145
- code = ''
146
- for action_input in action_input_list:
147
- code += (extract_code(action_input) + '\n')
148
- fixed_code = []
149
- for line in code.split('\n'):
150
- fixed_code.append(line)
151
- if line.startswith('sns.set_theme('):
152
- fixed_code.append('plt.rcParams["font.sans-serif"] = ["SimHei"]')
153
- fixed_code.append('plt.rcParams["axes.unicode_minus"] = False')
154
- fixed_code = '\n'.join(fixed_code)
155
- if 'def solution()' in fixed_code:
156
- fixed_code += '\nsolution()'
157
-
158
- return _code_interpreter(fixed_code, timeout, clear)
159
-
160
-
161
- def _code_interpreter(code: str, timeout, clear=False):
162
- if not code.strip():
163
- return ''
164
- if timeout:
165
- code = f'signal.alarm({timeout})\n{code}'
166
- if clear:
167
- code = "get_ipython().run_line_magic('reset', '-f')\n" + START_CODE + code
168
-
169
- pid = os.getpid()
170
- if pid not in _KERNEL_CLIENTS:
171
- _KERNEL_CLIENTS[pid] = start_kernel(pid)
172
- _code_interpreter(START_CODE, timeout=None)
173
- kc = _KERNEL_CLIENTS[pid]
174
- kc.wait_for_ready()
175
- kc.execute(code)
176
- result = ''
177
- image_idx = 0
178
- while True:
179
- text = ''
180
- image = ''
181
- finished = False
182
- msg_type = 'error'
183
- try:
184
- msg = kc.get_iopub_msg()
185
- msg_type = msg['msg_type']
186
- if msg_type == 'status':
187
- if msg['content'].get('execution_state') == 'idle':
188
- finished = True
189
- elif msg_type == 'execute_result':
190
- text = msg['content']['data'].get('text/plain', '')
191
- if 'image/png' in msg['content']['data']:
192
- image_b64 = msg['content']['data']['image/png']
193
- image_url = publish_image_to_local(image_b64)
194
- image_idx += 1
195
- image = '![fig-%03d](%s)' % (image_idx, image_url)
196
- elif msg_type == 'display_data':
197
- if 'image/png' in msg['content']['data']:
198
- image_b64 = msg['content']['data']['image/png']
199
- image_url = publish_image_to_local(image_b64)
200
- image_idx += 1
201
- image = '![fig-%03d](%s)' % (image_idx, image_url)
202
- else:
203
- text = msg['content']['data'].get('text/plain', '')
204
- elif msg_type == 'stream':
205
- msg_type = msg['content']['name'] # stdout, stderr
206
- text = msg['content']['text']
207
- elif msg_type == 'error':
208
- text = escape_ansi('\n'.join(msg['content']['traceback']))
209
- if 'M6_CODE_INTERPRETER_TIMEOUT' in text:
210
- text = f'Timeout. No response after {timeout} seconds.'
211
- except queue.Empty:
212
- text = f'Timeout. No response after {timeout} seconds.'
213
- finished = True
214
- except Exception:
215
- text = 'The code interpreter encountered an unexpected error.'
216
- logging.warning(''.join(
217
- traceback.format_exception(*sys.exc_info())))
218
- finished = True
219
- if text:
220
- result += f'\n\n{msg_type}:\n\n```\n{text}\n```'
221
- if image:
222
- result += f'\n\n{image}'
223
- if finished:
224
- break
225
- result = result.lstrip('\n')
226
- if timeout:
227
- _code_interpreter('signal.alarm(0)', timeout=None)
228
- return result
229
-
230
-
231
- def get_multiline_input(hint):
232
- print(hint)
233
- print('// Press ENTER to make a new line. Press CTRL-D to end input.')
234
- lines = []
235
- while True:
236
- try:
237
- line = input()
238
- except EOFError: # CTRL-D
239
- break
240
- lines.append(line)
241
- print('// Input received.')
242
- if lines:
243
- return '\n'.join(lines)
244
- else:
245
- return ''
246
-
247
-
248
- if __name__ == '__main__':
249
- while True:
250
- print(code_interpreter([get_multiline_input('Enter python code:')]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/config.py DELETED
@@ -1,66 +0,0 @@
1
- from parser import InternLMReActParser, ReActParser
2
-
3
- from models import LLM, QwenVL, Qwen, QwenDashscopeVLModel
4
- from prompt import InternLMReAct, LlamaReAct, QwenReAct
5
-
6
- react_prompt_map = {
7
- 'qwen': QwenReAct,
8
- 'llama': LlamaReAct,
9
- 'internlm': InternLMReAct,
10
- }
11
-
12
- react_parser_map = {
13
- 'qwen': ReActParser,
14
- 'llama': ReActParser,
15
- 'internlm': InternLMReActParser,
16
- }
17
-
18
- model_map = {'qwen': Qwen, 'llama': LLM, 'internlm': LLM, 'qwen-vl-chat': QwenVL}
19
-
20
- model_type_map = {
21
- 'qwen-72b-chat': 'qwen',
22
- 'qwen-14b-chat': 'qwen',
23
- 'qwen-1.8b-chat': 'qwen',
24
- 'qwen-7b-chat': 'qwen',
25
- 'llama-2-7b-chat': 'llama',
26
- 'llama-2-13b-chat': 'llama',
27
- 'codellama-7b-instruct': 'llama',
28
- 'codellama-13b-instruct': 'llama',
29
- 'internlm-7b-chat-1.1': 'internlm',
30
- 'internlm-20b-chat': 'internlm',
31
- 'qwen-vl-chat': 'qwen-vl-chat',
32
- }
33
-
34
- model_path_map = {
35
- 'qwen-72b-chat': 'Qwen/Qwen-72B-Chat',
36
- 'qwen-14b-chat': 'Qwen/Qwen-14B-Chat',
37
- 'qwen-7b-chat': 'Qwen/Qwen-7B-Chat',
38
- 'qwen-1.8b-chat': 'Qwen/Qwen-1_8B-Chat',
39
- 'llama-2-7b-chat': 'meta-llama/Llama-2-7b-chat-hf',
40
- 'llama-2-13b-chat': 'meta-llama/Llama-2-13b-chat-hf',
41
- 'codellama-7b-instruct': 'codellama/CodeLlama-7b-Instruct-hf',
42
- 'codellama-13b-instruct': 'codellama/CodeLlama-13b-Instruct-hf',
43
- 'internlm-7b-chat-1.1': 'internlm/internlm-chat-7b-v1_1',
44
- 'internlm-20b-chat': 'internlm/internlm-chat-20b',
45
- 'qwen-vl-chat': 'Qwen/Qwen-VL-Chat',
46
- }
47
-
48
-
49
- def get_react_prompt(model_name, query, lang, upload_fname_list):
50
- react_prompt_cls = react_prompt_map.get(model_type_map[model_name],
51
- QwenReAct)
52
- return react_prompt_cls(query, lang, upload_fname_list)
53
-
54
-
55
- def get_react_parser(model_name):
56
- react_parser_cls = react_parser_map.get(model_type_map[model_name],
57
- ReActParser)
58
- return react_parser_cls()
59
-
60
-
61
- def get_model(model_name):
62
- if model_name in ["qwen-vl-plus"]:
63
- return QwenDashscopeVLModel(model=model_name)
64
- model_path = model_path_map.get(model_name, None)
65
- model_cls = model_map.get(model_type_map[model_name], LLM)
66
- return model_cls(model_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/inference_and_execute.py DELETED
@@ -1,280 +0,0 @@
1
- import argparse
2
- import json
3
- import logging
4
- import os
5
- from parser import ReActParser
6
-
7
- import prettytable
8
- import tqdm
9
- from code_interpreter import code_interpreter
10
- from config import (get_model, get_react_parser, get_react_prompt,
11
- model_path_map)
12
- from datasets import load_dataset
13
- from metrics.code_execution import eval_code_execution_rate
14
- from metrics.gsm8k import eval_gsm8k_acc, is_correct
15
- from metrics.visualization import eval_visualization_acc
16
- from utils.code_utils import replace_upload_fname
17
- from utils.data_utils import load_jsonl
18
-
19
- logging.basicConfig(
20
- format='%(asctime)s - %(levelname)s - %(message)s',
21
- datefmt='%Y-%m-%d %H:%M:%S',
22
- level=logging.INFO,
23
- )
24
-
25
- WORK_DIR = os.getenv('CODE_INTERPRETER_WORK_DIR', '/tmp/workspace')
26
- os.makedirs(WORK_DIR, exist_ok=True)
27
- os.system(f'cp -r upload_file_clean {WORK_DIR}/upload_file')
28
- os.system('cp -r upload_file_clean ./upload_file')
29
-
30
- global_eval_result = {
31
- 'code_executability': {
32
- 'math': None,
33
- 'visualization': None,
34
- 'general': None,
35
- },
36
- 'code_correctness': {
37
- 'math': None,
38
- 'visualization-hard': None,
39
- 'visualization-easy': None,
40
- }
41
- }
42
-
43
-
44
- def llm_with_plugin(args, query, item=None, exec_limit=3):
45
- exec_count = 0
46
-
47
- # Build ReAct prompt
48
- upload_fname_list = item[
49
- 'input_file_path'] if item and 'input_file_path' in item else []
50
- lang = item['lang'] if item and 'lang' in item else 'en'
51
- react_prompt_obj = get_react_prompt(args.model, query, lang,
52
- upload_fname_list)
53
- planning_prompt = react_prompt_obj.build_prompt()
54
-
55
- # Execute the code when providing the first action in the query
56
- if '<|im_start|>' in query:
57
- _, prepend_code, __ = ReActParser().parse_latest_plugin_call(query)
58
- prepend_code = replace_upload_fname(prepend_code, upload_fname_list)
59
- call_plugin(_, [prepend_code], clear=(exec_count == 0))
60
- exec_count += 1
61
- exec_limit += 1
62
-
63
- # Inference and execute
64
- text = ''
65
- while exec_count < exec_limit:
66
- stop_words_list = react_prompt_obj.get_stop_words_list()
67
- output = text_completion(args.llm,
68
- planning_prompt + text,
69
- stop_words=stop_words_list)
70
-
71
- if args.gen_only:
72
- text += output
73
- break
74
-
75
- react_parser = get_react_parser(args.model)
76
- action, action_input, output = react_parser.parse_latest_plugin_call(
77
- output)
78
- if action:
79
- action_input = replace_upload_fname(action_input,
80
- upload_fname_list)
81
- observation = call_plugin(action, [action_input],
82
- clear=(exec_count == 0))
83
- output += react_prompt_obj.build_observation(observation)
84
- text += output
85
- exec_count += 1
86
- if 'error:' in observation or 'Traceback' in observation:
87
- break
88
- else:
89
- text += output
90
- break
91
- return text
92
-
93
-
94
- def text_completion(llm, input_text, stop_words=[]):
95
- logging.info('Generating'.center(60, '='))
96
- logging.info('Input'.center(60, '-'))
97
- logging.info(input_text)
98
-
99
- output = llm.generate(input_text, stop_words)
100
-
101
- logging.info('Output'.center(60, '-'))
102
- logging.info(output)
103
- return output
104
-
105
-
106
- def call_plugin(plugin_name, plugin_args_list, clear=False):
107
- # Relax constraints on plugin name.
108
- logging.info('Call code interpreter'.center(60, '='))
109
- obs = code_interpreter(plugin_args_list, clear=clear)
110
- logging.info(obs)
111
- return obs
112
-
113
-
114
- def process_code_interpreter(item, writer):
115
- query = item['query']
116
- exec_limit = 3 if 'visualization' in item['tags'] else 1
117
- response = llm_with_plugin(args=args,
118
- query=query,
119
- item=item,
120
- exec_limit=exec_limit)
121
- item['gen'] = response
122
-
123
- writer.write(json.dumps(item, ensure_ascii=False) + '\n')
124
- writer.flush()
125
-
126
-
127
- def process_gsm8k(doc, writer):
128
- context = doc['question']
129
- completion = llm_with_plugin(args=args, query=context)
130
- acc = is_correct(completion, doc['answer'])
131
- doc['completion'] = completion
132
- doc['acc'] = acc
133
-
134
- writer.write(json.dumps(doc, ensure_ascii=False) + '\n')
135
- writer.flush()
136
-
137
-
138
- def sequential_processing(args, data_list, process_func, writer):
139
- for item in tqdm.tqdm(data_list):
140
- process_func(item, writer)
141
-
142
-
143
- process_func_map = {
144
- 'gsm8k': process_gsm8k,
145
- 'visualization': process_code_interpreter
146
- }
147
-
148
-
149
- def gather_eval_result(model_name):
150
- for metric in global_eval_result:
151
- logging.info(metric)
152
- table = prettytable.PrettyTable()
153
- table.field_names = ['model'] + list(global_eval_result[metric].keys())
154
- row_data = [model_name]
155
- for item in global_eval_result[metric].values():
156
- item = str(item) if not item else str(round(item, 2))
157
- row_data.append(item)
158
- table.add_row(row_data)
159
- logging.info('\n' + str(table))
160
-
161
-
162
- def eval_metrics(args, test_set, full_output_fname):
163
- # metrics
164
- assert os.path.exists(
165
- full_output_fname), f'Not Found File {full_output_fname}.'
166
- inference_res = load_jsonl(full_output_fname)
167
- assert len(inference_res) == len(
168
- test_set
169
- ), f'There are still {len(test_set)-len(inference_res)} cases left.'
170
-
171
- abs_output_fname = os.path.join(os.path.dirname(os.path.abspath(__file__)),
172
- full_output_fname)
173
- if args.task == 'gsm8k':
174
- math_code_correctness = eval_gsm8k_acc(abs_output_fname)
175
- global_eval_result['code_correctness'].update(math_code_correctness)
176
- else:
177
- code_executability = eval_code_execution_rate(abs_output_fname,
178
- args.task, args.model)
179
- global_eval_result['code_executability'].update(code_executability)
180
- if args.task in ['all_ci', 'visualization'
181
- ] and not args.eval_code_exec_only:
182
- visualization_code_correctness = eval_visualization_acc(
183
- abs_output_fname, args.model, args.vis_judger)
184
- global_eval_result['code_correctness'].update(
185
- visualization_code_correctness)
186
-
187
-
188
- def main(args):
189
- current_dir = os.getcwd()
190
- os.makedirs(args.output_path, exist_ok=True)
191
- full_output_fname = os.path.join(
192
- args.output_path,
193
- (args.output_fname or f'{args.task}_{args.model}_res.jsonl'))
194
-
195
- if not os.path.exists(full_output_fname):
196
- with open(full_output_fname, 'w'):
197
- logging.info(f'Create file {full_output_fname} done.')
198
-
199
- # build data
200
- if args.task == 'gsm8k':
201
- dataset = load_dataset('gsm8k', 'main')
202
- test_set = dataset['test']
203
- else:
204
- eval_data_path = os.path.join(args.input_path, args.input_fname)
205
- test_set = [
206
- item for item in load_jsonl(eval_data_path)
207
- if args.task in item['tags']
208
- ]
209
- logging.info(f'Test set: {len(test_set)}')
210
-
211
- if args.eval_only:
212
- eval_metrics(args, test_set, full_output_fname)
213
- else:
214
- key = 'question' if args.task == 'gsm8k' else 'query'
215
- cache_question = [item[key] for item in load_jsonl(full_output_fname)
216
- ] if not args.force else []
217
- data_list = [
218
- item for item in test_set if item[key] not in cache_question
219
- ]
220
- logging.info(f'Left cases: {len(data_list)}')
221
-
222
- # inference
223
- writer_mode = 'w' if args.force else 'a'
224
- f_output = open(full_output_fname, writer_mode, encoding='utf-8')
225
- process_func = process_func_map.get(args.task,
226
- process_code_interpreter)
227
- sequential_processing(args, data_list, process_func, f_output)
228
- f_output.close()
229
-
230
- # evaluate
231
- if not args.gen_exec_only:
232
- eval_metrics(args, test_set, full_output_fname)
233
-
234
- os.chdir(current_dir)
235
-
236
-
237
- def parse_args():
238
- parser = argparse.ArgumentParser()
239
- parser.add_argument('--model',
240
- type=str,
241
- default='qwen-14b-chat',
242
- choices=list(model_path_map.keys()))
243
- parser.add_argument(
244
- '--task',
245
- type=str,
246
- default='all',
247
- choices=['all', 'gsm8k', 'visualization', 'general'])
248
- parser.add_argument('--output-path', type=str, default='output_data')
249
- parser.add_argument('--input-path', type=str, default='eval_data')
250
- parser.add_argument('-o', '--output-fname', type=str, default='')
251
- parser.add_argument('-i',
252
- '--input-fname',
253
- type=str,
254
- default='eval_code_interpreter_v1.jsonl')
255
- parser.add_argument('-f', '--force', action='store_true', default=False)
256
- parser.add_argument('--eval-only', action='store_true', default=False)
257
- parser.add_argument('--eval-code-exec-only',
258
- action='store_true',
259
- default=False)
260
- parser.add_argument('--gen-exec-only', action='store_true', default=False)
261
- parser.add_argument('--gen-only', action='store_true', default=False)
262
- parser.add_argument('--vis-judger', type=str, default="'gpt-4-vision-preview'",
263
- choices=['gpt-4-vision-preview', 'qwen-vl-chat', 'qwen-vl-plus'])
264
- args = parser.parse_args()
265
- return args
266
-
267
-
268
- if __name__ == '__main__':
269
- args = parse_args()
270
- if not args.eval_only:
271
- args.llm = get_model(args.model)
272
- logging.info(f'Init {args.model} done.')
273
-
274
- if args.task == 'all':
275
- for key in ['gsm8k', 'visualization', 'general']:
276
- args.task = key
277
- main(args)
278
- else:
279
- main(args)
280
- gather_eval_result(args.model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/metrics/__init__.py DELETED
File without changes
benchmark/metrics/code_execution.py DELETED
@@ -1,257 +0,0 @@
1
- import logging
2
- import os
3
-
4
- import func_timeout
5
- from config import get_react_parser
6
- from func_timeout import func_set_timeout
7
- from utils.code_utils import extract_code, replace_upload_fname
8
- from utils.data_utils import load_jsonl, save_jsonl
9
-
10
- pre_load = """
11
- import os
12
- if 'upload_file' not in os.getcwd():
13
- os.chdir("./upload_file/")
14
-
15
- import seaborn as sns
16
-
17
- import matplotlib
18
- # matplotlib.use('Agg')
19
- import matplotlib.pyplot as plt
20
- plt.ion()
21
-
22
- import numpy as np
23
- import pandas as pd
24
- from sympy import Eq, symbols, solve
25
- import re
26
- import json
27
- import math
28
- """
29
-
30
- tags_config = {
31
- 'visualization': {
32
- 'timelimit': True,
33
- 'extract_first_code': True,
34
- },
35
- 'math': {
36
- 'timelimit': True,
37
- 'extract_first_code': False,
38
- },
39
- 'general': {
40
- 'timelimit': False,
41
- 'extract_first_code': True,
42
- }
43
- }
44
-
45
- code_executability = {'math': None, 'visualization': None, 'general': None}
46
-
47
-
48
- @func_set_timeout(10)
49
- def exec_limit_time(text):
50
- exec(text, locals())
51
-
52
-
53
- def exec_code(text, timelimit=False):
54
- if timelimit:
55
- exec_limit_time(text)
56
- else:
57
- exec(text, locals())
58
-
59
-
60
- def postprocess_code(gen_code, line):
61
- if '<|im_start|>' in line['query']:
62
- first_action_code = get_action_input_code(line['query'])
63
- gen_code = first_action_code + gen_code
64
-
65
- upload_fname_list = line[
66
- 'input_file_path'] if line and 'input_file_path' in line else []
67
- gen_code = replace_upload_fname(gen_code, upload_fname_list)
68
-
69
- if 'def solution()' in gen_code:
70
- gen_code += '\nsolution()\n'
71
-
72
- if 'plt.show()' in gen_code:
73
- gen_code += "\nplt.pause(1)\nplt.close('all')\n"
74
-
75
- if 'sns.' in gen_code and 'plot' in gen_code:
76
- gen_code += "\nplt.close('all')\n"
77
-
78
- gen_code = pre_load + gen_code
79
- return gen_code
80
-
81
-
82
- def get_action_input_code(text,
83
- model_name='qwen-14b-chat',
84
- extract_first_code=False):
85
- action_input_list = []
86
- tmp = text
87
- react_parser = get_react_parser(model_name)
88
- while True:
89
- action_input = react_parser.get_first_action_input(tmp)
90
- if not action_input:
91
- break
92
- action_input_list.append(action_input)
93
- tmp = tmp.split(action_input)[1]
94
- if not tmp or extract_first_code:
95
- break
96
-
97
- code = ''
98
- for action_input in action_input_list:
99
- code = code + '# concat\n' + extract_code(action_input) + '\n'
100
- return code
101
-
102
-
103
- def eval_code_execution_rate(output_fname,
104
- tag='all_ci',
105
- model_name='qwen-14b-chat',
106
- timelimit=False,
107
- extract_first_code=False):
108
- data_list = load_jsonl(output_fname)
109
- pip_package = []
110
-
111
- for line_id, line in enumerate(data_list):
112
- line['idx'] = line_id
113
- tags_list = line['tags'].split(',')
114
- if tag not in tags_list:
115
- continue
116
-
117
- # update args
118
- for cur_tag in tags_list:
119
- if cur_tag != 'all_ci':
120
- timelimit = tags_config[cur_tag]['timelimit']
121
- extract_first_code = tags_config[cur_tag]['extract_first_code']
122
-
123
- line['executable_code'] = False
124
- line['missing_code'] = False
125
- line['code_error_info'] = ''
126
-
127
- # get Action Input code from response
128
- gen_code = get_action_input_code(line['gen'],
129
- model_name=model_name,
130
- extract_first_code=extract_first_code)
131
-
132
- if not gen_code:
133
- line['missing_code'] = True
134
- line['code'] = ''
135
- line['code_error_info'] = 'missing code'
136
- continue
137
-
138
- line['code'] = gen_code
139
- gen_code = postprocess_code(gen_code, line)
140
-
141
- while True:
142
- try:
143
- exec_code(gen_code, timelimit=timelimit)
144
- line['executable_code'] = True
145
- break
146
- except func_timeout.exceptions.FunctionTimedOut as ex:
147
- line['code_error_info'] = str(ex)
148
- break
149
- except (ImportError, ModuleNotFoundError) as ex:
150
- try:
151
- packege = str(ex).split("'")[1].strip()
152
- except Exception:
153
- packege = ''
154
- if packege and packege not in pip_package: # install package
155
- pip_package.append(packege)
156
- os.system('pip install ' + packege)
157
- logging.info(f'Automatic installation: {packege}')
158
- else:
159
- line['code_error_info'] = str(ex)
160
- break
161
- except Exception as ex:
162
- line['code_error_info'] = str(ex)
163
- break
164
-
165
- # double check
166
- observation = get_react_parser(model_name).get_first_observation(
167
- line['gen'])
168
- if line['executable_code'] and ('error:' in observation):
169
- logging.warning(
170
- 'The code executes correctly, but it has an error in IPython!')
171
- logging.warning(f'Code:\n{gen_code}')
172
- logging.warning(f'IPython error info:\n{observation}')
173
- logging.info('=' * 60)
174
- elif not line['executable_code'] and not ('error:' in observation):
175
- logging.warning(
176
- 'The code has an execution error, but it runs correctly in IPython!'
177
- )
178
- logging.warning(f'Code:\n{gen_code}')
179
- logging.warning(f"Exec error info:\n{line['code_error_info']}")
180
- logging.warning(f'IPython observation:\n{observation}')
181
- logging.info('=' * 60)
182
-
183
- # save error data
184
- error_data_list = [
185
- item for item in data_list
186
- if not item['executable_code'] or item['missing_code']
187
- ]
188
- error_data_output_fname = os.path.splitext(
189
- output_fname)[0] + '_exec_error.jsonl'
190
- save_jsonl(error_data_list, error_data_output_fname)
191
-
192
- log_result(data_list)
193
-
194
- return code_executability
195
-
196
-
197
- def log_result(data_list, verbose=True):
198
- if verbose:
199
- logging.info('*' * 60)
200
- logging.info('{:^60}'.format('Detail'))
201
- logging.info('*' * 60)
202
- for line_id, line in enumerate(data_list):
203
- logging.info(f'Question {line_id}'.center(60, '='))
204
- logging.info(line['query'])
205
-
206
- logging.info(f'Generated {line_id}'.center(60, '-'))
207
- logging.info('\n' + line['gen'])
208
-
209
- logging.info(f'Code {line_id}'.center(60, '-'))
210
- logging.info('\n' + line['code'])
211
-
212
- logging.info(f'Exec Result {line_id}'.center(60, '-'))
213
- prefix_info = 'Exec Success' if line[
214
- 'executable_code'] else 'Exec Error: '
215
- exec_info = prefix_info + line['code_error_info']
216
- logging.info(exec_info)
217
-
218
- logging.info('=' * 60)
219
- logging.info('{:^60}'.format('Code Execuation Rate'))
220
- logging.info('=' * 60)
221
- involved_tags = []
222
- for line in data_list:
223
- involved_tags += line['tags'].split(',')
224
- involved_tags = list(set(involved_tags))
225
-
226
- for key in involved_tags:
227
- logging.info(f'task: {key}'.center(60, '='))
228
- key_item_list = [item for item in data_list if key in item['tags']]
229
- all_count = len(key_item_list)
230
- missing_code_count = len(
231
- [item for item in key_item_list if item['missing_code']])
232
- executable_code_count = len(
233
- [item for item in key_item_list if item['executable_code']])
234
-
235
- logging.info(f'All Test: {all_count}')
236
- logging.info(f'Missing Code: {missing_code_count}')
237
- logging.info(f'Predict Exec Success: {executable_code_count}')
238
- logging.info('Codes available && Execution Rate: {:.2f}'.format(
239
- executable_code_count / (all_count - missing_code_count) * 100))
240
- logging.info('Execution Rate: {:.2f}'.format(executable_code_count /
241
- all_count * 100))
242
- logging.info('Non-executable rate: {:.2f}'.format(
243
- (all_count - missing_code_count - executable_code_count) /
244
- all_count * 100))
245
- logging.info('Missing code rate: {:.2f}'.format(missing_code_count /
246
- all_count * 100))
247
-
248
- if key != 'all_ci':
249
- code_executability[key] = executable_code_count / all_count * 100
250
-
251
- if verbose:
252
- logging.info('Error List: ')
253
- error_list = [(item['idx'], item['code_error_info'])
254
- for item in key_item_list if item['code_error_info']]
255
- error_list.sort(key=lambda x: x[1])
256
- for x in error_list:
257
- logging.info(x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/metrics/gsm8k.py DELETED
@@ -1,54 +0,0 @@
1
- import logging
2
- import os
3
- import re
4
-
5
- import numpy as np
6
- from utils.data_utils import load_jsonl, save_jsonl
7
-
8
- INVALID_ANS = '[invalid]'
9
-
10
-
11
- def extract_answer(completion):
12
-
13
- def _get_last_digit(s):
14
- _PAT_LAST_DIGIT = re.compile(
15
- r'(?<=(\s|[\$%#{]))([+-])?(?=(\S))(0|([1-9](\d*|\d{0,2}(,\d{3})*)))?(\.\d*[1-9])?(?=(\s|[.,}]|$))'
16
- )
17
- match = list(_PAT_LAST_DIGIT.finditer(s))
18
- if match:
19
- last_digit = match[-1].group().replace(',', '').replace('+', '')
20
- else:
21
- last_digit = None
22
- logging.warning(f'No digits found in {s!r}')
23
- return last_digit
24
-
25
- job_gen = completion.strip('.').replace('\n', '\\n')
26
- last_digit = _get_last_digit(job_gen)
27
- if last_digit:
28
- return eval(last_digit)
29
- else:
30
- return INVALID_ANS
31
-
32
-
33
- def is_correct(completion, answer):
34
- gold = extract_answer(answer)
35
- assert gold != INVALID_ANS, 'No ground truth answer found in the document.'
36
- return extract_answer(completion) == gold
37
-
38
-
39
- def eval_gsm8k_acc(output_fname):
40
- data_list = load_jsonl(output_fname)
41
- acc_res = [item['acc'] for item in data_list]
42
- logging.info('=' * 60)
43
- logging.info('{:^60}'.format('Math Acc.'))
44
- logging.info('=' * 60)
45
- logging.info('Total num={:.2f}'.format(len(acc_res)))
46
- logging.info('Right num={:.2f}'.format(np.sum(acc_res)))
47
- logging.info('Zero-shot Acc={:.2f}'.format(np.mean(acc_res) * 100))
48
-
49
- error_data_list = [item for item in data_list if not item['acc']]
50
- error_data_output_fname = os.path.splitext(
51
- output_fname)[0] + '_gsm8k_error.jsonl'
52
- save_jsonl(error_data_list, error_data_output_fname)
53
-
54
- return {'math': np.mean(acc_res) * 100}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/metrics/visualization.py DELETED
@@ -1,179 +0,0 @@
1
- import logging
2
- import os
3
- import re
4
- import base64
5
- import torch
6
- from config import get_model, get_react_parser
7
- from utils.data_utils import load_jsonl, save_jsonl
8
-
9
- torch.manual_seed(1234)
10
-
11
- EVAL_VISUAL_PROMPT_ZH = """请判断图片是否与下面的[问题]一致,如果一致则回复“right”,不一致则回复“wrong”。
12
- [问题]:{query}
13
- """
14
-
15
- EVAL_VISUAL_PROMPT_EN = """Please judge whether the image is consistent with the [Question] below, if it is consistent then reply "right", if not then reply "wrong".
16
- [Question]: {query}
17
- """
18
-
19
- visualization_code_correctness = {
20
- 'visualization-hard': None,
21
- 'visualization-easy': None,
22
- }
23
-
24
-
25
- def encode_image(image_path):
26
- with open(image_path, "rb") as image_file:
27
- a = base64.b64encode(image_file.read()).decode('utf-8')
28
- return a
29
-
30
-
31
- def judger_model_inference(judger_model_name, judger_model, imgs=[], prompt=''):
32
- output = ""
33
- if judger_model_name == 'gpt-4-vision-preview':
34
- logging.warning("This is an example of `gpt-4-vision-preview`. "
35
- "Please set the API key and use according to your actual situation.")
36
- from openai import OpenAI
37
- client = OpenAI()
38
- content_list = []
39
- content_list.append({"type": "text", "text": prompt})
40
- input_images = []
41
- for img in imgs:
42
- if 'http' not in img:
43
- base64_image = encode_image(img)
44
- img = f"data:image/jpeg;base64,{base64_image}"
45
- input_images.append({"type": "image_url", 'image_url': img})
46
- content_list.extend(input_images)
47
- response = client.chat.completions.create(
48
- model="gpt-4-vision-preview",
49
- messages=[
50
- {
51
- "role": "user",
52
- "content": content_list,
53
- }
54
- ],
55
- max_tokens=300,
56
- )
57
- output = response.choices[0]
58
- elif judger_model_name in ['qwen-vl-plus', 'qwen-vl-chat']:
59
- inputs = []
60
- for img in imgs:
61
- if 'http' not in img and judger_model_name == 'qwen-vl-plus':
62
- img = "file://" + img
63
- inputs.append({'image': img})
64
- inputs.append({'text': prompt})
65
-
66
- logging.info('Eval'.center(60, '-'))
67
- logging.info(inputs)
68
- output = judger_model.generate(inputs)
69
- logging.info(output)
70
- logging.info('=' * 60)
71
- return output
72
-
73
-
74
- def extract_images(text):
75
- regex = re.compile(r'!\[fig-(.+)\]\((.+)\)')
76
- results = re.findall(regex, text)
77
- images = []
78
- for res in results:
79
- assert len(res) == 2
80
- if os.path.exists(res[1]):
81
- images.append(res[1])
82
- return images
83
-
84
-
85
- def check_images_observation(text, images, model_name):
86
- start_flag = get_react_parser(model_name).observation
87
- for image in images:
88
- logging.info('Image'.center(60, '-'))
89
- logging.info(image)
90
-
91
- end_idx = text.find(image)
92
- tmp_text = text[:end_idx + len(image)]
93
- start_idx = tmp_text.rfind(start_flag)
94
- check_text = tmp_text[start_idx + len(start_flag):]
95
-
96
- logging.info('Observation'.center(60, '-'))
97
- logging.info(check_text)
98
-
99
- # As long as there exists correctly executed observation, we consider `True`
100
- if 'error:' not in check_text and 'Traceback' not in check_text:
101
- return True
102
- return False
103
-
104
-
105
- eval_visual_prompt = {'zh': EVAL_VISUAL_PROMPT_ZH, 'en': EVAL_VISUAL_PROMPT_EN}
106
-
107
-
108
- def eval_visualization_acc(output_fname, model_name, judger_model_name='gpt-4-vision-preview'):
109
- if judger_model_name == 'gpt-4-vision-preview':
110
- judger_model = None
111
- elif judger_model_name in ['qwen-vl-chat', 'qwen-vl-plus']:
112
- if judger_model_name == 'qwen-vl-chat':
113
- logging.warning('In this benchmark of version 20231206, `Qwen-vl-chat` is no longer used as the '
114
- 'evaluation model for `Visualization` task.. If you insist on using it, '
115
- 'the evaluation results might differ from the official results.')
116
- judger_model = get_model(judger_model_name)
117
- else:
118
- raise Exception("Not supported judger model.")
119
-
120
- one_action, one_action_right = 0, 0
121
- zero_action, zero_action_right = 0, 0
122
-
123
- data_list = load_jsonl(output_fname)
124
- for item in data_list:
125
- if 'visualization' not in item['tags']:
126
- continue
127
-
128
- item['vis_acc'] = False
129
- if '<|im_end|>' in item['query']:
130
- one_action += 1
131
- prompt = item['query'].split('<|im_end|>')[0]
132
- else:
133
- zero_action += 1
134
- prompt = item['query']
135
-
136
- images = extract_images(item['gen'])
137
-
138
- if images and check_images_observation(item['gen'], images,
139
- model_name):
140
- input_prompt = eval_visual_prompt[item.get('lang', 'en')]
141
- format_prompt = input_prompt.format(query=prompt)
142
- output = judger_model_inference(judger_model_name, judger_model, images, format_prompt)
143
- if 'right' in output.lower():
144
- item['vis_acc'] = True
145
- if '<|im_end|>' in item['query']:
146
- one_action_right += 1
147
- else:
148
- zero_action_right += 1
149
-
150
- logging.info('*' * 60)
151
- logging.info('{:^60}'.format('Visualization Acc.'))
152
- logging.info('*' * 60)
153
- logging.info(
154
- 'Visualization-Hard count={}, Visualization-Hard right count={}, Visualization-Hard acc={:.2f}'
155
- .format(zero_action, zero_action_right,
156
- zero_action_right / zero_action * 100))
157
- logging.info(
158
- 'Visualization-Easy count={}, Visualization-Easy right count={}, Visualization-Easy acc={:.2f}'
159
- .format(one_action, one_action_right,
160
- one_action_right / one_action * 100))
161
- logging.info('all count={}, all right={}, all acc={:.2f}'.format(
162
- zero_action + one_action, zero_action_right + one_action_right,
163
- (zero_action_right + one_action_right) / (zero_action + one_action) *
164
- 100))
165
-
166
- visualization_code_correctness[
167
- 'visualization-hard'] = zero_action_right / zero_action * 100
168
- visualization_code_correctness[
169
- 'visualization-easy'] = one_action_right / one_action * 100
170
-
171
- error_data_list = [
172
- item for item in data_list
173
- if 'visualization' in item['tags'] and not item['vis_acc']
174
- ]
175
- error_data_output_fname = os.path.splitext(
176
- output_fname)[0] + '_vis_error.jsonl'
177
- save_jsonl(error_data_list, error_data_output_fname)
178
-
179
- return visualization_code_correctness
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/models/__init__.py DELETED
@@ -1,4 +0,0 @@
1
- from models.base import HFModel # noqa
2
- from models.llm import LLM # noqa
3
- from models.qwen import Qwen, QwenVL # noqa
4
- from models.dashscope import QwenDashscopeVLModel
 
 
 
 
 
benchmark/models/base.py DELETED
@@ -1,17 +0,0 @@
1
- from transformers import AutoModelForCausalLM, AutoTokenizer
2
- from transformers.generation import GenerationConfig
3
-
4
-
5
- class HFModel(object):
6
-
7
- def __init__(self, model_path):
8
- self.tokenizer = AutoTokenizer.from_pretrained(model_path,
9
- trust_remote_code=True)
10
- self.model = AutoModelForCausalLM.from_pretrained(
11
- model_path,
12
- trust_remote_code=True,
13
- device_map='auto',
14
- low_cpu_mem_usage=True).eval()
15
- self.model.generation_config = GenerationConfig.from_pretrained(
16
- model_path, trust_remote_code=True)
17
- self.model.generation_config.do_sample = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/models/dashscope.py DELETED
@@ -1,40 +0,0 @@
1
- import logging
2
- from http import HTTPStatus
3
- import time
4
- import dashscope
5
-
6
-
7
- class QwenDashscopeVLModel(object):
8
- def __init__(self, model, api_key):
9
- self.model = model
10
- dashscope.api_key = api_key.strip() or os.getenv('DASHSCOPE_API_KEY', default='')
11
- assert dashscope.api_key, 'DASHSCOPE_API_KEY is required.'
12
-
13
- def generate(self, prompt, stop_words=[]):
14
- if isinstance(prompt, str):
15
- prompt = [{'text': prompt}]
16
-
17
- MAX_TRY = 3
18
- count = 0
19
- while count < MAX_TRY:
20
- response = dashscope.MultiModalConversation.call(
21
- self.model,
22
- messages=[{'role': 'user', 'content': prompt}],
23
- top_p=0.01,
24
- top_k=1,
25
- )
26
- if response.status_code == HTTPStatus.OK:
27
- output = response.output.choices[0].message.content[0]['text']
28
- for stop_str in stop_words:
29
- idx = output.find(stop_str)
30
- if idx != -1:
31
- output = output[: idx + len(stop_str)]
32
- return output
33
- else:
34
- err = 'Error code: %s, error message: %s' % (
35
- response.code,
36
- response.message,
37
- )
38
- logging.error(err)
39
- count += 1
40
- time.sleep(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/models/llm.py DELETED
@@ -1,26 +0,0 @@
1
- import torch
2
- from models.base import HFModel
3
-
4
-
5
- class LLM(HFModel):
6
-
7
- def __init__(self, model_path):
8
- super().__init__(model_path)
9
-
10
- def generate(self, input_text, stop_words=[], max_new_tokens=512):
11
- if isinstance(input_text, str):
12
- input_text = [input_text]
13
-
14
- input_ids = self.tokenizer(input_text)['input_ids']
15
- input_ids = torch.tensor(input_ids, device=self.model.device)
16
- gen_kwargs = {'max_new_tokens': max_new_tokens, 'do_sample': False}
17
- outputs = self.model.generate(input_ids, **gen_kwargs)
18
- s = outputs[0][input_ids.shape[1]:]
19
- output = self.tokenizer.decode(s, skip_special_tokens=True)
20
-
21
- for stop_str in stop_words:
22
- idx = output.find(stop_str)
23
- if idx != -1:
24
- output = output[:idx + len(stop_str)]
25
-
26
- return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/models/qwen.py DELETED
@@ -1,36 +0,0 @@
1
- import torch
2
- from models.base import HFModel
3
-
4
-
5
- class Qwen(HFModel):
6
-
7
- def __init__(self, model_path):
8
- super().__init__(model_path)
9
-
10
- def generate(self, input_text, stop_words=[]):
11
- im_end = '<|im_end|>'
12
- if im_end not in stop_words:
13
- stop_words = stop_words + [im_end]
14
- stop_words_ids = [self.tokenizer.encode(w) for w in stop_words]
15
-
16
- input_ids = torch.tensor([self.tokenizer.encode(input_text)
17
- ]).to(self.model.device)
18
- output = self.model.generate(input_ids, stop_words_ids=stop_words_ids)
19
- output = output.tolist()[0]
20
- output = self.tokenizer.decode(output, errors='ignore')
21
- assert output.startswith(input_text)
22
- output = output[len(input_text):].replace('<|endoftext|>',
23
- '').replace(im_end, '')
24
-
25
- return output
26
-
27
-
28
- class QwenVL(HFModel):
29
- def __init__(self, model_path):
30
- super().__init__(model_path)
31
-
32
- def generate(self, inputs: list):
33
- query = self.tokenizer.from_list_format(inputs)
34
- response, _ = self.model.chat(self.tokenizer, query=query, history=None)
35
-
36
- return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/parser/__init__.py DELETED
@@ -1,2 +0,0 @@
1
- from parser.internlm_parser import InternLMReActParser # noqa
2
- from parser.react_parser import ReActParser # noqa
 
 
 
benchmark/parser/internlm_parser.py DELETED
@@ -1,11 +0,0 @@
1
- from parser.react_parser import ReActParser
2
-
3
-
4
- class InternLMReActParser(ReActParser):
5
-
6
- def __init__(self):
7
- self.action = '\nAction:'
8
- self.action_input = '\nActionInput:'
9
- self.action_input_stop = '<eoa>'
10
- self.observation = '<|System|>:Response:'
11
- self.observation_stop = '<TOKENS_UNUSED_2>\n<|Bot|>:'
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/parser/react_parser.py DELETED
@@ -1,46 +0,0 @@
1
- class ReActParser(object):
2
-
3
- def __init__(self):
4
- self.action = '\nAction:'
5
- self.action_input = '\nAction Input:'
6
- self.action_input_stop = '\nObservation:'
7
- self.observation = '\nObservation:'
8
- self.observation_stop = '\nThought:'
9
-
10
- def parse_latest_plugin_call(self, text):
11
- action = self.action
12
- action_input = self.action_input
13
- observation = self.action_input_stop
14
- plugin_name, plugin_args = '', ''
15
- i = text.rfind(action)
16
- j = text.rfind(action_input)
17
- k = text.rfind(observation)
18
- if 0 <= i < j: # If the text has `Action` and `Action input`,
19
- if k < j: # but does not contain `Observation`,
20
- # then it is likely that `Observation` is ommited by the LLM,
21
- # because the output text may have discarded the stop word.
22
- text = text.rstrip() + observation # Add it back.
23
- k = text.rfind(observation)
24
- plugin_name = text[i + len(action):j].strip()
25
- plugin_args = text[j + len(action_input):k].strip()
26
- text = text[:k]
27
- return plugin_name, plugin_args, text
28
-
29
- def _extract_first_target(self, text, start_flag, end_flag):
30
- target = ''
31
- i = text.find(start_flag)
32
- if i != -1:
33
- j = text.find(end_flag, i)
34
- if j != -1:
35
- target = text[i + len(start_flag):j].strip()
36
- else:
37
- target = text[i + len(start_flag):].strip()
38
- return target
39
-
40
- def get_first_observation(self, text):
41
- return self._extract_first_target(text, self.observation,
42
- self.observation_stop)
43
-
44
- def get_first_action_input(self, text):
45
- return self._extract_first_target(text, self.action_input,
46
- self.action_input_stop)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/prompt/__init__.py DELETED
@@ -1,4 +0,0 @@
1
- from prompt.internlm_react import InternLMReAct # noqa
2
- from prompt.llama_react import LlamaReAct # noqa
3
- from prompt.qwen_react import QwenReAct # noqa
4
- from prompt.react import ReAct # noqa
 
 
 
 
 
benchmark/prompt/internlm_react.py DELETED
@@ -1,103 +0,0 @@
1
- from prompt.react import ReAct
2
-
3
- INTERNLM_TOOL_DESCRIPTION = """用来执行Python代码。代码必须是一个函数,
4
- 函数名必须得是 'solution',代码对应你的思考过程。代码实例格式如下:
5
- ```python
6
- # import 依赖包
7
- import xxx
8
- def solution():
9
- # 初始化一些变量
10
- variable_names_with_real_meaning = xxx
11
- # 步骤一
12
- mid_variable = func(variable_names_with_real_meaning)
13
- # 步骤 x
14
- mid_variable = func(mid_variable)
15
- # 最后结果
16
- final_answer = func(mid_variable)
17
- return final_answer
18
- ```"""
19
-
20
- INTERNLM_TOOL = {'PythonInterpreter': INTERNLM_TOOL_DESCRIPTION}
21
-
22
- INTERNLM_REACT_PROMPT_ZH = """<|System|>:你是一个可以调用外部工具的助手,可以使用的工具包括:
23
- {tools_text}
24
- 如果使用工具请遵循以下格式回复:
25
- ```
26
- Thought:思考你当前步骤需要解决什么问题,是否需要使用工具
27
- Action:工具名称,你的工具必须从 [{tools_name_text}] 选择
28
- ActionInput:工具输入参数
29
- ```
30
- 工具返回按照以下格式回复:
31
- ```
32
- Response:调用工具后的结果
33
- ```
34
- 如果你已经知道了答案,或者你不需要工具,请遵循以下格式回复
35
- ```
36
- Thought:给出最终答案的思考过程
37
- FinalAnswer:最终答案
38
- ```
39
- 开始!<TOKENS_UNUSED_2>
40
- <|User|>:{query}<eoh>
41
- <|Bot|>:"""
42
-
43
- INTERNLM_REACT_PROMPT_EN = """<|System|>:You are a assistant who can utilize external tools.
44
- {tools_text}
45
- To use a tool, please use the following format:
46
- ```
47
- Thought: Think what you need to solve, do you need to use tools?
48
- Action: the tool name, should be one of [{tools_name_text}]
49
- ActionInput: the input to the action
50
- ```
51
- The response after utilizing tools should using the following format:
52
- ```
53
- Response: the results after call the tool.
54
- ``
55
- If you already know the answer, or you do not need to use tools,
56
- please using the following format to reply:
57
- ```
58
- Thought: the thought process to get the final answer
59
- FinalAnswer: final answer
60
- ```
61
- Begin!<TOKENS_UNUSED_2>
62
- <|User|>:{query}<eoh>
63
- <|Bot|>:"""
64
-
65
-
66
- class InternLMReAct(ReAct):
67
-
68
- def __init__(self, query, lang='en', upload_file_paths=[]):
69
- super().__init__(query, lang, upload_file_paths)
70
- self.react_template = INTERNLM_REACT_PROMPT_ZH if self.lang == 'zh' else INTERNLM_REACT_PROMPT_EN
71
-
72
- def build_prompt(self):
73
- planning_prompt = super().build_prompt()
74
- if '<|im_end|>' in self.query and planning_prompt.endswith(
75
- '<eoh>\n<|Bot|>:'):
76
- planning_prompt = planning_prompt[:-len('<eoh>\n<|Bot|>:')]
77
-
78
- if '<|im_end|>' in self.query:
79
- planning_prompt = planning_prompt.replace(
80
- '<|im_end|>\n<|im_start|>assistant\n',
81
- '<eoh>\n<|Bot|>:').replace(
82
- 'Observation:', '<eoa>\n<|System|>:Response:').replace(
83
- '\nAction Input',
84
- '\nActionInput').replace('code_interpreter',
85
- 'PythonInterpreter')
86
- assert planning_prompt.endswith('Thought:')
87
- planning_prompt = planning_prompt[:-len(
88
- 'Thought:')] + '<TOKENS_UNUSED_2>\n<|Bot|>:'
89
-
90
- self.prompt = planning_prompt
91
- return planning_prompt
92
-
93
- def _build_tools_text(self):
94
- return INTERNLM_TOOL
95
-
96
- def _build_tools_name_text(self):
97
- return list(INTERNLM_TOOL.keys())
98
-
99
- def build_observation(self, observation):
100
- return f'<eoa>\n<|System|>:Response:{observation}\n<TOKENS_UNUSED_2>\n<|Bot|>:'
101
-
102
- def get_stop_words_list(self):
103
- return ['<eoa>']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/prompt/llama_react.py DELETED
@@ -1,20 +0,0 @@
1
- from prompt.react import ReAct
2
-
3
-
4
- class LlamaReAct(ReAct):
5
-
6
- def __init__(self, query, lang='en', upload_file_paths=[]):
7
- super().__init__(query, lang, upload_file_paths)
8
-
9
- def build_prompt(self):
10
- planning_prompt = super().build_prompt()
11
- planning_prompt = '[INST] ' + planning_prompt + ' [/INST]'
12
-
13
- if '<|im_end|>' in self.query:
14
- planning_prompt = planning_prompt.replace(
15
- '<|im_end|>\n<|im_start|>assistant', ' [/INST] ')
16
- assert planning_prompt.endswith(' [/INST]')
17
- planning_prompt = planning_prompt[:-len(' [/INST]')]
18
-
19
- self.prompt = planning_prompt
20
- return planning_prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/prompt/qwen_react.py DELETED
@@ -1,80 +0,0 @@
1
- import json
2
- import os
3
-
4
- from prompt.react import ReAct
5
-
6
- QWEN_TOOLS_LIST = [
7
- {
8
- 'name_for_human': '代码解释器',
9
- 'name_for_model': 'code_interpreter',
10
- 'description_for_model': '代码解释器,可用于执行Python代码。',
11
- 'parameters': [{
12
- 'name': 'code',
13
- 'type': 'string',
14
- 'description': '待执行的代码'
15
- }],
16
- 'args_format': 'code'
17
- },
18
- ]
19
-
20
- TOOL_DESC = """{name_for_model}: Call this tool to interact with the {name_for_human} API. What is the {name_for_human} API useful for? {description_for_model} Parameters: {parameters}"""
21
-
22
-
23
- class QwenReAct(ReAct):
24
-
25
- def __init__(self, query, lang='en', upload_file_paths=[]):
26
- super().__init__(query, lang, upload_file_paths)
27
-
28
- self.upload_file_paths = [
29
- f'{os.path.basename(fname)}' for fname in upload_file_paths
30
- ]
31
- self.list_of_plugin_info = QWEN_TOOLS_LIST
32
- self.fname_template = {
33
- 'zh': '[上传文件{fname_str}]',
34
- 'en': '[Upload file {fname_str}]',
35
- 'en_multi': '[Upload file {fname_str}]'
36
- }
37
-
38
- def build_prompt(self):
39
- im_start = '<|im_start|>'
40
- im_end = '<|im_end|>'
41
- prompt = f'{im_start}system\nYou are a helpful assistant.{im_end}'
42
-
43
- query = super().build_prompt()
44
-
45
- query = query.lstrip('\n').rstrip()
46
- prompt += f'\n{im_start}user\n{query}{im_end}'
47
- if f'{im_start}assistant' not in query:
48
- prompt += f'\n{im_start}assistant\n{im_end}'
49
- assert prompt.endswith(f'\n{im_start}assistant\n{im_end}')
50
-
51
- prompt = prompt[:-len(f'{im_end}')]
52
- self.prompt = prompt
53
- return prompt
54
-
55
- def _build_tools_text(self):
56
- # tool info
57
- tools_text = []
58
- for plugin_info in self.list_of_plugin_info:
59
- tool = TOOL_DESC.format(
60
- name_for_model=plugin_info['name_for_model'],
61
- name_for_human=plugin_info['name_for_human'],
62
- description_for_model=plugin_info['description_for_model'],
63
- parameters=json.dumps(plugin_info['parameters'],
64
- ensure_ascii=False),
65
- )
66
- if plugin_info.get('args_format', 'json') == 'json':
67
- tool += ' Format the arguments as a JSON object.'
68
- elif plugin_info['args_format'] == 'code':
69
- tool += ' Enclose the code within triple backticks (`) at the beginning and end of the code.'
70
- else:
71
- raise NotImplementedError
72
- tools_text.append(tool)
73
- tools_text = '\n\n'.join(tools_text)
74
- return tools_text
75
-
76
- def _build_tools_name_text(self):
77
- return ', '.join([
78
- plugin_info['name_for_model']
79
- for plugin_info in self.list_of_plugin_info
80
- ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/prompt/react.py DELETED
@@ -1,87 +0,0 @@
1
- import os
2
-
3
- tools_text = """code_interpreter: Call this tool to interact with the Code Interpreter API.
4
- What is the Code Interpreter API useful for?
5
- Code Interpreter is used to execute Python code to deal with the following tasks:
6
- 1. Solving mathematical problems, both quantitative and qualitative
7
- 2. Doing data analysis and visualization
8
- 3. Converting files between formats
9
- Parameters:
10
- ```py
11
- code
12
- ```
13
- Enclose the code within triple backticks (```) at the beginning and end of the code.
14
- """
15
-
16
- REACT_PROMPT = """Answer the following questions as best you can. You have access to the following tools:
17
-
18
- {tools_text}
19
-
20
- Use the following format:
21
-
22
- Question: the input question you must answer
23
- Thought: you should always think about what to do
24
- Action: the action to take, should be one of [{tools_name_text}]
25
- Action Input: the input to the action
26
- Observation: the result of the action
27
- ... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
28
- Thought: I now know the final answer
29
- Final Answer: the final answer to the original input question
30
-
31
- Begin!
32
-
33
- Question: {query}"""
34
-
35
- fname_template = {
36
- 'zh': '文件{fname_str},',
37
- 'en_multi': 'Files {fname_str}. ',
38
- 'en': 'File {fname_str}. ',
39
- }
40
-
41
-
42
- class ReAct(object):
43
-
44
- def __init__(self, query, lang='en', upload_file_paths=[]):
45
- self.query = query
46
- self.lang = lang
47
- self.upload_file_paths = [
48
- f'`{os.path.basename(fname)}`' for fname in upload_file_paths
49
- ]
50
-
51
- self.fname_template = fname_template
52
- self.react_template = REACT_PROMPT
53
- self.prompt = ''
54
-
55
- def build_prompt(self):
56
- query = self._format_upload_fname() + self.query
57
- tools_text = self._build_tools_text()
58
- tools_name_text = self._build_tools_name_text()
59
- planning_prompt = self.react_template.format(
60
- query=query,
61
- tools_text=tools_text,
62
- tools_name_text=tools_name_text)
63
-
64
- self.prompt = planning_prompt
65
- return planning_prompt
66
-
67
- def _format_upload_fname(self):
68
- prefix = ''
69
- if self.upload_file_paths:
70
- fname_str = ', '.join(self.upload_file_paths)
71
- lang_key = 'en_multi' if self.lang == 'en' and len(
72
- self.upload_file_paths) > 1 else self.lang
73
- fname_template = self.fname_template[lang_key]
74
- prefix = fname_template.format(fname_str=fname_str)
75
- return prefix
76
-
77
- def _build_tools_text(self):
78
- return tools_text
79
-
80
- def _build_tools_name_text(self):
81
- return 'code_interpreter'
82
-
83
- def build_observation(self, observation):
84
- return f'\nObservation: {observation}\nThought:'
85
-
86
- def get_stop_words_list(self):
87
- return ['Observation:', 'Observation:\n']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/requirements.txt DELETED
@@ -1,13 +0,0 @@
1
- accelerate>=0.20.3
2
- func_timeout
3
- json5
4
- matplotlib
5
- numpy
6
- pandas
7
- PrettyTable
8
- scipy
9
- seaborn
10
- sympy
11
- transformers==4.33.1
12
- transformers_stream_generator
13
- openai
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/utils/__init__.py DELETED
File without changes
benchmark/utils/code_utils.py DELETED
@@ -1,31 +0,0 @@
1
- import os
2
- import re
3
-
4
- import json5
5
-
6
-
7
- def replace_upload_fname(text, upload_fname_list):
8
- for full_input_fname in upload_fname_list:
9
- if full_input_fname not in text and os.path.basename(
10
- full_input_fname) in text:
11
- text = text.replace(os.path.basename(full_input_fname),
12
- full_input_fname)
13
- return text
14
-
15
-
16
- def extract_code(text):
17
- # Match triple backtick blocks first
18
- triple_match = re.search(r'```[^\n]*\n(.+?)```', text, re.DOTALL)
19
- # Match single backtick blocks second
20
- single_match = re.search(r'`([^`]*)`', text, re.DOTALL)
21
- if triple_match:
22
- text = triple_match.group(1)
23
- elif single_match:
24
- text = single_match.group(1)
25
- else:
26
- try:
27
- text = json5.loads(text)['code']
28
- except Exception:
29
- pass
30
- # If no code blocks found, return original text
31
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmark/utils/data_utils.py DELETED
@@ -1,28 +0,0 @@
1
- import json
2
- import logging
3
-
4
- from tqdm import tqdm
5
-
6
-
7
- def load_jsonl(path):
8
- data = []
9
- with open(path, 'r', encoding='utf8') as f:
10
- for idx, line in enumerate(f, start=1):
11
- try:
12
- data.append(json.loads(line))
13
- except Exception as e:
14
- logging.info(line)
15
- logging.warning(f'Error at line {idx}: {e}')
16
- continue
17
- return data
18
-
19
-
20
- def save_jsonl(data, path, progress=False, enabled=True):
21
- if not enabled:
22
- return
23
- with open(path, 'w', encoding='utf-8') as f:
24
- if progress:
25
- data = tqdm(data)
26
- for item in data:
27
- line = json.dumps(item, ensure_ascii=False)
28
- print(line, file=f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
setup.py DELETED
@@ -1,16 +0,0 @@
1
- from setuptools import find_packages, setup
2
-
3
-
4
- def read_requirements():
5
- with open('requirements.txt') as req:
6
- content = req.read()
7
- requirements = content.split('\n')
8
- return requirements
9
-
10
-
11
- setup(
12
- name='qwen_agent',
13
- version='0.0.1',
14
- packages=find_packages(),
15
- install_requires=read_requirements(),
16
- )